v25.4

v25.3
v25.2
2025-09-17 23:44:27 +00:00 · 2025-06-20 20:31:41 -04:00 · 2025-01-16 12:31:53 -05:00 · 2025-01-12 18:58:21 -05:00
94 changed files with 2865 additions and 5034 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,31 +1,35 @@

+if HAVE_APPLE
+# MacOS uses Homebrew to install needed packages but they aren't linked for
+# the jansson test in configure. Ignore the failed test & link them now,
+# different path for different CPU arch.
+
+if ARCH_ARM64
+  EXTRA_INCLUDES = -I/opt/homebrew/include
+  EXTRA_LIBS     = -L/opt/homebrew/lib
+else
+  EXTRA_INCLUDES = -I/usr/local/include
+  EXTRA_LIBS     = -L/usr/local/lib
+endif
+
+else
+
 if WANT_JANSSON
-JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson
+# Can't find jansson libraries, compile the included source code.
+  EXTRA_INCLUDES = -I$(top_srcdir)/compat/jansson
+  EXTRA_LIBS     = -L$(top_srcdir)/compat/jansson
 else
-JANSSON_INCLUDES=
+  EXTRA_INCLUDES =
+  EXTRA_LIBS     =
 endif

-# Hook for for GMP on MacOS which is provided by homebrew.
-# Homebrew has different linkage on x86_64 & ARM64.
-# Need complex expressions, nesting or elseif, none seem to work.
-if !HAVE_APPLE
-  GMP_INCLUDES =
-  GMP_LIB = -lgmp
-endif
-if ARM64_APPLE
-  GMP_INCLUDES = -I/opt/homebrew/include
-  GMP_LIB = /opt/homebrew/lib/libgmp.a
-endif
-if X86_64_APPLE
-  GMP_INCLUDES = -I/usr/local/include
-  GMP_LIB = /usr/local/lib/libgmp.a
 endif

 EXTRA_DIST = example-cfg.json nomacro.pl

 SUBDIRS = compat

-ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(JANSSON_INCLUDES) $(GMP_INCLUDES) -I.
+ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(EXTRA_INCLUDES) -I.

 bin_PROGRAMS = cpuminer

@@ -39,6 +43,7 @@ cpuminer_SOURCES = \
  sysinfos.c \
  algo-gate-api.c\
  malloc-huge.c \
+  simd-utils/simd-constants.c \
  algo/argon2d/argon2d-gate.c \
  algo/argon2d/blake2/blake2b.c \
  algo/argon2d/argon2d/argon2.c \
@@ -288,21 +293,20 @@ cpuminer_SOURCES = \
  algo/yespower/yespower-opt.c \
  algo/yespower/yespower-ref.c \
  algo/yespower/yespower-blake2b-ref.c
-  
-disable_flags =
-
-if USE_ASM
-   cpuminer_SOURCES += asm/neoscrypt_asm.S
-else
-   disable_flags += -DNOASM
-endif

 if HAVE_WINDOWS
   cpuminer_SOURCES += compat/winansi.c
 endif

+if USE_ASM
+   disable_flags =
+   cpuminer_SOURCES += asm/neoscrypt_asm.S
+else
+   disable_flags = -DNOASM
+endif
+
 cpuminer_LDFLAGS = @LDFLAGS@
-cpuminer_LDADD	= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ $(GMP_LIB)
+cpuminer_LDADD	= $(EXTRA_LIBS) @LIBCURL@ -ljansson @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
 cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
 cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)

@@ -312,9 +316,6 @@ endif

 if HAVE_WINDOWS

-# Add -U_WIN32_WINNT to command line CFLAGS to undefine
-cpuminer_CFLAGS += -D_WIN32_WINNT=0x0601
-
 # use to profile an object
 # gprof_cflags = -pg -g3
 # cpuminer_LDFLAGS += -pg
--- a/README.md
+++ b/README.md
@@ -36,34 +36,18 @@ for compile instructions.
 Requirements
 ------------

-1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
-Intel Core2 and newer and AMD equivalents. Further optimizations are available
-on some algoritms for CPUs with AES, AVX, AVX2, SHA, AVX512 and VAES.
-
-32 bit CPUs are not supported.
-Other CPU architectures such as ARM, Raspberry Pi, RISC-V, Xeon Phi, etc,
-are not supported.
+1. A 64 bit CPU supporting x86_64 (Intel or AMD) or aarch64 (ARM).
+x86_64 requires SSE2, aarch64 requires armv8 & NEON.

 Mobile CPUs like laptop computers are not recommended because they aren't
 designed for extreme heat of operating at full load for extended periods of
 time.

-Older CPUs and ARM architecture may be supported by cpuminer-multi by TPruvot.
-
-2. 64 bit Linux or Windows OS. Ubuntu and Fedora based distributions,
-including Mint and Centos, are known to work and have all dependencies
-in their repositories. Others may work but may require more effort. Older
-versions such as Centos 6 don't work due to missing features. 
-
-Windows 7 or newer is supported with mingw_w64 and msys or using the pre-built
-binaries. WindowsXP 64 bit is YMMV.
-
-FreeBSD is not actively tested but should work, YMMV.
-MacOS, OSx and Android are not supported.
+2. 64 bit operating system including Linux, Windows, MacOS, or BSD.
+Android, IOS and alt OSs like Haiku & ReactOS are not supported.

 3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
-RPC getwork using http:// or https://.
-GBT is YMMV.
+RPC getblocktemplate using http:// or https://.

 Supported Algorithms
 --------------------
@@ -71,9 +55,9 @@ Supported Algorithms
                          allium        Garlicoin
                          anime         Animecoin
                          argon2        Argon2 coin (AR2)
-                          argon2d250    argon2d-crds, Credits (CRDS)
-                          argon2d500    argon2d-dyn,  Dynamic (DYN)
-                          argon2d4096   argon2d-uis, Unitus, (UIS)
+                          argon2d250    
+                          argon2d500
+                          argon2d4096
                          blake         Blake-256
                          blake2b       Blake2-512
                          blake2s       Blake2-256
--- a/21
+++ b/21
@@ -75,6 +75,27 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v25.4
+
+x86_64: improved handling of vector constants used for byte permutations.
+x86_64: removed hooks for cancelled AVX10-256.
+Minor bug fixes & improvements.
+More code cleanup.
+
+v25.3
+
+#442, #443: Fixed a regression in Makefile.am.
+Removed algo features log display.
+Some code cleanup.
+
+v25.2
+
+ARM: Fixed regression from v25.1 that could cause build fail.
+BSD: FreeBSD is now supported. Other BSDs may also work.
+MacOS: build with installed jansson library instead of compiling the included source code.
+Windows: remove "_WIN32_WINNT=0x0601" which was a downgrade on Win11.
+Changed build.sh shell from bash to sh.
+
 v25.1

 MacOS ARM64: m7m algo is now working.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -295,8 +295,8 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
  {
    case ALGO_ALLIUM:       rc = register_allium_algo        ( gate ); break;
    case ALGO_ANIME:        rc = register_anime_algo         ( gate ); break;
-    case ALGO_ARGON2D250:   rc = register_argon2d_crds_algo  ( gate ); break;
-    case ALGO_ARGON2D500:   rc = register_argon2d_dyn_algo   ( gate ); break;
+    case ALGO_ARGON2D250:   rc = register_argon2d250_algo    ( gate ); break;
+    case ALGO_ARGON2D500:   rc = register_argon2d500_algo    ( gate ); break;
    case ALGO_ARGON2D4096:  rc = register_argon2d4096_algo   ( gate ); break;
    case ALGO_AXIOM:        rc = register_axiom_algo         ( gate ); break;
    case ALGO_BLAKE:        rc = register_blake_algo         ( gate ); break;
@@ -416,8 +416,6 @@ void exec_hash_function( int algo, void *output, const void *pdata )
 const char* const algo_alias_map[][2] =
 {
 //   alias                proper
-  { "argon2d-dyn",       "argon2d500"     },
-  { "argon2d-uis",       "argon2d4096"    },
  { "bcd",               "x13bcd"         },
  { "bitcore",           "timetravel10"   },
  { "bitzeny",           "yescryptr8"     },
--- a/algo/argon2d/argon2d-gate.c
+++ b/algo/argon2d/argon2d-gate.c
@@ -6,9 +6,7 @@ static const size_t INPUT_BYTES = 80;  // Lenth of a block header in bytes. Inpu
 static const size_t OUTPUT_BYTES = 32; // Length of output needed for a 256-bit hash
 static const unsigned int DEFAULT_ARGON2_FLAG = 2; //Same as ARGON2_DEFAULT_FLAGS

-// Credits
-
-void argon2d_crds_hash( void *output, const void *input )
+void argon2d250_hash( void *output, const void *input )
 {
 	argon2_context context;
 	context.out = (uint8_t *)output;
@@ -34,7 +32,7 @@ void argon2d_crds_hash( void *output, const void *input )
 	argon2_ctx( &context, Argon2_d );
 }

-int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
+int scanhash_argon2d250( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(64) edata[20];
@@ -50,7 +48,7 @@ int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,

   do {
      be32enc(&edata[19], nonce);
-      argon2d_crds_hash( hash, edata );
+      argon2d250_hash( hash, edata );
      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
      {
          pdata[19] = nonce;
@@ -64,18 +62,16 @@ int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
   return 0;
 }

-bool register_argon2d_crds_algo( algo_gate_t* gate )
+bool register_argon2d250_algo( algo_gate_t* gate )
 {
-        gate->scanhash = (void*)&scanhash_argon2d_crds;
-        gate->hash = (void*)&argon2d_crds_hash;
+        gate->scanhash = (void*)&scanhash_argon2d250;
+        gate->hash = (void*)&argon2d250_hash;
        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
        opt_target_factor = 65536.0;
        return true;
 }

-// Dynamic
-
-void argon2d_dyn_hash( void *output, const void *input )
+void argon2d500_hash( void *output, const void *input )
 {
    argon2_context context;
    context.out = (uint8_t *)output;
@@ -101,7 +97,7 @@ void argon2d_dyn_hash( void *output, const void *input )
    argon2_ctx( &context, Argon2_d );
 }

-int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
+int scanhash_argon2d500( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(64) edata[20];
@@ -118,7 +114,7 @@ int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
   do
   {
      edata[19] = nonce;
-      argon2d_dyn_hash( hash, edata );
+      argon2d500_hash( hash, edata );
      if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
           && !bench ) )
      {
@@ -133,17 +129,15 @@ int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
   return 0;
 }

-bool register_argon2d_dyn_algo( algo_gate_t* gate )
+bool register_argon2d500_algo( algo_gate_t* gate )
 {
-        gate->scanhash = (void*)&scanhash_argon2d_dyn;
-        gate->hash = (void*)&argon2d_dyn_hash;
+        gate->scanhash = (void*)&scanhash_argon2d500;
+        gate->hash = (void*)&argon2d500_hash;
        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
        opt_target_factor = 65536.0;
        return true;
 }

-// Unitus
-
 int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
--- a/algo/argon2d/argon2d-gate.h
+++ b/algo/argon2d/argon2d-gate.h
@@ -5,19 +5,19 @@
 #include <stdint.h>

 // Credits: version = 0x10, m_cost = 250.
-bool register_argon2d_crds_algo( algo_gate_t* gate );
+bool register_argon2d250_algo( algo_gate_t* gate );

-void argon2d_crds_hash( void *state, const void *input );
+void argon2d250_hash( void *state, const void *input );

-int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
+int scanhash_argon2d250( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );

 // Dynamic: version = 0x10, m_cost = 500.
-bool register_argon2d_dyn_algo( algo_gate_t* gate );
+bool register_argon2d500_algo( algo_gate_t* gate );

-void argon2d_dyn_hash( void *state, const void *input );
+void argon2d500_hash( void *state, const void *input );

-int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
+int scanhash_argon2d500( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );


--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -6,15 +6,15 @@

 #if defined (BLAKE_4WAY)

-blake256r14_4way_context blake_4w_ctx;
+blake256r14_4x32_context blake_4w_ctx;

 void blakehash_4way(void *state, const void *input)
 {
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-     blake256r14_4way_context ctx;
+     blake256r14_4x32_context ctx;
     memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
-     blake256r14_4way_update( &ctx, input + (64<<2), 16 );
-     blake256r14_4way_close( &ctx, vhash );
+     blake256r14_4x32_update( &ctx, input + (64<<2), 16 );
+     blake256r14_4x32_close( &ctx, vhash );
     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

@@ -35,8 +35,8 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
      HTarget = 0x7f;

   v128_bswap32_intrlv80_4x32( vdata, pdata );
-   blake256r14_4way_init( &blake_4w_ctx );
-   blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
+   blake256r14_4x32_init( &blake_4w_ctx );
+   blake256r14_4x32_update( &blake_4w_ctx, vdata, 64 );

   do {
      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
@@ -61,15 +61,15 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,

 #if defined(BLAKE_8WAY)

-blake256r14_8way_context blake_8w_ctx;
+blake256r14_8x32_context blake_8w_ctx;

 void blakehash_8way( void *state, const void *input )
 {
     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
-     blake256r14_8way_context ctx;
+     blake256r14_8x32_context ctx;
     memcpy( &ctx, &blake_8w_ctx, sizeof ctx );
-     blake256r14_8way( &ctx, input + (64<<3), 16 );
-     blake256r14_8way_close( &ctx, vhash );
+     blake256r14_8x32( &ctx, input + (64<<3), 16 );
+     blake256r14_8x32_close( &ctx, vhash );
     _dintrlv_8x32( state,     state+ 32, state+ 64, state+ 96,
                    state+128, state+160, state+192, state+224,
                    vhash, 256 );
@@ -93,8 +93,8 @@ int scanhash_blake_8way( struct work *work, uint32_t max_nonce,

   mm256_bswap32_intrlv80_8x32( vdata, pdata );

-   blake256r14_8way_init( &blake_8w_ctx );
-   blake256r14_8way( &blake_8w_ctx, vdata, 64 );
+   blake256r14_8x32_init( &blake_8w_ctx );
+   blake256r14_8x32( &blake_8w_ctx, vdata, 64 );

   do {
      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
--- a/algo/blake/blake256-hash.c
+++ b/algo/blake/blake256-hash.c
@@ -423,33 +423,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
 		(state)->T1 = T1; \
 	} while (0)

-
-#if defined(__SSSE3__)
-
-#define BLAKE256_4X32_BLOCK_BSWAP32 \
-{ \
-   v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
-                                     0x0405060700010203 ); \
-   M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
-   M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
-   M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
-   M3 = _mm_shuffle_epi8( buf[ 3], shuf_bswap32 ); \
-   M4 = _mm_shuffle_epi8( buf[ 4], shuf_bswap32 ); \
-   M5 = _mm_shuffle_epi8( buf[ 5], shuf_bswap32 ); \
-   M6 = _mm_shuffle_epi8( buf[ 6], shuf_bswap32 ); \
-   M7 = _mm_shuffle_epi8( buf[ 7], shuf_bswap32 ); \
-   M8 = _mm_shuffle_epi8( buf[ 8], shuf_bswap32 ); \
-   M9 = _mm_shuffle_epi8( buf[ 9], shuf_bswap32 ); \
-   MA = _mm_shuffle_epi8( buf[10], shuf_bswap32 ); \
-   MB = _mm_shuffle_epi8( buf[11], shuf_bswap32 ); \
-   MC = _mm_shuffle_epi8( buf[12], shuf_bswap32 ); \
-   MD = _mm_shuffle_epi8( buf[13], shuf_bswap32 ); \
-   ME = _mm_shuffle_epi8( buf[14], shuf_bswap32 ); \
-   MF = _mm_shuffle_epi8( buf[15], shuf_bswap32 ); \
-}
-
-#else  // SSE2
-
 #define BLAKE256_4X32_BLOCK_BSWAP32 \
 { \
   M0 = v128_bswap32( buf[0] ); \
@@ -470,8 +443,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
   MF = v128_bswap32( buf[15] ); \
 }

-#endif  // SSSE3 else SSE2
-
 #define COMPRESS32_4X32( rounds ) \
 { \
   v128_t M0, M1, M2, M3, M4, M5, M6, M7; \
@@ -926,22 +897,6 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
      ROUND_S_4X32_3;
   }

-#if defined(__SSSE3__)
-
-   const v128_t shuf_bswap32 =
-                      v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
-
-   H[0] = _mm_shuffle_epi8( v128_xor3( V8, V0, h[0] ), shuf_bswap32 );
-   H[1] = _mm_shuffle_epi8( v128_xor3( V9, V1, h[1] ), shuf_bswap32 );
-   H[2] = _mm_shuffle_epi8( v128_xor3( VA, V2, h[2] ), shuf_bswap32 );
-   H[3] = _mm_shuffle_epi8( v128_xor3( VB, V3, h[3] ), shuf_bswap32 );
-   H[4] = _mm_shuffle_epi8( v128_xor3( VC, V4, h[4] ), shuf_bswap32 );
-   H[5] = _mm_shuffle_epi8( v128_xor3( VD, V5, h[5] ), shuf_bswap32 );
-   H[6] = _mm_shuffle_epi8( v128_xor3( VE, V6, h[6] ), shuf_bswap32 );
-   H[7] = _mm_shuffle_epi8( v128_xor3( VF, V7, h[7] ), shuf_bswap32 );
-
-#else
-
   H[0] = v128_bswap32( v128_xor3( V8, V0, h[0] ) );
   H[1] = v128_bswap32( v128_xor3( V9, V1, h[1] ) );
   H[2] = v128_bswap32( v128_xor3( VA, V2, h[2] ) );
@@ -950,8 +905,6 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
   H[5] = v128_bswap32( v128_xor3( VD, V5, h[5] ) );
   H[6] = v128_bswap32( v128_xor3( VE, V6, h[6] ) );
   H[7] = v128_bswap32( v128_xor3( VF, V7, h[7] ) );
-
-#endif
 }

 #if defined (__AVX2__)
@@ -1291,24 +1244,22 @@ do { \
   VD = v256_32( T0 ^ 0x299F31D0 ); \
   VE = v256_32( T1 ^ 0x082EFA98 ); \
   VF = v256_32( T1 ^ 0xEC4E6C89 ); \
-   const __m256i shuf_bswap32 = mm256_set2_64( \
-                               0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
-   M0 = _mm256_shuffle_epi8( * buf    , shuf_bswap32 ); \
-   M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
-   M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
-   M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
-   M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
-   M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
-   M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
-   M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
-   M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
-   M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
-   MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
-   MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
-   MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
-   MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
-   ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
-   MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
+   M0 = mm256_bswap_32( * buf     ); \
+   M1 = mm256_bswap_32( *(buf+ 1) ); \
+   M2 = mm256_bswap_32( *(buf+ 2) ); \
+   M3 = mm256_bswap_32( *(buf+ 3) ); \
+   M4 = mm256_bswap_32( *(buf+ 4) ); \
+   M5 = mm256_bswap_32( *(buf+ 5) ); \
+   M6 = mm256_bswap_32( *(buf+ 6) ); \
+   M7 = mm256_bswap_32( *(buf+ 7) ); \
+   M8 = mm256_bswap_32( *(buf+ 8) ); \
+   M9 = mm256_bswap_32( *(buf+ 9) ); \
+   MA = mm256_bswap_32( *(buf+10) ); \
+   MB = mm256_bswap_32( *(buf+11) ); \
+   MC = mm256_bswap_32( *(buf+12) ); \
+   MD = mm256_bswap_32( *(buf+13) ); \
+   ME = mm256_bswap_32( *(buf+14) ); \
+   MF = mm256_bswap_32( *(buf+15) ); \
   ROUND_S_8WAY(0); \
   ROUND_S_8WAY(1); \
   ROUND_S_8WAY(2); \
@@ -1401,7 +1352,7 @@ do { \
   H7 = mm256_xor3( VF, V7, H7 ); \
 }

-void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
+void blake256_8x32_round0_prehash_le( void *midstate, const void *midhash,
                                      void *data )
 {
   __m256i *M = (__m256i*)data;
@@ -1491,7 +1442,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
                         _mm256_xor_si256( v256_32( CSE ), M[15] ) );
 }

-void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
+void blake256_8x32_final_rounds_le( void *final_hash, const void *midstate,
                     const void *midhash, const void *data, const int rounds )
 {
   __m256i *H = (__m256i*)final_hash;
@@ -1596,17 +1547,14 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
      ROUND256_8WAY_3;
   }

-   const __m256i shuf_bswap32 =
-                  mm256_set2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
-
-   H[0] = _mm256_shuffle_epi8( mm256_xor3( V8, V0, h[0] ), shuf_bswap32 );
-   H[1] = _mm256_shuffle_epi8( mm256_xor3( V9, V1, h[1] ), shuf_bswap32 );
-   H[2] = _mm256_shuffle_epi8( mm256_xor3( VA, V2, h[2] ), shuf_bswap32 );
-   H[3] = _mm256_shuffle_epi8( mm256_xor3( VB, V3, h[3] ), shuf_bswap32 );
-   H[4] = _mm256_shuffle_epi8( mm256_xor3( VC, V4, h[4] ), shuf_bswap32 );
-   H[5] = _mm256_shuffle_epi8( mm256_xor3( VD, V5, h[5] ), shuf_bswap32 );
-   H[6] = _mm256_shuffle_epi8( mm256_xor3( VE, V6, h[6] ), shuf_bswap32 );
-   H[7] = _mm256_shuffle_epi8( mm256_xor3( VF, V7, h[7] ), shuf_bswap32 );
+   H[0] = mm256_bswap_32( mm256_xor3( V8, V0, h[0] ) );
+   H[1] = mm256_bswap_32( mm256_xor3( V9, V1, h[1] ) );
+   H[2] = mm256_bswap_32( mm256_xor3( VA, V2, h[2] ) );
+   H[3] = mm256_bswap_32( mm256_xor3( VB, V3, h[3] ) );
+   H[4] = mm256_bswap_32( mm256_xor3( VC, V4, h[4] ) );
+   H[5] = mm256_bswap_32( mm256_xor3( VD, V5, h[5] ) );
+   H[6] = mm256_bswap_32( mm256_xor3( VE, V6, h[6] ) );
+   H[7] = mm256_bswap_32( mm256_xor3( VF, V7, h[7] ) );
 }

 #endif
@@ -1933,8 +1881,6 @@ do { \
   __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
   __m512i V0, V1, V2, V3, V4, V5, V6, V7; \
   __m512i V8, V9, VA, VB, VC, VD, VE, VF; \
-   const __m512i shuf_bswap32 = mm512_bcast_m128( v128_set64( \
-                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
   V0 = H0; \
   V1 = H1; \
   V2 = H2; \
@@ -1951,22 +1897,22 @@ do { \
   VD = v512_32( T0 ^ 0x299F31D0 ); \
   VE = v512_32( T1 ^ 0x082EFA98 ); \
   VF = v512_32( T1 ^ 0xEC4E6C89 ); \
-   M0 = _mm512_shuffle_epi8( * buf    , shuf_bswap32 ); \
-   M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
-   M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
-   M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
-   M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
-   M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
-   M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
-   M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
-   M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
-   M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
-   MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
-   MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
-   MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
-   MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
-   ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
-   MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
+   M0 = mm512_bswap_32( * buf     ); \
+   M1 = mm512_bswap_32( *(buf+ 1) ); \
+   M2 = mm512_bswap_32( *(buf+ 2) ); \
+   M3 = mm512_bswap_32( *(buf+ 3) ); \
+   M4 = mm512_bswap_32( *(buf+ 4) ); \
+   M5 = mm512_bswap_32( *(buf+ 5) ); \
+   M6 = mm512_bswap_32( *(buf+ 6) ); \
+   M7 = mm512_bswap_32( *(buf+ 7) ); \
+   M8 = mm512_bswap_32( *(buf+ 8) ); \
+   M9 = mm512_bswap_32( *(buf+ 9) ); \
+   MA = mm512_bswap_32( *(buf+10) ); \
+   MB = mm512_bswap_32( *(buf+11) ); \
+   MC = mm512_bswap_32( *(buf+12) ); \
+   MD = mm512_bswap_32( *(buf+13) ); \
+   ME = mm512_bswap_32( *(buf+14) ); \
+   MF = mm512_bswap_32( *(buf+15) ); \
   ROUND_S_16WAY(0); \
   ROUND_S_16WAY(1); \
   ROUND_S_16WAY(2); \
@@ -2063,7 +2009,7 @@ do { \
 // is constant for every nonce and only needs to be run once per job. The
 // second part is run for each nonce using the precalculated midstate and the
 // hash from the first block.
-void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
+void blake256_16x32_round0_prehash_le( void *midstate, const void *midhash,
                                       void *data )
 {
   __m512i *M = (__m512i*)data;
@@ -2157,7 +2103,7 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
 }

 // Dfault is 14 rounds, blakecoin & vanilla are 8.
-void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
+void blake256_16x32_final_rounds_le( void *final_hash, const void *midstate,
                     const void *midhash, const void *data, const int rounds )
 {
   __m512i *H = (__m512i*)final_hash;
@@ -2274,27 +2220,23 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
   }

   // Byte swap final hash
-   const __m512i shuf_bswap32 =  mm512_bcast_m128( v128_set64( 
-                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
-   H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 );
-   H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 );
-   H[2] = _mm512_shuffle_epi8( mm512_xor3( VA, V2, h[2] ), shuf_bswap32 );
-   H[3] = _mm512_shuffle_epi8( mm512_xor3( VB, V3, h[3] ), shuf_bswap32 );
-   H[4] = _mm512_shuffle_epi8( mm512_xor3( VC, V4, h[4] ), shuf_bswap32 );
-   H[5] = _mm512_shuffle_epi8( mm512_xor3( VD, V5, h[5] ), shuf_bswap32 );
-   H[6] = _mm512_shuffle_epi8( mm512_xor3( VE, V6, h[6] ), shuf_bswap32 );
-   H[7] = _mm512_shuffle_epi8( mm512_xor3( VF, V7, h[7] ), shuf_bswap32 );
+   H[0] = mm512_bswap_32( mm512_xor3( V8, V0, h[0] ) );
+   H[1] = mm512_bswap_32( mm512_xor3( V9, V1, h[1] ) );
+   H[2] = mm512_bswap_32( mm512_xor3( VA, V2, h[2] ) );
+   H[3] = mm512_bswap_32( mm512_xor3( VB, V3, h[3] ) );
+   H[4] = mm512_bswap_32( mm512_xor3( VC, V4, h[4] ) );
+   H[5] = mm512_bswap_32( mm512_xor3( VD, V5, h[5] ) );
+   H[6] = mm512_bswap_32( mm512_xor3( VE, V6, h[6] ) );
+   H[7] = mm512_bswap_32( mm512_xor3( VF, V7, h[7] ) );
 }

 #endif

 // Blake-256 4 way

-static const uint32_t salt_zero_4x32_small[4] = { 0, 0, 0, 0 };
-
 static void
 blake32_4x32_init( blake_4x32_small_context *ctx, const uint32_t *iv,
-                   const uint32_t *salt, int rounds )
+                   int rounds )
 {
   casti_v128( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
   casti_v128( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 );
@@ -2404,11 +2346,10 @@ blake32_4x32_close( blake_4x32_small_context *ctx, unsigned ub, unsigned n,

 // Blake-256 8 way

-static const uint32_t salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };

 static void
-blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
-                   const uint32_t *salt, int rounds )
+blake32_8way_init( blake256_8x32_context *sc, const uint32_t *iv,
+                   int rounds )
 {
   casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E6676A09E667 );
   casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE85BB67AE85 );
@@ -2424,7 +2365,7 @@ blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
 }

 static void
-blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
+blake32_8way( blake256_8x32_context *sc, const void *data, size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   __m256i *buf;
@@ -2466,7 +2407,7 @@ blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
 }

 static void
-blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
+blake32_8way_close( blake256_8x32_context *sc, unsigned ub, unsigned n,
                    void *dst, size_t out_size_w32 )
 {
   __m256i buf[16];
@@ -2520,7 +2461,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
 }

 static void
-blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
+blake32_8way_le( blake256_8x32_context *sc, const void *data, size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   __m256i *buf;
@@ -2562,7 +2503,7 @@ blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
 }

 static void
-blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
+blake32_8way_close_le( blake256_8x32_context *sc, unsigned ub, unsigned n,
                       void *dst, size_t out_size_w32 )
 {
   __m256i buf[16];
@@ -2622,8 +2563,8 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
 //Blake-256 16 way AVX512

 static void
-blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
-                   const uint32_t *salt, int rounds )
+blake32_16way_init( blake256_16x32_context *sc, const uint32_t *iv,
+                    int rounds )
 {
   casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E6676A09E667 );
   casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE85BB67AE85 );
@@ -2639,7 +2580,7 @@ blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
 }

 static void
-blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
+blake32_16way( blake256_16x32_context *sc, const void *data, size_t len )
 {
   __m512i *vdata = (__m512i*)data;
   __m512i *buf;
@@ -2679,7 +2620,7 @@ blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
   sc->ptr = ptr;
 }
 static void
-blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
+blake32_16way_close( blake256_16x32_context *sc, unsigned ub, unsigned n,
                    void *dst, size_t out_size_w32 )
 {
   __m512i buf[16];
@@ -2733,7 +2674,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
 }

 static void
-blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
+blake32_16way_le( blake256_16x32_context *sc, const void *data, size_t len )
 {
   __m512i *vdata = (__m512i*)data;
   __m512i *buf;
@@ -2776,7 +2717,7 @@ blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
 }

 static void
-blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
+blake32_16way_close_le( blake256_16x32_context *sc, unsigned ub, unsigned n,
                    void *dst, size_t out_size_w32 )
 {
   __m512i buf[16];
@@ -2827,65 +2768,65 @@ blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
 }

 void
-blake256_16way_init(void *cc)
+blake256_16x32_init(void *cc)
 {
-   blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
+   blake32_16way_init( cc, IV256, 14 );
 }

 void
-blake256_16way_update(void *cc, const void *data, size_t len)
+blake256_16x32_update(void *cc, const void *data, size_t len)
 {
        blake32_16way(cc, data, len);
 }

 void
-blake256_16way_close(void *cc, void *dst)
+blake256_16x32_close(void *cc, void *dst)
 {
        blake32_16way_close(cc, 0, 0, dst, 8);
 }

 void
-blake256_16way_update_le(void *cc, const void *data, size_t len)
+blake256_16x32_update_le(void *cc, const void *data, size_t len)
 {
   blake32_16way_le(cc, data, len);
 }

 void
-blake256_16way_close_le(void *cc, void *dst)
+blake256_16x32_close_le(void *cc, void *dst)
 {
    blake32_16way_close_le(cc, 0, 0, dst, 8);
 }

 void blake256r14_16way_init(void *cc)
 {
-   blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
+   blake32_16way_init( cc, IV256, 14 );
 }

 void
-blake256r14_16way_update(void *cc, const void *data, size_t len)
+blake256r14_16x32_update(void *cc, const void *data, size_t len)
 {
   blake32_16way(cc, data, len);
 }

 void
-blake256r14_16way_close(void *cc, void *dst)
+blake256r14_16x32_close(void *cc, void *dst)
 {
   blake32_16way_close(cc, 0, 0, dst, 8);
 }

 void blake256r8_16way_init(void *cc)
 {
-   blake32_16way_init( cc, IV256, salt_zero_8way_small, 8 );
+   blake32_16way_init( cc, IV256, 8 );
 }

 void
-blake256r8_16way_update(void *cc, const void *data, size_t len)
+blake256r8_16x32_update(void *cc, const void *data, size_t len)
 {
   blake32_16way(cc, data, len);
 }

 void
-blake256r8_16way_close(void *cc, void *dst)
+blake256r8_16x32_close(void *cc, void *dst)
 {
   blake32_16way_close(cc, 0, 0, dst, 8);
 }
@@ -2898,7 +2839,7 @@ blake256r8_16way_close(void *cc, void *dst)
 void
 blake256_4x32_init(void *ctx)
 {
-   blake32_4x32_init( ctx, IV256, salt_zero_4x32_small, 14 );
+   blake32_4x32_init( ctx, IV256, 14 );
 }

 void
@@ -2918,31 +2859,31 @@ blake256_4x32_close(void *ctx, void *dst)
 // Blake-256 8 way

 void
-blake256_8way_init(void *cc)
+blake256_8x32_init(void *cc)
 {
-   blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
+   blake32_8way_init( cc, IV256, 14 );
 }

 void
-blake256_8way_update(void *cc, const void *data, size_t len)
+blake256_8x32_update(void *cc, const void *data, size_t len)
 {
        blake32_8way(cc, data, len);
 }

 void
-blake256_8way_close(void *cc, void *dst)
+blake256_8x32_close(void *cc, void *dst)
 {
        blake32_8way_close(cc, 0, 0, dst, 8);
 }

 void
-blake256_8way_update_le(void *cc, const void *data, size_t len)
+blake256_8x32_update_le(void *cc, const void *data, size_t len)
 {
        blake32_8way_le(cc, data, len);
 }

 void
-blake256_8way_close_le(void *cc, void *dst)
+blake256_8x32_close_le(void *cc, void *dst)
 {
        blake32_8way_close_le(cc, 0, 0, dst, 8);
 }
@@ -2952,7 +2893,7 @@ blake256_8way_close_le(void *cc, void *dst)
 // 14 rounds Blake, Decred
 void blake256r14_4x32_init(void *cc)
 {
-   blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 14 );
+   blake32_4x32_init( cc, IV256, 14 );
 }

 void
@@ -2969,19 +2910,19 @@ blake256r14_4x32_close(void *cc, void *dst)

 #if defined(__AVX2__)

-void blake256r14_8way_init(void *cc)
+void blake256r14_8x32_init(void *cc)
 {
-   blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
+   blake32_8way_init( cc, IV256, 14 );
 }

 void
-blake256r14_8way_update(void *cc, const void *data, size_t len)
+blake256r14_8x32_update(void *cc, const void *data, size_t len)
 {
   blake32_8way(cc, data, len);
 }

 void
-blake256r14_8way_close(void *cc, void *dst)
+blake256r14_8x32_close(void *cc, void *dst)
 {
   blake32_8way_close(cc, 0, 0, dst, 8);
 }
@@ -2991,7 +2932,7 @@ blake256r14_8way_close(void *cc, void *dst)
 // 8 rounds Blakecoin, Vanilla
 void blake256r8_4x32_init(void *cc)
 {
-   blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 8 );
+   blake32_4x32_init( cc, IV256, 8 );
 }

 void
@@ -3008,19 +2949,19 @@ blake256r8_4x32_close(void *cc, void *dst)

 #if defined (__AVX2__)

-void blake256r8_8way_init(void *cc)
+void blake256r8_8x32_init(void *cc)
 {
-   blake32_8way_init( cc, IV256, salt_zero_8way_small, 8 );
+   blake32_8way_init( cc, IV256, 8 );
 }

 void
-blake256r8_8way_update(void *cc, const void *data, size_t len)
+blake256r8_8x32_update(void *cc, const void *data, size_t len)
 {
   blake32_8way(cc, data, len);
 }

 void
-blake256r8_8way_close(void *cc, void *dst)
+blake256r8_8x32_close(void *cc, void *dst)
 {
   blake32_8way_close(cc, 0, 0, dst, 8);
 }
--- a/algo/blake/blake256-hash.h
+++ b/algo/blake/blake256-hash.h
@@ -29,13 +29,6 @@ typedef struct

 void blake256_transform_le( uint32_t *H, const uint32_t *buf,
                            const uint32_t T0, const uint32_t T1, int rounds );
-/*
-void blake256_init( blake256_context *sc );
-void blake256_update( blake256_context *sc, const void *data, size_t len );
-void blake256_close( blake256_context *sc, void *dst );
-void blake256_full( blake256_context *sc, void *dst, const void *data,
-                    size_t len );
-*/

 //////////////////////////////////
 //
@@ -55,6 +48,10 @@ typedef blake_4x32_small_context blake256_4x32_context;
 void blake256_4x32_init(void *ctx);
 void blake256_4x32_update(void *ctx, const void *data, size_t len);
 void blake256_4x32_close(void *ctx, void *dst);
+void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
+                                      void *data );
+void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
+                     const void *midhash, const void *data, const int rounds );

 // 14 rounds
 typedef blake_4x32_small_context blake256r14_4x32_context;
@@ -68,29 +65,6 @@ void blake256r8_4x32_init(void *cc);
 void blake256r8_4x32_update(void *cc, const void *data, size_t len);
 void blake256r8_4x32_close(void *cc, void *dst);

-void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
-                                      void *data );
-void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
-                     const void *midhash, const void *data, const int rounds );
-
-#define blake_4way_small_context         blake256_4x32_context
-#define blake256_4way_context            blake256_4x32_context
-#define blake256_4way_init               blake256_4x32_init
-#define blake256_4way_update             blake256_4x32_update
-#define blake256_4way_close              blake256_4x32_close
-#define blake256_4way_update_le          blake256_4x32_update_le
-#define blake256_4way_close_le           blake256_4x32_close_le
-#define blake256_4way_round0_prehash_le  blake256_4x32_round0_prehash_le
-#define blake256_4way_final_rounds_le    blake256_4x32_final_rounds_le
-#define blake256r14_4way_context         blake256r14_4x32_context
-#define blake256r14_4way_init            blake256r14_4x32_init
-#define blake256r14_4way_update          blake256r14_4x32_update
-#define blake256r14_4way_close           blake256r14_4x32_close
-#define blake256r8_4way_context          blake256r14_4x32_context
-#define blake256r8_4way_init             blake256r14_4x32_init
-#define blake256r8_4way_update           blake256r14_4x32_update
-#define blake256r8_4way_close            blake256r14_4x32_close
-
 #ifdef __AVX2__

 //////////////////////////////
@@ -107,45 +81,28 @@ typedef struct
 } blake_8way_small_context;

 // Default 14 rounds
-typedef blake_8way_small_context blake256_8way_context;
-void blake256_8way_init(void *cc);
-void blake256_8way_update(void *cc, const void *data, size_t len);
-void blake256_8way_close(void *cc, void *dst);
-void blake256_8way_update_le(void *cc, const void *data, size_t len);
-void blake256_8way_close_le(void *cc, void *dst);
-void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
+typedef blake_8way_small_context blake256_8x32_context;
+void blake256_8x32_init(void *cc);
+void blake256_8x32_update(void *cc, const void *data, size_t len);
+void blake256_8x32_close(void *cc, void *dst);
+void blake256_8x32_update_le(void *cc, const void *data, size_t len);
+void blake256_8x32_close_le(void *cc, void *dst);
+void blake256_8x32_round0_prehash_le( void *midstate, const void *midhash,
                                      void *data );
-void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
+void blake256_8x32_final_rounds_le( void *final_hash, const void *midstate,
                    const void *midhash, const void *data, const int rounds );

 // 14 rounds, blake, decred
-typedef blake_8way_small_context blake256r14_8way_context;
-void blake256r14_8way_init(void *cc);
-void blake256r14_8way_update(void *cc, const void *data, size_t len);
-void blake256r14_8way_close(void *cc, void *dst);
+typedef blake_8way_small_context blake256r14_8x32_context;
+void blake256r14_8x32_init(void *cc);
+void blake256r14_8x32_update(void *cc, const void *data, size_t len);
+void blake256r14_8x32_close(void *cc, void *dst);

 // 8 rounds, blakecoin, vanilla
-typedef blake_8way_small_context blake256r8_8way_context;
-void blake256r8_8way_init(void *cc);
-void blake256r8_8way_update(void *cc, const void *data, size_t len);
-void blake256r8_8way_close(void *cc, void *dst);
-
-#define blake_8x32_small_context      blake256_8way_context
-#define blake_8x32_init               blake256_8way_init
-#define blake_8x32_update             blake256_8way_update
-#define blake_8x32_close              blake256_8way_close
-#define blake_8x32_update_le          blake256_8way_update_le
-#define blake_8x32_close_le           blake256_8way_close_le
-#define blake_8x32_round0_prehash_le  blake256_8way_round0_prehash
-#define blake_8x32_final_rounds_le    blake256_8way_final_rounds_le
-#define blake256r14_8x32_context      blake256r14_8way_context
-#define blake256r14_8x32_init         blake256r14_8way_init
-#define blake256r14_8x32_update       blake256r14_8way_update
-#define blake256r14_8x32_close        blake256r14_8way_close
-#define blake256r8_8x32_context       blake256r14_8way_context
-#define blake256r8_8x32_init          blake256r14_8way_init
-#define blake256r8_8x32_update        blake256r14_8way_update
-#define blake256r8_8x32_close         blake256r14_8way_close
+typedef blake_8way_small_context blake256r8_8x32_context;
+void blake256r8_8x32_init(void *cc);
+void blake256r8_8x32_update(void *cc, const void *data, size_t len);
+void blake256r8_8x32_close(void *cc, void *dst);

 #if defined(SIMD512)

@@ -163,46 +120,29 @@ typedef struct
 } blake_16way_small_context __attribute__ ((aligned (128)));

 // Default 14 rounds
-typedef blake_16way_small_context blake256_16way_context;
-void blake256_16way_init(void *cc);
-void blake256_16way_update(void *cc, const void *data, size_t len);
-void blake256_16way_close(void *cc, void *dst);
+typedef blake_16way_small_context blake256_16x32_context;
+void blake256_16x32_init(void *cc);
+void blake256_16x32_update(void *cc, const void *data, size_t len);
+void blake256_16x32_close(void *cc, void *dst);
 // Expects data in little endian order, no byte swap needed
-void blake256_16way_update_le(void *cc, const void *data, size_t len);
-void blake256_16way_close_le(void *cc, void *dst);
-void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
+void blake256_16x32_update_le(void *cc, const void *data, size_t len);
+void blake256_16x32_close_le(void *cc, void *dst);
+void blake256_16x32_round0_prehash_le( void *midstate, const void *midhash,
                                       void *data );
-void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
+void blake256_16x32_final_rounds_le( void *final_hash, const void *midstate,
                     const void *midhash, const void *data, const int rounds );

 // 14 rounds, blake, decred
-typedef blake_16way_small_context blake256r14_16way_context;
-void blake256r14_16way_init(void *cc);
-void blake256r14_16way_update(void *cc, const void *data, size_t len);
-void blake256r14_16way_close(void *cc, void *dst);
+typedef blake_16way_small_context blake256r14_16x32_context;
+void blake256r14_16x32_init(void *cc);
+void blake256r14_16x32_update(void *cc, const void *data, size_t len);
+void blake256r14_16x32_close(void *cc, void *dst);

 // 8 rounds, blakecoin, vanilla
-typedef blake_16way_small_context blake256r8_16way_context;
-void blake256r8_16way_init(void *cc);
-void blake256r8_16way_update(void *cc, const void *data, size_t len);
-void blake256r8_16way_close(void *cc, void *dst);
-
-#define blake_16x32_small_context      blake256_16way_context
-#define blake_16x32_init               blake256_16way_init
-#define blake_16x32_update             blake256_16way_update
-#define blake_16x32_close              blake256_16way_close
-#define blake_16x32_update_le          blake256_16way_update_le
-#define blake_16x32_close_le           blake256_16way_close_le
-#define blake_16x32_round0_prehash_le  blake256_16way_round0_prehash
-#define blake_16x32_final_rounds_le    blake256_16way_final_rounds_le
-#define blake256r14_16x32_context      blake256r14_16way_context
-#define blake256r14_16x32_init         blake256r14_16way_init
-#define blake256r14_16x32_update       blake256r14_16way_update
-#define blake256r14_16x32_close        blake256r14_16way_close
-#define blake256r8_16x32_context       blake256r8_16way_context
-#define blake256r8_16x32_init          blake256r8_16way_init
-#define blake256r8_16x32_update        blake256r8_16way_update
-#define blake256r8_16x32_close         blake256r8_16way_close
+typedef blake_16way_small_context blake256r8_16x32_context;
+void blake256r8_16x32_init(void *cc);
+void blake256r8_16x32_update(void *cc, const void *data, size_t len);
+void blake256r8_16x32_close(void *cc, void *dst);

 #endif  // AVX512
 #endif  // AVX2
--- a/algo/blake/blake2b-hash.h
+++ b/algo/blake/blake2b-hash.h
@@ -14,7 +14,6 @@
 #define ALIGN(x) __attribute__((aligned(x)))
 #endif

-
 #if defined(SIMD512)

 typedef struct ALIGN( 64 ) {
@@ -30,11 +29,6 @@ void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
                          size_t inlen );
 void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out );

-#define blake2b_8way_ctx         blake2b_8x64_ctx
-#define blake2b_8way_init        blake2b_8x64_init
-#define blake2b_8way_update      blake2b_8x64_update
-#define blake2b_8way_final       blake2b_8x64_final
-
 #endif

 #if defined(__AVX2__)
@@ -53,11 +47,6 @@ void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
                          size_t inlen );
 void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out );

-#define blake2b_4way_ctx         blake2b_4x64_ctx
-#define blake2b_4way_init        blake2b_4x64_init
-#define blake2b_4way_update      blake2b_4x64_update
-#define blake2b_4way_final       blake2b_4x64_final
-
 #endif

 #endif
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -17,7 +17,7 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
   uint32_t hash[8*8] __attribute__ ((aligned (128)));;
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
+   blake2b_8x64_ctx ctx __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[49]);   // 3*16+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -35,9 +35,9 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );

-      blake2b_8way_init( &ctx );
-      blake2b_8way_update( &ctx, vdata, 80 );
-      blake2b_8way_final( &ctx, hash );
+      blake2b_8x64_init( &ctx );
+      blake2b_8x64_update( &ctx, vdata, 80 );
+      blake2b_8x64_final( &ctx, hash );

      for ( int lane = 0; lane < 8; lane++ )
      if ( hash7[ lane<<1 ] <= Htarg )
@@ -61,10 +61,10 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
 // Function not used, code inlined.
 void blake2b_4way_hash(void *output, const void *input)
 {
-    blake2b_4way_ctx ctx;
-    blake2b_4way_init( &ctx );
-    blake2b_4way_update( &ctx, input, 80 );
-    blake2b_4way_final( &ctx, output );
+    blake2b_4x64_ctx ctx;
+    blake2b_4x64_init( &ctx );
+    blake2b_4x64_update( &ctx, input, 80 );
+    blake2b_4x64_final( &ctx, output );
 }

 int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
@@ -73,7 +73,7 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
 	uint32_t hash[8*4] __attribute__ ((aligned (64)));;
   uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
+   blake2b_4x64_ctx ctx __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[25]);   // 3*8+1
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
@@ -90,9 +90,9 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

-      blake2b_4way_init( &ctx ); 
-      blake2b_4way_update( &ctx, vdata, 80 );
-      blake2b_4way_final( &ctx, hash );
+      blake2b_4x64_init( &ctx ); 
+      blake2b_4x64_update( &ctx, vdata, 80 );
+      blake2b_4x64_final( &ctx, hash );

      for ( int lane = 0; lane < 4; lane++ )
      if ( hash7[ lane<<1 ] <= Htarg )
--- a/algo/blake/blake2s-hash.h
+++ b/algo/blake/blake2s-hash.h
@@ -61,6 +61,11 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
 int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
                              const void *input, uint64_t inlen );

+#define blake2s_4x32_state       blake2s_4way_state
+#define blake2s_4x32_init        blake2s_4way_init
+#define blake2s_4x32_update      blake2s_4way_update
+#define blake2s_4x32_final       blake2s_4way_final
+#define blake2s_4x32_full_blocks blake2s_4way_full_blocks

 #if defined(__AVX2__)

@@ -81,6 +86,12 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
 int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
                              const void *input, uint64_t inlen );

+#define blake2s_8x32_state       blake2s_8way_state
+#define blake2s_8x32_init        blake2s_8way_init
+#define blake2s_8x32_update      blake2s_8way_update
+#define blake2s_8x32_final       blake2s_8way_final
+#define blake2s_8x32_full_blocks blake2s_8way_full_blocks
+
 #endif

 #if defined(SIMD512)
@@ -100,6 +111,11 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in,
                         uint64_t inlen );
 int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );

+#define blake2s_16x32_state       blake2s_16way_state
+#define blake2s_16x32_init        blake2s_16way_init
+#define blake2s_16x32_update      blake2s_16way_update
+#define blake2s_16x32_final       blake2s_16way_final
+
 #endif

 #if 0
--- a/algo/blake/blake512-hash.c
+++ b/algo/blake/blake512-hash.c
@@ -617,24 +617,22 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
  VD = v512_64( CB5 ^ T0 ); \
  VE = v512_64( CB6 ^ T1 ); \
  VF = v512_64( CB7 ^ T1 ); \
-  const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set64( \
-                                   0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
-  M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
-  M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
-  M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
-  M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
-  M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
-  M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
-  M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
-  M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
-  M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
-  M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
-  MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
-  MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
-  MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
-  MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
-  ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
-  MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
+  M0 = mm512_bswap_64( *(buf+ 0) ); \
+  M1 = mm512_bswap_64( *(buf+ 1) ); \
+  M2 = mm512_bswap_64( *(buf+ 2) ); \
+  M3 = mm512_bswap_64( *(buf+ 3) ); \
+  M4 = mm512_bswap_64( *(buf+ 4) ); \
+  M5 = mm512_bswap_64( *(buf+ 5) ); \
+  M6 = mm512_bswap_64( *(buf+ 6) ); \
+  M7 = mm512_bswap_64( *(buf+ 7) ); \
+  M8 = mm512_bswap_64( *(buf+ 8) ); \
+  M9 = mm512_bswap_64( *(buf+ 9) ); \
+  MA = mm512_bswap_64( *(buf+10) ); \
+  MB = mm512_bswap_64( *(buf+11) ); \
+  MC = mm512_bswap_64( *(buf+12) ); \
+  MD = mm512_bswap_64( *(buf+13) ); \
+  ME = mm512_bswap_64( *(buf+14) ); \
+  MF = mm512_bswap_64( *(buf+15) ); \
  ROUND_B_8WAY(0); \
  ROUND_B_8WAY(1); \
  ROUND_B_8WAY(2); \
@@ -661,7 +659,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
  H7 = mm512_xor3( VF, V7, H7 ); \
 }

-void blake512_8way_compress( blake_8way_big_context *sc )
+void blake512_8x64_compress( blake_8x64_big_context *sc )
 { 
  __m512i M0, M1, M2, M3, M4, M5, M6, M7;
  __m512i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -685,25 +683,22 @@ void blake512_8way_compress( blake_8way_big_context *sc )
  VE = v512_64( CB6 ^ sc->T1 );
  VF = v512_64( CB7 ^ sc->T1 );

-  const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set64( 
-                                   0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
-
-  M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
-  M1 = _mm512_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
-  M2 = _mm512_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
-  M3 = _mm512_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
-  M4 = _mm512_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
-  M5 = _mm512_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
-  M6 = _mm512_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
-  M7 = _mm512_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
-  M8 = _mm512_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
-  M9 = _mm512_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
-  MA = _mm512_shuffle_epi8( sc->buf[10], shuf_bswap64 );
-  MB = _mm512_shuffle_epi8( sc->buf[11], shuf_bswap64 );
-  MC = _mm512_shuffle_epi8( sc->buf[12], shuf_bswap64 );
-  MD = _mm512_shuffle_epi8( sc->buf[13], shuf_bswap64 );
-  ME = _mm512_shuffle_epi8( sc->buf[14], shuf_bswap64 );
-  MF = _mm512_shuffle_epi8( sc->buf[15], shuf_bswap64 );
+  M0 = mm512_bswap_64( sc->buf[ 0] );
+  M1 = mm512_bswap_64( sc->buf[ 1] );
+  M2 = mm512_bswap_64( sc->buf[ 2] );
+  M3 = mm512_bswap_64( sc->buf[ 3] );
+  M4 = mm512_bswap_64( sc->buf[ 4] );
+  M5 = mm512_bswap_64( sc->buf[ 5] );
+  M6 = mm512_bswap_64( sc->buf[ 6] );
+  M7 = mm512_bswap_64( sc->buf[ 7] );
+  M8 = mm512_bswap_64( sc->buf[ 8] );
+  M9 = mm512_bswap_64( sc->buf[ 9] );
+  MA = mm512_bswap_64( sc->buf[10] );
+  MB = mm512_bswap_64( sc->buf[11] );
+  MC = mm512_bswap_64( sc->buf[12] );
+  MD = mm512_bswap_64( sc->buf[13] );
+  ME = mm512_bswap_64( sc->buf[14] );
+  MF = mm512_bswap_64( sc->buf[15] );

  ROUND_B_8WAY(0);
  ROUND_B_8WAY(1);
@@ -733,7 +728,7 @@ void blake512_8way_compress( blake_8way_big_context *sc )
 }

 // won't be used after prehash implemented
-void blake512_8way_compress_le( blake_8x64_big_context *sc )
+void blake512_8x64_compress_le( blake_8x64_big_context *sc )
 {
  __m512i M0, M1, M2, M3, M4, M5, M6, M7;
  __m512i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -1177,7 +1172,7 @@ void blake512_8x64_full( blake_8x64_big_context *sc, void * dst,
   {
      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
            sc->T1 = sc->T1 + 1;
-      blake512_8way_compress( sc );
+      blake512_8x64_compress( sc );
      sc->ptr = 0;
   }

@@ -1213,7 +1208,7 @@ void blake512_8x64_full( blake_8x64_big_context *sc, void * dst,
   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
       sc->T1 = sc->T1 + 1;

-   blake512_8way_compress( sc );
+   blake512_8x64_compress( sc );
   
   mm512_block_bswap_64( (__m512i*)dst, sc->H );
 }
@@ -1244,7 +1239,7 @@ void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst,
   {
      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
            sc->T1 = sc->T1 + 1;
-      blake512_8way_compress_le( sc );
+      blake512_8x64_compress_le( sc );
      sc->ptr = 0;
   }

@@ -1280,7 +1275,7 @@ void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst,
   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
       sc->T1 = sc->T1 + 1;

-   blake512_8way_compress_le( sc );
+   blake512_8x64_compress_le( sc );

   mm512_block_bswap_64( (__m512i*)dst, sc->H );
 }
@@ -1355,24 +1350,22 @@ blake512_8x64_close(void *cc, void *dst)
  VD = v256_64( CB5 ^ T0 ); \
  VE = v256_64( CB6 ^ T1 ); \
  VF = v256_64( CB7 ^ T1 ); \
-  const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set64( \
-                             0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
-  M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
-  M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
-  M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
-  M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
-  M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
-  M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
-  M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
-  M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
-  M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
-  M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
-  MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
-  MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
-  MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
-  MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
-  ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
-  MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
+  M0 = mm256_bswap_64( *(buf+ 0) ); \
+  M1 = mm256_bswap_64( *(buf+ 1) ); \
+  M2 = mm256_bswap_64( *(buf+ 2) ); \
+  M3 = mm256_bswap_64( *(buf+ 3) ); \
+  M4 = mm256_bswap_64( *(buf+ 4) ); \
+  M5 = mm256_bswap_64( *(buf+ 5) ); \
+  M6 = mm256_bswap_64( *(buf+ 6) ); \
+  M7 = mm256_bswap_64( *(buf+ 7) ); \
+  M8 = mm256_bswap_64( *(buf+ 8) ); \
+  M9 = mm256_bswap_64( *(buf+ 9) ); \
+  MA = mm256_bswap_64( *(buf+10) ); \
+  MB = mm256_bswap_64( *(buf+11) ); \
+  MC = mm256_bswap_64( *(buf+12) ); \
+  MD = mm256_bswap_64( *(buf+13) ); \
+  ME = mm256_bswap_64( *(buf+14) ); \
+  MF = mm256_bswap_64( *(buf+15) ); \
  ROUND_B_4WAY(0); \
  ROUND_B_4WAY(1); \
  ROUND_B_4WAY(2); \
@@ -1400,7 +1393,7 @@ blake512_8x64_close(void *cc, void *dst)
 }


-void blake512_4way_compress( blake_4x64_big_context *sc )
+void blake512_4x64_compress( blake_4x64_big_context *sc )
 {
  __m256i M0, M1, M2, M3, M4, M5, M6, M7;
  __m256i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -1423,25 +1416,23 @@ void blake512_4way_compress( blake_4x64_big_context *sc )
  VD = v256_64( CB5 ^ sc->T0 );
  VE = v256_64( CB6 ^ sc->T1 );
  VF = v256_64( CB7 ^ sc->T1 );
-  const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set64(
-                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ) );

-  M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
-  M1 = _mm256_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
-  M2 = _mm256_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
-  M3 = _mm256_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
-  M4 = _mm256_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
-  M5 = _mm256_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
-  M6 = _mm256_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
-  M7 = _mm256_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
-  M8 = _mm256_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
-  M9 = _mm256_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
-  MA = _mm256_shuffle_epi8( sc->buf[10], shuf_bswap64 );
-  MB = _mm256_shuffle_epi8( sc->buf[11], shuf_bswap64 );
-  MC = _mm256_shuffle_epi8( sc->buf[12], shuf_bswap64 );
-  MD = _mm256_shuffle_epi8( sc->buf[13], shuf_bswap64 );
-  ME = _mm256_shuffle_epi8( sc->buf[14], shuf_bswap64 );
-  MF = _mm256_shuffle_epi8( sc->buf[15], shuf_bswap64 );
+  M0 = mm256_bswap_64( sc->buf[ 0] );
+  M1 = mm256_bswap_64( sc->buf[ 1] );
+  M2 = mm256_bswap_64( sc->buf[ 2] );
+  M3 = mm256_bswap_64( sc->buf[ 3] );
+  M4 = mm256_bswap_64( sc->buf[ 4] );
+  M5 = mm256_bswap_64( sc->buf[ 5] );
+  M6 = mm256_bswap_64( sc->buf[ 6] );
+  M7 = mm256_bswap_64( sc->buf[ 7] );
+  M8 = mm256_bswap_64( sc->buf[ 8] );
+  M9 = mm256_bswap_64( sc->buf[ 9] );
+  MA = mm256_bswap_64( sc->buf[10] );
+  MB = mm256_bswap_64( sc->buf[11] );
+  MC = mm256_bswap_64( sc->buf[12] );
+  MD = mm256_bswap_64( sc->buf[13] );
+  ME = mm256_bswap_64( sc->buf[14] );
+  MF = mm256_bswap_64( sc->buf[15] );

  ROUND_B_4WAY(0);
  ROUND_B_4WAY(1);
@@ -1470,7 +1461,7 @@ void blake512_4way_compress( blake_4x64_big_context *sc )
  sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
 }

-void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate,
+void blake512_4x64_prehash_le( blake512_4x64_context *sc, __m256i *midstate,
                               const void *data )
 {
   __m256i V0, V1, V2, V3, V4, V5, V6, V7;
@@ -1562,7 +1553,7 @@ void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate,
   midstate[15] = VF;
 }

-void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
+void blake512_4x64_final_le( blake512_4x64_context *sc, void *hash,
                             const __m256i nonce, const __m256i *midstate )
 {
   __m256i M0, M1, M2, M3, M4, M5, M6, M7;
@@ -1685,7 +1676,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
 }


-void blake512_4x64_init( blake_4x64_big_context *sc )
+void blake512_4x64_init( blake512_4x64_context *sc )
 {
   casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
   casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
@@ -1798,7 +1789,7 @@ blake64_4way_close( blake_4x64_big_context *sc, void *dst )
 }

 // init, update & close
-void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
+void blake512_4x64_full( blake512_4x64_context *sc, void * dst,
                         const void *data, size_t len )
 {

@@ -1824,7 +1815,7 @@ void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
   {
      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
         sc->T1 =  sc->T1 + 1;
-      blake512_4way_compress( sc );
+      blake512_4x64_compress( sc );
      sc->ptr = 0;
   }

@@ -1859,7 +1850,7 @@ void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
       sc->T1 = sc->T1 + 1;

-   blake512_4way_compress( sc );
+   blake512_4x64_compress( sc );

   mm256_block_bswap_64( (__m256i*)dst, sc->H );
 }
@@ -1934,29 +1925,6 @@ void blake512_2x64_compress( blake_2x64_big_context *sc )
  VE = v128_64( CB6 ^ sc->T1 );
  VF = v128_64( CB7 ^ sc->T1 );

-#if defined(__SSSE3__)
-
-  const v128u64_t shuf_bswap64 = v128_set64(
-                                 0x08090a0b0c0d0e0f, 0x0001020304050607 );
-  M0 = v128_shuffle8( sc->buf[ 0], shuf_bswap64 );
-  M1 = v128_shuffle8( sc->buf[ 1], shuf_bswap64 );
-  M2 = v128_shuffle8( sc->buf[ 2], shuf_bswap64 );
-  M3 = v128_shuffle8( sc->buf[ 3], shuf_bswap64 );
-  M4 = v128_shuffle8( sc->buf[ 4], shuf_bswap64 );
-  M5 = v128_shuffle8( sc->buf[ 5], shuf_bswap64 );
-  M6 = v128_shuffle8( sc->buf[ 6], shuf_bswap64 );
-  M7 = v128_shuffle8( sc->buf[ 7], shuf_bswap64 );
-  M8 = v128_shuffle8( sc->buf[ 8], shuf_bswap64 );
-  M9 = v128_shuffle8( sc->buf[ 9], shuf_bswap64 );
-  MA = v128_shuffle8( sc->buf[10], shuf_bswap64 );
-  MB = v128_shuffle8( sc->buf[11], shuf_bswap64 );
-  MC = v128_shuffle8( sc->buf[12], shuf_bswap64 );
-  MD = v128_shuffle8( sc->buf[13], shuf_bswap64 );
-  ME = v128_shuffle8( sc->buf[14], shuf_bswap64 );
-  MF = v128_shuffle8( sc->buf[15], shuf_bswap64 );
-
-#else  // SSE2 & NEON
-
  M0 = v128_bswap64( sc->buf[ 0] );
  M1 = v128_bswap64( sc->buf[ 1] );
  M2 = v128_bswap64( sc->buf[ 2] );
@@ -1974,8 +1942,6 @@ void blake512_2x64_compress( blake_2x64_big_context *sc )
  ME = v128_bswap64( sc->buf[14] );
  MF = v128_bswap64( sc->buf[15] );
  
-#endif
-
  ROUND_B_2X64(0);
  ROUND_B_2X64(1);
  ROUND_B_2X64(2);
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -54,10 +54,10 @@ int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );

   // Partialy prehash second block without touching nonces in block_buf[3].
-   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-      blake256_16way_final_rounds_le( hash32, midstate_vars, block0_hash,
+      blake256_16x32_final_rounds_le( hash32, midstate_vars, block0_hash,
                                      block_buf, rounds );
      for ( int lane = 0; lane < 16; lane++ )
      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -123,10 +123,10 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
   block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );

   // Partialy prehash second block without touching nonces in block_buf[3].
-   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-      blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
+      blake256_8x32_final_rounds_le( hash32, midstate_vars, block0_hash,
                                     block_buf, rounds );
      for ( int lane = 0; lane < 8; lane++ )
      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -148,16 +148,16 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
   
 #elif defined (BLAKECOIN_4WAY)

-blake256r8_4way_context blakecoin_4w_ctx;
+blake256r8_4x32_context blakecoin_4w_ctx;

 void blakecoin_4way_hash(void *state, const void *input)
 {
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-     blake256r8_4way_context ctx;
+     blake256r8_4x32_context ctx;

     memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
-     blake256r8_4way_update( &ctx, input + (64<<2), 16 );
-     blake256r8_4way_close( &ctx, vhash );
+     blake256r8_4x32_update( &ctx, input + (64<<2), 16 );
+     blake256r8_4x32_close( &ctx, vhash );

     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }
@@ -178,8 +178,8 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
      HTarget = 0x7f;

   v128_bswap32_intrlv80_4x32( vdata, pdata );
-   blake256r8_4way_init( &blakecoin_4w_ctx );
-   blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
+   blake256r8_4x32_init( &blakecoin_4w_ctx );
+   blake256r8_4x32_update( &blakecoin_4w_ctx, vdata, 64 );

   do {
      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -16,28 +16,27 @@ extern void pentablakehash_4way( void *output, const void *input )
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-     blake512_4way_context ctx;
+     blake512_4x64_context ctx;

+     blake512_4x64_init( &ctx );
+     blake512_4x64_update( &ctx, input, 80 );
+     blake512_4x64_close( &ctx, vhash );

-     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, input, 80 );
-     blake512_4way_close( &ctx, vhash );
+     blake512_4x64_init( &ctx );
+     blake512_4x64_update( &ctx, vhash, 64 );
+     blake512_4x64_close( &ctx, vhash );

-     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, vhash, 64 );
-     blake512_4way_close( &ctx, vhash );
+     blake512_4x64_init( &ctx );
+     blake512_4x64_update( &ctx, vhash, 64 );
+     blake512_4x64_close( &ctx, vhash );

-     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, vhash, 64 );
-     blake512_4way_close( &ctx, vhash );
+     blake512_4x64_init( &ctx );
+     blake512_4x64_update( &ctx, vhash, 64 );
+     blake512_4x64_close( &ctx, vhash );

-     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, vhash, 64 );
-     blake512_4way_close( &ctx, vhash );
-
-     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, vhash, 64 );
-     blake512_4way_close( &ctx, vhash );
+     blake512_4x64_init( &ctx );
+     blake512_4x64_update( &ctx, vhash, 64 );
+     blake512_4x64_close( &ctx, vhash );

     memcpy( output,    hash0, 32 );
     memcpy( output+32, hash1, 32 );
--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -227,7 +227,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
 	v[14] = S->f[0] ^ blake2s_IV[6];
 	v[15] = S->f[1] ^ blake2s_IV[7];

-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)

   v128_t *V = (v128_t*)v;

@@ -263,19 +263,6 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
   V[3] = v128_swap64( V[3] ); \
   V[2] = v128_shufll32( V[2] )

-   BLAKE2S_ROUND(0);
-   BLAKE2S_ROUND(1);
-   BLAKE2S_ROUND(2);
-   BLAKE2S_ROUND(3);
-   BLAKE2S_ROUND(4);
-   BLAKE2S_ROUND(5);
-   BLAKE2S_ROUND(6);
-   BLAKE2S_ROUND(7);
-   BLAKE2S_ROUND(8);
-   BLAKE2S_ROUND(9);
-   
-#undef BLAKE2S_ROUND
-
 #else

 #define G(r,i,a,b,c,d) \
@@ -290,7 +277,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
 		b = SPH_ROTR32(b ^ c, 7); \
 	} while(0)

-#define ROUND(r)  \
+#define BLAKE2S_ROUND(r)  \
 	do { \
 		G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
 		G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
@@ -302,24 +289,25 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
 		G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
 	} while(0)

-   ROUND( 0 );
-	ROUND( 1 );
-	ROUND( 2 );
-	ROUND( 3 );
-	ROUND( 4 );
-	ROUND( 5 );
-	ROUND( 6 );
-	ROUND( 7 );
-	ROUND( 8 );
-	ROUND( 9 );
-
 #endif

+   BLAKE2S_ROUND(0);
+   BLAKE2S_ROUND(1);
+   BLAKE2S_ROUND(2);
+   BLAKE2S_ROUND(3);
+   BLAKE2S_ROUND(4);
+   BLAKE2S_ROUND(5);
+   BLAKE2S_ROUND(6);
+   BLAKE2S_ROUND(7);
+   BLAKE2S_ROUND(8);
+   BLAKE2S_ROUND(9);
+   
+
 	for( size_t i = 0; i < 8; ++i )
 		S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];

 #undef G
-#undef ROUND
+#undef BLAKE2S_ROUND
 	return 0;
 }

--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -39,16 +39,14 @@
 #include <stddef.h>
 #include "simd-utils.h"

-#define SPH_SIZE_bmw256   256
-
-#define SPH_SIZE_bmw512   512
-
 // BMW-256 4 way 32

+#if defined(__SSE2__) || defined(__ARM_NEON)
+
 typedef struct
 {
-   v128_t buf[64];
-   v128_t H[16];
+   v128u32_t buf[64];
+   v128u32_t H[16];
   size_t ptr;
   uint32_t bit_count;  // assume bit_count fits in 32 bits
 } bmw_4way_small_context;
@@ -58,13 +56,19 @@ typedef bmw_4way_small_context bmw256_4way_context;
 void bmw256_4way_init( bmw256_4way_context *ctx );

 void bmw256_4way_update(void *cc, const void *data, size_t len);
-#define bmw256_4way bmw256_4way_update

 void bmw256_4way_close(void *cc, void *dst);

 void bmw256_4way_addbits_and_close(
        void *cc, unsigned ub, unsigned n, void *dst);

+#define bmw256_4x32_context bmw256_4way_context
+#define bmw256_4x32_init    bmw256_4way_init
+#define bmw256_4x32_update  bmw256_4way_update
+#define bmw256_4x32_close   bmw256_4way_close
+
+#endif
+
 #if defined(__AVX2__)

 // BMW-256 8 way 32
@@ -85,6 +89,11 @@ void bmw256_8way_update( bmw256_8way_context *ctx, const void *data,
 #define bmw256_8way bmw256_8way_update
 void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );

+#define bmw256_8x32_context bmw256_8way_context
+#define bmw256_8x32_init    bmw256_8way_init
+#define bmw256_8x32_update  bmw256_8way_update
+#define bmw256_8x32_close   bmw256_8way_close
+
 #endif

 #if defined(SIMD512)
@@ -106,6 +115,11 @@ void bmw256_16way_update( bmw256_16way_context *ctx, const void *data,
                          size_t len );
 void bmw256_16way_close( bmw256_16way_context *ctx, void *dst );

+#define bmw256_16x32_context bmw256_16way_context
+#define bmw256_16x32_init    bmw256_16way_init
+#define bmw256_16x32_update  bmw256_16way_update
+#define bmw256_16x32_close   bmw256_16way_close
+
 #endif

 // BMW-512 2 way 64
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -45,7 +45,7 @@ extern "C"{

 #define LPAR   (

-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)

 // BMW-256 4 way 32
 /*
@@ -284,9 +284,9 @@ static const uint32_t IV256[] = {
                     v128_xor( M[13], H[13] ) ) )


-void compress_small( const v128u64_t *M, const v128u64_t H[16], v128u64_t dH[16] )
+void compress_small( const v128u32_t *M, const v128u32_t H[16], v128u32_t dH[16] )
 {
-   v128u64_t qt[32], xl, xh; \
+   v128u32_t qt[32], xl, xh; \

   qt[ 0] = v128_add32( ss0( Ws0 ), H[ 1] );
   qt[ 1] = v128_add32( ss1( Ws1 ), H[ 2] );
@@ -428,49 +428,25 @@ static const uint32_t final_s[16][4] =
   { 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae },
   { 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
 };
-/*
-static const v128u64_t final_s[16] =
-{
-   { 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
-   { 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
-   { 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 },
-   { 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 },
-   { 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 },
-   { 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 },
-   { 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 },
-   { 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 },
-   { 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 },
-   { 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 },
-   { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
-   { 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab },
-   { 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac },
-   { 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad },
-   { 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
-   { 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
-};
-*/
+
 void bmw256_4way_init( bmw256_4way_context *ctx )
 {
-   ctx->H[ 0] = v128_64( 0x4041424340414243 );
-   ctx->H[ 1] = v128_64( 0x4445464744454647 );
-   ctx->H[ 2] = v128_64( 0x48494A4B48494A4B );
-   ctx->H[ 3] = v128_64( 0x4C4D4E4F4C4D4E4F );
-   ctx->H[ 4] = v128_64( 0x5051525350515253 );
-   ctx->H[ 5] = v128_64( 0x5455565754555657 );
-   ctx->H[ 6] = v128_64( 0x58595A5B58595A5B );
-   ctx->H[ 7] = v128_64( 0x5C5D5E5F5C5D5E5F );
-   ctx->H[ 8] = v128_64( 0x6061626360616263 );
-   ctx->H[ 9] = v128_64( 0x6465666764656667 );
-   ctx->H[10] = v128_64( 0x68696A6B68696A6B );
-   ctx->H[11] = v128_64( 0x6C6D6E6F6C6D6E6F );
-   ctx->H[12] = v128_64( 0x7071727370717273 );
-   ctx->H[13] = v128_64( 0x7475767774757677 );
-   ctx->H[14] = v128_64( 0x78797A7B78797A7B );
-   ctx->H[15] = v128_64( 0x7C7D7E7F7C7D7E7F );
-
-
-//   for ( int i = 0; i < 16; i++ )
-//      sc->H[i] = v128_32( iv[i] );
+   ctx->H[ 0] = v128_32( 0x40414243 );
+   ctx->H[ 1] = v128_32( 0x44454647 );
+   ctx->H[ 2] = v128_32( 0x48494A4B );
+   ctx->H[ 3] = v128_32( 0x4C4D4E4F );
+   ctx->H[ 4] = v128_32( 0x50515253 );
+   ctx->H[ 5] = v128_32( 0x54555657 );
+   ctx->H[ 6] = v128_32( 0x58595A5B );
+   ctx->H[ 7] = v128_32( 0x5C5D5E5F );
+   ctx->H[ 8] = v128_32( 0x60616263 );
+   ctx->H[ 9] = v128_32( 0x64656667 );
+   ctx->H[10] = v128_32( 0x68696A6B );
+   ctx->H[11] = v128_32( 0x6C6D6E6F );
+   ctx->H[12] = v128_32( 0x70717273 );
+   ctx->H[13] = v128_32( 0x74757677 );
+   ctx->H[14] = v128_32( 0x78797A7B );
+   ctx->H[15] = v128_32( 0x7C7D7E7F );
   ctx->ptr = 0;
   ctx->bit_count = 0;
 }
@@ -478,10 +454,10 @@ void bmw256_4way_init( bmw256_4way_context *ctx )
 static void
 bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
 {
-   v128u64_t *vdata = (v128u64_t*)data;
-   v128u64_t *buf;
-   v128u64_t htmp[16];
-   v128u64_t *h1, *h2;
+   v128u32_t *vdata = (v128u32_t*)data;
+   v128u32_t *buf;
+   v128u32_t htmp[16];
+   v128u32_t *h1, *h2;
   size_t ptr;
   const int buf_size = 64;  // bytes of one lane, compatible with len

@@ -503,7 +479,7 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
      ptr += clen;
      if ( ptr == buf_size )
      {
-         v128u64_t *ht;
+         v128u32_t *ht;
         compress_small( buf, h1, h2 );
         ht = h1;
         h1 = h2;
@@ -521,14 +497,14 @@ static void
 bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
 	void *dst, size_t out_size_w32)
 {
-   v128u64_t *buf;
-   v128u64_t h1[16], h2[16], *h;
+   v128u32_t *buf;
+   v128u32_t h1[16], h2[16], *h;
   size_t ptr, u, v;
   const int buf_size = 64;  // bytes of one lane, compatible with len

   buf = sc->buf;
   ptr = sc->ptr;
-   buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
+   buf[ ptr>>2 ] = v128_32( 0x00000080 );
   ptr += 4;
   h = sc->H;

@@ -548,7 +524,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
   for ( u = 0; u < 16; u ++ )
      buf[u] = h2[u];

-   compress_small( buf, (v128u64_t*)final_s, h1 );
+   compress_small( buf, (v128u32_t*)final_s, h1 );

   for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
      casti_v128( dst, u ) = h1[v];
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -39,7 +39,7 @@ static void transform( cubehashParam *sp )

 #elif defined(__AVX2__)

-    register __m256i x0, x1, x2, x3, y0, y1;
+    register __m256i x0, x1, x2, x3, t0;

    x0 = _mm256_load_si256( (__m256i*)sp->x     );
    x1 = _mm256_load_si256( (__m256i*)sp->x + 1 );   
@@ -50,10 +50,10 @@ static void transform( cubehashParam *sp )
    { 
        x2 = _mm256_add_epi32( x0, x2 );
        x3 = _mm256_add_epi32( x1, x3 );
-        y0 = mm256_rol_32( x1, 7 );
-        y1 = mm256_rol_32( x0, 7 );
-        x0 = _mm256_xor_si256( y0, x2 );
-        x1 = _mm256_xor_si256( y1, x3 );
+        t0 = mm256_rol_32( x1, 7 );
+        x1 = mm256_rol_32( x0, 7 );
+        x0 = _mm256_xor_si256( t0, x2 );
+        x1 = _mm256_xor_si256( x1, x3 );
        x2 = mm256_swap128_64( x2 );
        x3 = mm256_swap128_64( x3 );
        x2 = _mm256_add_epi32( x0, x2 );
@@ -75,7 +75,7 @@ static void transform( cubehashParam *sp )

 #else   // AVX, SSE2, NEON

-    v128_t x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
+    v128_t x0, x1, x2, x3, x4, x5, x6, x7, t0, t1;

    x0 = casti_v128( sp->x, 0 );
    x1 = casti_v128( sp->x, 1 );
@@ -92,16 +92,12 @@ static void transform( cubehashParam *sp )
       x5 = v128_add32( x1, x5 );
       x6 = v128_add32( x2, x6 );
       x7 = v128_add32( x3, x7 );
-       y0 = x2;
-       y1 = x3;
-       y2 = x0;
-       y3 = x1;
-       x0 = v128_rol32( y0, 7 );
-       x1 = v128_rol32( y1, 7 );
-       x2 = v128_rol32( y2, 7 );
-       x3 = v128_rol32( y3, 7 );
-       x0 = v128_xor( x0, x4 );
-       x1 = v128_xor( x1, x5 );
+       t0 = v128_rol32( x2, 7 );
+       t1 = v128_rol32( x3, 7 );
+       x2 = v128_rol32( x0, 7 );
+       x3 = v128_rol32( x1, 7 );
+       x0 = v128_xor( t0, x4 );
+       x1 = v128_xor( t1, x5 );
       x2 = v128_xor( x2, x6 );
       x3 = v128_xor( x3, x7 );
       x4 = v128_swap64( x4 );
@@ -112,19 +108,15 @@ static void transform( cubehashParam *sp )
       x5 = v128_add32( x1, x5 );
       x6 = v128_add32( x2, x6 );
       x7 = v128_add32( x3, x7 );
-       y0 = x1;
-       y1 = x0;
-       y2 = x3;
-       y3 = x2;
-       x0 = v128_rol32( y0, 11 );
-       x1 = v128_rol32( y1, 11 );
-       x2 = v128_rol32( y2, 11 );
-       x3 = v128_rol32( y3, 11 );
-	    x0 = v128_xor( x0, x4 );
-	    x1 = v128_xor( x1, x5 );
-	    x2 = v128_xor( x2, x6 );
-	    x3 = v128_xor( x3, x7 );
-	    x4 = v128_swap64_32( x4 );
+       t0 = v128_rol32( x1, 11 );
+       x1 = v128_rol32( x0, 11 );
+       t1 = v128_rol32( x3, 11 );
+       x3 = v128_rol32( x2, 11 );
+       x0 = v128_xor( t0, x4 );
+       x1 = v128_xor( x1, x5 );
+       x2 = v128_xor( t1, x6 );
+       x3 = v128_xor( x3, x7 );
+       x4 = v128_swap64_32( x4 );
 	    x5 = v128_swap64_32( x5 );
 	    x6 = v128_swap64_32( x6 );
 	    x7 = v128_swap64_32( x7 );
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -17,7 +17,7 @@ typedef struct {
 #else
   hashState_groestl       groestl;
 #endif
-   sha256_8way_context     sha;
+   sha256_8x32_context     sha;
 } myrgr_8way_ctx_holder;

 myrgr_8way_ctx_holder myrgr_8way_ctx;
@@ -29,7 +29,7 @@ void init_myrgr_8way_ctx()
 #else
     init_groestl( &myrgr_8way_ctx.groestl, 64 );
 #endif
-     sha256_8way_init( &myrgr_8way_ctx.sha );
+     sha256_8x32_init( &myrgr_8way_ctx.sha );
 }

 void myriad_8way_hash( void *output, const void *input )
@@ -96,8 +96,8 @@ void myriad_8way_hash( void *output, const void *input )
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
                       hash6, hash7 );
     
-     sha256_8way_update( &ctx.sha, vhash, 64 );
-     sha256_8way_close( &ctx.sha, output );
+     sha256_8x32_update( &ctx.sha, vhash, 64 );
+     sha256_8x32_close( &ctx.sha, output );
 }

 int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
@@ -156,7 +156,7 @@ int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,

 typedef struct {
    hashState_groestl       groestl;
-    sha256_4way_context     sha;
+    sha256_4x32_context     sha;
 } myrgr_4way_ctx_holder;

 myrgr_4way_ctx_holder myrgr_4way_ctx;
@@ -164,7 +164,7 @@ myrgr_4way_ctx_holder myrgr_4way_ctx;
 void init_myrgr_4way_ctx()
 {
     init_groestl (&myrgr_4way_ctx.groestl, 64 );
-     sha256_4way_init( &myrgr_4way_ctx.sha );
+     sha256_4x32_init( &myrgr_4way_ctx.sha );
 }

 void myriad_4way_hash( void *output, const void *input )
@@ -189,8 +189,8 @@ void myriad_4way_hash( void *output, const void *input )

     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

-     sha256_4way_update( &ctx.sha, vhash, 64 );
-     sha256_4way_close( &ctx.sha, output );
+     sha256_4x32_update( &ctx.sha, vhash, 64 );
+     sha256_4x32_close( &ctx.sha, output );
 }

 int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -1059,7 +1059,7 @@ void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
   WRITE_STATE_BIG8( sc );
 }

-void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
+void hamsi_8way_big_final( hamsi512_8x64_context *sc, __m512i *buf )
 {
   __m512i m0, m1, m2, m3, m4, m5, m6, m7;

@@ -1071,7 +1071,7 @@ void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
   WRITE_STATE_BIG8( sc );
 }

-void hamsi512_8way_init( hamsi_8way_big_context *sc )
+void hamsi512_8x64_init( hamsi512_8x64_context *sc )
 {
   sc->partial_len = 0;
   sc->count_high = sc->count_low = 0;
@@ -1087,7 +1087,7 @@ void hamsi512_8way_init( hamsi_8way_big_context *sc )
   sc->h[7] = v512_64( iv[7] );
   }

-void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
+void hamsi512_8x64_update( hamsi512_8x64_context *sc, const void *data,
                           size_t len )
 {
   __m512i *vdata = (__m512i*)data;
@@ -1099,7 +1099,7 @@ void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
   sc->partial_len = len;
 }

-void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
+void hamsi512_8x64_close( hamsi512_8x64_context *sc, void *dst )
 {
   __m512i pad[1];
   uint32_t ch, cl;
@@ -1944,7 +1944,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void * dst,

 ////////////

-void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
+void hamsi_big( hamsi512_4x64_context *sc, __m256i *buf, size_t num )
 {
   DECL_STATE_BIG
   uint32_t tmp;
@@ -1968,7 +1968,7 @@ void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
   WRITE_STATE_BIG( sc );
 }

-void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
+void hamsi_big_final( hamsi512_4x64_context *sc, __m256i *buf )
 {
   __m256i m0, m1, m2, m3, m4, m5, m6, m7;
   DECL_STATE_BIG
@@ -1979,7 +1979,7 @@ void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
   WRITE_STATE_BIG( sc );
 }

-void hamsi512_4way_init( hamsi_4way_big_context *sc )
+void hamsi512_4x64_init( hamsi512_4x64_context *sc )
 {
   sc->partial_len = 0;
   sc->count_high = sc->count_low = 0;
@@ -1994,7 +1994,7 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc )
   sc->h[7] = v256_64( iv[7] );
 }

-void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
+void hamsi512_4x64_update( hamsi512_4x64_context *sc, const void *data,
      size_t len )
 {
   __m256i *vdata = (__m256i*)data;
@@ -2006,7 +2006,7 @@ void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
   sc->partial_len = len;
 }

-void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
+void hamsi512_4x64_close( hamsi512_4x64_context *sc, void *dst )
 {
   __m256i pad[1];
   uint32_t ch, cl;
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -72,17 +72,17 @@ typedef struct
   size_t partial_len;
   uint32_t count_high, count_low;
 } hamsi_4way_big_context;
-typedef hamsi_4way_big_context hamsi512_4way_context;
+typedef hamsi_4way_big_context hamsi512_4x64_context;

-void hamsi512_4way_init( hamsi512_4way_context *sc );
-void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
+void hamsi512_4x64_init( hamsi512_4x64_context *sc );
+void hamsi512_4x64_update( hamsi512_4x64_context *sc, const void *data,
      size_t len );
-void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
+void hamsi512_4x64_close( hamsi512_4x64_context *sc, void *dst );

-#define hamsi512_4x64_context   hamsi512_4way_context
-#define hamsi512_4x64_init      hamsi512_4way_init
-#define hamsi512_4x64_update    hamsi512_4way_update
-#define hamsi512_4x64_close     hamsi512_4way_close
+#define hamsi512_4way_context   hamsi512_4x64_context
+#define hamsi512_4way_init      hamsi512_4x64_init
+#define hamsi512_4way_update    hamsi512_4x64_update
+#define hamsi512_4way_close     hamsi512_4x64_close

 // Hamsi-512 8x32

@@ -115,17 +115,17 @@ typedef struct
   size_t partial_len;
   uint32_t count_high, count_low;
 } hamsi_8way_big_context;
-typedef hamsi_8way_big_context hamsi512_8way_context;
+typedef hamsi_8way_big_context hamsi512_8x64_context;

-void hamsi512_8way_init( hamsi512_8way_context *sc );
-void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
+void hamsi512_8x64_init( hamsi512_8x64_context *sc );
+void hamsi512_8x64_update( hamsi512_8x64_context *sc, const void *data,
                           size_t len );
-void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );
+void hamsi512_8x64_close( hamsi512_8x64_context *sc, void *dst );

-#define hamsi512_8x64_context   hamsi512_8way_context
-#define hamsi512_8x64_init      hamsi512_8way_init
-#define hamsi512_8x64_update    hamsi512_8way_update
-#define hamsi512_8x64_close     hamsi512_8way_close
+#define hamsi512_8way_context   hamsi512_8x64_context
+#define hamsi512_8way_init      hamsi512_8x64_init
+#define hamsi512_8way_update    hamsi512_8x64_update
+#define hamsi512_8way_close     hamsi512_8x64_close

 // Hamsi-512 16x32

--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -82,12 +82,15 @@ typedef struct {
 typedef haval_4way_context haval256_5_4way_context;

 void haval256_5_4way_init( void *cc );
-
 void haval256_5_4way_update( void *cc, const void *data, size_t len );
 //#define haval256_5_4way haval256_5_4way_update
-
 void haval256_5_4way_close( void *cc, void *dst );

+#define haval256_4x32_context    haval256_5_4way_context
+#define haval256_4x32_init       haval256_5_4way_init
+#define haval256_4x32_update     haval256_5_4way_update
+#define haval256_4x32_close      haval256_5_4way_close
+
 #if defined(__AVX2__)

 typedef struct {
@@ -100,11 +103,14 @@ typedef struct {
 typedef haval_8way_context haval256_5_8way_context;

 void haval256_5_8way_init( void *cc );
-
 void haval256_5_8way_update( void *cc, const void *data, size_t len );
-
 void haval256_5_8way_close( void *cc, void *dst );

+#define haval256_8x32_context    haval256_5_8way_context
+#define haval256_8x32_init       haval256_5_8way_init
+#define haval256_8x32_update     haval256_5_8way_update
+#define haval256_8x32_close      haval256_5_8way_close
+
 #endif // AVX2

 #if defined(SIMD512)
@@ -119,11 +125,14 @@ typedef struct {
 typedef haval_16way_context haval256_5_16way_context;

 void haval256_5_16way_init( void *cc );
-
 void haval256_5_16way_update( void *cc, const void *data, size_t len );
-
 void haval256_5_16way_close( void *cc, void *dst );

+#define haval256_16x32_context    haval256_5_16way_context
+#define haval256_16x32_init       haval256_5_16way_init
+#define haval256_16x32_update     haval256_5_16way_update
+#define haval256_16x32_close      haval256_5_16way_close
+
 #endif // AVX512

 #ifdef __cplusplus
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -190,7 +190,7 @@ static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
    memcpy_512( dst, kc->w, m512_len );
 }

-void keccak256_8way_init( void *kc )
+void keccak256_8x64_init( void *kc )
 {
   keccak64_8way_init( kc, 256 );
 }
--- a/algo/keccak/sha3d-4way.c
+++ b/algo/keccak/sha3d-4way.c
@@ -9,7 +9,7 @@
 void sha3d_hash_8way(void *state, const void *input)
 {
    uint32_t buffer[16*8] __attribute__ ((aligned (128)));
-    keccak256_8way_context ctx;
+    keccak256_8x64_context ctx;

    keccak256_8x64_init( &ctx );
    keccak256_8x64_update( &ctx, input, 80 );
@@ -69,7 +69,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
 void sha3d_hash_4way(void *state, const void *input)
 {
    uint32_t buffer[16*4] __attribute__ ((aligned (64)));
-    keccak256_4way_context ctx;
+    keccak256_4x64_context ctx;

    keccak256_4x64_init( &ctx );
    keccak256_4x64_update( &ctx, input, 80 );
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -273,8 +273,6 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
    uint32_t hash[8*4] __attribute((aligned(128)));
    __m512i* chainv = state->chainv;
    __m512i t[2];
-    const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x( 
-                                  0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

    /*---- blank round with m=0 ----*/
    rnd512_4way( state, NULL );
@@ -289,10 +287,8 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
    _mm512_store_si512( (__m512i*)&hash[ 0], t[0] );
    _mm512_store_si512( (__m512i*)&hash[16], t[1] );

-    casti_m512i( b,0 ) = _mm512_shuffle_epi8(
-                                  casti_m512i( hash,0 ), shuff_bswap32 );
-    casti_m512i( b,1 ) = _mm512_shuffle_epi8(
-                                  casti_m512i( hash,1 ), shuff_bswap32 );
+    casti_m512i( b,0 ) = mm512_bswap_32( casti_m512i( hash,0 ) );
+    casti_m512i( b,1 ) = mm512_bswap_32( casti_m512i( hash,1 ) );

    rnd512_4way( state, NULL );

@@ -306,10 +302,8 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
    _mm512_store_si512( (__m512i*)&hash[ 0], t[0] );
    _mm512_store_si512( (__m512i*)&hash[16], t[1] );

-    casti_m512i( b,2 ) = _mm512_shuffle_epi8(
-                                  casti_m512i( hash,0 ), shuff_bswap32 );
-    casti_m512i( b,3 ) = _mm512_shuffle_epi8(
-                                  casti_m512i( hash,1 ), shuff_bswap32 );
+    casti_m512i( b,2 ) = mm512_bswap_32( casti_m512i( hash,0 ) );
+    casti_m512i( b,3 ) = mm512_bswap_32( casti_m512i( hash,1 ) );
 }

 int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
@@ -349,16 +343,14 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
    __m512i msg[2];
    int i;
    int blocks = (int)len >> 5;
-    const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(  
-                                   0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

    state->rembytes = (int)len & 0x1F;

    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
-       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       msg[0] = mm512_bswap_32( vdata[ 0 ] );
+       msg[1] = mm512_bswap_32( vdata[ 1 ] );
       rnd512_4way( state, msg );
    }

@@ -367,7 +359,7 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
    if ( state->rembytes  )
    {
      // remaining data bytes
-      buffer[0] = _mm512_shuffle_epi8( vdata[0], shuff_bswap32 );
+      buffer[0] = mm512_bswap_32( vdata[0] );
      buffer[1] = mm512_bcast128lo_64( 0x0000000080000000 );
    }
    return 0;
@@ -434,16 +426,14 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
    __m512i msg[2];
    int i;
    const int blocks = (int)( inlen >> 5 );
-    const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x( 
-                                   0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

    state->rembytes = inlen & 0x1F;

    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
-       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       msg[0] = mm512_bswap_32( vdata[ 0 ] );
+       msg[1] = mm512_bswap_32( vdata[ 1 ] );
       rnd512_4way( state, msg );
    }

@@ -451,7 +441,7 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
    if ( state->rembytes  )
    {
       // padding of partial block
-       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[0] = mm512_bswap_32( vdata[ 0 ] );
       msg[1] = mm512_bcast128lo_64( 0x0000000080000000 );
       rnd512_4way( state, msg );
    }
@@ -479,16 +469,14 @@ int luffa_4way_update_close( luffa_4way_context *state,
    __m512i msg[2];
    int i;
    const int blocks = (int)( inlen >> 5 );
-    const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x( 
-                                   0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

    state->rembytes = inlen & 0x1F;

    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
-       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       msg[0] = mm512_bswap_32( vdata[ 0 ] );
+       msg[1] = mm512_bswap_32( vdata[ 1 ] );
       rnd512_4way( state, msg );
    }

@@ -496,7 +484,7 @@ int luffa_4way_update_close( luffa_4way_context *state,
    if ( state->rembytes  )
    {
       // padding of partial block
-       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[0] = mm512_bswap_32( vdata[ 0 ] );
       msg[1] = mm512_bcast128lo_64( 0x0000000080000000 );
       rnd512_4way( state, msg );
    }
@@ -775,8 +763,6 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
    uint32 hash[8*2] __attribute((aligned(64)));
    __m256i* chainv = state->chainv;
    __m256i t0, t1;
-    const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
-                                                 0x0405060700010203 );
    /*---- blank round with m=0 ----*/
    rnd512_2way( state, NULL );

@@ -791,10 +777,8 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
    _mm256_store_si256( (__m256i*)&hash[0], t0 );
    _mm256_store_si256( (__m256i*)&hash[8], t1 );

-    casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
-                                  casti_m256i( hash, 0 ), shuff_bswap32 );
-    casti_m256i( b, 1 ) = _mm256_shuffle_epi8( 
-                                  casti_m256i( hash, 1 ), shuff_bswap32 );
+    casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );

    rnd512_2way( state, NULL );

@@ -809,10 +793,8 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
    _mm256_store_si256( (__m256i*)&hash[0], t0 );
    _mm256_store_si256( (__m256i*)&hash[8], t1 );

-    casti_m256i( b, 2 ) = _mm256_shuffle_epi8( 
-                                  casti_m256i( hash, 0 ), shuff_bswap32 );
-    casti_m256i( b, 3 ) = _mm256_shuffle_epi8( 
-                                  casti_m256i( hash, 1 ), shuff_bswap32 );
+    casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
 }

 int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
@@ -847,15 +829,13 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
    __m256i msg[2];
    int i;
    int blocks = (int)len >> 5;
-    const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
-                                                 0x0405060700010203 );
    state-> rembytes = (int)len & 0x1F;

    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
-       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       msg[0] = mm256_bswap_32( vdata[ 0 ] );
+       msg[1] = mm256_bswap_32( vdata[ 1 ] );
       rnd512_2way( state, msg );
    }

@@ -864,7 +844,7 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
    if ( state->rembytes  )
    {
      // remaining data bytes
-      buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
+      buffer[0] = mm256_bswap_32( vdata[0] );
      buffer[1] = mm256_bcast128lo_64( 0x0000000080000000 );
    }
    return 0;
@@ -916,16 +896,14 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
    __m256i msg[2];
    int i;
    const int blocks = (int)( inlen >> 5 );
-    const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
-                                                 0x0405060700010203 );

    state->rembytes = inlen & 0x1F;

    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
-       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       msg[0] = mm256_bswap_32( vdata[ 0 ] );
+       msg[1] = mm256_bswap_32( vdata[ 1 ] );
       rnd512_2way( state, msg );
    }

@@ -933,7 +911,7 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
    if ( state->rembytes  )
    {
       // padding of partial block
-       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[0] = mm256_bswap_32( vdata[ 0 ] );
       msg[1] = mm256_bcast128lo_64( 0x0000000080000000 );
       rnd512_2way( state, msg );
    }
@@ -961,16 +939,14 @@ int luffa_2way_update_close( luffa_2way_context *state,
    __m256i msg[2];
    int i;
    const int blocks = (int)( inlen >> 5 );
-    const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
-                                                 0x0405060700010203 );

    state->rembytes = inlen & 0x1F;

    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
-       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       msg[0] = mm256_bswap_32( vdata[ 0 ] );
+       msg[1] = mm256_bswap_32( vdata[ 1 ] );
       rnd512_2way( state, msg );
    }

@@ -978,7 +954,7 @@ int luffa_2way_update_close( luffa_2way_context *state,
    if ( state->rembytes  )
    {
       // padding of partial block
-       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[0] = mm256_bswap_32( vdata[ 0 ] );
       msg[1] = mm256_bcast128lo_64( 0x0000000080000000 );
       rnd512_2way( state, msg );
    }
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -26,9 +26,9 @@
 #if defined (ALLIUM_16WAY)  

 typedef union {
-   keccak256_8way_context    keccak;
+   keccak256_8x64_context    keccak;
   cube_4way_2buf_context    cube;
-   skein256_8way_context     skein;
+   skein256_8x64_context     skein;
 #if defined(__VAES__)
   groestl256_4way_context   groestl;
 #else
@@ -60,7 +60,7 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
   uint32_t hash15[8] __attribute__ ((aligned (32)));
   allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));

-   blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
+   blake256_16x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );

   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
@@ -70,12 +70,12 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
                hash15, 256 );
   
-   keccak256_8way_init( &ctx.keccak );
-   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
-   keccak256_8way_close( &ctx.keccak, vhashA);
-   keccak256_8way_init( &ctx.keccak );
-   keccak256_8way_update( &ctx.keccak, vhashB, 32 );
-   keccak256_8way_close( &ctx.keccak, vhashB);
+   keccak256_8x64_init( &ctx.keccak );
+   keccak256_8x64_update( &ctx.keccak, vhashA, 32 );
+   keccak256_8x64_close( &ctx.keccak, vhashA);
+   keccak256_8x64_init( &ctx.keccak );
+   keccak256_8x64_update( &ctx.keccak, vhashB, 32 );
+   keccak256_8x64_close( &ctx.keccak, vhashB);

   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 vhashA, 256 );
@@ -153,12 +153,12 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
                hash15, 256 );

-   skein256_8way_init( &ctx.skein );
-   skein256_8way_update( &ctx.skein, vhashA, 32 );
-   skein256_8way_close( &ctx.skein, vhashA );
-   skein256_8way_init( &ctx.skein );
-   skein256_8way_update( &ctx.skein, vhashB, 32 );
-   skein256_8way_close( &ctx.skein, vhashB );
+   skein256_8x64_init( &ctx.skein );
+   skein256_8x64_update( &ctx.skein, vhashA, 32 );
+   skein256_8x64_close( &ctx.skein, vhashA );
+   skein256_8x64_init( &ctx.skein );
+   skein256_8x64_update( &ctx.skein, vhashB, 32 );
+   skein256_8x64_close( &ctx.skein, vhashB );

   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 vhashA, 256 );
@@ -251,7 +251,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );

   // Partialy prehash second block without touching nonces in block_buf[3].
-   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
     allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -273,9 +273,9 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
 #elif defined (ALLIUM_8WAY)  

 typedef union {
-   keccak256_4way_context    keccak;
+   keccak256_4x64_context    keccak;
   cube_2way_context         cube;
-   skein256_4way_context     skein;
+   skein256_4x64_context     skein;
 #if defined(__VAES__)
   groestl256_2way_context   groestl;
 #else
@@ -298,19 +298,19 @@ static void allium_8way_hash( void *hash, const void *midstate_vars,
   uint64_t *hash7 = (uint64_t*)hash+28;
   allium_8way_ctx_holder ctx __attribute__ ((aligned (64))); 

-   blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
+   blake256_8x32_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );

   dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 vhashA, 256 );
   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );

-   keccak256_4way_init( &ctx.keccak );
-   keccak256_4way_update( &ctx.keccak, vhashA, 32 );
-   keccak256_4way_close( &ctx.keccak, vhashA );
-   keccak256_4way_init( &ctx.keccak );
-   keccak256_4way_update( &ctx.keccak, vhashB, 32 );
-   keccak256_4way_close( &ctx.keccak, vhashB );
+   keccak256_4x64_init( &ctx.keccak );
+   keccak256_4x64_update( &ctx.keccak, vhashA, 32 );
+   keccak256_4x64_close( &ctx.keccak, vhashA );
+   keccak256_4x64_init( &ctx.keccak );
+   keccak256_4x64_update( &ctx.keccak, vhashB, 32 );
+   keccak256_4x64_close( &ctx.keccak, vhashB );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
   dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );
@@ -350,12 +350,12 @@ static void allium_8way_hash( void *hash, const void *midstate_vars,
   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );

-   skein256_4way_init( &ctx.skein );
-   skein256_4way_update( &ctx.skein, vhashA, 32 );
-   skein256_4way_close( &ctx.skein, vhashA );
-   skein256_4way_init( &ctx.skein );
-   skein256_4way_update( &ctx.skein, vhashB, 32 );
-   skein256_4way_close( &ctx.skein, vhashB );
+   skein256_4x64_init( &ctx.skein );
+   skein256_4x64_update( &ctx.skein, vhashA, 32 );
+   skein256_4x64_close( &ctx.skein, vhashA );
+   skein256_4x64_init( &ctx.skein );
+   skein256_4x64_update( &ctx.skein, vhashB, 32 );
+   skein256_4x64_close( &ctx.skein, vhashB );

 #if defined(__VAES__)

@@ -433,7 +433,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                                     n+ 3, n+ 2, n+ 1, n );

   // Partialy prehash second block without touching nonces
-   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
     allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -483,7 +483,7 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
   uint64_t *hash3 = (uint64_t*)hash+12;
   allium_4way_ctx_holder ctx __attribute__ ((aligned (64)));

-   blake256_4way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
+   blake256_4x32_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
   dintrlv_4x32( hash0, hash1, hash2, hash3, vhashA, 256 );

   intrlv_2x64( vhashA, hash0, hash1, 256 );
@@ -588,7 +588,7 @@ int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
   block_buf[15] = v128_32( 640 );

      // Partialy prehash second block without touching nonces
-   blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_4x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
     allium_4way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -616,7 +616,6 @@ int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
 //
 //  1 way

-
 typedef struct 
 {
        blake256_context        blake;
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -14,12 +14,12 @@ bool lyra2h_4way_thread_init()
 return ( lyra2h_4way_matrix = mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
 }

-static __thread blake256_4way_context l2h_4way_blake_mid;
+static __thread blake256_4x32_context l2h_4way_blake_mid;

 void lyra2h_4way_midstate( const void* input )
 {
-       blake256_4way_init( &l2h_4way_blake_mid );
-       blake256_4way_update( &l2h_4way_blake_mid, input, 64 );
+       blake256_4x32_init( &l2h_4way_blake_mid );
+       blake256_4x32_update( &l2h_4way_blake_mid, input, 64 );
 }

 void lyra2h_4way_hash( void *state, const void *input )
@@ -29,11 +29,11 @@ void lyra2h_4way_hash( void *state, const void *input )
     uint32_t hash2[8] __attribute__ ((aligned (64)));
     uint32_t hash3[8] __attribute__ ((aligned (64)));
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
+     blake256_4x32_context ctx_blake __attribute__ ((aligned (64)));

     memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
-     blake256_4way_update( &ctx_blake, input + (64*4), 16 );
-     blake256_4way_close( &ctx_blake, vhash );
+     blake256_4x32_update( &ctx_blake, input + (64*4), 16 );
+     blake256_4x32_close( &ctx_blake, vhash );

     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -7,25 +7,24 @@
 #include "algo/cubehash/cubehash_sse2.h" 
 #include "algo/cubehash/cube-hash-2way.h"

-
 #if defined (LYRA2REV2_16WAY)

 typedef struct {
-   blake256_16way_context    blake;
-   keccak256_8way_context    keccak;
+   blake256_16x32_context    blake;
+   keccak256_8x64_context    keccak;
   cubehashParam             cube;
-   skein256_8way_context     skein;
-   bmw256_16way_context      bmw;
+   skein256_8x64_context     skein;
+   bmw256_16x32_context      bmw;
 } lyra2v2_16way_ctx_holder __attribute__ ((aligned (64)));

 static lyra2v2_16way_ctx_holder l2v2_16way_ctx;

 bool init_lyra2rev2_16way_ctx()
 {
-   keccak256_8way_init( &l2v2_16way_ctx.keccak );
+   keccak256_8x64_init( &l2v2_16way_ctx.keccak );
   cubehashInit( &l2v2_16way_ctx.cube, 256, 16, 32 );
-   skein256_8way_init( &l2v2_16way_ctx.skein );
-   bmw256_16way_init( &l2v2_16way_ctx.bmw );
+   skein256_8x64_init( &l2v2_16way_ctx.skein );
+   bmw256_16x32_init( &l2v2_16way_ctx.bmw );
   return true;
 }

@@ -51,8 +50,8 @@ void lyra2rev2_16way_hash( void *state, const void *input )
   lyra2v2_16way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v2_16way_ctx, sizeof(l2v2_16way_ctx) );

-   blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
-   blake256_16way_close( &ctx.blake, vhash );
+   blake256_16x32_update( &ctx.blake, input + (64<<4), 16 );
+   blake256_16x32_close( &ctx.blake, vhash );

   dintrlv_16x32( hash0,  hash1,  hash2,  hash3,
                  hash4,  hash5,  hash6,  hash7,
@@ -62,17 +61,17 @@ void lyra2rev2_16way_hash( void *state, const void *input )
   intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
                       hash4, hash5, hash6, hash7, 256 );

-   keccak256_8way_update( &ctx.keccak, vhash, 32 );
-   keccak256_8way_close( &ctx.keccak, vhash );
+   keccak256_8x64_update( &ctx.keccak, vhash, 32 );
+   keccak256_8x64_close( &ctx.keccak, vhash );

   dintrlv_8x64( hash0, hash1, hash2, hash3,
                 hash4, hash5, hash6, hash7, vhash, 256 );
   intrlv_8x64( vhash, hash8,  hash9,  hash10, hash11,
                       hash12, hash13, hash14, hash15, 256 );

-   keccak256_8way_init( &ctx.keccak );
-   keccak256_8way_update( &ctx.keccak, vhash, 32 );
-   keccak256_8way_close( &ctx.keccak, vhash );
+   keccak256_8x64_init( &ctx.keccak );
+   keccak256_8x64_update( &ctx.keccak, vhash, 32 );
+   keccak256_8x64_close( &ctx.keccak, vhash );

   dintrlv_8x64( hash8,  hash9,  hash10,  hash11,
                 hash12, hash13, hash14, hash15, vhash, 256 );
@@ -122,21 +121,20 @@ void lyra2rev2_16way_hash( void *state, const void *input )

   intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
                       hash4, hash5, hash6, hash7, 256 );
-   skein256_8way_update( &ctx.skein, vhash, 32 );
-   skein256_8way_close( &ctx.skein, vhash );
+   skein256_8x64_update( &ctx.skein, vhash, 32 );
+   skein256_8x64_close( &ctx.skein, vhash );

   dintrlv_8x64( hash0, hash1, hash2, hash3,
                 hash4, hash5, hash6, hash7, vhash, 256 );
   intrlv_8x64( vhash, hash8,  hash9,  hash10, hash11, hash12,
                       hash13, hash14, hash15, 256 );

-   skein256_8way_init( &ctx.skein );
-   skein256_8way_update( &ctx.skein, vhash, 32 );
-   skein256_8way_close( &ctx.skein, vhash );
+   skein256_8x64_init( &ctx.skein );
+   skein256_8x64_update( &ctx.skein, vhash, 32 );
+   skein256_8x64_close( &ctx.skein, vhash );

   dintrlv_8x64( hash8,  hash9,  hash10, hash11,
                 hash12, hash13, hash14, hash15, vhash, 256 );
-
   
   cubehash_full( &ctx.cube, (byte*) hash0,  256, (const byte*) hash0, 32 );
   cubehash_full( &ctx.cube, (byte*) hash1,  256, (const byte*) hash1, 32 );
@@ -160,8 +158,8 @@ void lyra2rev2_16way_hash( void *state, const void *input )
                        hash8,  hash9,  hash10, hash11,
                        hash12, hash13, hash14, hash15, 256 );

-   bmw256_16way_update( &ctx.bmw, vhash, 32 );
-   bmw256_16way_close( &ctx.bmw, state );
+   bmw256_16x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_16x32_close( &ctx.bmw, state );
 }

 int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
@@ -186,8 +184,8 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
   mm512_bswap32_intrlv80_16x32( vdata, pdata );
   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
-   blake256_16way_init( &l2v2_16way_ctx.blake );
-   blake256_16way_update( &l2v2_16way_ctx.blake, vdata, 64 );
+   blake256_16x32_init( &l2v2_16way_ctx.blake );
+   blake256_16x32_update( &l2v2_16way_ctx.blake, vdata, 64 );

   do
   {
@@ -214,21 +212,21 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
 #elif defined (LYRA2REV2_8WAY)

 typedef struct {
-   blake256_8way_context     blake;
-   keccak256_4way_context    keccak;
+   blake256_8x32_context     blake;
+   keccak256_4x64_context    keccak;
   cubehashParam             cube;
-   skein256_4way_context     skein;
-   bmw256_8way_context       bmw;
+   skein256_4x64_context     skein;
+   bmw256_8x32_context       bmw;
 } lyra2v2_8way_ctx_holder __attribute__ ((aligned (64)));

 static lyra2v2_8way_ctx_holder l2v2_8way_ctx;

 bool init_lyra2rev2_8way_ctx()
 {
-   keccak256_4way_init( &l2v2_8way_ctx.keccak );
+   keccak256_4x64_init( &l2v2_8way_ctx.keccak );
   cubehashInit( &l2v2_8way_ctx.cube, 256, 16, 32 );
-   skein256_4way_init( &l2v2_8way_ctx.skein );
-   bmw256_8way_init( &l2v2_8way_ctx.bmw );
+   skein256_4x64_init( &l2v2_8way_ctx.skein );
+   bmw256_8x32_init( &l2v2_8way_ctx.bmw );
   return true;
 }

@@ -246,20 +244,20 @@ void lyra2rev2_8way_hash( void *state, const void *input )
   lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );

-   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
-   blake256_8way_close( &ctx.blake, vhash );
+   blake256_8x32_update( &ctx.blake, input + (64<<3), 16 );
+   blake256_8x32_close( &ctx.blake, vhash );

   dintrlv_8x32( hash0, hash1, hash2, hash3,
                 hash4, hash5, hash6, hash7, vhash, 256 );

   intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
-   keccak256_4way_update( &ctx.keccak, vhash, 32 );
-   keccak256_4way_close( &ctx.keccak, vhash );
+   keccak256_4x64_update( &ctx.keccak, vhash, 32 );
+   keccak256_4x64_close( &ctx.keccak, vhash );
   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
   intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
-   keccak256_4way_init( &ctx.keccak );
-   keccak256_4way_update( &ctx.keccak, vhash, 32 );
-   keccak256_4way_close( &ctx.keccak, vhash );
+   keccak256_4x64_init( &ctx.keccak );
+   keccak256_4x64_update( &ctx.keccak, vhash, 32 );
+   keccak256_4x64_close( &ctx.keccak, vhash );
   dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );

   cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
@@ -282,13 +280,13 @@ void lyra2rev2_8way_hash( void *state, const void *input )
   LYRA2REV2( l2v2_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
   
   intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
-   skein256_4way_update( &ctx.skein, vhash, 32 );
-   skein256_4way_close( &ctx.skein, vhash );
+   skein256_4x64_update( &ctx.skein, vhash, 32 );
+   skein256_4x64_close( &ctx.skein, vhash );
   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
   intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
-   skein256_4way_init( &ctx.skein );
-   skein256_4way_update( &ctx.skein, vhash, 32 );
-   skein256_4way_close( &ctx.skein, vhash );
+   skein256_4x64_init( &ctx.skein );
+   skein256_4x64_update( &ctx.skein, vhash, 32 );
+   skein256_4x64_close( &ctx.skein, vhash );
   dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );

   cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
@@ -303,8 +301,8 @@ void lyra2rev2_8way_hash( void *state, const void *input )
   intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
                       hash4, hash5, hash6, hash7, 256 );

-   bmw256_8way_update( &ctx.bmw, vhash, 32 );
-   bmw256_8way_close( &ctx.bmw, state );
+   bmw256_8x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_8x32_close( &ctx.bmw, state );
 }

 int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
@@ -328,8 +326,8 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
-   blake256_8way_init( &l2v2_8way_ctx.blake );
-   blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 );
+   blake256_8x32_init( &l2v2_8way_ctx.blake );
+   blake256_8x32_update( &l2v2_8way_ctx.blake, vdata, 64 );

   do
   {
@@ -356,21 +354,21 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
 #elif defined (LYRA2REV2_4WAY)

 typedef struct {
-   blake256_4way_context     blake;
-   keccak256_4way_context    keccak;
+   blake256_4x32_context     blake;
+   keccak256_4x64_context    keccak;
   cubehashParam             cube;
-   skein256_4way_context     skein;
-   bmw256_4way_context          bmw;
+   skein256_4x64_context     skein;
+   bmw256_4x32_context          bmw;
 } lyra2v2_4way_ctx_holder;

 static lyra2v2_4way_ctx_holder l2v2_4way_ctx;

 bool init_lyra2rev2_4way_ctx()
 {
-   keccak256_4way_init( &l2v2_4way_ctx.keccak );
+   keccak256_4x64_init( &l2v2_4way_ctx.keccak );
   cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
-   skein256_4way_init( &l2v2_4way_ctx.skein );
-   bmw256_4way_init( &l2v2_4way_ctx.bmw );
+   skein256_4x64_init( &l2v2_4way_ctx.skein );
+   bmw256_4x32_init( &l2v2_4way_ctx.bmw );
   return true;
 }

@@ -385,13 +383,13 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
   memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );

-   blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
-   blake256_4way_close( &ctx.blake, vhash );
+   blake256_4x32_update( &ctx.blake, input + (64<<2), 16 );
+   blake256_4x32_close( &ctx.blake, vhash );

   rintrlv_4x32_4x64( vhash64, vhash, 256 );

-   keccak256_4way_update( &ctx.keccak, vhash64, 32 );
-   keccak256_4way_close( &ctx.keccak, vhash64 );
+   keccak256_4x64_update( &ctx.keccak, vhash64, 32 );
+   keccak256_4x64_close( &ctx.keccak, vhash64 );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

@@ -410,8 +408,8 @@ void lyra2rev2_4way_hash( void *state, const void *input )

   intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );

-   skein256_4way_update( &ctx.skein, vhash64, 32 );
-   skein256_4way_close( &ctx.skein, vhash64 );
+   skein256_4x64_update( &ctx.skein, vhash64, 32 );
+   skein256_4x64_close( &ctx.skein, vhash64 );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

@@ -426,8 +424,8 @@ void lyra2rev2_4way_hash( void *state, const void *input )

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );

-   bmw256_4way_update( &ctx.bmw, vhash, 32 );
-   bmw256_4way_close( &ctx.bmw, state );
+   bmw256_4x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_4x32_close( &ctx.bmw, state );
 }

 int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
@@ -451,8 +449,8 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,

   v128_bswap32_intrlv80_4x32( vdata, pdata );

-   blake256_4way_init( &l2v2_4way_ctx.blake );
-   blake256_4way_update( &l2v2_4way_ctx.blake, vdata, 64 );
+   blake256_4x32_init( &l2v2_4way_ctx.blake );
+   blake256_4x32_update( &l2v2_4way_ctx.blake, vdata, 64 );

   do
   {
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -9,18 +9,18 @@
 #if defined (LYRA2REV3_16WAY)

 typedef struct {
-   blake256_16way_context     blake;
+   blake256_16x32_context     blake;
   cube_4way_context          cube;
-   bmw256_16way_context       bmw;
+   bmw256_16x32_context       bmw;
 } lyra2v3_16way_ctx_holder;

 static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx;

 bool init_lyra2rev3_16way_ctx()
 {
-   blake256_16way_init( &l2v3_16way_ctx.blake );
+   blake256_16x32_init( &l2v3_16way_ctx.blake );
   cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 );
-   bmw256_16way_init( &l2v3_16way_ctx.bmw );
+   bmw256_16x32_init( &l2v3_16way_ctx.bmw );
   return true;
 }

@@ -46,8 +46,8 @@ void lyra2rev3_16way_hash( void *state, const void *input )
   lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) );

-   blake256_16way_update( &ctx.blake, input + (64*16), 16 );
-   blake256_16way_close( &ctx.blake, vhash );
+   blake256_16x32_update( &ctx.blake, input + (64*16), 16 );
+   blake256_16x32_close( &ctx.blake, vhash );

   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
           hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
@@ -120,8 +120,8 @@ void lyra2rev3_16way_hash( void *state, const void *input )
             hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
             hash15, 256 );

-   bmw256_16way_update( &ctx.bmw, vhash, 32 );
-   bmw256_16way_close( &ctx.bmw, state );
+   bmw256_16x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_16x32_close( &ctx.bmw, state );
 }


@@ -145,8 +145,8 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,

   mm512_bswap32_intrlv80_16x32( vdata, pdata );

-   blake256_16way_init( &l2v3_16way_ctx.blake );
-   blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 );
+   blake256_16x32_init( &l2v3_16way_ctx.blake );
+   blake256_16x32_update( &l2v3_16way_ctx.blake, vdata, 64 );

   do
   {
@@ -178,18 +178,18 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
 #elif defined (LYRA2REV3_8WAY)

 typedef struct {
-   blake256_8way_context     blake;
+   blake256_8x32_context     blake;
   cubehashParam             cube;
-   bmw256_8way_context       bmw;
+   bmw256_8x32_context       bmw;
 } lyra2v3_8way_ctx_holder;

 static __thread lyra2v3_8way_ctx_holder l2v3_8way_ctx;

 bool init_lyra2rev3_8way_ctx()
 {
-   blake256_8way_init( &l2v3_8way_ctx.blake );
+   blake256_8x32_init( &l2v3_8way_ctx.blake );
   cubehashInit( &l2v3_8way_ctx.cube, 256, 16, 32 );
-   bmw256_8way_init( &l2v3_8way_ctx.bmw );
+   bmw256_8x32_init( &l2v3_8way_ctx.bmw );
   return true;
 }

@@ -207,8 +207,8 @@ void lyra2rev3_8way_hash( void *state, const void *input )
   lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );

-   blake256_8way_update( &ctx.blake, input + (64*8), 16 );
-   blake256_8way_close( &ctx.blake, vhash );
+   blake256_8x32_update( &ctx.blake, input + (64*8), 16 );
+   blake256_8x32_close( &ctx.blake, vhash );

   dintrlv_8x32( hash0, hash1, hash2, hash3,
                       hash4, hash5, hash6, hash7, vhash, 256 );
@@ -243,8 +243,8 @@ void lyra2rev3_8way_hash( void *state, const void *input )
   intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
                             hash4, hash5, hash6, hash7, 256 );

-   bmw256_8way_update( &ctx.bmw, vhash, 32 );
-   bmw256_8way_close( &ctx.bmw, state );
+   bmw256_8x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_8x32_close( &ctx.bmw, state );

   }

@@ -269,8 +269,8 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
-   blake256_8way_init( &l2v3_8way_ctx.blake );
-   blake256_8way_update( &l2v3_8way_ctx.blake, vdata, 64 );
+   blake256_8x32_init( &l2v3_8way_ctx.blake );
+   blake256_8x32_update( &l2v3_8way_ctx.blake, vdata, 64 );

   do
   {
@@ -300,19 +300,18 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
 #if defined (LYRA2REV3_4WAY)  

 typedef struct {
-   blake256_4way_context     blake;
+   blake256_4x32_context     blake;
   cubehashParam             cube;
-   bmw256_4way_context       bmw;
+   bmw256_4x32_context       bmw;
 } lyra2v3_4way_ctx_holder;

-//static lyra2v3_4way_ctx_holder l2v3_4way_ctx;
 static __thread lyra2v3_4way_ctx_holder l2v3_4way_ctx;

 bool init_lyra2rev3_4way_ctx()
 {
-   blake256_4way_init( &l2v3_4way_ctx.blake );
+   blake256_4x32_init( &l2v3_4way_ctx.blake );
   cubehashInit( &l2v3_4way_ctx.cube, 256, 16, 32 );
-   bmw256_4way_init( &l2v3_4way_ctx.bmw );
+   bmw256_4x32_init( &l2v3_4way_ctx.bmw );
   return true;
 }

@@ -326,8 +325,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
   memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );

-   blake256_4way_update( &ctx.blake, input + (64*4), 16 );
-   blake256_4way_close( &ctx.blake, vhash );
+   blake256_4x32_update( &ctx.blake, input + (64*4), 16 );
+   blake256_4x32_close( &ctx.blake, vhash );
   dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
@@ -349,8 +348,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
-   bmw256_4way_update( &ctx.bmw, vhash, 32 );
-   bmw256_4way_close( &ctx.bmw, state );
+   bmw256_4x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_4x32_close( &ctx.bmw, state );
 }

 int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
@@ -374,8 +373,8 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
   v128_bswap32_intrlv80_4x32( vdata, pdata );
   *noncev = _mm_set_epi32( n+3, n+2, n+1, n );

-   blake256_4way_init( &l2v3_4way_ctx.blake );
-   blake256_4way_update( &l2v3_4way_ctx.blake, vdata, 64 );
+   blake256_4x32_init( &l2v3_4way_ctx.blake );
+   blake256_4x32_update( &l2v3_4way_ctx.blake, vdata, 64 );

   do
   {
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -45,7 +45,7 @@ static void lyra2z_16way_hash( void *state, const void *midstate_vars,
    uint32_t hash14[8] __attribute__ ((aligned (32)));
    uint32_t hash15[8] __attribute__ ((aligned (32)));

-    blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
+    blake256_16x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );

    dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
              hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
@@ -139,7 +139,7 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );

   // Partialy prehash second block without touching nonces in block_buf[3].
-   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
     lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -180,7 +180,7 @@ static void lyra2z_8way_hash( void *state, const void *midstate_vars,
     uint32_t hash7[8] __attribute__ ((aligned (32)));
     uint32_t vhash[8*8] __attribute__ ((aligned (64)));

-     blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
+     blake256_8x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );

     dintrlv_8x32( hash0, hash1, hash2, hash3,
                   hash4, hash5, hash6, hash7, vhash, 256 );
@@ -246,7 +246,7 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
            _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );

   // Partialy prehash second block without touching nonces
-   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
     lyra2z_8way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -279,12 +279,12 @@ bool lyra2z_4way_thread_init()
 return ( lyra2z_4way_matrix = mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
 }

-static __thread blake256_4way_context l2z_4way_blake_mid;
+static __thread blake256_4x32_context l2z_4way_blake_mid;

 void lyra2z_4way_midstate( const void* input )
 {
-       blake256_4way_init( &l2z_4way_blake_mid );
-       blake256_4way_update( &l2z_4way_blake_mid, input, 64 );
+       blake256_4x32_init( &l2z_4way_blake_mid );
+       blake256_4x32_update( &l2z_4way_blake_mid, input, 64 );
 }

 void lyra2z_4way_hash( void *hash, const void *midstate_vars,
@@ -295,15 +295,8 @@ void lyra2z_4way_hash( void *hash, const void *midstate_vars,
     uint32_t hash2[8] __attribute__ ((aligned (64)));
     uint32_t hash3[8] __attribute__ ((aligned (64)));
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-//     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));

-     blake256_4way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
-
-/*
-     memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
-     blake256_4way_update( &ctx_blake, input + (64*4), 16 );
-     blake256_4way_close( &ctx_blake, vhash );
-*/
+     blake256_4x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );

     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

@@ -357,7 +350,7 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
   block_buf[15] = v128_32( 640 );

   // Partialy prehash second block without touching nonces
-   blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_4x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
      lyra2z_4way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -454,11 +447,9 @@ bool register_lyra2z_algo( algo_gate_t* gate )
 #if defined(LYRA2Z_16WAY)
  gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
  gate->scanhash          = (void*)&scanhash_lyra2z_16way;
-//  gate->hash       = (void*)&lyra2z_16way_hash;
 #elif defined(LYRA2Z_8WAY)
  gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
  gate->scanhash          = (void*)&scanhash_lyra2z_8way;
-//  gate->hash       = (void*)&lyra2z_8way_hash;
 #elif defined(LYRA2Z_4WAY)
  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
  gate->scanhash          = (void*)&scanhash_lyra2z_4way;
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -45,7 +45,7 @@ static const uint64_t blake2b_IV[8] =

 #if defined(SIMD512)

-#define G2W_4X64(a,b,c,d) \
+#define G2W(a,b,c,d) \
   a = _mm512_add_epi64( a, b ); \
   d = _mm512_ror_epi64( _mm512_xor_si512( d, a ), 32 ); \
   c = _mm512_add_epi64( c, d ); \
@@ -56,27 +56,15 @@ static const uint64_t blake2b_IV[8] =
   b = _mm512_ror_epi64( _mm512_xor_si512( b, c ), 63 );

 #define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
-   G2W_4X64( s0, s1, s2, s3 ); \
+   G2W( s0, s1, s2, s3 ); \
   s0 = mm512_shufll256_64( s0 ); \
-   s3 = mm512_swap256_128( s3); \
+   s3 = mm512_swap256_128( s3 ); \
   s2 = mm512_shuflr256_64( s2 ); \
-   G2W_4X64( s0, s1, s2, s3 ); \
+   G2W( s0, s1, s2, s3 ); \
   s0 = mm512_shuflr256_64( s0 ); \
   s3 = mm512_swap256_128( s3 ); \
   s2 = mm512_shufll256_64( s2 ); 

-/*
-#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
-   G2W_4X64( s0, s1, s2, s3 ); \
-   s3 = mm512_shufll256_64( s3 ); \
-   s1 = mm512_shuflr256_64( s1); \
-   s2 = mm512_swap256_128( s2 ); \
-   G2W_4X64( s0, s1, s2, s3 ); \
-   s3 = mm512_shuflr256_64( s3 ); \
-   s1 = mm512_shufll256_64( s1 ); \
-   s2 = mm512_swap256_128( s2 ); 
-*/
-
 #define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
@@ -95,7 +83,7 @@ static const uint64_t blake2b_IV[8] =

 #if defined(__AVX2__)

-#define G_4X64(a,b,c,d) \
+#define G_AVX2(a,b,c,d) \
   a = _mm256_add_epi64( a, b ); \
   d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
   c = _mm256_add_epi64( c, d ); \
@@ -107,27 +95,15 @@ static const uint64_t blake2b_IV[8] =

 // Pivot about s1 instead of s0 reduces latency.
 #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
-   G_4X64( s0, s1, s2, s3 ); \
+   G_AVX2( s0, s1, s2, s3 ); \
   s0 = mm256_shufll_64( s0 ); \
-   s3 = mm256_swap_128( s3); \
+   s3 = mm256_swap_128( s3 ); \
   s2 = mm256_shuflr_64( s2 ); \
-   G_4X64( s0, s1, s2, s3 ); \
+   G_AVX2( s0, s1, s2, s3 ); \
   s0 = mm256_shuflr_64( s0 ); \
   s3 = mm256_swap_128( s3 ); \
   s2 = mm256_shufll_64( s2 );

-/*
-#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
-   G_4X64( s0, s1, s2, s3 ); \
-   s3 = mm256_shufll_64( s3 ); \
-   s1 = mm256_shuflr_64( s1); \
-   s2 = mm256_swap_128( s2 ); \
-   G_4X64( s0, s1, s2, s3 ); \
-   s3 = mm256_shuflr_64( s3 ); \
-   s1 = mm256_shufll_64( s1 ); \
-   s2 = mm256_swap_128( s2 );
-*/
-
 #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -148,7 +124,7 @@ static const uint64_t blake2b_IV[8] =

 // process 2 columns in parallel
 // returns void, all args updated
-#define G_2X64(a,b,c,d) \
+#define G_128(a,b,c,d) \
   a = v128_add64( a, b ); \
   d = v128_ror64xor( d, a, 32 ); \
   c = v128_add64( c, d ); \
@@ -161,16 +137,16 @@ static const uint64_t blake2b_IV[8] =
 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
 { \
   v128u64_t t; \
-   G_2X64( s0, s2, s4, s6 ); \
-   G_2X64( s1, s3, s5, s7 ); \
+   G_128( s0, s2, s4, s6 ); \
+   G_128( s1, s3, s5, s7 ); \
   t =  v128_alignr64( s7, s6, 1 ); \
   s6 = v128_alignr64( s6, s7, 1 ); \
   s7 = t; \
   t =  v128_alignr64( s2, s3, 1 ); \
   s2 = v128_alignr64( s3, s2, 1 ); \
   s3 = t; \
-   G_2X64( s0, s2, s5, s6 ); \
-   G_2X64( s1, s3, s4, s7 ); \
+   G_128( s0, s2, s5, s6 ); \
+   G_128( s1, s3, s4, s7 ); \
   t =  v128_alignr64( s6, s7, 1 ); \
   s6 = v128_alignr64( s7, s6, 1 ); \
   s7 = t; \
--- a/algo/panama/panama-hash-4way.h
+++ b/algo/panama/panama-hash-4way.h
@@ -18,11 +18,14 @@ typedef struct {
 } panama_4way_context __attribute__ ((aligned (64)));

 void panama_4way_init( void *cc );
-
 void panama_4way_update( void *cc, const void *data, size_t len );
-
 void panama_4way_close( void *cc, void *dst );

+#define panama_4x32_context panama_4way_context
+#define panama_4x32_init    panama_4way_init
+#define panama_4x32_update  panama_4way_update
+#define panama_4x32_close   panama_4way_close
+
 #if defined(__AVX2__)

 typedef struct {
@@ -34,10 +37,13 @@ typedef struct {
 } panama_8way_context __attribute__ ((aligned (128)));

 void panama_8way_init( void *cc );
-
 void panama_8way_update( void *cc, const void *data, size_t len );
-
 void panama_8way_close( void *cc, void *dst );

+#define panama_8x32_context panama_8way_context
+#define panama_8x32_init    panama_8way_init
+#define panama_8x32_update  panama_8way_update
+#define panama_8x32_close   panama_8way_close
+
 #endif
 #endif
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -31,20 +31,20 @@

 union _hmq1725_8way_context_overlay
 {
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
+    blake512_8x64_context   blake;
+    bmw512_8x64_context     bmw;
+    skein512_8x64_context   skein;
+    jh512_8x64_context      jh;
+    keccak512_8x64_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
    simd_4way_context       simd;
-    hamsi512_8way_context   hamsi;
+    hamsi512_8x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_8way_context  shabal;
+    shabal512_8x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
-    haval256_5_8way_context haval;
+    sha512_8x64_context     sha512;
+    haval256_8x32_context   haval;
 #if defined(__VAES__)
    groestl512_4way_context groestl;
    shavite512_4way_context shavite;
@@ -81,7 +81,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   __m512i* vhB = (__m512i*)vhashB;
   __m512i* vhC = (__m512i*)vhashC;

-   bmw512_8way_full( &ctx.bmw, vhash, input, 80 );
+   bmw512_8x64_full( &ctx.bmw, vhash, input, 80 );

   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
                     hash4, hash5, hash6, hash7, vhash );
@@ -141,26 +141,26 @@ extern void hmq1725_8way_hash(void *state, const void *input)

   // B
   if ( likely( vh_mask & 0xff ) )
-       skein512_8way_full( &ctx.skein, vhashB, vhash, 64 );
+       skein512_8x64_full( &ctx.skein, vhashB, vhash, 64 );

   mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );

-   jh512_8way_init( &ctx.jh );
-   jh512_8way_update( &ctx.jh, vhash, 64 );
-   jh512_8way_close( &ctx.jh, vhash );
+   jh512_8x64_init( &ctx.jh );
+   jh512_8x64_update( &ctx.jh, vhash, 64 );
+   jh512_8x64_close( &ctx.jh, vhash );

-   keccak512_8way_init( &ctx.keccak );
-   keccak512_8way_update( &ctx.keccak, vhash, 64 );
-   keccak512_8way_close( &ctx.keccak, vhash );
+   keccak512_8x64_init( &ctx.keccak );
+   keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+   keccak512_8x64_close( &ctx.keccak, vhash );

   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );

   // A
   if ( ( vh_mask & 0xff ) != 0xff )
-       blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
+       blake512_8x64_full( &ctx.blake, vhashA, vhash, 64 );
   // B
   if ( vh_mask & 0xff )
-       bmw512_8way_full( &ctx.bmw, vhashB, vhash, 64 );
+       bmw512_8x64_full( &ctx.bmw, vhashB, vhash, 64 );

   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
@@ -176,16 +176,16 @@ extern void hmq1725_8way_hash(void *state, const void *input)

   if ( likely( ( vh_mask & 0xff ) != 0xff ) )
   {
-      keccak512_8way_init( &ctx.keccak );
-      keccak512_8way_update( &ctx.keccak, vhash, 64 );
-      keccak512_8way_close( &ctx.keccak, vhashA );
+      keccak512_8x64_init( &ctx.keccak );
+      keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+      keccak512_8x64_close( &ctx.keccak, vhashA );
   }

   if ( likely( vh_mask & 0xff ) )
   {
-      jh512_8way_init( &ctx.jh );
-      jh512_8way_update( &ctx.jh, vhash, 64 );
-      jh512_8way_close( &ctx.jh, vhashB );
+      jh512_8x64_init( &ctx.jh );
+      jh512_8x64_update( &ctx.jh, vhash, 64 );
+      jh512_8x64_close( &ctx.jh, vhashB );
   }

   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
@@ -251,9 +251,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   // B
   if ( likely( vh_mask & 0xff ) )
   {
-      haval256_5_8way_init( &ctx.haval );
-      haval256_5_8way_update( &ctx.haval, vhash, 64 );
-      haval256_5_8way_close( &ctx.haval, vhash );
+      haval256_8x32_init( &ctx.haval );
+      haval256_8x32_update( &ctx.haval, vhash, 64 );
+      haval256_8x32_close( &ctx.haval, vhash );
      memset( &vhash[8<<3], 0, 32<<3 );
      rintrlv_8x32_8x64( vhashB, vhash, 512 );
   }
@@ -296,7 +296,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)

 #endif

-   blake512_8way_full( &ctx.blake, vhash, vhash, 64 );
+   blake512_8x64_full( &ctx.blake, vhash, vhash, 64 );

   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );

@@ -351,9 +351,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)

   mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );

-   hamsi512_8way_init( &ctx.hamsi );
-   hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
-   hamsi512_8way_close( &ctx.hamsi, vhash );
+   hamsi512_8x64_init( &ctx.hamsi );
+   hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
+   hamsi512_8x64_close( &ctx.hamsi, vhash );

   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
                     hash4, hash5, hash6, hash7, vhash );
@@ -429,9 +429,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)

   rintrlv_8x64_8x32( vhashA, vhash, 512 );

-   shabal512_8way_init( &ctx.shabal );
-   shabal512_8way_update( &ctx.shabal, vhashA, 64 );
-   shabal512_8way_close( &ctx.shabal, vhash );
+   shabal512_8x32_init( &ctx.shabal );
+   shabal512_8x32_update( &ctx.shabal, vhashA, 64 );
+   shabal512_8x32_close( &ctx.shabal, vhash );

   dintrlv_8x32_512( hash0, hash1, hash2, hash3,
                     hash4, hash5, hash6, hash7, vhash );
@@ -474,9 +474,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   // B
   if ( likely( vh_mask & 0xff ) )
   {
-      sha512_8way_init( &ctx.sha512 );
-      sha512_8way_update( &ctx.sha512, vhash, 64 );
-      sha512_8way_close( &ctx.sha512, vhashB );
+      sha512_8x64_init( &ctx.sha512 );
+      sha512_8x64_update( &ctx.sha512, vhash, 64 );
+      sha512_8x64_close( &ctx.sha512, vhashB );
   }

   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
@@ -509,9 +509,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   
 #endif

-   sha512_8way_init( &ctx.sha512 );
-   sha512_8way_update( &ctx.sha512, vhash, 64 );
-   sha512_8way_close( &ctx.sha512, vhash );
+   sha512_8x64_init( &ctx.sha512 );
+   sha512_8x64_update( &ctx.sha512, vhash, 64 );
+   sha512_8x64_close( &ctx.sha512, vhash );

   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
@@ -522,9 +522,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   {
      intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                         hash7 );
-      haval256_5_8way_init( &ctx.haval );
-      haval256_5_8way_update( &ctx.haval, vhash, 64 );
-      haval256_5_8way_close( &ctx.haval, vhash );
+      haval256_8x32_init( &ctx.haval );
+      haval256_8x32_update( &ctx.haval, vhash, 64 );
+      haval256_8x32_close( &ctx.haval, vhash );
      memset( &vhash[8<<3], 0, 32<<3 );
      rintrlv_8x32_8x64( vhashA, vhash, 512 );
   }
@@ -551,9 +551,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
                    hash7 );
   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );

-   bmw512_8way_init( &ctx.bmw );
-   bmw512_8way_update( &ctx.bmw, vhash, 64 );
-   bmw512_8way_close( &ctx.bmw, state );
+   bmw512_8x64_init( &ctx.bmw );
+   bmw512_8x64_update( &ctx.bmw, vhash, 64 );
+   bmw512_8x64_close( &ctx.bmw, state );
 }

 int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
@@ -605,12 +605,12 @@ int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,

 union _hmq1725_4way_context_overlay
 {
-    blake512_4way_context   blake;
-    bmw512_4way_context     bmw;
+    blake512_4x64_context   blake;
+    bmw512_4x64_context     bmw;
    hashState_groestl       groestl;
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;
-    keccak512_4way_context  keccak;
+    skein512_4x64_context   skein;
+    jh512_4x64_context      jh;
+    keccak512_4x64_context  keccak;
    hashState_luffa         luffa;
    luffa_2way_context      luffa2;
    cubehashParam           cube;
@@ -620,12 +620,12 @@ union _hmq1725_4way_context_overlay
    shavite512_2way_context shavite2;
    simd_2way_context       simd_2way;
    hashState_echo          echo;
-    hamsi512_4way_context   hamsi;
+    hamsi512_4x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_4way_context  shabal;
+    shabal512_4x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_4way_context     sha512;
-    haval256_5_4way_context haval;
+    sha512_4x64_context     sha512;
+    haval256_4x32_context haval;
 #if defined(__VAES__)
    groestl512_2way_context groestl2;
    echo_2way_context       echo2;
@@ -652,9 +652,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
   __m256i* vhA = (__m256i*)vhashA;
   __m256i* vhB = (__m256i*)vhashB;

-   bmw512_4way_init( &ctx.bmw );
-   bmw512_4way_update( &ctx.bmw, input, 80 );
-   bmw512_4way_close( &ctx.bmw, vhash );
+   bmw512_4x64_init( &ctx.bmw );
+   bmw512_4x64_update( &ctx.bmw, input, 80 );
+   bmw512_4x64_close( &ctx.bmw, vhash );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

@@ -686,17 +686,17 @@ extern void hmq1725_4way_hash(void *state, const void *input)
 // B

    if ( h_mask & 0xffffffff )
-       skein512_4way_full( &ctx.skein, vhashB, vhash, 64 );
+       skein512_4x64_full( &ctx.skein, vhashB, vhash, 64 );

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

-    jh512_4way_init( &ctx.jh );
-    jh512_4way_update( &ctx.jh, vhash, 64 );
-    jh512_4way_close( &ctx.jh, vhash );
+    jh512_4x64_init( &ctx.jh );
+    jh512_4x64_update( &ctx.jh, vhash, 64 );
+    jh512_4x64_close( &ctx.jh, vhash );

-    keccak512_4way_init( &ctx.keccak );
-    keccak512_4way_update( &ctx.keccak, vhash, 64 );
-    keccak512_4way_close( &ctx.keccak, vhash );
+    keccak512_4x64_init( &ctx.keccak );
+    keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+    keccak512_4x64_close( &ctx.keccak, vhash );

 // second fork, A = blake parallel, B= bmw parallel.
    
@@ -704,13 +704,13 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    h_mask = _mm256_movemask_epi8( vh_mask );

    if ( ( h_mask & 0xffffffff ) != 0xffffffff )
-       blake512_4way_full( &ctx.blake, vhashA, vhash, 64 );
+       blake512_4x64_full( &ctx.blake, vhashA, vhash, 64 );

    if ( h_mask & 0xffffffff )
    {
-       bmw512_4way_init( &ctx.bmw );
-       bmw512_4way_update( &ctx.bmw, vhash, 64 );
-       bmw512_4way_close( &ctx.bmw, vhashB );
+       bmw512_4x64_init( &ctx.bmw );
+       bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+       bmw512_4x64_close( &ctx.bmw, vhashB );
    }

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
@@ -733,16 +733,16 @@ extern void hmq1725_4way_hash(void *state, const void *input)

    if ( ( h_mask & 0xffffffff ) != 0xffffffff )
    {
-        keccak512_4way_init( &ctx.keccak );
-        keccak512_4way_update( &ctx.keccak, vhash, 64 );
-        keccak512_4way_close( &ctx.keccak, vhashA );
+        keccak512_4x64_init( &ctx.keccak );
+        keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+        keccak512_4x64_close( &ctx.keccak, vhashA );
    }

    if ( h_mask & 0xffffffff )
    {
-        jh512_4way_init( &ctx.jh );
-        jh512_4way_update( &ctx.jh, vhash, 64 );
-        jh512_4way_close( &ctx.jh, vhashB );
+        jh512_4x64_init( &ctx.jh );
+        jh512_4x64_update( &ctx.jh, vhash, 64 );
+        jh512_4x64_close( &ctx.jh, vhashB );
    }

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
@@ -778,9 +778,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    // B
    if ( h_mask & 0xffffffff )
    {
-       haval256_5_4way_init( &ctx.haval );
-       haval256_5_4way_update( &ctx.haval, vhash, 64 );
-       haval256_5_4way_close( &ctx.haval, vhash );
+       haval256_4x32_init( &ctx.haval );
+       haval256_4x32_update( &ctx.haval, vhash, 64 );
+       haval256_4x32_close( &ctx.haval, vhash );
       memset( &vhash[8<<2], 0, 32<<2 );
       rintrlv_4x32_4x64( vhashB, vhash, 512 );
    }
@@ -813,7 +813,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)

 #endif

-    blake512_4way_full( &ctx.blake, vhash, vhash, 64 );
+    blake512_4x64_full( &ctx.blake, vhash, vhash, 64 );

    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

@@ -845,9 +845,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)

    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

-    hamsi512_4way_init( &ctx.hamsi );
-    hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
-    hamsi512_4way_close( &ctx.hamsi, vhash );
+    hamsi512_4x64_init( &ctx.hamsi );
+    hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
+    hamsi512_4x64_close( &ctx.hamsi, vhash );

    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

@@ -890,9 +890,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

-   shabal512_4way_init( &ctx.shabal );
-   shabal512_4way_update( &ctx.shabal, vhash, 64 );
-   shabal512_4way_close( &ctx.shabal, vhash );
+   shabal512_4x32_init( &ctx.shabal );
+   shabal512_4x32_update( &ctx.shabal, vhash, 64 );
+   shabal512_4x32_close( &ctx.shabal, vhash );

   dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );

@@ -921,9 +921,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)

   if ( h_mask & 0xffffffff )
   {
-      sha512_4way_init( &ctx.sha512 );
-      sha512_4way_update( &ctx.sha512, vhash, 64 );
-      sha512_4way_close( &ctx.sha512, vhashB );
+      sha512_4x64_init( &ctx.sha512 );
+      sha512_4x64_update( &ctx.sha512, vhash, 64 );
+      sha512_4x64_close( &ctx.sha512, vhashB );
   }

   mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
@@ -950,9 +950,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)

 #endif

-   sha512_4way_init( &ctx.sha512 ); 
-   sha512_4way_update( &ctx.sha512, vhash, 64 );
-   sha512_4way_close( &ctx.sha512, vhash ); 
+   sha512_4x64_init( &ctx.sha512 ); 
+   sha512_4x64_update( &ctx.sha512, vhash, 64 );
+   sha512_4x64_close( &ctx.sha512, vhash ); 

 // A = haval parallel, B = Whirlpool serial

@@ -964,9 +964,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)

   if ( ( h_mask & 0xffffffff ) != 0xffffffff )
   {
-      haval256_5_4way_init( &ctx.haval );
-      haval256_5_4way_update( &ctx.haval, vhash, 64 );
-      haval256_5_4way_close( &ctx.haval, vhash );
+      haval256_4x32_init( &ctx.haval );
+      haval256_4x32_update( &ctx.haval, vhash, 64 );
+      haval256_4x32_close( &ctx.haval, vhash );
      memset( &vhash[8<<2], 0, 32<<2 );
      rintrlv_4x32_4x64( vhashA, vhash, 512 );
   }
@@ -984,9 +984,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)

   mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

-   bmw512_4way_init( &ctx.bmw );
-   bmw512_4way_update( &ctx.bmw, vhash, 64 );
-   bmw512_4way_close( &ctx.bmw, state );
+   bmw512_4x64_init( &ctx.bmw );
+   bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+   bmw512_4x64_close( &ctx.bmw, state );
 }

 int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -13,7 +13,7 @@

 #if defined(LBRY_16WAY)

-static __thread sha256_16way_context sha256_16w_mid;
+static __thread sha256_16x32_context sha256_16w_mid;

 void lbry_16way_hash( void* output, const void* input )
 {
@@ -36,17 +36,17 @@ void lbry_16way_hash( void* output, const void* input )
   uint32_t _ALIGN(64) h13[32];
   uint32_t _ALIGN(64) h14[32];
   uint32_t _ALIGN(64) h15[32];
-   sha256_16way_context    ctx_sha256 __attribute__ ((aligned (64)));
-   sha512_8way_context     ctx_sha512;
-   ripemd160_16way_context ctx_ripemd;
+   sha256_16x32_context    ctx_sha256 __attribute__ ((aligned (64)));
+   sha512_8x64_context     ctx_sha512;
+   ripemd160_16x32_context ctx_ripemd;

   memcpy( &ctx_sha256, &sha256_16w_mid, sizeof(ctx_sha256) );
-   sha256_16way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<4), LBRY_TAIL );
-   sha256_16way_close( &ctx_sha256, vhashA );
+   sha256_16x32_update( &ctx_sha256, input + (LBRY_MIDSTATE<<4), LBRY_TAIL );
+   sha256_16x32_close( &ctx_sha256, vhashA );

-   sha256_16way_init( &ctx_sha256 );
-   sha256_16way_update( &ctx_sha256, vhashA, 32 );
-   sha256_16way_close( &ctx_sha256, vhashA );
+   sha256_16x32_init( &ctx_sha256 );
+   sha256_16x32_update( &ctx_sha256, vhashA, 32 );
+   sha256_16x32_close( &ctx_sha256, vhashA );

   // reinterleave to do sha512 4-way 64 bit twice.
   dintrlv_16x32( h0, h1, h2, h3, h4, h5, h6, h7,
@@ -54,13 +54,13 @@ void lbry_16way_hash( void* output, const void* input )
   intrlv_8x64( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 256 );
   intrlv_8x64( vhashB, h8, h9, h10, h11, h12, h13, h14, h15, 256 );

-   sha512_8way_init( &ctx_sha512 );
-   sha512_8way_update( &ctx_sha512, vhashA, 32 );
-   sha512_8way_close( &ctx_sha512, vhashA );
+   sha512_8x64_init( &ctx_sha512 );
+   sha512_8x64_update( &ctx_sha512, vhashA, 32 );
+   sha512_8x64_close( &ctx_sha512, vhashA );

-   sha512_8way_init( &ctx_sha512 );
-   sha512_8way_update( &ctx_sha512, vhashB, 32 );
-   sha512_8way_close( &ctx_sha512, vhashB );
+   sha512_8x64_init( &ctx_sha512 );
+   sha512_8x64_update( &ctx_sha512, vhashB, 32 );
+   sha512_8x64_close( &ctx_sha512, vhashB );

   // back to 8-way 32 bit
   dintrlv_8x64( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 512 );
@@ -68,22 +68,22 @@ void lbry_16way_hash( void* output, const void* input )
   intrlv_16x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7,
                         h8, h9, h10, h11, h12, h13, h14, h15, 512 );

-   ripemd160_16way_init( &ctx_ripemd );
-   ripemd160_16way_update( &ctx_ripemd, vhashA, 32 );
-   ripemd160_16way_close( &ctx_ripemd, vhashB );
+   ripemd160_16x32_init( &ctx_ripemd );
+   ripemd160_16x32_update( &ctx_ripemd, vhashA, 32 );
+   ripemd160_16x32_close( &ctx_ripemd, vhashB );

-   ripemd160_16way_init( &ctx_ripemd );
-   ripemd160_16way_update( &ctx_ripemd, vhashA+(8<<4), 32 );
-   ripemd160_16way_close( &ctx_ripemd, vhashC );
+   ripemd160_16x32_init( &ctx_ripemd );
+   ripemd160_16x32_update( &ctx_ripemd, vhashA+(8<<4), 32 );
+   ripemd160_16x32_close( &ctx_ripemd, vhashC );

-   sha256_16way_init( &ctx_sha256 );
-   sha256_16way_update( &ctx_sha256, vhashB, 20 );
-   sha256_16way_update( &ctx_sha256, vhashC, 20 );
-   sha256_16way_close( &ctx_sha256, vhashA );
+   sha256_16x32_init( &ctx_sha256 );
+   sha256_16x32_update( &ctx_sha256, vhashB, 20 );
+   sha256_16x32_update( &ctx_sha256, vhashC, 20 );
+   sha256_16x32_close( &ctx_sha256, vhashA );

-   sha256_16way_init( &ctx_sha256 );
-   sha256_16way_update( &ctx_sha256, vhashA, 32 );
-   sha256_16way_close( &ctx_sha256, output );
+   sha256_16x32_init( &ctx_sha256 );
+   sha256_16x32_update( &ctx_sha256, vhashA, 32 );
+   sha256_16x32_close( &ctx_sha256, output );
 }

 int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
@@ -115,8 +115,8 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
   intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
        edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );

-   sha256_16way_init( &sha256_16w_mid );
-   sha256_16way_update( &sha256_16w_mid, vdata, LBRY_MIDSTATE );
+   sha256_16x32_init( &sha256_16w_mid );
+   sha256_16x32_update( &sha256_16w_mid, vdata, LBRY_MIDSTATE );

   do
   {
@@ -144,7 +144,7 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,

 #elif defined(LBRY_8WAY)

-static __thread sha256_8way_context sha256_8w_mid;
+static __thread sha256_8x32_context sha256_8w_mid;

 void lbry_8way_hash( void* output, const void* input )
 {
@@ -159,52 +159,52 @@ void lbry_8way_hash( void* output, const void* input )
   uint32_t _ALIGN(32) h5[32];
   uint32_t _ALIGN(32) h6[32];
   uint32_t _ALIGN(32) h7[32];
-   sha256_8way_context     ctx_sha256 __attribute__ ((aligned (64)));
-   sha512_4way_context     ctx_sha512;
-   ripemd160_8way_context  ctx_ripemd;
+   sha256_8x32_context     ctx_sha256 __attribute__ ((aligned (64)));
+   sha512_4x64_context     ctx_sha512;
+   ripemd160_8x32_context  ctx_ripemd;

   memcpy( &ctx_sha256, &sha256_8w_mid, sizeof(ctx_sha256) );
-   sha256_8way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL );
-   sha256_8way_close( &ctx_sha256, vhashA );
+   sha256_8x32_update( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL );
+   sha256_8x32_close( &ctx_sha256, vhashA );

-   sha256_8way_init( &ctx_sha256 );
-   sha256_8way_update( &ctx_sha256, vhashA, 32 );
-   sha256_8way_close( &ctx_sha256, vhashA );
+   sha256_8x32_init( &ctx_sha256 );
+   sha256_8x32_update( &ctx_sha256, vhashA, 32 );
+   sha256_8x32_close( &ctx_sha256, vhashA );

   // reinterleave to do sha512 4-way 64 bit twice.
   dintrlv_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 );
   intrlv_4x64( vhashA, h0, h1, h2, h3, 256 );
   intrlv_4x64( vhashB, h4, h5, h6, h7, 256 );

-   sha512_4way_init( &ctx_sha512 );
-   sha512_4way_update( &ctx_sha512, vhashA, 32 );
-   sha512_4way_close( &ctx_sha512, vhashA );
+   sha512_4x64_init( &ctx_sha512 );
+   sha512_4x64_update( &ctx_sha512, vhashA, 32 );
+   sha512_4x64_close( &ctx_sha512, vhashA );

-   sha512_4way_init( &ctx_sha512 );
-   sha512_4way_update( &ctx_sha512, vhashB, 32 );
-   sha512_4way_close( &ctx_sha512, vhashB );
+   sha512_4x64_init( &ctx_sha512 );
+   sha512_4x64_update( &ctx_sha512, vhashB, 32 );
+   sha512_4x64_close( &ctx_sha512, vhashB );

   // back to 8-way 32 bit
   dintrlv_4x64( h0, h1, h2, h3, vhashA, 512 );
   dintrlv_4x64( h4, h5, h6, h7, vhashB, 512 );
   intrlv_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );

-   ripemd160_8way_init( &ctx_ripemd );
-   ripemd160_8way_update( &ctx_ripemd, vhashA, 32 );
-   ripemd160_8way_close( &ctx_ripemd, vhashB );
+   ripemd160_8x32_init( &ctx_ripemd );
+   ripemd160_8x32_update( &ctx_ripemd, vhashA, 32 );
+   ripemd160_8x32_close( &ctx_ripemd, vhashB );

-   ripemd160_8way_init( &ctx_ripemd );
-   ripemd160_8way_update( &ctx_ripemd, vhashA+(8<<3), 32 );
-   ripemd160_8way_close( &ctx_ripemd, vhashC );
+   ripemd160_8x32_init( &ctx_ripemd );
+   ripemd160_8x32_update( &ctx_ripemd, vhashA+(8<<3), 32 );
+   ripemd160_8x32_close( &ctx_ripemd, vhashC );

-   sha256_8way_init( &ctx_sha256 );
-   sha256_8way_update( &ctx_sha256, vhashB, 20 );
-   sha256_8way_update( &ctx_sha256, vhashC, 20 );
-   sha256_8way_close( &ctx_sha256, vhashA );
+   sha256_8x32_init( &ctx_sha256 );
+   sha256_8x32_update( &ctx_sha256, vhashB, 20 );
+   sha256_8x32_update( &ctx_sha256, vhashC, 20 );
+   sha256_8x32_close( &ctx_sha256, vhashA );

-   sha256_8way_init( &ctx_sha256 );
-   sha256_8way_update( &ctx_sha256, vhashA, 32 );
-   sha256_8way_close( &ctx_sha256, output );
+   sha256_8x32_init( &ctx_sha256 );
+   sha256_8x32_update( &ctx_sha256, vhashA, 32 );
+   sha256_8x32_close( &ctx_sha256, output );
 }

 int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
@@ -235,8 +235,8 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
   intrlv_8x32( vdata, edata, edata, edata, edata,
                       edata, edata, edata, edata, 1024 );

-   sha256_8way_init( &sha256_8w_mid );
-   sha256_8way_update( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
+   sha256_8x32_init( &sha256_8w_mid );
+   sha256_8x32_update( &sha256_8w_mid, vdata, LBRY_MIDSTATE );

   do
   {
--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -57,7 +57,7 @@ do{ \
 #define ROUND2(a, b, c, d, e, f, s, r, k)  \
 	RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)

-static void ripemd160_4way_round( ripemd160_4way_context *sc )
+static void ripemd160_4x32_round( ripemd160_4x32_context *sc )
 {
   const __m128i *in = (__m128i*)sc->buf;
   __m128i *h  = (__m128i*)sc->val;
@@ -249,7 +249,7 @@ static void ripemd160_4way_round( ripemd160_4way_context *sc )
   h[0] = tmp;
 }

-void ripemd160_4way_init( ripemd160_4way_context *sc )
+void ripemd160_4x32_init( ripemd160_4x32_context *sc )
 {
   sc->val[0] = _mm_set1_epi64x( 0x6745230167452301 );
   sc->val[1] = _mm_set1_epi64x( 0xEFCDAB89EFCDAB89 );
@@ -259,7 +259,7 @@ void ripemd160_4way_init( ripemd160_4way_context *sc )
   sc->count_high = sc->count_low = 0;
 }

-void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
+void ripemd160_4x32_update( ripemd160_4x32_context *sc, const void *data,
                            size_t len )
 {
   __m128i *vdata = (__m128i*)data;
@@ -281,7 +281,7 @@ void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
      len -= clen;
      if ( ptr == block_size )
      {
-         ripemd160_4way_round( sc );
+         ripemd160_4x32_round( sc );
         ptr = 0;
      }
      clow = sc->count_low;
@@ -292,7 +292,7 @@ void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
   }
 }

-void ripemd160_4way_close( ripemd160_4way_context  *sc, void *dst )
+void ripemd160_4x32_close( ripemd160_4x32_context  *sc, void *dst )
 {
   unsigned ptr, u;
   uint32_t low, high;
@@ -306,7 +306,7 @@ void ripemd160_4way_close( ripemd160_4way_context  *sc, void *dst )
   if ( ptr > pad )
   {
       memset_zero_128( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
-       ripemd160_4way_round( sc );
+       ripemd160_4x32_round( sc );
       memset_zero_128( sc->buf, pad>>2 );
   }
   else
@@ -317,7 +317,7 @@ void ripemd160_4way_close( ripemd160_4way_context  *sc, void *dst )
    low = low << 3;
    sc->buf[  pad>>2      ] = _mm_set1_epi32( low  );
    sc->buf[ (pad>>2) + 1 ] = _mm_set1_epi32( high );
-    ripemd160_4way_round( sc );
+    ripemd160_4x32_round( sc );
    for (u = 0; u < 5; u ++)
        casti_v128u32( dst, u ) = sc->val[u];
 }
@@ -357,7 +357,7 @@ do{ \
 #define ROUND2_8W(a, b, c, d, e, f, s, r, k)  \
        RR_8W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)

-static void ripemd160_8way_round( ripemd160_8way_context *sc )
+static void ripemd160_8x32_round( ripemd160_8x32_context *sc )
 {
   const __m256i *in = (__m256i*)sc->buf;
   __m256i *h  = (__m256i*)sc->val;
@@ -550,7 +550,7 @@ static void ripemd160_8way_round( ripemd160_8way_context *sc )
 }


-void ripemd160_8way_init( ripemd160_8way_context *sc )
+void ripemd160_8x32_init( ripemd160_8x32_context *sc )
 {
   sc->val[0] = _mm256_set1_epi64x( 0x6745230167452301 );
   sc->val[1] = _mm256_set1_epi64x( 0xEFCDAB89EFCDAB89 );
@@ -560,7 +560,7 @@ void ripemd160_8way_init( ripemd160_8way_context *sc )
   sc->count_high = sc->count_low = 0;
 }

-void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
+void ripemd160_8x32_update( ripemd160_8x32_context *sc, const void *data,
                            size_t len )
 {
   __m256i *vdata = (__m256i*)data;
@@ -582,7 +582,7 @@ void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
      len -= clen;
      if ( ptr == block_size )
      {
-         ripemd160_8way_round( sc );
+         ripemd160_8x32_round( sc );
         ptr = 0;
      }
      clow = sc->count_low;
@@ -593,7 +593,7 @@ void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
   }
 }

-void ripemd160_8way_close( ripemd160_8way_context  *sc, void *dst )
+void ripemd160_8x32_close( ripemd160_8x32_context  *sc, void *dst )
 {
   unsigned ptr, u;
   uint32_t low, high;
@@ -607,7 +607,7 @@ void ripemd160_8way_close( ripemd160_8way_context  *sc, void *dst )
   if ( ptr > pad )
   {
       memset_zero_256( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
-       ripemd160_8way_round( sc );
+       ripemd160_8x32_round( sc );
       memset_zero_256( sc->buf, pad>>2 );
   }
   else
@@ -618,7 +618,7 @@ void ripemd160_8way_close( ripemd160_8way_context  *sc, void *dst )
    low = low << 3;
    sc->buf[  pad>>2      ] = _mm256_set1_epi32( low  );
    sc->buf[ (pad>>2) + 1 ] = _mm256_set1_epi32( high );
-    ripemd160_8way_round( sc );
+    ripemd160_8x32_round( sc );
    for (u = 0; u < 5; u ++)
        casti_m256i( dst, u ) = sc->val[u];
 }
@@ -629,7 +629,6 @@ void ripemd160_8way_close( ripemd160_8way_context  *sc, void *dst )

 //  RIPEMD-160 16 way

-
 #define F16W_1(x, y, z) \
   _mm512_xor_si512( _mm512_xor_si512( x, y ), z )

@@ -659,7 +658,7 @@ do{ \
 #define ROUND2_16W(a, b, c, d, e, f, s, r, k)  \
        RR_16W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)

-static void ripemd160_16way_round( ripemd160_16way_context *sc )
+static void ripemd160_16x32_round( ripemd160_16x32_context *sc )
 {
   const __m512i *in = (__m512i*)sc->buf;
   __m512i *h  = (__m512i*)sc->val;
@@ -851,7 +850,7 @@ static void ripemd160_16way_round( ripemd160_16way_context *sc )
   h[0] = tmp;
 }

-void ripemd160_16way_init( ripemd160_16way_context *sc )
+void ripemd160_16x32_init( ripemd160_16x32_context *sc )
 {
   sc->val[0] = _mm512_set1_epi64( 0x6745230167452301 );
   sc->val[1] = _mm512_set1_epi64( 0xEFCDAB89EFCDAB89 );
@@ -861,7 +860,7 @@ void ripemd160_16way_init( ripemd160_16way_context *sc )
   sc->count_high = sc->count_low = 0;
 }

-void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
+void ripemd160_16x32_update( ripemd160_16x32_context *sc, const void *data,
                      size_t len )
 {
   __m512i *vdata = (__m512i*)data;
@@ -883,7 +882,7 @@ void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
      len -= clen;
      if ( ptr == block_size )
      {
-         ripemd160_16way_round( sc );
+         ripemd160_16x32_round( sc );
         ptr = 0;
      }
      clow = sc->count_low;
@@ -894,7 +893,7 @@ void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
   }
 }

-void ripemd160_16way_close( ripemd160_16way_context  *sc, void *dst )
+void ripemd160_16x32_close( ripemd160_16x32_context  *sc, void *dst )
 {
   unsigned ptr, u;
   uint32_t low, high;
@@ -908,7 +907,7 @@ void ripemd160_16way_close( ripemd160_16way_context  *sc, void *dst )
   if ( ptr > pad )
   {
       memset_zero_512( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
-       ripemd160_16way_round( sc );
+       ripemd160_16x32_round( sc );
       memset_zero_512( sc->buf, pad>>2 );
   }
   else
@@ -919,7 +918,7 @@ void ripemd160_16way_close( ripemd160_16way_context  *sc, void *dst )
    low = low << 3;
    sc->buf[  pad>>2      ] = _mm512_set1_epi32( low  );
    sc->buf[ (pad>>2) + 1 ] = _mm512_set1_epi32( high );
-    ripemd160_16way_round( sc );
+    ripemd160_16x32_round( sc );
    for (u = 0; u < 5; u ++)
        casti_m512i( dst, u ) = sc->val[u];
 }
--- a/algo/ripemd/ripemd-hash-4way.h
+++ b/algo/ripemd/ripemd-hash-4way.h
@@ -12,12 +12,12 @@ typedef struct
   __m128i buf[64>>2];
   __m128i val[5];
   uint32_t count_high, count_low;
-} __attribute__ ((aligned (64))) ripemd160_4way_context;
+} __attribute__ ((aligned (64))) ripemd160_4x32_context;

-void ripemd160_4way_init( ripemd160_4way_context *sc );
-void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
+void ripemd160_4x32_init( ripemd160_4x32_context *sc );
+void ripemd160_4x32_update( ripemd160_4x32_context *sc, const void *data,
                            size_t len );
-void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst );
+void ripemd160_4x32_close( ripemd160_4x32_context *sc, void *dst );

 #if defined (__AVX2__)

@@ -26,12 +26,12 @@ typedef struct
   __m256i buf[64>>2];
   __m256i val[5];
   uint32_t count_high, count_low;
-} __attribute__ ((aligned (128))) ripemd160_8way_context;
+} __attribute__ ((aligned (128))) ripemd160_8x32_context;

-void ripemd160_8way_init( ripemd160_8way_context *sc );
-void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
+void ripemd160_8x32_init( ripemd160_8x32_context *sc );
+void ripemd160_8x32_update( ripemd160_8x32_context *sc, const void *data,
                            size_t len );
-void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );
+void ripemd160_8x32_close( ripemd160_8x32_context *sc, void *dst );

 #if defined(SIMD512)

@@ -40,12 +40,12 @@ typedef struct
   __m512i buf[64>>2];
   __m512i val[5];
   uint32_t count_high, count_low;
-} __attribute__ ((aligned (128))) ripemd160_16way_context;
+} __attribute__ ((aligned (128))) ripemd160_16x32_context;

-void ripemd160_16way_init( ripemd160_16way_context *sc );
-void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
+void ripemd160_16x32_init( ripemd160_16x32_context *sc );
+void ripemd160_16x32_update( ripemd160_16x32_context *sc, const void *data,
                      size_t len );
-void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst );
+void ripemd160_16x32_close( ripemd160_16x32_context *sc, void *dst );

 #endif // AVX512
 #endif // __AVX2__
--- a/algo/scrypt/neoscrypt.c
+++ b/algo/scrypt/neoscrypt.c
@@ -597,6 +597,45 @@ static void blake2s_compress(blake2s_state *S, const void *buf) {
    v[13] = S->t[1] ^ blake2s_IV[5];
    v[14] = S->f[0] ^ blake2s_IV[6];
    v[15] = S->f[1] ^ blake2s_IV[7];
+
+#if defined(__SSE2__) || defined(__ARM_NEON)
+
+   v128_t *V = (v128_t*)v;
+
+#define ROUND( r ) \
+   V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
+                  m[blake2s_sigma[r][ 6]], m[blake2s_sigma[r][ 4]], \
+                  m[blake2s_sigma[r][ 2]], m[blake2s_sigma[r][ 0]] ) ) ); \
+   V[3] = v128_ror32( v128_xor( V[3], V[0] ), 16 ); \
+   V[2] = v128_add32( V[2], V[3] ); \
+   V[1] = v128_ror32( v128_xor( V[1], V[2] ), 12 ); \
+   V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
+                   m[blake2s_sigma[r][ 7]], m[blake2s_sigma[r][ 5]], \
+                   m[blake2s_sigma[r][ 3]], m[blake2s_sigma[r][ 1]] ) ) ); \
+   V[3] = v128_ror32( v128_xor( V[3], V[0] ), 8 ); \
+   V[2] = v128_add32( V[2], V[3] ); \
+   V[1] = v128_ror32( v128_xor( V[1], V[2] ), 7 ); \
+   V[0] = v128_shufll32( V[0] ); \
+   V[3] = v128_swap64( V[3] ); \
+   V[2] = v128_shuflr32( V[2] ); \
+   V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
+                    m[blake2s_sigma[r][12]], m[blake2s_sigma[r][10]], \
+                    m[blake2s_sigma[r][ 8]], m[blake2s_sigma[r][14]] ) ) ); \
+   V[3] = v128_ror32( v128_xor( V[3], V[0] ), 16 ); \
+   V[2] = v128_add32( V[2], V[3] ); \
+   V[1] = v128_ror32( v128_xor( V[1], V[2] ), 12 ); \
+   V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
+                    m[blake2s_sigma[r][13]], m[blake2s_sigma[r][11]], \
+                    m[blake2s_sigma[r][ 9]], m[blake2s_sigma[r][15]] ) ) ); \
+   V[3] = v128_ror32( v128_xor( V[3], V[0] ), 8 ); \
+   V[2] = v128_add32( V[2], V[3] ); \
+   V[1] = v128_ror32( v128_xor( V[1], V[2] ), 7 ); \
+   V[0] = v128_shuflr32( V[0] ); \
+   V[3] = v128_swap64( V[3] ); \
+   V[2] = v128_shufll32( V[2] )
+
+#else
+
 #define G(r,i,a,b,c,d) \
  do { \
    a = a + b + m[blake2s_sigma[r][2*i+0]]; \
@@ -619,6 +658,9 @@ static void blake2s_compress(blake2s_state *S, const void *buf) {
    G(r, 6, v[ 2], v[ 7], v[ 8], v[13]); \
    G(r, 7, v[ 3], v[ 4], v[ 9], v[14]); \
  } while(0)
+
+#endif
+
    ROUND(0);
    ROUND(1);
    ROUND(2);
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -274,9 +274,6 @@ static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,

 #endif   // SHA

-
-
-
 static const uint32_t keypad_4way[ 4*12 ] __attribute((aligned(32))) = 
 {
 	0x80000000, 0x80000000, 0x80000000, 0x80000000,
@@ -339,7 +336,7 @@ static const uint32_t _ALIGN(16) finalblk_4way[4 * 16] = {
 };
 */

-static inline void sha256_4way_init_state( void *state )
+static inline void sha256_4x32_init_state( void *state )
 {
   casti_v128( state, 0 ) = v128_32( 0x6A09E667 );
   casti_v128( state, 1 ) = v128_32( 0xBB67AE85 );
@@ -362,21 +359,21 @@ static inline void HMAC_SHA256_80_init_4way( const uint32_t *key,
 	memcpy( pad, key + 4*16, 4*16 );
 	memcpy( pad + 4*4, keypad_4way, 4*48 );

-   sha256_4way_transform_le( (v128_t*)ihash, (v128_t*)pad,
+   sha256_4x32_transform_le( (v128_t*)ihash, (v128_t*)pad,
                             (const v128_t*)tstate );

-   sha256_4way_init_state( tstate );
+   sha256_4x32_init_state( tstate );

 	for ( i = 0; i < 4*8; i++ )  pad[i] = ihash[i] ^ 0x5c5c5c5c;
 	for ( ; i < 4*16; i++ )      pad[i] = 0x5c5c5c5c;

-   sha256_4way_transform_le( (v128_t*)ostate, (v128_t*)pad,
+   sha256_4x32_transform_le( (v128_t*)ostate, (v128_t*)pad,
                             (const v128_t*)tstate );
   
   for ( i = 0; i < 4*8; i++ )  pad[i] = ihash[i] ^ 0x36363636;
 	for ( ; i < 4*16; i++ )      pad[i] = 0x36363636;

-   sha256_4way_transform_le( (v128_t*)tstate, (v128_t*)pad,
+   sha256_4x32_transform_le( (v128_t*)tstate, (v128_t*)pad,
                             (const v128_t*)tstate );
 }

@@ -389,7 +386,7 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
 	uint32_t _ALIGN(16) obuf[4 * 16];
 	int i, j;

-   sha256_4way_transform_le( (v128_t*)istate, (v128_t*)salt,
+   sha256_4x32_transform_le( (v128_t*)istate, (v128_t*)salt,
                             (const v128_t*)tstate );
 	
 	memcpy(ibuf, salt + 4 * 16, 4 * 16);
@@ -403,10 +400,10 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
 		ibuf[4 * 4 + 2] = i + 1;
 		ibuf[4 * 4 + 3] = i + 1;

-      sha256_4way_transform_le( (v128_t*)obuf, (v128_t*)ibuf,
+      sha256_4x32_transform_le( (v128_t*)obuf, (v128_t*)ibuf,
                                (const v128_t*)istate );
      
-      sha256_4way_transform_le( (v128_t*)ostate2, (v128_t*)obuf,
+      sha256_4x32_transform_le( (v128_t*)ostate2, (v128_t*)obuf,
                                (const v128_t*)ostate );

      for ( j = 0; j < 4 * 8; j++ )
@@ -421,9 +418,9 @@ static inline void PBKDF2_SHA256_128_32_4way( uint32_t *tstate,
 	uint32_t _ALIGN(64) buf[4 * 16];
 	int i;
 	
-   sha256_4way_transform_be( (v128_t*)tstate, (v128_t*)salt,
+   sha256_4x32_transform_be( (v128_t*)tstate, (v128_t*)salt,
                       (const v128_t*)tstate );
-   sha256_4way_transform_be( (v128_t*)tstate, (v128_t*)( salt + 4*16),
+   sha256_4x32_transform_be( (v128_t*)tstate, (v128_t*)( salt + 4*16),
                       (const v128_t*)tstate );

   final[ 0] = v128_32( 0x00000001 );
@@ -434,20 +431,20 @@ static inline void PBKDF2_SHA256_128_32_4way( uint32_t *tstate,
             = v128_xor( final[ 0], final[ 0] ); //_mm_setzero_si128();
   final[15] = v128_32 ( 0x00000620 );

-   sha256_4way_transform_le( (v128_t*)tstate, (v128_t*)final,
+   sha256_4x32_transform_le( (v128_t*)tstate, (v128_t*)final,
                       (const v128_t*)tstate );
   
   memcpy(buf, tstate, 4 * 32);
 	memcpy(buf + 4 * 8, outerpad_4way, 4 * 32);

-   sha256_4way_transform_le( (v128_t*)ostate, (v128_t*)buf,
+   sha256_4x32_transform_le( (v128_t*)ostate, (v128_t*)buf,
                             (const v128_t*)ostate );

   for ( i = 0; i < 4 * 8; i++ )
 		output[i] = bswap_32( ostate[i] );
 }

-#ifdef HAVE_SHA256_8WAY
+#if defined(__AVX2__)

 /*
 static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = {
@@ -470,7 +467,7 @@ static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = {
 };
 */

-static inline void sha256_8way_init_state( void *state )
+static inline void sha256_8x32_init_state( void *state )
 {
   casti_m256i( state, 0 ) = _mm256_set1_epi32( 0x6A09E667 );
   casti_m256i( state, 1 ) = _mm256_set1_epi32( 0xBB67AE85 );
@@ -494,21 +491,21 @@ static inline void HMAC_SHA256_80_init_8way( const uint32_t *key,
 	memset( pad + 8*5, 0x00, 8*40 );
 	for ( i = 0; i < 8; i++ )    pad[ 8*15 + i ] = 0x00000280;

-   sha256_8way_transform_le( (__m256i*)ihash, (__m256i*)pad,
+   sha256_8x32_transform_le( (__m256i*)ihash, (__m256i*)pad,
                             (const __m256i*)tstate );

-   sha256_8way_init_state( tstate );
+   sha256_8x32_init_state( tstate );

   for ( i = 0; i < 8*8; i++ )   pad[i] = ihash[i] ^ 0x5c5c5c5c;
 	for ( ; i < 8*16; i++ )       pad[i] = 0x5c5c5c5c;

-   sha256_8way_transform_le( (__m256i*)ostate, (__m256i*)pad,
+   sha256_8x32_transform_le( (__m256i*)ostate, (__m256i*)pad,
                             (const __m256i*)tstate );

   for ( i = 0; i < 8*8; i++ )   pad[i] = ihash[i] ^ 0x36363636;
 	for ( ; i < 8*16; i++ )       pad[i] = 0x36363636;

-   sha256_8way_transform_le( (__m256i*)tstate, (__m256i*)pad,
+   sha256_8x32_transform_le( (__m256i*)tstate, (__m256i*)pad,
                             (const __m256i*)tstate );
 }

@@ -521,7 +518,7 @@ static inline void PBKDF2_SHA256_80_128_8way( const uint32_t *tstate,
 	uint32_t _ALIGN(32) obuf[8 * 16];
 	int i, j;
 	
-   sha256_8way_transform_le( (__m256i*)istate, (__m256i*)salt,
+   sha256_8x32_transform_le( (__m256i*)istate, (__m256i*)salt,
                             (const __m256i*)tstate );

 	memcpy( ibuf, salt + 8*16, 8*16 );
@@ -544,10 +541,10 @@ static inline void PBKDF2_SHA256_80_128_8way( const uint32_t *tstate,
 		ibuf[8 * 4 + 6] = i + 1;
 		ibuf[8 * 4 + 7] = i + 1;

-      sha256_8way_transform_le( (__m256i*)obuf, (__m256i*)ibuf,
+      sha256_8x32_transform_le( (__m256i*)obuf, (__m256i*)ibuf,
                                (const __m256i*)istate );

-      sha256_8way_transform_le( (__m256i*)ostate2, (__m256i*)obuf,
+      sha256_8x32_transform_le( (__m256i*)ostate2, (__m256i*)obuf,
                                (const __m256i*)ostate );

      for ( j = 0; j < 8*8; j++ )
@@ -562,9 +559,9 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
   uint32_t _ALIGN(128) buf[ 8*16 ];
 	int i;
 	
-   sha256_8way_transform_be( (__m256i*)tstate, (__m256i*)salt,
+   sha256_8x32_transform_be( (__m256i*)tstate, (__m256i*)salt,
                             (const __m256i*)tstate );
-   sha256_8way_transform_be( (__m256i*)tstate, (__m256i*)( salt + 8*16),
+   sha256_8x32_transform_be( (__m256i*)tstate, (__m256i*)( salt + 8*16),
                             (const __m256i*)tstate );
   
   final[ 0] = _mm256_set1_epi32( 0x00000001 );
@@ -575,7 +572,7 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
             = _mm256_setzero_si256();
   final[15] = _mm256_set1_epi32 ( 0x00000620 );

-   sha256_8way_transform_le( (__m256i*)tstate, final,
+   sha256_8x32_transform_le( (__m256i*)tstate, final,
                             (const __m256i*)tstate );

 	memcpy( buf, tstate, 8*32 );
@@ -583,18 +580,18 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
 	memset( buf + 8*9, 0x00, 8*24 );
 	for ( i = 0; i < 8; i++ )     buf[ 8*15 + i ] = 0x00000300;

-   sha256_8way_transform_le( (__m256i*)ostate, (__m256i*)buf,
+   sha256_8x32_transform_le( (__m256i*)ostate, (__m256i*)buf,
                             (const __m256i*)ostate );

 	for (i = 0; i < 8 * 8; i++)
 		output[i] = bswap_32(ostate[i]);
 }

-#endif /* HAVE_SHA256_8WAY */
+#endif //AVX2

 #if defined(SIMD512)

-static inline void sha256_16way_init_state( void *state )
+static inline void sha256_16x32_init_state( void *state )
 {
   casti_m512i( state, 0 ) = _mm512_set1_epi32( 0x6A09E667 );
   casti_m512i( state, 1 ) = _mm512_set1_epi32( 0xBB67AE85 );
@@ -618,21 +615,21 @@ static inline void HMAC_SHA256_80_init_16way( const uint32_t *key,
   memset( pad + 16*5, 0x00, 16*40 );
   for ( i = 0; i < 16; i++ )       pad[ 16*15 + i ] = 0x00000280;

-   sha256_16way_transform_le( (__m512i*)ihash, (__m512i*)pad,
+   sha256_16x32_transform_le( (__m512i*)ihash, (__m512i*)pad,
                              (const __m512i*)tstate );

-   sha256_16way_init_state( tstate );
+   sha256_16x32_init_state( tstate );

   for ( i = 0; i < 16*8; i++ )    pad[i] = ihash[i] ^ 0x5c5c5c5c;
   for ( ; i < 16*16; i++ )        pad[i] = 0x5c5c5c5c;

-   sha256_16way_transform_le( (__m512i*)ostate, (__m512i*)pad,
+   sha256_16x32_transform_le( (__m512i*)ostate, (__m512i*)pad,
                              (const __m512i*)tstate );

   for ( i = 0; i < 16*8; i++ )   pad[i] = ihash[i] ^ 0x36363636;
   for ( ; i < 16*16; i++ )       pad[i] = 0x36363636;
 
-   sha256_16way_transform_le( (__m512i*)tstate, (__m512i*)pad,
+   sha256_16x32_transform_le( (__m512i*)tstate, (__m512i*)pad,
                              (const __m512i*)tstate );
 }

@@ -645,7 +642,7 @@ static inline void PBKDF2_SHA256_80_128_16way( const uint32_t *tstate,
   uint32_t _ALIGN(128) ostate2[ 16*8 ];
   int i, j;

-   sha256_16way_transform_le( (__m512i*)istate, (__m512i*)salt,
+   sha256_16x32_transform_le( (__m512i*)istate, (__m512i*)salt,
                              (const __m512i*)tstate );

   memcpy( ibuf, salt + 16*16, 16*16 );
@@ -676,10 +673,10 @@ static inline void PBKDF2_SHA256_80_128_16way( const uint32_t *tstate,
      ibuf[ 16*4 + 14 ] = i + 1;
      ibuf[ 16*4 + 15 ] = i + 1;

-      sha256_16way_transform_le( (__m512i*)obuf, (__m512i*)ibuf,
+      sha256_16x32_transform_le( (__m512i*)obuf, (__m512i*)ibuf,
                                 (const __m512i*)istate );

-      sha256_16way_transform_le( (__m512i*)ostate2, (__m512i*)obuf,
+      sha256_16x32_transform_le( (__m512i*)ostate2, (__m512i*)obuf,
                                 (const __m512i*)ostate );

      for ( j = 0; j < 16*8; j++ )
@@ -694,9 +691,9 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
   uint32_t _ALIGN(128) buf[ 16*16 ];
   int i;

-   sha256_16way_transform_be( (__m512i*)tstate, (__m512i*)salt,
+   sha256_16x32_transform_be( (__m512i*)tstate, (__m512i*)salt,
                             (const __m512i*)tstate );
-   sha256_16way_transform_be( (__m512i*)tstate, (__m512i*)( salt + 16*16),
+   sha256_16x32_transform_be( (__m512i*)tstate, (__m512i*)( salt + 16*16),
                             (const __m512i*)tstate );

   final[ 0] = _mm512_set1_epi32( 0x00000001 );
@@ -707,7 +704,7 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
             = _mm512_setzero_si512();
   final[15] = _mm512_set1_epi32 ( 0x00000620 );

-   sha256_16way_transform_le( (__m512i*)tstate, final,
+   sha256_16x32_transform_le( (__m512i*)tstate, final,
                             (const __m512i*)tstate );

   memcpy( buf, tstate, 16*32 );
@@ -715,7 +712,7 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
   memset( buf + 16*9, 0x00, 16*24 );
   for ( i = 0; i < 16; i++ )      buf[ 16*15 + i ] = 0x00000300;

-   sha256_16way_transform_le( (__m512i*)ostate, (__m512i*)buf,
+   sha256_16x32_transform_le( (__m512i*)ostate, (__m512i*)buf,
                             (const __m512i*)ostate );

   for ( i = 0; i < 16*8; i++ )
@@ -724,25 +721,10 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,

 #endif // AVX512

-#define SCRYPT_MAX_WAYS 12
-#define HAVE_SCRYPT_3WAY 1
-void scrypt_core(uint32_t *X, uint32_t *V, int N);
-void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
-
-#if defined(__AVX2__)
-#undef SCRYPT_MAX_WAYS
-#define SCRYPT_MAX_WAYS 24
-#define HAVE_SCRYPT_6WAY 1
-void scrypt_core_6way(uint32_t *X, uint32_t *V, int N);
-#endif
-
-#ifndef SCRYPT_MAX_WAYS
-#define SCRYPT_MAX_WAYS 1
-#endif
-
 #include "scrypt-core-4way.h"

-/*
+#if ( SCRYPT_THROUGHPUT == 1 )
+   
 static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
                              uint32_t *midstate, int N, int thr_id )
 {
@@ -752,15 +734,12 @@ static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
 	memcpy(tstate, midstate, 32);
 	HMAC_SHA256_80_init(input, tstate, ostate);
 	PBKDF2_SHA256_80_128(tstate, ostate, input, X);
-
-   scrypt_core_simd128( X, scratchbuf, N );  // woring
-//   scrypt_core_1way( X, V, N );  // working
-//   scrypt_core(X, V, N);
-
+   scrypt_core_1way( X, scratchbuf, N ); 
 	PBKDF2_SHA256_128_32(tstate, ostate, X, output);
   return true;
 }
-*/
+
+#endif

 #if ( SCRYPT_THROUGHPUT == 8 )

@@ -1201,20 +1180,6 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_2buf( X+448, V, N );
 ********************/
-/*
-   scrypt_core_3way( X,     V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_3way( X+ 96, V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_2buf( X+192, V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_3way( X+256, V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_3way( X+352, V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_2buf( X+448, V, N );
-*/
-

   if ( work_restart[thrid].restart ) return 0;

@@ -1321,8 +1286,7 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
   return 1;
 }

-#else  
-// SSE2
+#elif defined(__SSE2__) || defined(__ARM_NEON)  

 static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
           uint32_t *midstate, int N, int thrid )
@@ -1481,7 +1445,7 @@ bool scrypt_miner_thread_init( int thr_id )
 bool register_scrypt_algo( algo_gate_t* gate )
 {
 #if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
-   gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
+   gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | SHA256_OPT | NEON_OPT;
 #else
   gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
 #endif
@@ -1491,31 +1455,31 @@ bool register_scrypt_algo( algo_gate_t* gate )
   opt_param_n = opt_param_n ? opt_param_n : 1024;
   applog( LOG_INFO,"Scrypt paramaters: N= %d, R= 1", opt_param_n );

-// scrypt_throughput defined at compile time and used to replace
-// MAX_WAYS to reduce memory usage.
-   
-#if defined(SIMD512)
-//   scrypt_throughput = 16;
-   if ( opt_param_n > 0x4000 )
-      scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
-   else      
-      scratchbuf_size = opt_param_n * 4 * 128;  // 4 way
-#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
-//   scrypt_throughput = 2;
-   scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
-#elif defined(__AVX2__)
-//   scrypt_throughput = 8;   
-   if ( opt_param_n > 0x4000 )
-      scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
-   else
-      scratchbuf_size = opt_param_n * 2 * 128;  // 2 way
-#else
-//   scrypt_throughput = 4;
-   if ( opt_param_n > 0x4000 )
-   scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
-   else
-   scratchbuf_size = opt_param_n * 4 * 128;  // 4 way
-#endif
+   switch ( SCRYPT_THROUGHPUT )
+   {
+     case 16:  // AVX512
+       if ( opt_param_n > 0x4000 )
+         scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+       else      
+         scratchbuf_size = opt_param_n * 4 * 128;  // 4 way
+     break;
+     case 2:  // SHA256
+         scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
+     break;
+     case 8:  // AVX2
+       if ( opt_param_n > 0x4000 )
+         scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+     else
+         scratchbuf_size = opt_param_n * 2 * 128;  // 2 way
+     break;
+     case 4:  // SSE2, NEON
+       if ( opt_param_n > 0x4000 )
+         scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
+       else
+         scratchbuf_size = opt_param_n * 4 * 128;  // 4 way
+     default:
+         scratchbuf_size = opt_param_n;  // 1 way
+   }

   char t_units[4] = {0};
   char d_units[4] = {0};
--- a/algo/sha/hmac-sha256-hash-4way.c
+++ b/algo/sha/hmac-sha256-hash-4way.c
@@ -31,7 +31,7 @@
 #include "hmac-sha256-hash-4way.h"
 #include "compat.h"

-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)
 // HMAC 4-way SSE2

 /**
@@ -62,30 +62,30 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
 	/* If Klen > 64, the key is really SHA256(K). */
 	if ( Klen > 64 )
   {
-		sha256_4way_init( &ctx->ictx );
-		sha256_4way_update( &ctx->ictx, K, Klen );
-		sha256_4way_close( &ctx->ictx, khash );
+		sha256_4x32_init( &ctx->ictx );
+		sha256_4x32_update( &ctx->ictx, K, Klen );
+		sha256_4x32_close( &ctx->ictx, khash );
 		K = khash;
 		Klen = 32;
 	}

 	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-   sha256_4way_init( &ctx->ictx );
+   sha256_4x32_init( &ctx->ictx );
 	memset( pad, 0x36, 64*4 );

   for ( i = 0; i < Klen; i++ )
-		casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
-                                               casti_v128u32( K, i ) );
+		casti_v128u32( pad, i ) = v128_xor( casti_v128u32( pad, i ),
+                                          casti_v128u32( K, i ) );

-   sha256_4way_update( &ctx->ictx, pad, 64 );
+   sha256_4x32_update( &ctx->ictx, pad, 64 );

 	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-	sha256_4way_init( &ctx->octx );
+	sha256_4x32_init( &ctx->octx );
 	memset( pad, 0x5c, 64*4 );
 	for ( i = 0; i < Klen/4; i++ )
-		casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
-                                               casti_v128u32( K, i ) );
-	sha256_4way_update( &ctx->octx, pad, 64 );
+		casti_v128u32( pad, i ) = v128_xor( casti_v128u32( pad, i ),
+                                          casti_v128u32( K, i ) );
+	sha256_4x32_update( &ctx->octx, pad, 64 );
 }

 /* Add bytes to the HMAC-SHA256 operation. */
@@ -94,7 +94,7 @@ hmac_sha256_4way_update( hmac_sha256_4way_context *ctx, const void *in,
                         size_t len )
 {
 	/* Feed data to the inner SHA256 operation. */
-	sha256_4way_update( &ctx->ictx, in, len );
+	sha256_4x32_update( &ctx->ictx, in, len );
 }

 /* Finish an HMAC-SHA256 operation. */
@@ -104,13 +104,13 @@ hmac_sha256_4way_close( hmac_sha256_4way_context *ctx, void *digest )
 	unsigned char ihash[32*4] __attribute__ ((aligned (64)));

 	/* Finish the inner SHA256 operation. */
-	sha256_4way_close( &ctx->ictx, ihash );
+	sha256_4x32_close( &ctx->ictx, ihash );

 	/* Feed the inner hash to the outer SHA256 operation. */
-	sha256_4way_update( &ctx->octx, ihash, 32 );
+	sha256_4x32_update( &ctx->octx, ihash, 32 );

 	/* Finish the outer SHA256 operation. */
-	sha256_4way_close( &ctx->octx, digest );
+	sha256_4x32_close( &ctx->octx, digest );
 }

 /**
@@ -126,7 +126,7 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
 	hmac_sha256_4way_context PShctx, hctx;
 	uint8_t _ALIGN(128) T[32*4];
 	uint8_t _ALIGN(128) U[32*4];
-   __m128i ivec;
+   v128u32_t ivec;
   size_t i, clen;
 	uint64_t j;
 	int k;
@@ -139,7 +139,7 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
 	for ( i = 0; i * 32 < dkLen; i++ )
   {
 		/* Generate INT(i + 1). */
-      ivec = _mm_set1_epi32( bswap_32( i+1 ) ); 
+      ivec = v128_32( bswap_32( i+1 ) ); 

 		/* Compute U_1 = PRF(P, S || INT(i)). */
 		memcpy( &hctx, &PShctx, sizeof(hmac_sha256_4way_context) );
@@ -158,8 +158,8 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,

 			/* ... xor U_j ... */
 			for ( k = 0; k < 8; k++ )
-				casti_v128u32( T, k ) = _mm_xor_si128( casti_v128u32( T, k ),
-                                                   casti_v128u32( U, k ) );
+				casti_v128u32( T, k ) = v128_xor( casti_v128u32( T, k ),
+                                              casti_v128u32( U, k ) );
 		}

 		/* Copy as many bytes as necessary into buf. */
@@ -199,30 +199,30 @@ hmac_sha256_8way_init( hmac_sha256_8way_context *ctx, const void *_K,
   /* If Klen > 64, the key is really SHA256(K). */
   if ( Klen > 64 )
   {
-      sha256_8way_init( &ctx->ictx );
-      sha256_8way_update( &ctx->ictx, K, Klen );
-      sha256_8way_close( &ctx->ictx, khash );
+      sha256_8x32_init( &ctx->ictx );
+      sha256_8x32_update( &ctx->ictx, K, Klen );
+      sha256_8x32_close( &ctx->ictx, khash );
      K = khash;
      Klen = 32;
   }

   /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-   sha256_8way_init( &ctx->ictx );
+   sha256_8x32_init( &ctx->ictx );
   memset( pad, 0x36, 64*8);

   for ( i = 0; i < Klen/4; i++ )
      casti_m256i( pad, i ) = _mm256_xor_si256( casti_m256i( pad, i ),
                                                casti_m256i( K, i ) );

-   sha256_8way_update( &ctx->ictx, pad, 64 );
+   sha256_8x32_update( &ctx->ictx, pad, 64 );

   /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-   sha256_8way_init( &ctx->octx );
+   sha256_8x32_init( &ctx->octx );
   memset( pad, 0x5c, 64*8 );
   for ( i = 0; i < Klen/4; i++ )
      casti_m256i( pad, i ) = _mm256_xor_si256( casti_m256i( pad, i ),
                                                casti_m256i( K, i ) );
-   sha256_8way_update( &ctx->octx, pad, 64 );
+   sha256_8x32_update( &ctx->octx, pad, 64 );
 }

 void
@@ -230,7 +230,7 @@ hmac_sha256_8way_update( hmac_sha256_8way_context *ctx, const void *in,
                         size_t len )
 {
   /* Feed data to the inner SHA256 operation. */
-   sha256_8way_update( &ctx->ictx, in, len );
+   sha256_8x32_update( &ctx->ictx, in, len );
 }

 /* Finish an HMAC-SHA256 operation. */
@@ -240,13 +240,13 @@ hmac_sha256_8way_close( hmac_sha256_8way_context *ctx, void *digest )
   unsigned char ihash[32*8] __attribute__ ((aligned (128)));

   /* Finish the inner SHA256 operation. */
-   sha256_8way_close( &ctx->ictx, ihash );
+   sha256_8x32_close( &ctx->ictx, ihash );

   /* Feed the inner hash to the outer SHA256 operation. */
-   sha256_8way_update( &ctx->octx, ihash, 32 );
+   sha256_8x32_update( &ctx->octx, ihash, 32 );

   /* Finish the outer SHA256 operation. */
-   sha256_8way_close( &ctx->octx, digest );
+   sha256_8x32_close( &ctx->octx, digest );
 }

 /**
@@ -332,21 +332,21 @@ hmac_sha256_16way_init( hmac_sha256_16way_context *ctx, const void *_K,
   /* If Klen > 64, the key is really SHA256(K). */
   if ( Klen > 64 )
   {
-      sha256_16way_init( &ctx->ictx );
-      sha256_16way_update( &ctx->ictx, K, Klen );
-      sha256_16way_close( &ctx->ictx, khash );
+      sha256_16x32_init( &ctx->ictx );
+      sha256_16x32_update( &ctx->ictx, K, Klen );
+      sha256_16x32_close( &ctx->ictx, khash );
      K = khash;
      Klen = 32;
   }

   /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-   sha256_16way_init( &ctx->ictx );
+   sha256_16x32_init( &ctx->ictx );
   memset( pad, 0x36, 64*16 );

   for ( i = 0; i < Klen; i++ )
      casti_m512i( pad, i ) = _mm512_xor_si512( casti_m512i( pad, i ),
                                                casti_m512i( K, i ) );
-   sha256_16way_update( &ctx->ictx, pad, 64 );
+   sha256_16x32_update( &ctx->ictx, pad, 64 );

   /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
   sha256_16way_init( &ctx->octx );
@@ -354,7 +354,7 @@ hmac_sha256_16way_init( hmac_sha256_16way_context *ctx, const void *_K,
   for ( i = 0; i < Klen/4; i++ )
      casti_m512i( pad, i ) = _mm512_xor_si512( casti_m512i( pad, i ),
                                             casti_m512i( K, i ) );
-   sha256_16way_update( &ctx->octx, pad, 64 );
+   sha256_16x32_update( &ctx->octx, pad, 64 );
 }
   
 void
@@ -362,7 +362,7 @@ hmac_sha256_16way_update( hmac_sha256_16way_context *ctx, const void *in,
                         size_t len )
 {
   /* Feed data to the inner SHA256 operation. */
-   sha256_16way_update( &ctx->ictx, in, len );
+   sha256_16x32_update( &ctx->ictx, in, len );
 }

 /* Finish an HMAC-SHA256 operation. */
@@ -372,13 +372,13 @@ hmac_sha256_16way_close( hmac_sha256_16way_context *ctx, void *digest )
   unsigned char ihash[32*16] __attribute__ ((aligned (128)));

   /* Finish the inner SHA256 operation. */
-   sha256_16way_close( &ctx->ictx, ihash );
+   sha256_16x32_close( &ctx->ictx, ihash );

   /* Feed the inner hash to the outer SHA256 operation. */
-   sha256_16way_update( &ctx->octx, ihash, 32 );
+   sha256_16x32_update( &ctx->octx, ihash, 32 );

   /* Finish the outer SHA256 operation. */
-   sha256_16way_close( &ctx->octx, digest );
+   sha256_16x32_close( &ctx->octx, digest );
 }

 /**
--- a/algo/sha/hmac-sha256-hash-4way.h
+++ b/algo/sha/hmac-sha256-hash-4way.h
@@ -1,6 +1,6 @@
 /*-
 * Copyright 2005,2007,2009 Colin Percival
- * Copyright 2020 JayDDee@gmailcom
+ * Copyright 2020 JayDDee246@gmailcom
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
@@ -38,11 +38,12 @@
 #include "simd-utils.h"
 #include "sha256-hash.h"

-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)
+
 typedef struct _hmac_sha256_4way_context
 {
-   sha256_4way_context ictx;
-   sha256_4way_context octx;
+   sha256_4x32_context ictx;
+   sha256_4x32_context octx;
 } hmac_sha256_4way_context;

 //void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
@@ -67,8 +68,8 @@ void pbkdf2_sha256_4way( uint8_t *, size_t, const uint8_t *, size_t,

 typedef struct _hmac_sha256_8way_context
 {
-   sha256_8way_context ictx;
-   sha256_8way_context octx;
+   sha256_8x32_context ictx;
+   sha256_8x32_context octx;
 } hmac_sha256_8way_context;

 //void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
@@ -88,8 +89,8 @@ void pbkdf2_sha256_8way( uint8_t *, size_t, const uint8_t *, size_t,

 typedef struct _hmac_sha256_16way_context
 {
-   sha256_16way_context ictx;
-   sha256_16way_context octx;
+   sha256_16x32_context ictx;
+   sha256_16x32_context octx;
 } hmac_sha256_16way_context;

 //void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -30,6 +30,7 @@ static const uint32_t K256[64] =
   0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
 };

+#if defined(__SSE2__) || defined(__ARM_NEON)
 // SHA-256 4 way SSE2

 #define CHs(X, Y, Z) \
@@ -309,142 +310,6 @@ void sha256_4x32_final_rounds( v128_t *state_out, const v128_t *data,
   v128_store( state_out + 7,  H );
 }

-
-# if 0
-
-// Working correctly but still slower
-int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
-                            const v128_t *state_in, const uint32_t *target )
-{
-   v128_t A, B, C, D, E, F, G, H, T0, T1, T2;
-   v128_t vmask, targ, hash;
-   int t6_mask, flip;
-   v128_t W[16];      v128_memcpy( W, data, 16 );
-
-   A = v128_load( state_in   );
-   B = v128_load( state_in+1 );
-   C = v128_load( state_in+2 );
-   D = v128_load( state_in+3 );
-   E = v128_load( state_in+4 );
-   F = v128_load( state_in+5 );
-   G = v128_load( state_in+6 );
-   H = v128_load( state_in+7 );
-
-   const v128_t IV7 = H;
-   const v128_t IV6 = G;
-
-   SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
-   SHA256_4X32_MSG_EXPANSION( W );
-   SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
-   SHA256_4X32_MSG_EXPANSION( W );
-   SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
-
-   W[ 0] = SHA256_4X32_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
-   W[ 1] = SHA256_4X32_MEXP( W[15], W[10], W[ 2], W[ 1] );
-   W[ 2] = SHA256_4X32_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
-   W[ 3] = SHA256_4X32_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
-   W[ 4] = SHA256_4X32_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
-   W[ 5] = SHA256_4X32_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
-   W[ 6] = SHA256_4X32_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
-   W[ 7] = SHA256_4X32_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
-   W[ 8] = SHA256_4X32_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
-   W[ 9] = SHA256_4X32_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
-   W[10] = SHA256_4X32_MEXP( W[ 8], W[ 3], W[11], W[10] );
-   W[11] = SHA256_4X32_MEXP( W[ 9], W[ 4], W[12], W[11] );
-   W[12] = SHA256_4X32_MEXP( W[10], W[ 5], W[13], W[12] );
-
-   v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C );
-   
-   SHA256_4X32_ROUND( A, B, C, D, E, F, G, H,  0, 48 );
-   SHA256_4X32_ROUND( H, A, B, C, D, E, F, G,  1, 48 );
-   SHA256_4X32_ROUND( G, H, A, B, C, D, E, F,  2, 48 );
-   SHA256_4X32_ROUND( F, G, H, A, B, C, D, E,  3, 48 );
-   SHA256_4X32_ROUND( E, F, G, H, A, B, C, D,  4, 48 );
-   SHA256_4X32_ROUND( D, E, F, G, H, A, B, C,  5, 48 );
-   SHA256_4X32_ROUND( C, D, E, F, G, H, A, B,  6, 48 );
-   SHA256_4X32_ROUND( B, C, D, E, F, G, H, A,  7, 48 );
-   SHA256_4X32_ROUND( A, B, C, D, E, F, G, H,  8, 48 );
-   SHA256_4X32_ROUND( H, A, B, C, D, E, F, G,  9, 48 );
-
-   T0 = v128_add32( v128_32( K256[58] ),
-                   v128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) );
-   B = v128_add32( B, T0 );
-
-   T1 = v128_add32( v128_32( K256[59] ),
-                    v128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) );
-   A = v128_add32( A, T1 );
-
-   T2 = v128_add32( v128_32( K256[60] ),
-                    v128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) );
-   H = v128_add32( H, T2 );
-
-   targ = v128_32( target[7] );
-   hash = v128_bswap32( v128_add32( H, IV7 ) );
-
-   flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash );
-
-   if ( likely(
-             0xf == ( flip ^ v128_movmask32( v128_cmpgt32( hash, targ ) ) ) ))
-   return 0;
-
-   t6_mask = v128_movmask32( vmask = v128_cmpeq32( hash, targ ) );
-
-   // round 58 part 2
-   F = v128_add32( T0, v128_add32( BSG2_0( G ), MAJs( G, H, A ) ) );
-
-   // round 61  part 1
-   W[13] = SHA256_4X32_MEXP( W[11], W[ 6], W[14], W[13] );
-   T0 = v128_add32( v128_32( K256[61] ),
-                    v128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) );
-   G = v128_add32( G, T0 );
-
-   if ( t6_mask )
-   {
-      targ = v128_and( vmask, v128_32( target[6] ) );
-      hash = v128_bswap32( v128_add32( G, IV6 ) );
-
-      if ( ( 0 != ( t6_mask & v128_movmask32( v128_cmpeq32( hash, targ ) ) ) ))
-         return 0;
-      else
-      {
-         flip = ( (int)target[6] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash );
-         if ( 0 != ( t6_mask & ( flip ^ v128_movmask32(
-                                             v128_cmpgt32( hash, targ ) ) ) ) )
-            return 0;
-          else if ( target[6] == 0x80000000 )
-          {
-             if ( 0 == ( t6_mask & v128_movmask32(
-                            v128_cmpgt32( hash, v128_xor( hash, hash ) ) ) ) )
-                return 0;
-          }
-       }
-   }
-   
-   // rounds 59 to 61 part 2
-   E = v128_add32( T1, v128_add32( BSG2_0( F ), MAJs( F, G, H ) ) );
-   D = v128_add32( T2, v128_add32( BSG2_0( E ), MAJs( E, F, G ) ) );
-   C = v128_add32( T0, v128_add32( BSG2_0( D ), MAJs( D, E, F ) ) );
-
-   // rounds 62 & 63
-   W[14] = SHA256_4X32_MEXP( W[12], W[ 7], W[15], W[14] );
-   W[15] = SHA256_4X32_MEXP( W[13], W[ 8], W[ 0], W[15] );
-
-   SHA256_4X32_ROUND( C, D, E, F, G, H, A, B, 14, 48 );
-   SHA256_4X32_ROUND( B, C, D, E, F, G, H, A, 15, 48 );
-
-   state_out[0] = v128_add32( state_in[0], A );
-   state_out[1] = v128_add32( state_in[1], B );
-   state_out[2] = v128_add32( state_in[2], C );
-   state_out[3] = v128_add32( state_in[3], D );
-   state_out[4] = v128_add32( state_in[4], E );
-   state_out[5] = v128_add32( state_in[5], F );
-   state_out[6] = v128_add32( state_in[6], G );
-   state_out[7] = v128_add32( state_in[7], H );
-return 1;
-}
-
-#endif
-
 void sha256_4x32_init( sha256_4x32_context *sc )
 {
   sc->count_high = sc->count_low = 0;
@@ -529,29 +394,31 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
   sha256_4x32_close( &ctx, dst );
 }

+#endif  // SSE2 || NEON
+
 #if defined(__AVX2__)

 // SHA-256 8 way

 #define BSG2_0x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x,  2 ), \
-                                       mm256_ror_32( x, 13 ) ), \
-                                       mm256_ror_32( x, 22 ) )
+   mm256_xor3( mm256_ror_32( x,  2 ), \
+               mm256_ror_32( x, 13 ), \
+               mm256_ror_32( x, 22 ) )

 #define BSG2_1x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x,  6 ), \
-                                       mm256_ror_32( x, 11 ) ), \
-                                       mm256_ror_32( x, 25 ) )
+   mm256_xor3( mm256_ror_32( x,  6 ), \
+               mm256_ror_32( x, 11 ), \
+               mm256_ror_32( x, 25 ) )

 #define SSG2_0x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x,  7 ), \
-                                       mm256_ror_32( x, 18 ) ), \
-                                       _mm256_srli_epi32( x, 3 ) ) 
+   mm256_xor3( mm256_ror_32( x,  7 ), \
+               mm256_ror_32( x, 18 ), \
+               _mm256_srli_epi32( x, 3 ) ) 

 #define SSG2_1x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 17 ), \
-                                       mm256_ror_32( x, 19 ) ), \
-                                       _mm256_srli_epi32( x, 10 ) )
+   mm256_xor3( mm256_ror_32( x, 17 ), \
+               mm256_ror_32( x, 19 ), \
+               _mm256_srli_epi32( x, 10 ) )

 #define SHA256_8WAY_MEXP( a, b, c, d ) \
     mm256_add4_32( SSG2_1x( a ), b, SSG2_0x( c ), d );
@@ -574,13 +441,8 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
      W[14] = SHA256_8WAY_MEXP( W[12], W[ 7], W[15], W[14] ); \
      W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] ); 

-
-// With AVX512VL ternary logic optimizations are available.
-// If not optimize by forwarding the result of X^Y in MAJ to the next round
-// to avoid recalculating it as Y^Z. This optimization is not applicable
-// when MAJ is optimized with ternary logic.
-
 #if defined(VL256)
+// AVX512 or AVX10-256

 #define CHx(X, Y, Z)    _mm256_ternarylogic_epi32( X, Y, Z, 0xca )

@@ -745,7 +607,7 @@ static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W,
 }

 // accepts LE input data
-void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
+void sha256_8x32_transform_le( __m256i *state_out, const __m256i *data,
                               const __m256i *state_in )
 {
   __m256i W[16];
@@ -754,7 +616,7 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
 }

 // Accepts BE input data, need to bswap
-void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
+void sha256_8x32_transform_be( __m256i *state_out, const __m256i *data,
                               const __m256i *state_in )
 {
   __m256i W[16];
@@ -764,7 +626,7 @@ void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
 }

 // Aggressive prehashing, LE byte order
-void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
+void sha256_8x32_prehash_3rounds( __m256i *state_mid, __m256i *X,
                                  const __m256i *W, const __m256i *state_in )
 {
   __m256i A, B, C, D, E, F, G, H, T1;
@@ -813,7 +675,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
   _mm256_store_si256( state_mid + 7, H );
 }

-void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
+void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data,
          const __m256i *state_in, const __m256i *state_mid, const __m256i *X )
 {
   __m256i A, B, C, D, E, F, G, H;
@@ -914,14 +776,12 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
   _mm256_store_si256( state_out + 7,  H );
 }

-int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
+int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
                           const __m256i *state_in, const uint32_t *target )
 {
   __m256i A, B, C, D, E, F, G, H, T0, T1, T2;
   __m256i vmask, targ, hash;
   __m256i W[16];  memcpy_256( W, data, 16 );
-   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
-                              0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
   uint8_t flip, t6_mask;

   A = _mm256_load_si256( state_in   );
@@ -1012,7 +872,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,

   // Got H, test it.
   targ = v256_32( target[7] );
-   hash = _mm256_shuffle_epi8( _mm256_add_epi32( H, IV7 ), bswap_shuf );
+   hash = mm256_bswap_32( _mm256_add_epi32( H, IV7 ) );
   if ( target[7] )
   {
      flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
@@ -1035,7 +895,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
   { 
      // Testing H was inconclusive: hash7 == target7, need to test G
      targ = _mm256_and_si256( vmask, v256_32( target[6] ) );
-      hash = _mm256_shuffle_epi8( _mm256_add_epi32( G, IV6 ), bswap_shuf );
+      hash = mm256_bswap_32( _mm256_add_epi32( G, IV6 ) );

      if ( likely( 0 == ( t6_mask & mm256_movmask_32(
                                      _mm256_cmpeq_epi32( hash, targ ) ) ) ))
@@ -1083,8 +943,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
   return 1;
 }

-
-void sha256_8way_init( sha256_8way_context *sc )
+void sha256_8x32_init( sha256_8x32_context *sc )
 {
   sc->count_high = sc->count_low = 0;
   sc->val[0] = v256_32( sha256_iv[0] );
@@ -1100,7 +959,7 @@ void sha256_8way_init( sha256_8way_context *sc )
 // need to handle odd byte length for yespower.
 // Assume only last update is odd.

-void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
+void sha256_8x32_update( sha256_8x32_context *sc, const void *data, size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   size_t ptr;
@@ -1121,7 +980,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
      len -= clen;
      if ( ptr == buf_size )
      {
-         sha256_8way_transform_be( sc->val, sc->buf, sc->val );
+         sha256_8x32_transform_be( sc->val, sc->buf, sc->val );
         ptr = 0;
      }
      clow = sc->count_low;
@@ -1132,7 +991,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
   }
 }

-void sha256_8way_close( sha256_8way_context *sc, void *dst )
+void sha256_8x32_close( sha256_8x32_context *sc, void *dst )
 {
    unsigned ptr;
    uint32_t low, high;
@@ -1146,7 +1005,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
    if ( ptr > pad )
    {
         memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
-         sha256_8way_transform_be( sc->val, sc->buf, sc->val );
+         sha256_8x32_transform_be( sc->val, sc->buf, sc->val );
         memset_zero_256( sc->buf, pad >> 2 );
    }
    else
@@ -1159,17 +1018,17 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
    sc->buf[   pad     >> 2 ] = v256_32( bswap_32( high ) );
    sc->buf[ ( pad+4 ) >> 2 ] = v256_32( bswap_32( low ) );

-    sha256_8way_transform_be( sc->val, sc->buf, sc->val );
+    sha256_8x32_transform_be( sc->val, sc->buf, sc->val );

    mm256_block_bswap_32( dst, sc->val );
 }

-void sha256_8way_full( void *dst, const void *data, size_t len )
+void sha256_8x32_full( void *dst, const void *data, size_t len )
 {
-   sha256_8way_context ctx;
-   sha256_8way_init( &ctx );
-   sha256_8way_update( &ctx, data, len );
-   sha256_8way_close( &ctx, dst );
+   sha256_8x32_context ctx;
+   sha256_8x32_init( &ctx );
+   sha256_8x32_update( &ctx, data, len );
+   sha256_8x32_close( &ctx, dst );
 }

 #if defined(SIMD512)
@@ -1302,7 +1161,7 @@ static inline void SHA256_16WAY_TRANSFORM( __m512i *out, __m512i *W,
 }

 // accepts LE input data
-void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
+void sha256_16x32_transform_le( __m512i *state_out, const __m512i *data,
                                const __m512i *state_in )
 {
   __m512i W[16];
@@ -1311,7 +1170,7 @@ void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
 }

 // Accepts BE input data, need to bswap
-void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
+void sha256_16x32_transform_be( __m512i *state_out, const __m512i *data,
                                const __m512i *state_in )
 {
   __m512i W[16];
@@ -1321,7 +1180,7 @@ void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
 }
 
 // Aggressive prehashing, LE byte order
-void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X, 
+void sha256_16x32_prehash_3rounds( __m512i *state_mid, __m512i *X, 
                                   const __m512i *W, const __m512i *state_in )
 {
   __m512i A, B, C, D, E, F, G, H, T1;
@@ -1369,7 +1228,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
   _mm512_store_si512( state_mid + 7, H );
 }   

-void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
+void sha256_16x32_final_rounds( __m512i *state_out, const __m512i *data,
          const __m512i *state_in, const __m512i *state_mid, const __m512i *X )
 {
   __m512i A, B, C, D, E, F, G, H;
@@ -1470,15 +1329,13 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,

 // returns 0 if hash aborted early and invalid,
 // returns 1 for completed hash with at least one valid candidate.
-int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
+int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
                              const __m512i *state_in, const uint32_t *target )
 {
   __m512i A, B, C, D, E, F, G, H, hash, targ;
   __m512i T0, T1, T2;
   __m512i W[16];      memcpy_512( W, data, 16 );
   __mmask16 t6_mask;
-   const __m512i bswap_shuf = mm512_bcast_m128( _mm_set_epi64x(
-                              0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); 

   A = _mm512_load_si512( state_in   );
   B = _mm512_load_si512( state_in+1 );
@@ -1588,7 +1445,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
   H = _mm512_add_epi32( H, T2 );

   // got H, test it against target[7]
-   hash = _mm512_shuffle_epi8( _mm512_add_epi32( H , IV7 ), bswap_shuf );
+   hash = mm512_bswap_32( _mm512_add_epi32( H , IV7 ) );
   targ = v512_32( target[7] );
   if ( target[7] )
   if ( likely( 0 == _mm512_cmple_epu32_mask( hash, targ ) ))
@@ -1608,7 +1465,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
   // got G, test it against target[6] if indicated
   if ( (uint16_t)t6_mask )
   {
-      hash = _mm512_shuffle_epi8( _mm512_add_epi32( G, IV6 ), bswap_shuf );
+      hash = mm512_bswap_32( _mm512_add_epi32( G, IV6 ) );
      targ = v512_32( target[6] );
      if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) ))
          return 0;
@@ -1644,7 +1501,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
   return 1;
 }

-void sha256_16way_init( sha256_16way_context *sc )
+void sha256_16x32_init( sha256_16x32_context *sc )
 {
   sc->count_high = sc->count_low = 0;
   sc->val[0] = v512_32( sha256_iv[0] );
@@ -1657,7 +1514,7 @@ void sha256_16way_init( sha256_16way_context *sc )
   sc->val[7] = v512_32( sha256_iv[7] );
 }

-void sha256_16way_update( sha256_16way_context *sc, const void *data,
+void sha256_16x32_update( sha256_16x32_context *sc, const void *data,
                           size_t len )
 {
   __m512i *vdata = (__m512i*)data;
@@ -1679,7 +1536,7 @@ void sha256_16way_update( sha256_16way_context *sc, const void *data,
      len -= clen;
      if ( ptr == buf_size )
      {
-         sha256_16way_transform_be( sc->val, sc->buf, sc->val );
+         sha256_16x32_transform_be( sc->val, sc->buf, sc->val );
         ptr = 0;
      }
      clow = sc->count_low;
@@ -1690,7 +1547,7 @@ void sha256_16way_update( sha256_16way_context *sc, const void *data,
   }
 }

-void sha256_16way_close( sha256_16way_context *sc, void *dst )
+void sha256_16x32_close( sha256_16x32_context *sc, void *dst )
 {
    unsigned ptr;
    uint32_t low, high;
@@ -1704,7 +1561,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
    if ( ptr > pad )
    {
         memset_zero_512( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
-         sha256_16way_transform_be( sc->val, sc->buf, sc->val );
+         sha256_16x32_transform_be( sc->val, sc->buf, sc->val );
         memset_zero_512( sc->buf, pad >> 2 );
    }
    else
@@ -1717,17 +1574,17 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
    sc->buf[   pad     >> 2 ] = v512_32( bswap_32( high ) );
    sc->buf[ ( pad+4 ) >> 2 ] = v512_32( bswap_32( low ) );

-    sha256_16way_transform_be( sc->val, sc->buf, sc->val );
+    sha256_16x32_transform_be( sc->val, sc->buf, sc->val );

    mm512_block_bswap_32( dst, sc->val );
 }

-void sha256_16way_full( void *dst, const void *data, size_t len )
+void sha256_16x32_full( void *dst, const void *data, size_t len )
 {
-   sha256_16way_context ctx;
-   sha256_16way_init( &ctx );
-   sha256_16way_update( &ctx, data, len );
-   sha256_16way_close( &ctx, dst );
+   sha256_16x32_context ctx;
+   sha256_16x32_init( &ctx );
+   sha256_16x32_update( &ctx, data, len );
+   sha256_16x32_close( &ctx, dst );
 }

 #undef CH
--- a/algo/sha/sha256-hash.h
+++ b/algo/sha/sha256-hash.h
@@ -180,20 +180,9 @@ void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data,
 int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
                             const __m256i *state_in, const uint32_t *target );

-// Temporary API during naming transition
-#define sha256_8way_context               sha256_8x32_context
-#define sha256_8way_init                  sha256_8x32_init
-#define sha256_8way_update                sha256_8x32_update
-#define sha256_8way_close                 sha256_8x32_close
-#define sha256_8way_full                  sha256_8x32_full
-#define sha256_8way_transform_le          sha256_8x32_transform_le
-#define sha256_8way_transform_be          sha256_8x32_transform_be
-#define sha256_8way_prehash_3rounds       sha256_8x32_prehash_3rounds
-#define sha256_8way_final_rounds          sha256_8x32_final_rounds
-#define sha256_8way_transform_le_short    sha256_8x32_transform_le_short
-
 #endif  // AVX2

+#if defined(__SSE2__) || defined(__ARM_NEON)
 // SHA-256 4 way x86_64 with SSE2 or AArch64 with NEON

 typedef struct
@@ -219,16 +208,5 @@ void sha256_4x32_final_rounds( v128_t *state_out, const v128_t *data,
 int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
                            const v128_t *state_in, const uint32_t *target );

-// Temporary API during naming transition
-#define sha256_4way_context              sha256_4x32_context
-#define sha256_4way_init                 sha256_4x32_init
-#define sha256_4way_update               sha256_4x32_update
-#define sha256_4way_close                sha256_4x32_close
-#define sha256_4way_full                 sha256_4x32_full
-#define sha256_4way_transform_le         sha256_4x32_transform_le
-#define sha256_4way_transform_be         sha256_4x32_transform_be
-#define sha256_4way_prehash_3rounds      sha256_4x32_prehash_3rounds
-#define sha256_4way_final_rounds         sha256_4x32_final_rounds
-#define sha256_4way_transform_le_short   sha256_4x32_transform_le_short
-
-#endif
+#endif // SSE2 || NEON
+#endif // SHA256_HASH_H__
--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -32,8 +32,6 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const v128_t shuf_bswap32 =
-           v128_set64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );

   // hash first 64 byte block of data
   sha256_transform_le( mstatea, pdata, sha256_iv );
@@ -69,10 +67,8 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,

      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
      {
-          casti_v128( hasha, 0 ) =
-               _mm_shuffle_epi8( casti_v128( hasha, 0 ), shuf_bswap32 );
-          casti_v128( hasha, 1 ) =
-               _mm_shuffle_epi8( casti_v128( hasha, 1 ), shuf_bswap32 );
+          casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
+          casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
          {
             pdata[19] = n;
@@ -81,10 +77,8 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
      }
      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
      {
-         casti_v128( hashb, 0 ) =
-               _mm_shuffle_epi8( casti_v128( hashb, 0 ), shuf_bswap32 );
-         casti_v128( hashb, 1 ) =
-               _mm_shuffle_epi8( casti_v128( hashb, 1 ), shuf_bswap32 );
+         casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
+         casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
         {
            pdata[19] = n+1;
@@ -204,8 +198,6 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   const int thr_id = mythr->id;
   const __m512i sixteen = v512_32( 16 );
   const bool bench = opt_benchmark;
-   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   // prehash first block directly from pdata
   sha256_transform_le( phash, pdata, sha256_iv );
@@ -231,7 +223,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   buf[15] = v512_32( 80*8 );  // bit count

   // partially pre-expand & prehash second message block, avoiding the nonces
-   sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
+   sha256_16x32_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );

   // vectorize IV for second hash
   istate[0] = v512_32( sha256_iv[0] );
@@ -250,15 +242,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,

   do
   {
-      sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
-      if ( unlikely( sha256_16way_transform_le_short(
+      sha256_16x32_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
+      if ( unlikely( sha256_16x32_transform_le_short(
                                  hash32, block, istate, ptarget ) ) )
      {
         for ( int lane = 0; lane < 16; lane++ )
         {
            extr_lane_16x32( phash, hash32, lane, 256 );
-            casti_m256i( phash, 0 ) =
-                   _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf ); 
+            casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) ); 
            if ( likely( valid_hash( phash, ptarget ) && !bench ) )
            {
              pdata[19] = n + lane;
@@ -299,8 +290,6 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
   const bool bench = opt_benchmark;
   const __m256i last_byte = v256_32( 0x80000000 );
   const __m256i eight = v256_32( 8 );
-   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   for ( int i = 0; i < 19; i++ )
      vdata[i] = v256_32( pdata[i] );
@@ -325,22 +314,22 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
   istate[6] = v256_32( sha256_iv[6] );
   istate[7] = v256_32( sha256_iv[7] );

-   sha256_8way_transform_le( mstate1, vdata, istate );
+   sha256_8x32_transform_le( mstate1, vdata, istate );

   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
+   sha256_8x32_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
   
   do
   {
-      sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
-      if ( unlikely( sha256_8way_transform_le_short( hash32, block,
+      sha256_8x32_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
+      if ( unlikely( sha256_8x32_transform_le_short( hash32, block,
                                                     istate, ptarget ) ) )
      {
         for ( int lane = 0; lane < 8; lane++ )
         {
            extr_lane_8x32( lane_hash, hash32, lane, 256 );
            casti_m256i( lane_hash, 0 ) =
-               _mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
+                                mm256_bswap_32( casti_m256i( lane_hash, 0 ) );
            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
            {
               pdata[19] = n + lane;
--- a/algo/sha/sha256d.h
+++ b/algo/sha/sha256d.h
@@ -12,7 +12,7 @@
  #define SHA256D_NEON_SHA2 1
 #elif defined(__AVX2__)
  #define SHA256D_8WAY 1
-#else
+#elif defined(__SSE2__) || defined(__ARM_NEON) 
  #define SHA256D_4WAY 1
 #endif

--- a/algo/sha/sha256dt.c
+++ b/algo/sha/sha256dt.c
@@ -17,7 +17,6 @@
 #elif defined (__SSE2__) || defined(__ARM_NEON) 
  #define SHA256DT_4X32 1
 #endif
-// else ref, should never happen

 static const uint32_t sha256dt_iv[8]  __attribute__ ((aligned (32))) =
 {
@@ -205,8 +204,6 @@ int scanhash_sha256dt_16x32( struct work *work, const uint32_t max_nonce,
   const int thr_id = mythr->id;
   const __m512i sixteen = v512_32( 16 );
   const bool bench = opt_benchmark;
-   const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   // prehash first block directly from pdata
   sha256_transform_le( phash, pdata, sha256dt_iv );
@@ -258,8 +255,7 @@ int scanhash_sha256dt_16x32( struct work *work, const uint32_t max_nonce,
         for ( int lane = 0; lane < 16; lane++ )
         {
            extr_lane_16x32( phash, hash32, lane, 256 );
-            casti_m256i( phash, 0 ) =
-                   _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf ); 
+            casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) ); 
            if ( likely( valid_hash( phash, ptarget ) && !bench ) )
            {
              pdata[19] = n + lane;
@@ -298,8 +294,6 @@ int scanhash_sha256dt_8x32( struct work *work, const uint32_t max_nonce,
   const bool bench = opt_benchmark;
   const __m256i last_byte = v256_32( 0x80000000 );
   const __m256i eight = v256_32( 8 );
-   const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   for ( int i = 0; i < 19; i++ )
      vdata[i] = v256_32( pdata[i] );
@@ -339,7 +333,7 @@ int scanhash_sha256dt_8x32( struct work *work, const uint32_t max_nonce,
         {
            extr_lane_8x32( lane_hash, hash32, lane, 256 );
            casti_m256i( lane_hash, 0 ) =
-               _mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
+                                mm256_bswap_32( casti_m256i( lane_hash, 0 ) );
            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
            {
               pdata[19] = n + lane;
@@ -406,7 +400,6 @@ int scanhash_sha256dt_4x32( struct work *work, const uint32_t max_nonce,
   do
   {
      sha256_4x32_final_rounds( block, vdata+16, mhash1, mhash2, mexp_pre );
-//      sha256_4x32_transform_le( block, vdata+16, mhash1 );
      sha256_4x32_transform_le( hash32, block, iv );

      for ( int lane = 0; lane < 4; lane++ )
--- a/algo/sha/sha256q-4way.c
+++ b/algo/sha/sha256q-4way.c
@@ -7,28 +7,28 @@

 #if defined(SHA256T_16WAY)

-static __thread sha256_16way_context sha256_ctx16 __attribute__ ((aligned (64)));
+static __thread sha256_16x32_context sha256_ctx16 __attribute__ ((aligned (64)));

 void sha256q_16way_hash( void* output, const void* input )
 {
   uint32_t vhash[8*16] __attribute__ ((aligned (64)));
-   sha256_16way_context ctx;
+   sha256_16x32_context ctx;
   memcpy( &ctx, &sha256_ctx16, sizeof ctx );

-   sha256_16way_update( &ctx, input + (64<<4), 16 );
-   sha256_16way_close( &ctx, vhash );
+   sha256_16x32_update( &ctx, input + (64<<4), 16 );
+   sha256_16x32_close( &ctx, vhash );

-   sha256_16way_init( &ctx );
-   sha256_16way_update( &ctx, vhash, 32 );
-   sha256_16way_close( &ctx, vhash );
+   sha256_16x32_init( &ctx );
+   sha256_16x32_update( &ctx, vhash, 32 );
+   sha256_16x32_close( &ctx, vhash );

-   sha256_16way_init( &ctx );
-   sha256_16way_update( &ctx, vhash, 32 );
-   sha256_16way_close( &ctx, vhash );
+   sha256_16x32_init( &ctx );
+   sha256_16x32_update( &ctx, vhash, 32 );
+   sha256_16x32_close( &ctx, vhash );

-   sha256_16way_init( &ctx );
-   sha256_16way_update( &ctx, vhash, 32 );
-   sha256_16way_close( &ctx, output );
+   sha256_16x32_init( &ctx );
+   sha256_16x32_update( &ctx, vhash, 32 );
+   sha256_16x32_close( &ctx, output );
 }

 int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,
@@ -51,8 +51,8 @@ int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,
   mm512_bswap32_intrlv80_16x32( vdata, pdata );
   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
-   sha256_16way_init( &sha256_ctx16 );
-   sha256_16way_update( &sha256_ctx16, vdata, 64 );
+   sha256_16x32_init( &sha256_ctx16 );
+   sha256_16x32_update( &sha256_ctx16, vdata, 64 );

   do
   {
@@ -80,28 +80,28 @@ int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,

 #if defined(SHA256T_8WAY)

-static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
+static __thread sha256_8x32_context sha256_ctx8 __attribute__ ((aligned (64)));

 void sha256q_8way_hash( void* output, const void* input )
 {
   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
-   sha256_8way_context ctx;
+   sha256_8x32_context ctx;
   memcpy( &ctx, &sha256_ctx8, sizeof ctx );

-   sha256_8way_update( &ctx, input + (64<<3), 16 );
-   sha256_8way_close( &ctx, vhash );
+   sha256_8x32_update( &ctx, input + (64<<3), 16 );
+   sha256_8x32_close( &ctx, vhash );

-   sha256_8way_init( &ctx );
-   sha256_8way_update( &ctx, vhash, 32 );
-   sha256_8way_close( &ctx, vhash );
+   sha256_8x32_init( &ctx );
+   sha256_8x32_update( &ctx, vhash, 32 );
+   sha256_8x32_close( &ctx, vhash );

-   sha256_8way_init( &ctx );
-   sha256_8way_update( &ctx, vhash, 32 );
-   sha256_8way_close( &ctx, vhash );
+   sha256_8x32_init( &ctx );
+   sha256_8x32_update( &ctx, vhash, 32 );
+   sha256_8x32_close( &ctx, vhash );

-   sha256_8way_init( &ctx );
-   sha256_8way_update( &ctx, vhash, 32 );
-   sha256_8way_close( &ctx, output );
+   sha256_8x32_init( &ctx );
+   sha256_8x32_update( &ctx, vhash, 32 );
+   sha256_8x32_close( &ctx, output );
 }

 int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,
@@ -123,8 +123,8 @@ int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
-   sha256_8way_init( &sha256_ctx8 );
-   sha256_8way_update( &sha256_ctx8, vdata, 64 );
+   sha256_8x32_init( &sha256_ctx8 );
+   sha256_8x32_update( &sha256_ctx8, vdata, 64 );

   do
   {
@@ -152,28 +152,28 @@ int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,

 #if defined(SHA256T_4WAY)

-static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
+static __thread sha256_4x32_context sha256_ctx4 __attribute__ ((aligned (64)));

 void sha256q_4way_hash( void* output, const void* input )
 {
   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-   sha256_4way_context ctx;
+   sha256_4x32_context ctx;
   memcpy( &ctx, &sha256_ctx4, sizeof ctx );

-   sha256_4way_update( &ctx, input + (64<<2), 16 );
-   sha256_4way_close( &ctx, vhash );
+   sha256_4x32_update( &ctx, input + (64<<2), 16 );
+   sha256_4x32_close( &ctx, vhash );

-   sha256_4way_init( &ctx );
-   sha256_4way_update( &ctx, vhash, 32 );
-   sha256_4way_close( &ctx, vhash );
+   sha256_4x32_init( &ctx );
+   sha256_4x32_update( &ctx, vhash, 32 );
+   sha256_4x32_close( &ctx, vhash );

-   sha256_4way_init( &ctx );
-   sha256_4way_update( &ctx, vhash, 32 );
-   sha256_4way_close( &ctx, vhash );
+   sha256_4x32_init( &ctx );
+   sha256_4x32_update( &ctx, vhash, 32 );
+   sha256_4x32_close( &ctx, vhash );

-   sha256_4way_init( &ctx );
-   sha256_4way_update( &ctx, vhash, 32 );
-   sha256_4way_close( &ctx, output );
+   sha256_4x32_init( &ctx );
+   sha256_4x32_update( &ctx, vhash, 32 );
+   sha256_4x32_close( &ctx, output );
 }

 int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
@@ -205,8 +205,8 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
                                        0 };

   v128_bswap32_intrlv80_4x32( vdata, pdata );
-   sha256_4way_init( &sha256_ctx4 );
-   sha256_4way_update( &sha256_ctx4, vdata, 64 );
+   sha256_4x32_init( &sha256_ctx4 );
+   sha256_4x32_update( &sha256_ctx4, vdata, 64 );

   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
   {
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -35,8 +35,6 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   const int thr_id = mythr->id;
   const __m512i sixteen = v512_32( 16 );
   const bool bench = opt_benchmark;
-   const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   // prehash first block directly from pdata
   sha256_transform_le( phash, pdata, sha256_iv );
@@ -62,7 +60,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   buf[15] = v512_32( 80*8 ); // bit count

   // partially pre-expand & prehash second message block, avoiding the nonces
-   sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
+   sha256_16x32_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );

   // vectorize IV for 2nd & 3rd sha256
   istate[0] = v512_32( sha256_iv[0] );
@@ -81,18 +79,17 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,

   do
   {
-      sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
+      sha256_16x32_final_rounds( block, buf, mstate1, mstate2, mexp_pre );

-      sha256_16way_transform_le( block, block, istate );
+      sha256_16x32_transform_le( block, block, istate );

-      if ( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) )
+      if ( sha256_16x32_transform_le_short( hash32, block, istate, ptarget ) )
      {
         for ( int lane = 0; lane < 16; lane++ )
         if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
         {
            extr_lane_16x32( phash, hash32, lane, 256 );
-            casti_m256i( phash, 0 ) =
-                _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
+            casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) );
            if ( likely( valid_hash( phash, ptarget ) && !bench ) )
            {
               pdata[19] = n + lane;
@@ -301,8 +298,6 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
   const bool bench = opt_benchmark;
   const __m256i last_byte = v256_32( 0x80000000 );
   const __m256i eight = v256_32( 8 );
-   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   for ( int i = 0; i < 19; i++ )
      vdata[i] = v256_32( pdata[i] );
@@ -327,29 +322,29 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
   istate[6] = v256_32( sha256_iv[6] );
   istate[7] = v256_32( sha256_iv[7] );

-   sha256_8way_transform_le( mstate1, vdata, istate );
+   sha256_8x32_transform_le( mstate1, vdata, istate );

   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
+   sha256_8x32_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );

   do
   {
      // 1. final 16 bytes of data, with padding
-      sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2,
+      sha256_8x32_final_rounds( block, vdata+16, mstate1, mstate2,
                                mexp_pre );

      // 2. 32 byte hash from 1.
-      sha256_8way_transform_le( block, block, istate );
+      sha256_8x32_transform_le( block, block, istate );

      // 3. 32 byte hash from 2.
-      if ( unlikely( sha256_8way_transform_le_short(
+      if ( unlikely( sha256_8x32_transform_le_short(
                                    hash32, block, istate, ptarget ) ) )
      {
         for ( int lane = 0; lane < 8; lane++ )
         {
            extr_lane_8x32( lane_hash, hash32, lane, 256 );
            casti_m256i( lane_hash, 0 ) =
-             _mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
+                             mm256_bswap_32( casti_m256i( lane_hash, 0 ) );
            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
            {
               pdata[19] = n + lane;
@@ -419,8 +414,8 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
   do
   {
      sha256_4x32_final_rounds( block, vdata+16, mhash1, mhash2, mexp_pre );
-      sha256_4way_transform_le( block,  block, iv );
-      sha256_4way_transform_le( hash32, block, iv );
+      sha256_4x32_transform_le( block,  block, iv );
+      sha256_4x32_transform_le( hash32, block, iv );

      for ( int lane = 0; lane < 4; lane++ )
      {
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -83,15 +83,13 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
                              const uint64_t *state_in )
 {
    __m256i STATE0, STATE1;
-    __m256i MSG, TMP, BSWAP64;
+    __m256i MSG, TMP;
    __m256i TMSG0, TMSG1, TMSG2, TMSG3;
    __m256i ABEF_SAVE, CDGH_SAVE;

    // Load initial values
    TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
    STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
-    BSWAP64 = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f,
-                                                0x0001020304050607 ) );
    TMP = _mm256_permute4x64_epi64( TMP, 0xB1 );             // CDAB
    STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B );       // EFGH
    STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
@@ -103,7 +101,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,

    // Rounds 0-3
    TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
-    TMSG0 = _mm256_shuffle_epi8( TMSG0, BSWAP64 );
+    TMSG0 = mm256_bswap_64( TMSG0 );
    MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
                                       _mm256_castsi256_si128 (MSG ) );
@@ -113,7 +111,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,

    // Rounds 4-7
    TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
-    TMSG1 = _mm256_shuffle_epi8( TMSG1, BSWAP64 );
+    TMSG1 = mm256_bswap_64( TMSG1 );
    MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
                                        _mm256_castsi256_si128( MSG ) );
@@ -124,7 +122,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,

    // Rounds 8-11
    TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
-    TMSG2 = _mm256_shuffle_epi8( TMSG2, BSWAP64 );
+    TMSG2 = mm256_bswap_64( TMSG2 );
    MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
                                       _mm256_castsi256_si128( MSG ) );
@@ -135,7 +133,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,

    // Rounds 12-15
    TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
-    TMSG3 = _mm256_shuffle_epi8( TMSG3, BSWAP64 );
+    TMSG3 = mm256_bswap_64( TMSG3 );
    MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
                                       _mm256_castsi256_si128( MSG ) );
@@ -735,8 +733,6 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst )
    unsigned ptr;
    const int buf_size = 128;
    const int pad = buf_size - 16;
-    const __m512i shuff_bswap64 = mm512_bcast_m128( _mm_set_epi64x(
-                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ) );

    ptr = (unsigned)sc->count & (buf_size - 1U);
    sc->buf[ ptr>>3 ] = v512_64( 0x80 );
@@ -750,10 +746,8 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst )
    else
         memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );

-    sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
-                       v512_64( sc->count >> 61 ), shuff_bswap64 );
-    sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
-                       v512_64( sc->count <<  3 ), shuff_bswap64 );
+    sc->buf[ pad >> 3 ] = v512_64( bswap_64( sc->count >> 61 ) );
+    sc->buf[ ( pad+8 ) >> 3 ] = v512_64( bswap_64( sc->count <<  3 ) );
    sha512_8x64_round( sc, sc->buf, sc->val );

    mm512_block_bswap_64( dst, sc->val );
@@ -957,8 +951,6 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst )
    unsigned ptr;
    const int buf_size = 128;
    const int pad = buf_size - 16;
-    const __m256i shuff_bswap64 = mm256_bcast_m128( _mm_set_epi64x(
-                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ) );

    ptr = (unsigned)sc->count & (buf_size - 1U);
    sc->buf[ ptr>>3 ] = v256_64( 0x80 );
@@ -972,10 +964,8 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst )
    else
         memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );

-    sc->buf[ pad >> 3 ] = _mm256_shuffle_epi8(
-                       v256_64( sc->count >> 61 ), shuff_bswap64 );
-    sc->buf[ ( pad+8 ) >> 3 ] = _mm256_shuffle_epi8( 
-                       v256_64( sc->count <<  3 ), shuff_bswap64 );
+    sc->buf[ pad >> 3 ] = v256_64( bswap_64( sc->count >> 61 ) );
+    sc->buf[ ( pad+8 ) >> 3 ] = v256_64( bswap_64( sc->count <<  3 ) );
    sha512_4x64_round( sc, sc->buf, sc->val );

    mm256_block_bswap_64( dst, sc->val );
@@ -1138,8 +1128,8 @@ void sha512_2x64_close( sha512_2x64_context *sc, void *dst )
    else
         v128_memset_zero( sc->buf + (ptr>>3), (pad - ptr) >> 3 );

-    sc->buf[ pad >> 3 ] = v128_bswap64( v128_64( sc->count >> 61 ) );
-    sc->buf[ ( pad+8 ) >> 3 ] = v128_bswap64( v128_64( sc->count << 3 ) );
+    sc->buf[ pad >> 3 ] = v128_64( bswap_64( sc->count >> 61 ) );
+    sc->buf[ ( pad+8 ) >> 3 ] = v128_64( bswap_64( sc->count << 3 ) );
    sha512_2x64_round( sc, sc->buf, sc->val );

    v128_block_bswap64( castp_v128u64( dst ), sc->val );
--- a/algo/sha/sha512-hash.h
+++ b/algo/sha/sha512-hash.h
@@ -36,7 +36,6 @@ typedef struct
   uint64_t count;
   bool initialized;
 } sha512_8x64_context __attribute__ ((aligned (128)));
-#define sha512_8way_context sha512_8x64_context

 void sha512_8x64_init( sha512_8x64_context *sc);
 void sha512_8x64_update( sha512_8x64_context *sc, const void *data, 
@@ -45,10 +44,6 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst );
 void sha512_8x64_ctx( sha512_8x64_context *sc, void *dst, const void *data,
                      size_t len );

-#define sha512_8way_init     sha512_8x64_init
-#define sha512_8way_update   sha512_8x64_update
-#define sha512_8way_close    sha512_8x64_close
-
 #endif  // AVX512

 #if defined (__AVX2__)
@@ -62,7 +57,6 @@ typedef struct
   uint64_t count;
   bool initialized;
 } sha512_4x64_context __attribute__ ((aligned (64)));
-#define sha512_4way_context sha512_4x64_context

 void sha512_4x64_init( sha512_4x64_context *sc);
 void sha512_4x64_update( sha512_4x64_context *sc, const void *data,
@@ -71,10 +65,6 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst );
 void sha512_4x64_ctx( sha512_4x64_context *sc, void *dst, const void *data,
                       size_t len );

-#define sha512_4way_init     sha512_4x64_init
-#define sha512_4way_update   sha512_4x64_update
-#define sha512_4way_close    sha512_4x64_close
-
 #endif  // AVX2

 typedef struct
--- a/algo/sha/sha512256d-4way.c
+++ b/algo/sha/sha512256d-4way.c
@@ -14,7 +14,7 @@

 #if defined(SHA512256D_8WAY)

-static void sha512256d_8way_init( sha512_8way_context *ctx )
+static void sha512256d_8x64_init( sha512_8x64_context *ctx )
 {
  ctx->count = 0;
  ctx->initialized = true;
@@ -33,7 +33,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
 {
    uint64_t hash[8*8] __attribute__ ((aligned (128)));
    uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-    sha512_8way_context ctx; 
+    sha512_8x64_context ctx; 
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
    uint64_t *hash_q3 = &(hash[3*8]);
    uint32_t *pdata = work->data;
@@ -53,13 +53,13 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
    do
    {
-       sha512256d_8way_init( &ctx );
-       sha512_8way_update( &ctx, vdata, 80 );
-       sha512_8way_close( &ctx, hash );        
+       sha512256d_8x64_init( &ctx );
+       sha512_8x64_update( &ctx, vdata, 80 );
+       sha512_8x64_close( &ctx, hash );        

-       sha512256d_8way_init( &ctx );
-       sha512_8way_update( &ctx, hash, 32 );
-       sha512_8way_close( &ctx, hash );
+       sha512256d_8x64_init( &ctx );
+       sha512_8x64_update( &ctx, hash, 32 );
+       sha512_8x64_close( &ctx, hash );

       for ( int lane = 0; lane < 8; lane++ )
       if ( unlikely( hash_q3[ lane ] <= targ_q3 && !bench ) )
@@ -82,7 +82,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,

 #elif defined(SHA512256D_4WAY)

-static void sha512256d_4way_init( sha512_4way_context *ctx )
+static void sha512256d_4x64_init( sha512_4x64_context *ctx )
 {
  ctx->count = 0;
  ctx->initialized = true;
@@ -101,7 +101,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
 {
    uint64_t hash[8*4] __attribute__ ((aligned (64)));
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-    sha512_4way_context ctx;
+    sha512_4x64_context ctx;
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
    uint64_t *hash_q3 = &(hash[3*4]);
    uint32_t *pdata = work->data;
@@ -119,13 +119,13 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
                     n+3, 0, n+2, 0, n+1, 0, n, 0 ), casti_m256i( vdata,9 ) );
    do
    {
-       sha512256d_4way_init( &ctx );
-       sha512_4way_update( &ctx, vdata, 80 );
-       sha512_4way_close( &ctx, hash );
+       sha512256d_4x64_init( &ctx );
+       sha512_4x64_update( &ctx, vdata, 80 );
+       sha512_4x64_close( &ctx, hash );

-       sha512256d_4way_init( &ctx );
-       sha512_4way_update( &ctx, hash, 32 );
-       sha512_4way_close( &ctx, hash );
+       sha512256d_4x64_init( &ctx );
+       sha512_4x64_update( &ctx, hash, 32 );
+       sha512_4x64_close( &ctx, hash );

       for ( int lane = 0; lane < 4; lane++ )
       if ( hash_q3[ lane ] <= targ_q3 )
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -430,9 +430,9 @@ do { \
   } while (0)

 static void
-shabal_16way_init( void *cc, unsigned size )
+shabal_16x32_init( void *cc, unsigned size )
 {
-   shabal_16way_context *sc = (shabal_16way_context*)cc;
+   shabal_16x32_context *sc = (shabal_16x32_context*)cc;

   if ( size == 512 )
   { // copy immediate constants directly to working registers later.
@@ -494,9 +494,9 @@ shabal_16way_init( void *cc, unsigned size )
 }

 static void
-shabal_16way_core( void *cc, const unsigned char *data, size_t len )
+shabal_16x32_core( void *cc, const unsigned char *data, size_t len )
 {
-   shabal_16way_context *sc = (shabal_16way_context*)cc;
+   shabal_16x32_context *sc = (shabal_16x32_context*)cc;
    __m512i *buf;
    __m512i *vdata = (__m512i*)data;
   const int buf_size = 64;
@@ -544,10 +544,10 @@ shabal_16way_core( void *cc, const unsigned char *data, size_t len )
 }

 static void
-shabal_16way_close( void *cc, unsigned ub, unsigned n, void *dst,
+shabal_16x32_close( void *cc, unsigned ub, unsigned n, void *dst,
                   unsigned size_words )
 {
-   shabal_16way_context *sc = (shabal_16way_context*)cc;
+   shabal_16x32_context *sc = (shabal_16x32_context*)cc;
    __m512i *buf;
   const int buf_size = 64;
   size_t ptr;
@@ -590,52 +590,39 @@ shabal_16way_close( void *cc, unsigned ub, unsigned n, void *dst,
 }

 void
-shabal256_16way_init( void *cc )
+shabal256_16x32_init( void *cc )
 {
-   shabal_16way_init(cc, 256);
+   shabal_16x32_init(cc, 256);
 }

 void
-shabal256_16way_update( void *cc, const void *data, size_t len )
+shabal256_16x32_update( void *cc, const void *data, size_t len )
 {
-   shabal_16way_core( cc, data, len );
+   shabal_16x32_core( cc, data, len );
 }

 void
-shabal256_16way_close( void *cc, void *dst )
+shabal256_16x32_close( void *cc, void *dst )
 {
-   shabal_16way_close(cc, 0, 0, dst, 8);
+   shabal_16x32_close(cc, 0, 0, dst, 8);
 }

 void
-shabal256_16way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                  void *dst )
+shabal512_16x32_init(void *cc)
 {
-   shabal_16way_close(cc, ub, n, dst, 8);
+   shabal_16x32_init(cc, 512);
 }

 void
-shabal512_16way_init(void *cc)
+shabal512_16x32_update(void *cc, const void *data, size_t len)
 {
-   shabal_16way_init(cc, 512);
+   shabal_16x32_core(cc, data, len);
 }

 void
-shabal512_16way_update(void *cc, const void *data, size_t len)
+shabal512_16x32_close(void *cc, void *dst)
 {
-   shabal_16way_core(cc, data, len);
-}
-
-void
-shabal512_16way_close(void *cc, void *dst)
-{
-   shabal_16way_close(cc, 0, 0, dst, 16);
-}
-
-void
-shabal512_16way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
-{
-   shabal_16way_close(cc, ub, n, dst, 16);
+   shabal_16x32_close(cc, 0, 0, dst, 16);
 }

 #endif
@@ -1031,9 +1018,9 @@ do { \
 } while (0)

 static void
-shabal_8way_init( void *cc, unsigned size )
+shabal_8x32_init( void *cc, unsigned size )
 {
-   shabal_8way_context *sc = (shabal_8way_context*)cc;
+   shabal_8x32_context *sc = (shabal_8x32_context*)cc;

   if ( size == 512 )
   { // copy immediate constants directly to working registers later.
@@ -1095,9 +1082,9 @@ shabal_8way_init( void *cc, unsigned size )
 }

 static void
-shabal_8way_core( void *cc, const unsigned char *data, size_t len )
+shabal_8x32_core( void *cc, const unsigned char *data, size_t len )
 {
-   shabal_8way_context *sc = (shabal_8way_context*)cc;
+   shabal_8x32_context *sc = (shabal_8x32_context*)cc;
    __m256i *buf;
    __m256i *vdata = (__m256i*)data;
   const int buf_size = 64;
@@ -1146,10 +1133,10 @@ shabal_8way_core( void *cc, const unsigned char *data, size_t len )
 }

 static void
-shabal_8way_close( void *cc, unsigned ub, unsigned n, void *dst,
+shabal_8x32_close( void *cc, unsigned ub, unsigned n, void *dst,
                   unsigned size_words )
 {
-   shabal_8way_context *sc = (shabal_8way_context*)cc;
+   shabal_8x32_context *sc = (shabal_8x32_context*)cc;
    __m256i *buf;
   const int buf_size = 64;
   size_t ptr;
@@ -1192,52 +1179,39 @@ shabal_8way_close( void *cc, unsigned ub, unsigned n, void *dst,
 }

 void
-shabal256_8way_init( void *cc )
+shabal256_8x32_init( void *cc )
 {
-   shabal_8way_init(cc, 256);
+   shabal_8x32_init(cc, 256);
 }

 void
-shabal256_8way_update( void *cc, const void *data, size_t len )
+shabal256_8x32_update( void *cc, const void *data, size_t len )
 {
-   shabal_8way_core( cc, data, len );
+   shabal_8x32_core( cc, data, len );
 }

 void
-shabal256_8way_close( void *cc, void *dst )
+shabal256_8x32_close( void *cc, void *dst )
 {
-   shabal_8way_close(cc, 0, 0, dst, 8);
+   shabal_8x32_close(cc, 0, 0, dst, 8);
 }

 void
-shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                  void *dst )
+shabal512_8x32_init(void *cc)
 {
-   shabal_8way_close(cc, ub, n, dst, 8);
+   shabal_8x32_init(cc, 512);
 }

 void
-shabal512_8way_init(void *cc)
+shabal512_8x32_update(void *cc, const void *data, size_t len)
 {
-   shabal_8way_init(cc, 512);
+   shabal_8x32_core(cc, data, len);
 }

 void
-shabal512_8way_update(void *cc, const void *data, size_t len)
+shabal512_8x32_close(void *cc, void *dst)
 {
-   shabal_8way_core(cc, data, len);
-}
-
-void
-shabal512_8way_close(void *cc, void *dst)
-{
-   shabal_8way_close(cc, 0, 0, dst, 16);
-}
-
-void
-shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
-{
-   shabal_8way_close(cc, ub, n, dst, 16);
+   shabal_8x32_close(cc, 0, 0, dst, 16);
 }

 #endif  // AVX2
@@ -1674,9 +1648,9 @@ static const sph_u32 C_init_512[] = {
 */

 static void
-shabal_4way_init( void *cc, unsigned size )
+shabal_4x32_init( void *cc, unsigned size )
 {
-   shabal_4way_context *sc = (shabal_4way_context*)cc;
+   shabal_4x32_context *sc = (shabal_4x32_context*)cc;

   if ( size == 512 )
   { // copy immediate constants directly to working registers later.
@@ -1786,9 +1760,9 @@ shabal_4way_init( void *cc, unsigned size )
 }

 static void
-shabal_4way_core( void *cc, const unsigned char *data, size_t len )
+shabal_4x32_core( void *cc, const unsigned char *data, size_t len )
 {
-   shabal_4way_context *sc = (shabal_4way_context*)cc;
+   shabal_4x32_context *sc = (shabal_4x32_context*)cc;
    v128_t *buf;
    v128_t *vdata = (v128_t*)data;
   const int buf_size = 64;  
@@ -1838,10 +1812,10 @@ shabal_4way_core( void *cc, const unsigned char *data, size_t len )
 }

 static void
-shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
+shabal_4x32_close( void *cc, unsigned ub, unsigned n, void *dst,
                   unsigned size_words )
 {
-   shabal_4way_context *sc = (shabal_4way_context*)cc;
+   shabal_4x32_context *sc = (shabal_4x32_context*)cc;
    v128_t *buf;
   const int buf_size = 64;
   size_t ptr;
@@ -1884,52 +1858,39 @@ shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
 }

 void
-shabal256_4way_init( void *cc )
+shabal256_4x32_init( void *cc )
 {
-	shabal_4way_init(cc, 256);
+	shabal_4x32_init(cc, 256);
 }

 void
-shabal256_4way_update( void *cc, const void *data, size_t len )
+shabal256_4x32_update( void *cc, const void *data, size_t len )
 {
-	shabal_4way_core( cc, data, len );
+	shabal_4x32_core( cc, data, len );
 }

 void
-shabal256_4way_close( void *cc, void *dst )
+shabal256_4x32_close( void *cc, void *dst )
 {
-	shabal_4way_close(cc, 0, 0, dst, 8);
+	shabal_4x32_close(cc, 0, 0, dst, 8);
 }

 void
-shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                  void *dst )
+shabal512_4x32_init(void *cc)
 {
-	shabal_4way_close(cc, ub, n, dst, 8);
+	shabal_4x32_init(cc, 512);
 }

 void
-shabal512_4way_init(void *cc)
+shabal512_4x32_update(void *cc, const void *data, size_t len)
 {
-	shabal_4way_init(cc, 512);
+	shabal_4x32_core(cc, data, len);
 }

 void
-shabal512_4way_update(void *cc, const void *data, size_t len)
+shabal512_4x32_close(void *cc, void *dst)
 {
-	shabal_4way_core(cc, data, len);
-}
-
-void
-shabal512_4way_close(void *cc, void *dst)
-{
-	shabal_4way_close(cc, 0, 0, dst, 16);
-}
-
-void
-shabal512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
-{
-	shabal_4way_close(cc, ub, n, dst, 16);
+	shabal_4x32_close(cc, 0, 0, dst, 16);
 }

 #endif
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -4,10 +4,6 @@
 #include <stddef.h>
 #include "simd-utils.h"

-#define SPH_SIZE_shabal256   256
-
-#define SPH_SIZE_shabal512   512
-
 #if defined(SIMD512)

 typedef struct {
@@ -16,22 +12,27 @@ typedef struct {
   uint32_t Whigh, Wlow;
   size_t ptr;
   bool state_loaded;
-} shabal_16way_context __attribute__ ((aligned (64)));
+} shabal_16x32_context __attribute__ ((aligned (64)));

-typedef shabal_16way_context shabal256_16way_context;
-typedef shabal_16way_context shabal512_16way_context;
+typedef shabal_16x32_context shabal256_16x32_context;
+typedef shabal_16x32_context shabal512_16x32_context;

-void shabal256_16way_init( void *cc );
-void shabal256_16way_update( void *cc, const void *data, size_t len );
-void shabal256_16way_close( void *cc, void *dst );
-void shabal256_16way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                       void *dst );
+void shabal256_16x32_init( void *cc );
+void shabal256_16x32_update( void *cc, const void *data, size_t len );
+void shabal256_16x32_close( void *cc, void *dst );

-void shabal512_16way_init( void *cc );
-void shabal512_16way_update( void *cc, const void *data, size_t len );
-void shabal512_16way_close( void *cc, void *dst );
-void shabal512_16way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                       void *dst );
+void shabal512_16x32_init( void *cc );
+void shabal512_16x32_update( void *cc, const void *data, size_t len );
+void shabal512_16x32_close( void *cc, void *dst );
+
+#define shabal256_16way_context    shabal256_16x32_context
+#define shabal256_16way_init       shabal256_16x32_init
+#define shabal256_16way_update     shabal256_16x32_update
+#define shabal256_16way_close      shabal256_16x32_close
+#define shabal512_16way_context    shabal512_16x32_context
+#define shabal512_16way_init       shabal512_16x32_init
+#define shabal512_16way_update     shabal512_16x32_update
+#define shabal512_16way_close      shabal512_16x32_close

 #endif

@@ -43,22 +44,27 @@ typedef struct {
   uint32_t Whigh, Wlow;
   size_t ptr;
   bool state_loaded;
-} shabal_8way_context __attribute__ ((aligned (64)));
+} shabal_8x32_context __attribute__ ((aligned (64)));

-typedef shabal_8way_context shabal256_8way_context;
-typedef shabal_8way_context shabal512_8way_context;
+typedef shabal_8x32_context shabal256_8x32_context;
+typedef shabal_8x32_context shabal512_8x32_context;

-void shabal256_8way_init( void *cc );
-void shabal256_8way_update( void *cc, const void *data, size_t len );
-void shabal256_8way_close( void *cc, void *dst );
-void shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                       void *dst );
+void shabal256_8x32_init( void *cc );
+void shabal256_8x32_update( void *cc, const void *data, size_t len );
+void shabal256_8x32_close( void *cc, void *dst );

-void shabal512_8way_init( void *cc );
-void shabal512_8way_update( void *cc, const void *data, size_t len );
-void shabal512_8way_close( void *cc, void *dst );
-void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                       void *dst );
+void shabal512_8x32_init( void *cc );
+void shabal512_8x32_update( void *cc, const void *data, size_t len );
+void shabal512_8x32_close( void *cc, void *dst );
+
+#define shabal256_8way_context     shabal256_8x32_context
+#define shabal256_8way_init        shabal256_8x32_init
+#define shabal256_8way_update      shabal256_8x32_update
+#define shabal256_8way_close       shabal256_8x32_close
+#define shabal512_8way_context     shabal512_8x32_context
+#define shabal512_8way_init        shabal512_8x32_init
+#define shabal512_8way_update      shabal512_8x32_update
+#define shabal512_8way_close       shabal512_8x32_close

 #endif

@@ -70,59 +76,29 @@ typedef struct {
 	uint32_t Whigh, Wlow;
   size_t ptr;
   bool state_loaded;
-} shabal_4way_context;
+} shabal_4x32_context;

-typedef shabal_4way_context shabal256_4way_context;
-typedef shabal_4way_context shabal512_4way_context;
+typedef shabal_4x32_context shabal256_4x32_context;
+typedef shabal_4x32_context shabal512_4x32_context;

-void shabal256_4way_init( void *cc );
-void shabal256_4way_update( void *cc, const void *data, size_t len );
-void shabal256_4way_close( void *cc, void *dst );
-void shabal256_4way_addbits_and_close(	void *cc, unsigned ub, unsigned n,
-                                       void *dst );
+void shabal256_4x32_init( void *cc );
+void shabal256_4x32_update( void *cc, const void *data, size_t len );
+void shabal256_4x32_close( void *cc, void *dst );

-void shabal512_4way_init( void *cc );
-void shabal512_4way_update( void *cc, const void *data, size_t len );
-void shabal512_4way_close( void *cc, void *dst );
-void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                       void *dst );
+void shabal512_4x32_init( void *cc );
+void shabal512_4x32_update( void *cc, const void *data, size_t len );
+void shabal512_4x32_close( void *cc, void *dst );
+
+#define shabal256_4way_context     shabal256_4x32_context
+#define shabal256_4way_init        shabal256_4x32_init
+#define shabal256_4way_update      shabal256_4x32_update
+#define shabal256_4way_close       shabal256_4x32_close
+#define shabal512_4way_context     shabal512_4x32_context
+#define shabal512_4way_init        shabal512_4x32_init
+#define shabal512_4way_update      shabal512_4x32_update
+#define shabal512_4way_close       shabal512_4x32_close

 #endif

-// SSE or NEON
-
-/* No __mullo_pi32
-
-typedef struct
-{
-   v64_t buf[16] __attribute__ ((aligned (64)));
-   v64_t A[12], B[16], C[16];
-   uint32_t Whigh, Wlow;
-   size_t ptr;
-   bool state_loaded;
-} shabal_2x32_context;
-
-typedef shabal_2x32_context shabal256_2x32_context;
-typedef shabal_2x32_context shabal512_2x32_context;
-
-void shabal256_2x32_init( void *cc );
-void shabal256_2x32_update( void *cc, const void *data, size_t len );
-void shabal256_2x32_close( void *cc, void *dst );
-void shabal256_2x32_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                       void *dst );
-
-void shabal512_2x32_init( shabal512_2x32_context *cc );
-void shabal512_2x32_update( shabal512_2x32_context *cc, const void *data,
-                            size_t len );
-void shabal512_2x32_close( shabal512_2x32_context *cc, void *dst );
-void shabal512_2x32_addbits_and_close( shabal512_2x32_context *cc,
-                                       unsigned ub, unsigned n, void *dst );
-void shabal512_2x32_ctx( shabal512_2x32_context *cc, void *dst,
-                         const void *data, size_t len );
-void shabal512_2x32( shabal512_2x32_context *dst, const void *data,
-                     size_t len );
-
-*/
-
 #endif

--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -6,23 +6,23 @@

 #if defined (SKEIN_8WAY)

-static __thread skein512_8way_context skein512_8way_ctx
+static __thread skein512_8x64_context skein512_8x64_ctx
                                            __attribute__ ((aligned (64)));

 void skeinhash_8way( void *state, const void *input )
 {
     uint64_t vhash64[8*8] __attribute__ ((aligned (128)));
-     skein512_8way_context ctx_skein;
-     memcpy( &ctx_skein, &skein512_8way_ctx, sizeof( ctx_skein ) );
+     skein512_8x64_context ctx_skein;
+     memcpy( &ctx_skein, &skein512_8x64_ctx, sizeof( ctx_skein ) );
     uint32_t vhash32[16*8] __attribute__ ((aligned (128)));
-     sha256_8way_context ctx_sha256;
+     sha256_8x32_context ctx_sha256;

-     skein512_8way_final16( &ctx_skein, vhash64, input + (64*8) );
+     skein512_8x64_final16( &ctx_skein, vhash64, input + (64*8) );
     rintrlv_8x64_8x32( vhash32, vhash64, 512 );

-     sha256_8way_init( &ctx_sha256 );
-     sha256_8way_update( &ctx_sha256, vhash32, 64 );
-     sha256_8way_close( &ctx_sha256, state );
+     sha256_8x32_init( &ctx_sha256 );
+     sha256_8x32_update( &ctx_sha256, vhash32, 64 );
+     sha256_8x32_close( &ctx_sha256, state );
 }

 int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
@@ -46,7 +46,7 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
   *noncev = mm512_intrlv_blend_32(
                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
-   skein512_8way_prehash64( &skein512_8way_ctx, vdata );
+   skein512_8x64_prehash64( &skein512_8x64_ctx, vdata );
   do
   {
       skeinhash_8way( hash, vdata );
@@ -73,14 +73,14 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce,

 #elif defined (SKEIN_4WAY)

-static __thread skein512_4way_context skein512_4way_ctx
+static __thread skein512_4x64_context skein512_4x64_ctx
                                            __attribute__ ((aligned (64)));

 void skeinhash_4way( void *state, const void *input )
 {
     uint64_t vhash64[8*4] __attribute__ ((aligned (128)));
-     skein512_4way_context ctx_skein;
-     memcpy( &ctx_skein, &skein512_4way_ctx, sizeof( ctx_skein ) );
+     skein512_4x64_context ctx_skein;
+     memcpy( &ctx_skein, &skein512_4x64_ctx, sizeof( ctx_skein ) );
 #if defined(__SHA__)
     uint32_t hash0[16] __attribute__ ((aligned (64)));
     uint32_t hash1[16] __attribute__ ((aligned (64)));
@@ -88,10 +88,10 @@ void skeinhash_4way( void *state, const void *input )
     uint32_t hash3[16] __attribute__ ((aligned (64)));
 #else
     uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
-     sha256_4way_context ctx_sha256;
+     sha256_4x32_context ctx_sha256;
 #endif

-     skein512_4way_final16( &ctx_skein, vhash64, input + (64*4) );
+     skein512_4x64_final16( &ctx_skein, vhash64, input + (64*4) );

 #if defined(__SHA__)      

@@ -107,9 +107,9 @@ void skeinhash_4way( void *state, const void *input )
 #else

     rintrlv_4x64_4x32( vhash32, vhash64, 512 );
-     sha256_4way_init( &ctx_sha256 );
-     sha256_4way_update( &ctx_sha256, vhash32, 64 );
-     sha256_4way_close( &ctx_sha256, state );
+     sha256_4x32_init( &ctx_sha256 );
+     sha256_4x32_update( &ctx_sha256, vhash32, 64 );
+     sha256_4x32_close( &ctx_sha256, state );

 #endif
 }
@@ -132,7 +132,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
    const bool bench = opt_benchmark;

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
-   skein512_4way_prehash64( &skein512_4way_ctx, vdata );
+   skein512_4x64_prehash64( &skein512_4x64_ctx, vdata );

   *noncev = mm256_intrlv_blend_32(
                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -513,7 +513,7 @@ do { \

 #if defined(SIMD512)

-void skein256_8way_init( skein256_8way_context *sc )
+void skein256_8x64_init( skein256_8x64_context *sc )
 {
        sc->h0 = _mm512_set1_epi64( 0xCCD044A12FDB3E13 );
        sc->h1 = _mm512_set1_epi64( 0xE83590301A79A9EB );
@@ -527,7 +527,7 @@ void skein256_8way_init( skein256_8way_context *sc )
        sc->ptr = 0;
 }

-void skein512_8way_init( skein512_8way_context *sc )
+void skein512_8x64_init( skein512_8x64_context *sc )
 {
        sc->h0 = _mm512_set1_epi64( 0x4903ADFF749C51CE );
        sc->h1 = _mm512_set1_epi64( 0x0D95DE399746DF03 );
@@ -542,7 +542,7 @@ void skein512_8way_init( skein512_8way_context *sc )
 }

 static void
-skein_big_core_8way( skein512_8way_context *sc, const void *data,
+skein_big_core_8x64( skein512_8x64_context *sc, const void *data,
                     size_t len )
 {
   __m512i *vdata = (__m512i*)data;
@@ -587,7 +587,7 @@ skein_big_core_8way( skein512_8way_context *sc, const void *data,
 }

 static void
-skein_big_close_8way( skein512_8way_context *sc, unsigned ub, unsigned n,
+skein_big_close_8x64( skein512_8x64_context *sc, unsigned ub, unsigned n,
                      void *dst, size_t out_len )
 {
   __m512i *buf;
@@ -621,7 +621,7 @@ skein_big_close_8way( skein512_8way_context *sc, unsigned ub, unsigned n,
   memcpy_512( dst, buf, out_len >> 3 );
 }

-void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data,
+void skein512_8x64_full( skein512_8x64_context *sc, void *out, const void *data,
                     size_t len )
 {
   __m512i h0, h1, h2, h3, h4, h5, h6, h7;
@@ -698,7 +698,7 @@ void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data,
 }

 void
-skein512_8way_prehash64( skein512_8way_context *sc, const void *data )
+skein512_8x64_prehash64( skein512_8x64_context *sc, const void *data )
 {
   __m512i *vdata = (__m512i*)data;
   __m512i *buf = sc->buf;
@@ -732,7 +732,7 @@ skein512_8way_prehash64( skein512_8way_context *sc, const void *data )
 }

 void
-skein512_8way_final16( skein512_8way_context *sc,  void *output,
+skein512_8x64_final16( skein512_8x64_context *sc,  void *output,
                       const void *data )
 {
   __m512i *in = (__m512i*)data;
@@ -778,34 +778,34 @@ skein512_8way_final16( skein512_8way_context *sc,  void *output,


 void
-skein256_8way_update(void *cc, const void *data, size_t len)
+skein256_8x64_update(void *cc, const void *data, size_t len)
 {
-   skein_big_core_8way(cc, data, len);
+   skein_big_core_8x64(cc, data, len);
 }

 void
-skein256_8way_close(void *cc, void *dst)
+skein256_8x64_close(void *cc, void *dst)
 {
-        skein_big_close_8way(cc, 0, 0, dst, 32);
+        skein_big_close_8x64(cc, 0, 0, dst, 32);
 }

 void
-skein512_8way_update(void *cc, const void *data, size_t len)
+skein512_8x64_update(void *cc, const void *data, size_t len)
 {
-   skein_big_core_8way(cc, data, len);
+   skein_big_core_8x64(cc, data, len);
 }

 void
-skein512_8way_close(void *cc, void *dst)
+skein512_8x64_close(void *cc, void *dst)
 {
-        skein_big_close_8way(cc, 0, 0, dst, 64);
+        skein_big_close_8x64(cc, 0, 0, dst, 64);
 }

 #endif // AVX512

 #if defined(__AVX2__)

-void skein256_4way_init( skein256_4way_context *sc )
+void skein256_4x64_init( skein256_4x64_context *sc )
 {
        sc->h0 = _mm256_set1_epi64x( 0xCCD044A12FDB3E13 );
        sc->h1 = _mm256_set1_epi64x( 0xE83590301A79A9EB );
@@ -819,7 +819,7 @@ void skein256_4way_init( skein256_4way_context *sc )
        sc->ptr = 0;
 }

-void skein512_4way_init( skein512_4way_context *sc )
+void skein512_4x64_init( skein512_4x64_context *sc )
 {
        sc->h0 = _mm256_set1_epi64x( 0x4903ADFF749C51CE );
        sc->h1 = _mm256_set1_epi64x( 0x0D95DE399746DF03 );
@@ -835,7 +835,7 @@ void skein512_4way_init( skein512_4way_context *sc )

 // Do not use for 128 bt data length
 static void
-skein_big_core_4way( skein512_4way_context *sc, const void *data,
+skein_big_core_4x64( skein512_4x64_context *sc, const void *data,
                     size_t len )
 {
   __m256i *vdata = (__m256i*)data;
@@ -882,7 +882,7 @@ skein_big_core_4way( skein512_4way_context *sc, const void *data,
 }

 static void
-skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
+skein_big_close_4x64( skein512_4x64_context *sc, unsigned ub, unsigned n,
                      void *dst, size_t out_len )
 {
 	__m256i *buf;
@@ -920,7 +920,7 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
 }

 void
-skein512_4way_full( skein512_4way_context *sc, void *out, const void *data,
+skein512_4x64_full( skein512_4x64_context *sc, void *out, const void *data,
                     size_t len )
 {
   __m256i h0, h1, h2, h3, h4, h5, h6, h7;
@@ -995,7 +995,7 @@ skein512_4way_full( skein512_4way_context *sc, void *out, const void *data,
 }

 void
-skein512_4way_prehash64( skein512_4way_context *sc, const void *data )
+skein512_4x64_prehash64( skein512_4x64_context *sc, const void *data )
 {
   __m256i *vdata = (__m256i*)data;
   __m256i *buf = sc->buf;
@@ -1029,7 +1029,7 @@ skein512_4way_prehash64( skein512_4way_context *sc, const void *data )
 }

 void
-skein512_4way_final16( skein512_4way_context *sc,  void *out, const void *data )
+skein512_4x64_final16( skein512_4x64_context *sc,  void *out, const void *data )
 {
   __m256i *vdata = (__m256i*)data;
   __m256i *buf = sc->buf;
@@ -1073,29 +1073,29 @@ skein512_4way_final16( skein512_4way_context *sc,  void *out, const void *data )

 // Broken for 80 bytes, use prehash.
 void
-skein256_4way_update(void *cc, const void *data, size_t len)
+skein256_4x64_update(void *cc, const void *data, size_t len)
 {
-	skein_big_core_4way(cc, data, len);
+	skein_big_core_4x64(cc, data, len);
 }

 void
-skein256_4way_close(void *cc, void *dst)
+skein256_4x64_close(void *cc, void *dst)
 {
-        skein_big_close_4way(cc, 0, 0, dst, 32);
+        skein_big_close_4x64(cc, 0, 0, dst, 32);
 }


 // Broken for 80 & 128 bytes, use prehash or full
 void
-skein512_4way_update(void *cc, const void *data, size_t len)
+skein512_4x64_update(void *cc, const void *data, size_t len)
 {
-	skein_big_core_4way(cc, data, len);
+	skein_big_core_4x64(cc, data, len);
 }

 void
-skein512_4way_close(void *cc, void *dst)
+skein512_4x64_close(void *cc, void *dst)
 {
-        skein_big_close_4way(cc, 0, 0, dst, 64);
+        skein_big_close_4x64(cc, 0, 0, dst, 64);
 }

 #endif   // AVX2
@@ -1231,7 +1231,7 @@ void skein512_2x64_init( skein512_2x64_context *sc )
 }

 static void
-skein_big_core_2way( skein512_2x64_context *sc, const void *data,
+skein_big_core_2x64( skein512_2x64_context *sc, const void *data,
                     size_t len )
 {
   v128u64_t *vdata = (v128u64_t*)data;
@@ -1278,7 +1278,7 @@ skein_big_core_2way( skein512_2x64_context *sc, const void *data,
 }

 static void
-skein_big_close_2way( skein512_2x64_context *sc, unsigned ub, unsigned n,
+skein_big_close_2x64( skein512_2x64_context *sc, unsigned ub, unsigned n,
                      void *dst, size_t out_len )
 {
   v128u64_t *buf;
@@ -1471,13 +1471,13 @@ skein512_2x64_final16( skein512_2x64_context *sc,  void *out, const void *data )
 void
 skein256_2x64_update(void *cc, const void *data, size_t len)
 {
-   skein_big_core_2way(cc, data, len);
+   skein_big_core_2x64(cc, data, len);
 }

 void
 skein256_2x64_close(void *cc, void *dst)
 {
-   skein_big_close_2way(cc, 0, 0, dst, 32);
+   skein_big_close_2x64(cc, 0, 0, dst, 32);
 }


@@ -1485,13 +1485,12 @@ skein256_2x64_close(void *cc, void *dst)
 void
 skein512_2x64_update(void *cc, const void *data, size_t len)
 {
-   skein_big_core_2way(cc, data, len);
+   skein_big_core_2x64(cc, data, len);
 }

 void
 skein512_2x64_close(void *cc, void *dst)
 {
-    skein_big_close_2way(cc, 0, 0, dst, 64);
+    skein_big_close_2x64(cc, 0, 0, dst, 64);
 }

-
--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -52,24 +52,36 @@ typedef struct
   __m512i h0, h1, h2, h3, h4, h5, h6, h7;
   size_t ptr;
   uint64_t bcount;
-} skein_8way_big_context __attribute__ ((aligned (128)));
+} skein_8x64_big_context __attribute__ ((aligned (128)));

-typedef skein_8way_big_context skein512_8way_context;
-typedef skein_8way_big_context skein256_8way_context;
+typedef skein_8x64_big_context skein512_8x64_context;
+typedef skein_8x64_big_context skein256_8x64_context;

-void skein512_8way_full( skein512_8way_context *sc, void *out,
+void skein512_8x64_full( skein512_8x64_context *sc, void *out,
                         const void *data, size_t len );
-void skein512_8way_init( skein512_8way_context *sc );
-void skein512_8way_update( void *cc, const void *data, size_t len );
-void skein512_8way_close( void *cc, void *dst );
+void skein512_8x64_init( skein512_8x64_context *sc );
+void skein512_8x64_update( void *cc, const void *data, size_t len );
+void skein512_8x64_close( void *cc, void *dst );

-void skein512_8way_prehash64( skein512_8way_context *sc, const void *data );
-void skein512_8way_final16( skein512_8way_context *sc, void *out,
+void skein512_8x64_prehash64( skein512_8x64_context *sc, const void *data );
+void skein512_8x64_final16( skein512_8x64_context *sc, void *out,
     const void *data );

-void skein256_8way_init( skein256_8way_context *sc );
-void skein256_8way_update( void *cc, const void *data, size_t len );
-void skein256_8way_close( void *cc, void *dst );
+void skein256_8x64_init( skein256_8x64_context *sc );
+void skein256_8x64_update( void *cc, const void *data, size_t len );
+void skein256_8x64_close( void *cc, void *dst );
+
+#define skein512_8way_context       skein512_8x64_context
+#define skein512_8way_full          skein512_8x64_full
+#define skein512_8way_init          skein512_8x64_init
+#define skein512_8way_update        skein512_8x64_update
+#define skein512_8way_close         skein512_8x64_close
+#define skein512_8way_prehash64     skein512_8x64_prehash64
+#define skein512_8way_final16       skein512_8x64_final16
+#define skein256_8way_context       skein256_8x64_context
+#define skein256_8way_init          skein256_8x64_init
+#define skein256_8way_update        skein256_8x64_update
+#define skein256_8way_close         skein256_8x64_close

 #endif // AVX512

@@ -81,25 +93,35 @@ typedef struct
   __m256i h0, h1, h2, h3, h4, h5, h6, h7;
   size_t ptr;
 	uint64_t bcount;
-} skein_4way_big_context __attribute__ ((aligned (128)));
+} skein_4x64_big_context __attribute__ ((aligned (128)));

-typedef skein_4way_big_context skein512_4way_context;
-typedef skein_4way_big_context skein256_4way_context;
+typedef skein_4x64_big_context skein512_4x64_context;
+typedef skein_4x64_big_context skein256_4x64_context;

-void skein512_4way_init( skein512_4way_context *sc );
-void skein512_4way_full( skein512_4way_context *sc, void *out,
+void skein512_4x64_init( skein512_4x64_context *sc );
+void skein512_4x64_full( skein512_4x64_context *sc, void *out,
                         const void *data, size_t len );
-void skein512_4way_update( void *cc, const void *data, size_t len );
-void skein512_4way_close( void *cc, void *dst );
-
-void skein256_4way_init( skein256_4way_context *sc );
-void skein256_4way_update( void *cc, const void *data, size_t len );
-void skein256_4way_close( void *cc, void *dst );
-
-void skein512_4way_prehash64( skein512_4way_context *sc, const void *data );
-void skein512_4way_final16( skein512_4way_context *sc, void *out,
+void skein512_4x64_update( void *cc, const void *data, size_t len );
+void skein512_4x64_close( void *cc, void *dst );
+void skein512_4x64_prehash64( skein512_4x64_context *sc, const void *data );
+void skein512_4x64_final16( skein512_4x64_context *sc, void *out,
     const void *data );

+void skein256_4x64_init( skein256_4x64_context *sc );
+void skein256_4x64_update( void *cc, const void *data, size_t len );
+void skein256_4x64_close( void *cc, void *dst );
+
+#define skein512_4way_context       skein512_4x64_context
+#define skein512_4way_full          skein512_4x64_full
+#define skein512_4way_init          skein512_4x64_init
+#define skein512_4way_update        skein512_4x64_update
+#define skein512_4way_close         skein512_4x64_close
+#define skein512_4way_prehash64     skein512_4x64_prehash64
+#define skein512_4way_final16       skein512_4x64_final16
+#define skein256_4way_context       skein256_4x64_context
+#define skein256_4way_init          skein256_4x64_init
+#define skein256_4way_update        skein256_4x64_update
+#define skein256_4way_close         skein256_4x64_close

 #endif

@@ -109,10 +131,10 @@ typedef struct
   v128u64_t h0, h1, h2, h3, h4, h5, h6, h7;
   size_t ptr;
   uint64_t bcount;
-} skein_2way_big_context __attribute__ ((aligned (128)));
+} skein_2x64_big_context __attribute__ ((aligned (128)));

-typedef skein_2way_big_context skein512_2x64_context;
-typedef skein_2way_big_context skein256_2x64_context;
+typedef skein_2x64_big_context skein512_2x64_context;
+typedef skein_2x64_big_context skein256_2x64_context;

 void skein512_2x64_init( skein512_2x64_context *sc );
 void skein512_2x64_full( skein512_2x64_context *sc, void *out,
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -21,17 +21,17 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
    __m512i  *noncev = (__m512i*)vdata + 9; 
    const int thr_id = mythr->id; 
    const bool bench = opt_benchmark;
-    skein512_8way_context ctx;
+    skein512_8x64_context ctx;

    mm512_bswap32_intrlv80_8x64( vdata, pdata );
    *noncev = mm512_intrlv_blend_32(
                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
-    skein512_8way_prehash64( &ctx, vdata );
+    skein512_8x64_prehash64( &ctx, vdata );
    do
    {
-       skein512_8way_final16( &ctx, hash, vdata + (16*8) );
-       skein512_8way_full( &ctx, hash, hash, 64 );
+       skein512_8x64_final16( &ctx, hash, vdata + (16*8) );
+       skein512_8x64_full( &ctx, hash, hash, 64 );

       for ( int lane = 0; lane < 8; lane++ )
       if ( unlikely( hashq3[ lane ] <= targq3 && !bench ) )
@@ -71,16 +71,16 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
    __m256i  *noncev = (__m256i*)vdata + 9; 
    const int thr_id = mythr->id;  
    const bool bench = opt_benchmark;
-    skein512_4way_context ctx;
+    skein512_4x64_context ctx;

    mm256_bswap32_intrlv80_4x64( vdata, pdata );
-    skein512_4way_prehash64( &ctx, vdata );
+    skein512_4x64_prehash64( &ctx, vdata );
    *noncev = mm256_intrlv_blend_32(
                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
    do 
    {
-       skein512_4way_final16( &ctx, hash, vdata + (16*4) );
-       skein512_4way_full( &ctx, hash, hash, 64 );
+       skein512_4x64_final16( &ctx, hash, vdata + (16*4) );
+       skein512_4x64_full( &ctx, hash, hash, 64 );

       for ( int lane = 0; lane < 4; lane++ )
       if ( hash_q3[ lane ] <= targ_q3 )
--- a/algo/x16/hex.c
+++ b/algo/x16/hex.c
@@ -189,7 +189,7 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
   v128_bswap32_80( edata, pdata );
   
   static __thread uint32_t s_ntime = UINT32_MAX;
-   uint32_t ntime = swab32(pdata[17]);
+   uint32_t ntime = bswap_32(pdata[17]);
   if ( s_ntime != ntime )
   {
      hex_getAlgoString( (const uint32_t*) (&edata[1]), x16r_hash_order );
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -31,18 +31,18 @@ void x16r_8way_prehash( void *vdata, void *pdata, const char *hash_order )
   {
      case JH:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         jh512_8way_init( &x16r_ctx.jh );
-         jh512_8way_update( &x16r_ctx.jh, vdata, 64 );
+         jh512_8x64_init( &x16r_ctx.jh );
+         jh512_8x64_update( &x16r_ctx.jh, vdata, 64 );
      break;
      case KECCAK:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         keccak512_8way_init( &x16r_ctx.keccak );
-         keccak512_8way_update( &x16r_ctx.keccak, vdata, 72 );
+         keccak512_8x64_init( &x16r_ctx.keccak );
+         keccak512_8x64_update( &x16r_ctx.keccak, vdata, 72 );
      break;
      case SKEIN:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         skein512_8way_init( &x16r_ctx.skein );
-         skein512_8way_update( &x16r_ctx.skein, vdata, 64 );
+         skein512_8x64_init( &x16r_ctx.skein );
+         skein512_8x64_update( &x16r_ctx.skein, vdata, 64 );
      break;
      case LUFFA:
      {
@@ -78,8 +78,8 @@ void x16r_8way_prehash( void *vdata, void *pdata, const char *hash_order )
      break;
      case HAMSI:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         hamsi512_8way_init( &x16r_ctx.hamsi );
-         hamsi512_8way_update( &x16r_ctx.hamsi, vdata, 72 );
+         hamsi512_8x64_init( &x16r_ctx.hamsi );
+         hamsi512_8x64_update( &x16r_ctx.hamsi, vdata, 72 );
      break;
      case FUGUE:
         v128_bswap32_80( edata, pdata );
@@ -90,8 +90,8 @@ void x16r_8way_prehash( void *vdata, void *pdata, const char *hash_order )
      break;
      case SHABAL:
         mm256_bswap32_intrlv80_8x32( vdata2, pdata );
-         shabal512_8way_init( &x16r_ctx.shabal );
-         shabal512_8way_update( &x16r_ctx.shabal, vdata2, 64 );
+         shabal512_8x32_init( &x16r_ctx.shabal );
+         shabal512_8x32_update( &x16r_ctx.shabal, vdata2, 64 );
         rintrlv_8x32_8x64( vdata, vdata2, 640 );
      break;
      case WHIRLPOOL:
@@ -146,27 +146,27 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid,
      {
         case BLAKE:
            if ( i == 0 )
-               blake512_8way_full( &ctx.blake, vhash, input, size );
+               blake512_8x64_full( &ctx.blake, vhash, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
                            size<<3 );
-               blake512_8way_full( &ctx.blake, vhash, vhash, size );
+               blake512_8x64_full( &ctx.blake, vhash, vhash, size );
            }
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5,
                                 hash6, hash7, vhash );
         break;
         case BMW:
-            bmw512_8way_init( &ctx.bmw );
+            bmw512_8x64_init( &ctx.bmw );
            if ( i == 0 )
-               bmw512_8way_update( &ctx.bmw, input, size );
+               bmw512_8x64_update( &ctx.bmw, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
-               bmw512_8way_update( &ctx.bmw, vhash, size );
+               bmw512_8x64_update( &ctx.bmw, vhash, size );
            }
-            bmw512_8way_close( &ctx.bmw, vhash );
+            bmw512_8x64_close( &ctx.bmw, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
@@ -191,43 +191,43 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid,
         break;
         case JH:
            if ( i == 0 )
-               jh512_8way_update( &ctx.jh, input + (64<<3), 16 );
+               jh512_8x64_update( &ctx.jh, input + (64<<3), 16 );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
                            size<<3 );
-               jh512_8way_init( &ctx.jh );
-               jh512_8way_update( &ctx.jh, vhash, size );
+               jh512_8x64_init( &ctx.jh );
+               jh512_8x64_update( &ctx.jh, vhash, size );
            }
-            jh512_8way_close( &ctx.jh, vhash );
+            jh512_8x64_close( &ctx.jh, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case KECCAK:
           if ( i == 0 )
-               keccak512_8way_update( &ctx.keccak, input + (72<<3), 8 );
+               keccak512_8x64_update( &ctx.keccak, input + (72<<3), 8 );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
                            size<<3 );
-               keccak512_8way_init( &ctx.keccak );
-               keccak512_8way_update( &ctx.keccak, vhash, size );
+               keccak512_8x64_init( &ctx.keccak );
+               keccak512_8x64_update( &ctx.keccak, vhash, size );
            }
-            keccak512_8way_close( &ctx.keccak, vhash );
+            keccak512_8x64_close( &ctx.keccak, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case SKEIN:
            if ( i == 0 )
-               skein512_8way_update( &ctx.skein, input + (64<<3), 16 );
+               skein512_8x64_update( &ctx.skein, input + (64<<3), 16 );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
-               skein512_8way_init( &ctx.skein );
-               skein512_8way_update( &ctx.skein, vhash, size );
+               skein512_8x64_init( &ctx.skein );
+               skein512_8x64_update( &ctx.skein, vhash, size );
            }
-            skein512_8way_close( &ctx.skein, vhash );
+            skein512_8x64_close( &ctx.skein, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
@@ -333,15 +333,15 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid,
         break;
         case HAMSI:
            if ( i == 0 )
-               hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 );
+               hamsi512_8x64_update( &ctx.hamsi, input + (72<<3), 8 );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
-               hamsi512_8way_init( &ctx.hamsi );
-               hamsi512_8way_update( &ctx.hamsi, vhash, size );
+               hamsi512_8x64_init( &ctx.hamsi );
+               hamsi512_8x64_update( &ctx.hamsi, vhash, size );
            }
-            hamsi512_8way_close( &ctx.hamsi, vhash );
+            hamsi512_8x64_close( &ctx.hamsi, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
@@ -388,13 +388,13 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid,
             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                             size<<3 );
             if ( i == 0 )
-                shabal512_8way_update( &ctx.shabal, vhash + (16<<3), 16 );
+                shabal512_8x32_update( &ctx.shabal, vhash + (16<<3), 16 );
             else
             {
-                shabal512_8way_init( &ctx.shabal );
-                shabal512_8way_update( &ctx.shabal, vhash, size );
+                shabal512_8x32_init( &ctx.shabal );
+                shabal512_8x32_update( &ctx.shabal, vhash, size );
             }
-             shabal512_8way_close( &ctx.shabal, vhash );
+             shabal512_8x32_close( &ctx.shabal, vhash );
             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
@@ -438,16 +438,16 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid,
            }
         break;
         case SHA_512:
-             sha512_8way_init( &ctx.sha512 );
+             sha512_8x64_init( &ctx.sha512 );
             if ( i == 0 )
-                sha512_8way_update( &ctx.sha512, input, size );
+                sha512_8x64_update( &ctx.sha512, input, size );
             else
             {
                intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                             size<<3 );
-                sha512_8way_update( &ctx.sha512, vhash, size );
+                sha512_8x64_update( &ctx.sha512, vhash, size );
             }
-             sha512_8way_close( &ctx.sha512, vhash );
+             sha512_8x64_close( &ctx.sha512, vhash );
             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                               hash7, vhash );
          break;
@@ -556,17 +556,17 @@ void x16r_4way_prehash( void *vdata, void *pdata, const char *hash_order )
   {
      case JH:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         jh512_4way_init( &x16r_ctx.jh );
-         jh512_4way_update( &x16r_ctx.jh, vdata, 64 );
+         jh512_4x64_init( &x16r_ctx.jh );
+         jh512_4x64_update( &x16r_ctx.jh, vdata, 64 );
      break;
      case KECCAK:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         keccak512_4way_init( &x16r_ctx.keccak );
-         keccak512_4way_update( &x16r_ctx.keccak, vdata, 72 );
+         keccak512_4x64_init( &x16r_ctx.keccak );
+         keccak512_4x64_update( &x16r_ctx.keccak, vdata, 72 );
      break;
      case SKEIN:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         skein512_4way_prehash64( &x16r_ctx.skein, vdata );
+         skein512_4x64_prehash64( &x16r_ctx.skein, vdata );
      break;
      case LUFFA:
      {
@@ -599,8 +599,8 @@ void x16r_4way_prehash( void *vdata, void *pdata, const char *hash_order )
      break;
      case HAMSI:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         hamsi512_4way_init( &x16r_ctx.hamsi );
-         hamsi512_4way_update( &x16r_ctx.hamsi, vdata, 72 );
+         hamsi512_4x64_init( &x16r_ctx.hamsi );
+         hamsi512_4x64_update( &x16r_ctx.hamsi, vdata, 72 );
      break;
      case FUGUE:
         v128_bswap32_80( edata, pdata );
@@ -610,8 +610,8 @@ void x16r_4way_prehash( void *vdata, void *pdata, const char *hash_order )
      break;
      case SHABAL:
         v128_bswap32_intrlv80_4x32( vdata2, pdata );
-         shabal512_4way_init( &x16r_ctx.shabal );
-         shabal512_4way_update( &x16r_ctx.shabal, vdata2, 64 );
+         shabal512_4x32_init( &x16r_ctx.shabal );
+         shabal512_4x32_update( &x16r_ctx.shabal, vdata2, 64 );
         rintrlv_4x32_4x64( vdata, vdata2, 640 );
      break;
      case WHIRLPOOL:
@@ -652,24 +652,24 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid,
      {
         case BLAKE:
            if ( i == 0 )
-               blake512_4way_full( &ctx.blake, vhash, input, size );
+               blake512_4x64_full( &ctx.blake, vhash, input, size );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               blake512_4way_full( &ctx.blake, vhash, vhash, size );
+               blake512_4x64_full( &ctx.blake, vhash, vhash, size );
            }
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case BMW:
-            bmw512_4way_init( &ctx.bmw );
+            bmw512_4x64_init( &ctx.bmw );
            if ( i == 0 )
-               bmw512_4way_update( &ctx.bmw, input, size );
+               bmw512_4x64_update( &ctx.bmw, input, size );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               bmw512_4way_update( &ctx.bmw, vhash, size );
+               bmw512_4x64_update( &ctx.bmw, vhash, size );
            }
-            bmw512_4way_close( &ctx.bmw, vhash );
+            bmw512_4x64_close( &ctx.bmw, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case GROESTL:
@@ -689,35 +689,35 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid,
   	    break;
         case JH:
            if ( i == 0 )
-               jh512_4way_update( &ctx.jh, input + (64<<2), 16 );
+               jh512_4x64_update( &ctx.jh, input + (64<<2), 16 );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               jh512_4way_init( &ctx.jh );
-               jh512_4way_update( &ctx.jh, vhash, size );
+               jh512_4x64_init( &ctx.jh );
+               jh512_4x64_update( &ctx.jh, vhash, size );
            }
-            jh512_4way_close( &ctx.jh, vhash );
+            jh512_4x64_close( &ctx.jh, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case KECCAK:
           if ( i == 0 )
-               keccak512_4way_update( &ctx.keccak, input + (72<<2), 8 );
+               keccak512_4x64_update( &ctx.keccak, input + (72<<2), 8 );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               keccak512_4way_init( &ctx.keccak );
-               keccak512_4way_update( &ctx.keccak, vhash, size );
+               keccak512_4x64_init( &ctx.keccak );
+               keccak512_4x64_update( &ctx.keccak, vhash, size );
            }
-            keccak512_4way_close( &ctx.keccak, vhash );
+            keccak512_4x64_close( &ctx.keccak, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case SKEIN:
            if ( i == 0 )
-               skein512_4way_final16( &ctx.skein, vhash, input + (64*4) );
+               skein512_4x64_final16( &ctx.skein, vhash, input + (64*4) );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               skein512_4way_full( &ctx.skein, vhash, vhash, size );
+               skein512_4x64_full( &ctx.skein, vhash, vhash, size );
            }
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
@@ -809,14 +809,14 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid,
   	    break;
         case HAMSI:
            if ( i == 0 )
-               hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 );
+               hamsi512_4x64_update( &ctx.hamsi, input + (72<<2), 8 );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               hamsi512_4way_init( &ctx.hamsi );
-               hamsi512_4way_update( &ctx.hamsi, vhash, size );
+               hamsi512_4x64_init( &ctx.hamsi );
+               hamsi512_4x64_update( &ctx.hamsi, vhash, size );
            }
-            hamsi512_4way_close( &ctx.hamsi, vhash );
+            hamsi512_4x64_close( &ctx.hamsi, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case FUGUE:
@@ -845,13 +845,13 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid,
         case SHABAL:
             intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
             if ( i == 0 )
-                shabal512_4way_update( &ctx.shabal, vhash + (16<<2), 16 );
+                shabal512_4x32_update( &ctx.shabal, vhash + (16<<2), 16 );
             else
             {
-                shabal512_4way_init( &ctx.shabal );
-                shabal512_4way_update( &ctx.shabal, vhash, size );
+                shabal512_4x32_init( &ctx.shabal );
+                shabal512_4x32_update( &ctx.shabal, vhash, size );
             }
-             shabal512_4way_close( &ctx.shabal, vhash );
+             shabal512_4x32_close( &ctx.shabal, vhash );
             dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case WHIRLPOOL:
@@ -878,16 +878,16 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid,
            }
         break;
         case SHA_512:
-            sha512_4way_init( &ctx.sha512 );
+            sha512_4x64_init( &ctx.sha512 );
            if ( i == 0 )
-               sha512_4way_update( &ctx.sha512, input, size );
+               sha512_4x64_update( &ctx.sha512, input, size );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               sha512_4way_init( &ctx.sha512 );
-               sha512_4way_update( &ctx.sha512, vhash, size );
+               sha512_4x64_init( &ctx.sha512 );
+               sha512_4x64_update( &ctx.sha512, vhash, size );
            }
-            sha512_4way_close( &ctx.sha512, vhash );
+            sha512_4x64_close( &ctx.sha512, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
      }
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -125,19 +125,19 @@ bool register_x21s__algo( algo_gate_t* gate );

 union _x16r_8way_context_overlay
 {
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
+    blake512_8x64_context   blake;
+    bmw512_8x64_context     bmw;
+    skein512_8x64_context   skein;
+    jh512_8x64_context      jh;
+    keccak512_8x64_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
    simd_4way_context       simd;
-    hamsi512_8way_context   hamsi;
+    hamsi512_8x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_8way_context  shabal;
+    shabal512_8x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
+    sha512_8x64_context     sha512;
 #if defined(__VAES__)
    groestl512_4way_context groestl;
    shavite512_4way_context shavite;
@@ -170,8 +170,8 @@ int scanhash_x16r_8way( struct work *, uint32_t ,

 union _x16r_4way_context_overlay
 {
-    blake512_4way_context   blake;
-    bmw512_4way_context     bmw;
+    blake512_4x64_context   blake;
+    bmw512_4x64_context     bmw;
 #if defined(__VAES__)
    groestl512_2way_context groestl;
    shavite512_2way_context shavite;
@@ -181,17 +181,17 @@ union _x16r_4way_context_overlay
    shavite512_context      shavite;
    hashState_echo          echo;
 #endif
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;
-    keccak512_4way_context  keccak;
+    skein512_4x64_context   skein;
+    jh512_4x64_context      jh;
+    keccak512_4x64_context  keccak;
    luffa_2way_context      luffa;
    cube_2way_context       cube;
    simd_2way_context       simd;
-    hamsi512_4way_context   hamsi;
+    hamsi512_4x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_4way_context  shabal;
+    shabal512_4x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_4way_context     sha512;
+    sha512_4x64_context     sha512;
 } __attribute__ ((aligned (64)));
 #define  _x16r_4x64_context_overlay _x16r_4way_context_overlay

--- a/algo/x16/x16rt.c
+++ b/algo/x16/x16rt.c
@@ -20,7 +20,7 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
   v128_bswap32_80( edata, pdata );

   static __thread uint32_t s_ntime = UINT32_MAX;
-   uint32_t masked_ntime = swab32( pdata[17] ) & 0xffffff80;
+   uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
   if ( s_ntime != masked_ntime )
   {
      x16rt_getTimeHash( masked_ntime, &timeHash );
@@ -28,7 +28,7 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
      s_ntime = masked_ntime;
      if ( !thr_id )
          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
-                        x16r_hash_order, swab32( pdata[17] ), timeHash );
+                        x16r_hash_order, bswap_32( pdata[17] ), timeHash );
   }
   
   x16r_prehash( edata, pdata, x16r_hash_order );
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -14,19 +14,19 @@

 union _x16rv2_8way_context_overlay
 {
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
+    blake512_8x64_context   blake;
+    bmw512_8x64_context     bmw;
+    skein512_8x64_context   skein;
+    jh512_8x64_context      jh;
+    keccak512_8x64_context  keccak;
    luffa_4way_context      luffa;
    cubehashParam           cube;
    simd_4way_context       simd;
-    hamsi512_8way_context   hamsi;
+    hamsi512_8x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_8way_context  shabal;
+    shabal512_8x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
+    sha512_8x64_context     sha512;
    sph_tiger_context       tiger;
 #if defined(__VAES__)
    groestl512_4way_context groestl;
@@ -76,29 +76,29 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
      switch ( algo )
      {
         case BLAKE:
-            blake512_8way_init( &ctx.blake );
+            blake512_8x64_init( &ctx.blake );
            if ( i == 0 )
-               blake512_8way_full( &ctx.blake, vhash, input, size );
+               blake512_8x64_full( &ctx.blake, vhash, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
-               blake512_8way_full( &ctx.blake, vhash, vhash, size );
+               blake512_8x64_full( &ctx.blake, vhash, vhash, size );
            }
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5,
                                 hash6, hash7, vhash );
         break;
         case BMW:
-            bmw512_8way_init( &ctx.bmw );
+            bmw512_8x64_init( &ctx.bmw );
            if ( i == 0 )
-               bmw512_8way_update( &ctx.bmw, input, size );
+               bmw512_8x64_update( &ctx.bmw, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
-            bmw512_8way_update( &ctx.bmw, vhash, size );
+            bmw512_8x64_update( &ctx.bmw, vhash, size );
            }
-            bmw512_8way_close( &ctx.bmw, vhash );
+            bmw512_8x64_close( &ctx.bmw, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
@@ -123,15 +123,15 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
         break;
         case JH:
            if ( i == 0 )
-               jh512_8way_update( &ctx.jh, input + (64<<3), 16 );
+               jh512_8x64_update( &ctx.jh, input + (64<<3), 16 );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
-               jh512_8way_init( &ctx.jh ); 
-               jh512_8way_update( &ctx.jh, vhash, size );
+               jh512_8x64_init( &ctx.jh ); 
+               jh512_8x64_update( &ctx.jh, vhash, size );
            }
-            jh512_8way_close( &ctx.jh, vhash );
+            jh512_8x64_close( &ctx.jh, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
@@ -165,30 +165,30 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
             }
             else
             {
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in0, size );
-             sph_tiger_close( &ctx.tiger, hash0 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in1, size );
-             sph_tiger_close( &ctx.tiger, hash1 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in2, size );
-             sph_tiger_close( &ctx.tiger, hash2 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in3, size );
-             sph_tiger_close( &ctx.tiger, hash3 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in4, size );
-             sph_tiger_close( &ctx.tiger, hash4 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in5, size );
-             sph_tiger_close( &ctx.tiger, hash5 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in6, size );
-             sph_tiger_close( &ctx.tiger, hash6 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in7, size );
-             sph_tiger_close( &ctx.tiger, hash7 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in0, size );
+                sph_tiger_close( &ctx.tiger, hash0 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in1, size );
+                sph_tiger_close( &ctx.tiger, hash1 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in2, size );
+                sph_tiger_close( &ctx.tiger, hash2 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in3, size );
+                sph_tiger_close( &ctx.tiger, hash3 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in4, size );
+                sph_tiger_close( &ctx.tiger, hash4 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in5, size );
+                sph_tiger_close( &ctx.tiger, hash5 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in6, size );
+                sph_tiger_close( &ctx.tiger, hash6 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in7, size );
+                sph_tiger_close( &ctx.tiger, hash7 );
             }

             for ( int i = (24/4); i < (64/4); i++ )
@@ -197,23 +197,23 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )

             intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
                          hash6, hash7 );
-             keccak512_8way_init( &ctx.keccak );
-             keccak512_8way_update( &ctx.keccak, vhash, 64 );
-             keccak512_8way_close( &ctx.keccak, vhash );
+             keccak512_8x64_init( &ctx.keccak );
+             keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+             keccak512_8x64_close( &ctx.keccak, vhash );
             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case SKEIN:
            if ( i == 0 )
-               skein512_8way_update( &ctx.skein, input + (64<<3), 16 );
+               skein512_8x64_update( &ctx.skein, input + (64<<3), 16 );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
-               skein512_8way_init( &ctx.skein ); 
-               skein512_8way_update( &ctx.skein, vhash, size );
+               skein512_8x64_init( &ctx.skein ); 
+               skein512_8x64_update( &ctx.skein, vhash, size );
            }
-            skein512_8way_close( &ctx.skein, vhash );
+            skein512_8x64_close( &ctx.skein, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
@@ -395,16 +395,16 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
         break;
         case HAMSI:
            if ( i == 0 )
-               hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 );
+               hamsi512_8x64_update( &ctx.hamsi, input + (72<<3), 8 );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );

-               hamsi512_8way_init( &ctx.hamsi );
-               hamsi512_8way_update( &ctx.hamsi, vhash, size );
+               hamsi512_8x64_init( &ctx.hamsi );
+               hamsi512_8x64_update( &ctx.hamsi, vhash, size );
            }
-            hamsi512_8way_close( &ctx.hamsi, vhash );
+            hamsi512_8x64_close( &ctx.hamsi, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
@@ -451,13 +451,13 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
            intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                         size<<3 );
            if ( i == 0 )
-                shabal512_8way_update( &ctx.shabal, vhash + (16<<3), 16 );
+                shabal512_8x32_update( &ctx.shabal, vhash + (16<<3), 16 );
            else
            {
-                shabal512_8way_init( &ctx.shabal );
-                shabal512_8way_update( &ctx.shabal, vhash, size );
+                shabal512_8x32_init( &ctx.shabal );
+                shabal512_8x32_update( &ctx.shabal, vhash, size );
            }
-            shabal512_8way_close( &ctx.shabal, vhash );
+            shabal512_8x32_close( &ctx.shabal, vhash );
            dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                              hash7, vhash );
         break;
@@ -562,9 +562,9 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )

             intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
                          hash6, hash7 );
-             sha512_8way_init( &ctx.sha512 );
-             sha512_8way_update( &ctx.sha512, vhash, 64 );
-             sha512_8way_close( &ctx.sha512, vhash );
+             sha512_8x64_init( &ctx.sha512 );
+             sha512_8x64_update( &ctx.sha512, vhash, 64 );
+             sha512_8x64_close( &ctx.sha512, vhash );
             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
@@ -623,8 +623,8 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
   {
      case JH:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         jh512_8way_init( &x16rv2_ctx.jh );
-         jh512_8way_update( &x16rv2_ctx.jh, vdata, 64 );
+         jh512_8x64_init( &x16rv2_ctx.jh );
+         jh512_8x64_update( &x16rv2_ctx.jh, vdata, 64 );
      break;
      case KECCAK:
      case LUFFA:
@@ -637,8 +637,8 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
      break;
      case SKEIN:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         skein512_8way_init( &x16rv2_ctx.skein );
-         skein512_8way_update( &x16rv2_ctx.skein, vdata, 64 );
+         skein512_8x64_init( &x16rv2_ctx.skein );
+         skein512_8x64_update( &x16rv2_ctx.skein, vdata, 64 );
      break;
      case CUBEHASH:
         v128_bswap32_80( edata, pdata );
@@ -649,8 +649,8 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
      break;
      case HAMSI:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         hamsi512_8way_init( &x16rv2_ctx.hamsi );
-         hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 72 );
+         hamsi512_8x64_init( &x16rv2_ctx.hamsi );
+         hamsi512_8x64_update( &x16rv2_ctx.hamsi, vdata, 72 );
      break;
      case FUGUE:
         v128_bswap32_80( edata, pdata );
@@ -661,8 +661,8 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
      break;
      case SHABAL:
         mm256_bswap32_intrlv80_8x32( vdata2, pdata );
-         shabal512_8way_init( &x16rv2_ctx.shabal );
-         shabal512_8way_update( &x16rv2_ctx.shabal, vdata2, 64 );
+         shabal512_8x32_init( &x16rv2_ctx.shabal );
+         shabal512_8x32_update( &x16rv2_ctx.shabal, vdata2, 64 );
         rintrlv_8x32_8x64( vdata, vdata2, 640 );
      break;
      case WHIRLPOOL:
@@ -701,8 +701,8 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,

 union _x16rv2_4way_context_overlay
 {
-    blake512_4way_context   blake;
-    bmw512_4way_context     bmw;
+    blake512_4x64_context   blake;
+    bmw512_4x64_context     bmw;
 #if defined(__VAES__)
    groestl512_2way_context groestl;
    shavite512_2way_context shavite;
@@ -712,17 +712,17 @@ union _x16rv2_4way_context_overlay
    shavite512_context      shavite;
    hashState_echo          echo;
 #endif
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;
-    keccak512_4way_context  keccak;
+    skein512_4x64_context   skein;
+    jh512_4x64_context      jh;
+    keccak512_4x64_context  keccak;
    luffa_2way_context      luffa;
    cubehashParam           cube;
    simd_2way_context       simd;
-    hamsi512_4way_context   hamsi;
+    hamsi512_4x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_4way_context  shabal;
+    shabal512_4x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_4way_context     sha512;
+    sha512_4x64_context     sha512;
    sph_tiger_context       tiger;
 };
 typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay;
@@ -761,24 +761,24 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
      {
         case BLAKE:
            if ( i == 0 )
-               blake512_4way_full( &ctx.blake, vhash, input, size );
+               blake512_4x64_full( &ctx.blake, vhash, input, size );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               blake512_4way_full( &ctx.blake, vhash, vhash, size );
+               blake512_4x64_full( &ctx.blake, vhash, vhash, size );
            }
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case BMW:
-            bmw512_4way_init( &ctx.bmw );
+            bmw512_4x64_init( &ctx.bmw );
            if ( i == 0 )
-               bmw512_4way_update( &ctx.bmw, input, size );
+               bmw512_4x64_update( &ctx.bmw, input, size );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               bmw512_4way_update( &ctx.bmw, vhash, size );
+               bmw512_4x64_update( &ctx.bmw, vhash, size );
            }
-            bmw512_4way_close( &ctx.bmw, vhash );
+            bmw512_4x64_close( &ctx.bmw, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case GROESTL:
@@ -798,14 +798,14 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
         break;
         case JH:
            if ( i == 0 )
-               jh512_4way_update( &ctx.jh, input + (64<<2), 16 );
+               jh512_4x64_update( &ctx.jh, input + (64<<2), 16 );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               jh512_4way_init( &ctx.jh );
-               jh512_4way_update( &ctx.jh, vhash, size );
+               jh512_4x64_init( &ctx.jh );
+               jh512_4x64_update( &ctx.jh, vhash, size );
            }
-            jh512_4way_close( &ctx.jh, vhash );
+            jh512_4x64_close( &ctx.jh, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case KECCAK:
@@ -842,20 +842,20 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
                hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0;

            intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-            keccak512_4way_init( &ctx.keccak );
-            keccak512_4way_update( &ctx.keccak, vhash, 64 );
-            keccak512_4way_close( &ctx.keccak, vhash );
+            keccak512_4x64_init( &ctx.keccak );
+            keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+            keccak512_4x64_close( &ctx.keccak, vhash );
            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case SKEIN:
            if ( i == 0 )
-               skein512_4way_final16( &ctx.skein, vhash, input + (64*4) );
+               skein512_4x64_final16( &ctx.skein, vhash, input + (64*4) );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               skein512_4way_init( &ctx.skein );
-               skein512_4way_update( &ctx.skein, vhash, size );
-               skein512_4way_close( &ctx.skein, vhash );
+               skein512_4x64_init( &ctx.skein );
+               skein512_4x64_update( &ctx.skein, vhash, size );
+               skein512_4x64_close( &ctx.skein, vhash );
            }
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
@@ -976,14 +976,14 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
         break;
         case HAMSI:
            if ( i == 0 )
-               hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 );
+               hamsi512_4x64_update( &ctx.hamsi, input + (72<<2), 8 );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               hamsi512_4way_init( &ctx.hamsi );
-               hamsi512_4way_update( &ctx.hamsi, vhash, size );
+               hamsi512_4x64_init( &ctx.hamsi );
+               hamsi512_4x64_update( &ctx.hamsi, vhash, size );
            }
-            hamsi512_4way_close( &ctx.hamsi, vhash );
+            hamsi512_4x64_close( &ctx.hamsi, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case FUGUE:
@@ -1012,13 +1012,13 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
         case SHABAL:
             intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
             if ( i == 0 )
-                shabal512_4way_update( &ctx.shabal, vhash + (16<<2), 16 );
+                shabal512_4x32_update( &ctx.shabal, vhash + (16<<2), 16 );
             else
             {
-                shabal512_4way_init( &ctx.shabal );
-                shabal512_4way_update( &ctx.shabal, vhash, size );
+                shabal512_4x32_init( &ctx.shabal );
+                shabal512_4x32_update( &ctx.shabal, vhash, size );
             }
-             shabal512_4way_close( &ctx.shabal, vhash );
+             shabal512_4x32_close( &ctx.shabal, vhash );
             dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case WHIRLPOOL:
@@ -1078,9 +1078,9 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
                hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0;
 
             intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-             sha512_4way_init( &ctx.sha512 );
-             sha512_4way_update( &ctx.sha512, vhash, 64 );
-             sha512_4way_close( &ctx.sha512, vhash );
+             sha512_4x64_init( &ctx.sha512 );
+             sha512_4x64_update( &ctx.sha512, vhash, 64 );
+             sha512_4x64_close( &ctx.sha512, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
      }
@@ -1133,8 +1133,8 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
   {
      case JH:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         jh512_4way_init( &x16rv2_ctx.jh );
-         jh512_4way_update( &x16rv2_ctx.jh, vdata, 64 );
+         jh512_4x64_init( &x16rv2_ctx.jh );
+         jh512_4x64_update( &x16rv2_ctx.jh, vdata, 64 );
      break;
      case KECCAK:
      case LUFFA:
@@ -1146,7 +1146,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
      break;
      case SKEIN:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         skein512_4way_prehash64( &x16rv2_ctx.skein, vdata );
+         skein512_4x64_prehash64( &x16rv2_ctx.skein, vdata );
      break;
      case CUBEHASH:
         v128_bswap32_80( edata, pdata );
@@ -1156,8 +1156,8 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
      break;
      case HAMSI:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         hamsi512_4way_init( &x16rv2_ctx.hamsi );
-         hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 72 );
+         hamsi512_4x64_init( &x16rv2_ctx.hamsi );
+         hamsi512_4x64_update( &x16rv2_ctx.hamsi, vdata, 72 );
      break;
      case FUGUE:
         v128_bswap32_80( edata, pdata );
@@ -1167,8 +1167,8 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
      break;
      case SHABAL:
         v128_bswap32_intrlv80_4x32( vdata32, pdata );
-         shabal512_4way_init( &x16rv2_ctx.shabal );
-         shabal512_4way_update( &x16rv2_ctx.shabal, vdata32, 64 );
+         shabal512_4x32_init( &x16rv2_ctx.shabal );
+         shabal512_4x32_update( &x16rv2_ctx.shabal, vdata32, 64 );
         rintrlv_4x32_4x64( vdata, vdata32, 640 );
      break;
      case WHIRLPOOL:
--- a/algo/x16/x16rv2.c
+++ b/algo/x16/x16rv2.c
@@ -168,7 +168,7 @@ int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
   static __thread uint32_t s_ntime = UINT32_MAX;
   if ( s_ntime != pdata[17] )
   {
-      uint32_t ntime = swab32(pdata[17]);
+      uint32_t ntime = bswap_32(pdata[17]);
      x16_r_s_getAlgoString( (const uint8_t*) (&edata[1]), x16r_hash_order );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -21,10 +21,10 @@ static __thread uint64_t* x21s_8way_matrix;

 union _x21s_8way_context_overlay
 {
-    haval256_5_8way_context haval;
+    haval256_8x32_context   haval;
    sph_tiger_context       tiger;
    sph_gost512_context     gost;
-    sha256_8way_context     sha256;
+    sha256_8x32_context     sha256;
 } __attribute__ ((aligned (64)));

 typedef union _x21s_8way_context_overlay x21s_8way_context_overlay;
@@ -50,9 +50,9 @@ int x21s_8way_hash( void* output, const void* input, int thrid )
   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                    hash7 );

-   haval256_5_8way_init( &ctx.haval );
-   haval256_5_8way_update( &ctx.haval, vhash, 64 );
-   haval256_5_8way_close( &ctx.haval, vhash );
+   haval256_8x32_init( &ctx.haval );
+   haval256_8x32_update( &ctx.haval, vhash, 64 );
+   haval256_8x32_close( &ctx.haval, vhash );

   dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                     hash7, vhash );
@@ -122,9 +122,9 @@ int x21s_8way_hash( void* output, const void* input, int thrid )

   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                    hash7 );
-   sha256_8way_init( &ctx.sha256 );
-   sha256_8way_update( &ctx.sha256, vhash, 64 );
-   sha256_8way_close( &ctx.sha256, output );
+   sha256_8x32_init( &ctx.sha256 );
+   sha256_8x32_update( &ctx.sha256, vhash, 64 );
+   sha256_8x32_close( &ctx.sha256, output );

   return 1;
 }
@@ -202,11 +202,11 @@ static __thread uint64_t* x21s_4way_matrix;

 union _x21s_4way_context_overlay
 {
-    haval256_5_4way_context haval;
+    haval256_4x32_context   haval;
    sph_tiger_context       tiger;
    sph_gost512_context     gost;
 #if !defined(__SHA__)
-    sha256_4way_context     sha256;
+    sha256_4x32_context     sha256;
 #endif
 } __attribute__ ((aligned (64)));

@@ -228,9 +228,9 @@ int x21s_4way_hash( void* output, const void* input, int thrid )

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3,  512 );

-   haval256_5_4way_init( &ctx.haval );
-   haval256_5_4way_update( &ctx.haval, vhash, 64 );
-   haval256_5_4way_close( &ctx.haval, vhash );
+   haval256_4x32_init( &ctx.haval );
+   haval256_4x32_update( &ctx.haval, vhash, 64 );
+   haval256_4x32_close( &ctx.haval, vhash );

   dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );

@@ -279,9 +279,9 @@ int x21s_4way_hash( void* output, const void* input, int thrid )
 #else

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
-   sha256_4way_init( &ctx.sha256 );
-   sha256_4way_update( &ctx.sha256, vhash, 64 );
-   sha256_4way_close( &ctx.sha256, vhash );
+   sha256_4x32_init( &ctx.sha256 );
+   sha256_4x32_update( &ctx.sha256, vhash, 64 );
+   sha256_4x32_close( &ctx.sha256, vhash );
   dintrlv_4x32( output, output+32, output+64,output+96, vhash, 256 );

 #endif
--- a/algo/x16/x21s.c
+++ b/algo/x16/x21s.c
@@ -78,7 +78,7 @@ int scanhash_x21s( struct work *work, uint32_t max_nonce,
   static __thread uint32_t s_ntime = UINT32_MAX;
   if ( s_ntime != pdata[17] )
   {
-      uint32_t ntime = swab32(pdata[17]);
+      uint32_t ntime = bswap_32(pdata[17]);
      x16_r_s_getAlgoString( (const uint8_t*)(&edata[1]), x16r_hash_order );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -31,20 +31,20 @@

 union _sonoa_8way_context_overlay
 {
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
+    blake512_8x64_context   blake;
+    bmw512_8x64_context     bmw;
+    skein512_8x64_context   skein;
+    jh512_8x64_context      jh;
+    keccak512_8x64_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
    simd_4way_context       simd;
-    hamsi512_8way_context   hamsi;
+    hamsi512_8x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_8way_context  shabal;
+    shabal512_8x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
-    haval256_5_8way_context haval;
+    sha512_8x64_context     sha512;
+    haval256_8x32_context haval;
 #if defined(__VAES__)
    groestl512_4way_context groestl;
    shavite512_4way_context shavite;
@@ -75,9 +75,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 // 1
     
-     blake512_8way_full( &ctx.blake, vhash, input, 80 );
+     blake512_8x64_full( &ctx.blake, vhash, input, 80 );

-     bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
+     bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );

 #if defined(__VAES__)

@@ -107,15 +107,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );
     
-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, 64 );
-     jh512_8way_close( &ctx.jh, vhash );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhash, 64 );
+     jh512_8x64_close( &ctx.jh, vhash );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, 64 );
-     keccak512_8way_close( &ctx.keccak, vhash );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_8x64_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

@@ -189,7 +189,7 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     if ( work_restart[thr_id].restart ) return 0;
 // 2

-     bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
+     bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );

 #if defined(__VAES__)

@@ -219,15 +219,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, 64 );
-     jh512_8way_close( &ctx.jh, vhash );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhash, 64 );
+     jh512_8x64_close( &ctx.jh, vhash );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, 64 );
-     keccak512_8way_close( &ctx.keccak, vhash );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_8x64_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

@@ -298,14 +298,14 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8x64_close( &ctx.hamsi, vhash );

     if ( work_restart[thr_id].restart ) return 0;
 // 3

-     bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
+     bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );

 #if defined(__VAES__)

@@ -335,17 +335,17 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     skein512_8way_init( &ctx.skein );
-     skein512_8way_update( &ctx.skein, vhash, 64 );
-     skein512_8way_close( &ctx.skein, vhash );
+     skein512_8x64_init( &ctx.skein );
+     skein512_8x64_update( &ctx.skein, vhash, 64 );
+     skein512_8x64_close( &ctx.skein, vhash );

-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, 64 );
-     jh512_8way_close( &ctx.jh, vhash );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhash, 64 );
+     jh512_8x64_close( &ctx.jh, vhash );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, 64 );
-     keccak512_8way_close( &ctx.keccak, vhash );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_8x64_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

@@ -416,9 +416,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8x64_close( &ctx.hamsi, vhash );

     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
@@ -438,7 +438,7 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                       hash7 );

-     bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
+     bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );

 #if defined(__VAES__)

@@ -468,15 +468,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, 64 );
-     jh512_8way_close( &ctx.jh, vhash );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhash, 64 );
+     jh512_8x64_close( &ctx.jh, vhash );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, 64 );
-     keccak512_8way_close( &ctx.keccak, vhash );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_8x64_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

@@ -547,9 +547,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8x64_close( &ctx.hamsi, vhash );

     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
@@ -566,15 +566,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );

-     shabal512_8way_init( &ctx.shabal );
-     shabal512_8way_update( &ctx.shabal, vhash, 64 );
-     shabal512_8way_close( &ctx.shabal, vhash );
+     shabal512_8x32_init( &ctx.shabal );
+     shabal512_8x32_update( &ctx.shabal, vhash, 64 );
+     shabal512_8x32_close( &ctx.shabal, vhash );

     rintrlv_8x32_8x64( vhashA, vhash, 512 );

-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhashA, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhashA, 64 );
+     hamsi512_8x64_close( &ctx.hamsi, vhash );

 #if defined(__VAES__)
     
@@ -633,13 +633,13 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     if ( work_restart[thr_id].restart ) return 0;
 // 5

-     bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
+     bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );

     rintrlv_8x64_8x32( vhashA, vhash, 512 );

-     shabal512_8way_init( &ctx.shabal );
-     shabal512_8way_update( &ctx.shabal, vhashA, 64 );
-     shabal512_8way_close( &ctx.shabal, vhash );
+     shabal512_8x32_init( &ctx.shabal );
+     shabal512_8x32_update( &ctx.shabal, vhashA, 64 );
+     shabal512_8x32_close( &ctx.shabal, vhash );

 #if defined(__VAES__)

@@ -669,15 +669,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, 64 );
-     jh512_8way_close( &ctx.jh, vhash );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhash, 64 );
+     jh512_8x64_close( &ctx.jh, vhash );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, 64 );
-     keccak512_8way_close( &ctx.keccak, vhash );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_8x64_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

@@ -748,9 +748,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8x64_close( &ctx.hamsi, vhash );

     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
@@ -767,9 +767,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );

-     shabal512_8way_init( &ctx.shabal );
-     shabal512_8way_update( &ctx.shabal, vhash, 64 );
-     shabal512_8way_close( &ctx.shabal, vhash );
+     shabal512_8x32_init( &ctx.shabal );
+     shabal512_8x32_update( &ctx.shabal, vhash, 64 );
+     shabal512_8x32_close( &ctx.shabal, vhash );

     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
@@ -789,7 +789,7 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                       hash7 );

-     bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
+     bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );

 #if defined(__VAES__)

@@ -819,15 +819,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, 64 );
-     jh512_8way_close( &ctx.jh, vhash );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhash, 64 );
+     jh512_8x64_close( &ctx.jh, vhash );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, 64 );
-     keccak512_8way_close( &ctx.keccak, vhash );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_8x64_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

@@ -898,9 +898,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8x64_close( &ctx.hamsi, vhash );

     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
@@ -917,9 +917,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );

-     shabal512_8way_init( &ctx.shabal );
-     shabal512_8way_update( &ctx.shabal, vhash, 64 );
-     shabal512_8way_close( &ctx.shabal, vhash );
+     shabal512_8x32_init( &ctx.shabal );
+     shabal512_8x32_update( &ctx.shabal, vhash, 64 );
+     shabal512_8x32_close( &ctx.shabal, vhash );

     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
@@ -936,9 +936,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );

-     sha512_8way_init( &ctx.sha512 );
-     sha512_8way_update( &ctx.sha512, vhash, 64 );
-     sha512_8way_close( &ctx.sha512, vhash );
+     sha512_8x64_init( &ctx.sha512 );
+     sha512_8x64_update( &ctx.sha512, vhash, 64 );
+     sha512_8x64_close( &ctx.sha512, vhash );

     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
@@ -958,7 +958,7 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                       hash7 );

-     bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
+     bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );

 #if defined(__VAES__)

@@ -988,15 +988,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, 64 );
-     jh512_8way_close( &ctx.jh, vhash );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhash, 64 );
+     jh512_8x64_close( &ctx.jh, vhash );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, 64 );
-     keccak512_8way_close( &ctx.keccak, vhash );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_8x64_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

@@ -1067,9 +1067,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8x64_close( &ctx.hamsi, vhash );

     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
@@ -1086,9 +1086,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );

-     shabal512_8way_init( &ctx.shabal );
-     shabal512_8way_update( &ctx.shabal, vhash, 64 );
-     shabal512_8way_close( &ctx.shabal, vhash );
+     shabal512_8x32_init( &ctx.shabal );
+     shabal512_8x32_update( &ctx.shabal, vhash, 64 );
+     shabal512_8x32_close( &ctx.shabal, vhash );

     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
@@ -1105,15 +1105,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );

-     sha512_8way_init( &ctx.sha512 );
-     sha512_8way_update( &ctx.sha512, vhash, 64 );
-     sha512_8way_close( &ctx.sha512, vhash );
+     sha512_8x64_init( &ctx.sha512 );
+     sha512_8x64_update( &ctx.sha512, vhash, 64 );
+     sha512_8x64_close( &ctx.sha512, vhash );

     rintrlv_8x64_8x32( vhashA, vhash, 512 );

-     haval256_5_8way_init( &ctx.haval );
-     haval256_5_8way_update( &ctx.haval, vhashA, 64 );
-     haval256_5_8way_close( &ctx.haval, state );
+     haval256_8x32_init( &ctx.haval );
+     haval256_8x32_update( &ctx.haval, vhashA, 64 );
+     haval256_8x32_close( &ctx.haval, state );

     return 1;
 }
@@ -1122,8 +1122,8 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 union _sonoa_4way_context_overlay
 {
-    blake512_4way_context   blake;
-    bmw512_4way_context     bmw;
+    blake512_4x64_context   blake;
+    bmw512_4x64_context     bmw;
 #if defined(__VAES__)
    groestl512_2way_context groestl;
    echo512_2way_context    echo;
@@ -1131,19 +1131,19 @@ union _sonoa_4way_context_overlay
    hashState_groestl       groestl;
    hashState_echo          echo;
 #endif
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;
-    keccak512_4way_context  keccak;
+    skein512_4x64_context   skein;
+    jh512_4x64_context      jh;
+    keccak512_4x64_context  keccak;
    luffa_2way_context      luffa;
    cube_2way_context       cube;
    shavite512_2way_context shavite;
    simd_2way_context       simd;
-    hamsi512_4way_context   hamsi;
+    hamsi512_4x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_4way_context  shabal;
+    shabal512_4x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_4way_context     sha512;
-    haval256_5_4way_context haval;
+    sha512_4x64_context     sha512;
+    haval256_4x32_context haval;
 };

 typedef union _sonoa_4way_context_overlay sonoa_4way_context_overlay;
@@ -1161,11 +1161,11 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 // 1

-     blake512_4way_full( &ctx.blake, vhash, input, 80 );
+     blake512_4x64_full( &ctx.blake, vhash, input, 80 );

-     bmw512_4way_init( &ctx.bmw );
-     bmw512_4way_update( &ctx.bmw, vhash, 64 );
-     bmw512_4way_close( &ctx.bmw, vhash );
+     bmw512_4x64_init( &ctx.bmw );
+     bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+     bmw512_4x64_close( &ctx.bmw, vhash );

 #if defined(__VAES__)

@@ -1189,15 +1189,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif

-     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
+     jh512_4x64_init( &ctx.jh );
+     jh512_4x64_update( &ctx.jh, vhash, 64 );
+     jh512_4x64_close( &ctx.jh, vhash );

-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
+     keccak512_4x64_init( &ctx.keccak );
+     keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_4x64_close( &ctx.keccak, vhash );

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

@@ -1241,9 +1241,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     if ( work_restart[thr_id].restart ) return 0;
 // 2

-     bmw512_4way_init( &ctx.bmw );
-     bmw512_4way_update( &ctx.bmw, vhash, 64 );
-     bmw512_4way_close( &ctx.bmw, vhash );
+     bmw512_4x64_init( &ctx.bmw );
+     bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+     bmw512_4x64_close( &ctx.bmw, vhash );

 #if defined(__VAES__)

@@ -1267,15 +1267,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif     

-     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
+     jh512_4x64_init( &ctx.jh );
+     jh512_4x64_update( &ctx.jh, vhash, 64 );
+     jh512_4x64_close( &ctx.jh, vhash );

-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
+     keccak512_4x64_init( &ctx.keccak );
+     keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_4x64_close( &ctx.keccak, vhash );

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

@@ -1316,16 +1316,16 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
+     hamsi512_4x64_init( &ctx.hamsi );
+     hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_4x64_close( &ctx.hamsi, vhash );

     if ( work_restart[thr_id].restart ) return 0;
 // 3

-     bmw512_4way_init( &ctx.bmw );
-     bmw512_4way_update( &ctx.bmw, vhash, 64 );
-     bmw512_4way_close( &ctx.bmw, vhash );
+     bmw512_4x64_init( &ctx.bmw );
+     bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+     bmw512_4x64_close( &ctx.bmw, vhash );

 #if defined(__VAES__)

@@ -1349,15 +1349,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif     

-     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
+     jh512_4x64_init( &ctx.jh );
+     jh512_4x64_update( &ctx.jh, vhash, 64 );
+     jh512_4x64_close( &ctx.jh, vhash );

-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
+     keccak512_4x64_init( &ctx.keccak );
+     keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_4x64_close( &ctx.keccak, vhash );

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

@@ -1398,9 +1398,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
+     hamsi512_4x64_init( &ctx.hamsi );
+     hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_4x64_close( &ctx.hamsi, vhash );

     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

@@ -1413,9 +1413,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
 // 4
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

-     bmw512_4way_init( &ctx.bmw );
-     bmw512_4way_update( &ctx.bmw, vhash, 64 );
-     bmw512_4way_close( &ctx.bmw, vhash );
+     bmw512_4x64_init( &ctx.bmw );
+     bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+     bmw512_4x64_close( &ctx.bmw, vhash );

 #if defined(__VAES__)

@@ -1439,15 +1439,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif     

-     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
+     jh512_4x64_init( &ctx.jh );
+     jh512_4x64_update( &ctx.jh, vhash, 64 );
+     jh512_4x64_close( &ctx.jh, vhash );

-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
+     keccak512_4x64_init( &ctx.keccak );
+     keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_4x64_close( &ctx.keccak, vhash );

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

@@ -1488,9 +1488,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
+     hamsi512_4x64_init( &ctx.hamsi );
+     hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_4x64_close( &ctx.hamsi, vhash );

     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

@@ -1501,15 +1501,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );

-     shabal512_4way_init( &ctx.shabal );
-     shabal512_4way_update( &ctx.shabal, vhash, 64 );
-     shabal512_4way_close( &ctx.shabal, vhash );
+     shabal512_4x32_init( &ctx.shabal );
+     shabal512_4x32_update( &ctx.shabal, vhash, 64 );
+     shabal512_4x32_close( &ctx.shabal, vhash );

     rintrlv_4x32_4x64( vhashB, vhash, 512 ); 

-     hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way_update( &ctx.hamsi, vhashB, 64 );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
+     hamsi512_4x64_init( &ctx.hamsi );
+     hamsi512_4x64_update( &ctx.hamsi, vhashB, 64 );
+     hamsi512_4x64_close( &ctx.hamsi, vhash );

 #if defined(__VAES__)

@@ -1545,15 +1545,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
 // 5
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );

-     bmw512_4way_init( &ctx.bmw );
-     bmw512_4way_update( &ctx.bmw, vhash, 64 );
-     bmw512_4way_close( &ctx.bmw, vhash );
+     bmw512_4x64_init( &ctx.bmw );
+     bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+     bmw512_4x64_close( &ctx.bmw, vhash );

     rintrlv_4x64_4x32( vhashB, vhash,  512 );

-     shabal512_4way_init( &ctx.shabal );
-     shabal512_4way_update( &ctx.shabal, vhashB, 64 );
-     shabal512_4way_close( &ctx.shabal, vhash );
+     shabal512_4x32_init( &ctx.shabal );
+     shabal512_4x32_update( &ctx.shabal, vhashB, 64 );
+     shabal512_4x32_close( &ctx.shabal, vhash );

 #if defined(__VAES__)

@@ -1580,15 +1580,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif     

-     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
+     jh512_4x64_init( &ctx.jh );
+     jh512_4x64_update( &ctx.jh, vhash, 64 );
+     jh512_4x64_close( &ctx.jh, vhash );

-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
+     keccak512_4x64_init( &ctx.keccak );
+     keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_4x64_close( &ctx.keccak, vhash );

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

@@ -1629,9 +1629,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
+     hamsi512_4x64_init( &ctx.hamsi );
+     hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_4x64_close( &ctx.hamsi, vhash );

     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

@@ -1642,9 +1642,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );

-     shabal512_4way_init( &ctx.shabal );
-     shabal512_4way_update( &ctx.shabal, vhash, 64 );
-     shabal512_4way_close( &ctx.shabal, vhash );
+     shabal512_4x32_init( &ctx.shabal );
+     shabal512_4x32_update( &ctx.shabal, vhash, 64 );
+     shabal512_4x32_close( &ctx.shabal, vhash );

     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );

@@ -1658,9 +1658,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     
-     bmw512_4way_init( &ctx.bmw );
-     bmw512_4way_update( &ctx.bmw, vhash, 64 );
-     bmw512_4way_close( &ctx.bmw, vhash );
+     bmw512_4x64_init( &ctx.bmw );
+     bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+     bmw512_4x64_close( &ctx.bmw, vhash );

 #if defined(__VAES__)

@@ -1684,15 +1684,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif     

-     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
+     jh512_4x64_init( &ctx.jh );
+     jh512_4x64_update( &ctx.jh, vhash, 64 );
+     jh512_4x64_close( &ctx.jh, vhash );

-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
+     keccak512_4x64_init( &ctx.keccak );
+     keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_4x64_close( &ctx.keccak, vhash );

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

@@ -1733,9 +1733,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
+     hamsi512_4x64_init( &ctx.hamsi );
+     hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_4x64_close( &ctx.hamsi, vhash );

     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

@@ -1746,9 +1746,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );

-     shabal512_4way_init( &ctx.shabal );
-     shabal512_4way_update( &ctx.shabal, vhash, 64 );
-     shabal512_4way_close( &ctx.shabal, vhash );
+     shabal512_4x32_init( &ctx.shabal );
+     shabal512_4x32_update( &ctx.shabal, vhash, 64 );
+     shabal512_4x32_close( &ctx.shabal, vhash );

     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );

@@ -1759,9 +1759,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

-     sha512_4way_init( &ctx.sha512 );
-     sha512_4way_update( &ctx.sha512, vhash, 64 );
-     sha512_4way_close( &ctx.sha512, vhash );
+     sha512_4x64_init( &ctx.sha512 );
+     sha512_4x64_update( &ctx.sha512, vhash, 64 );
+     sha512_4x64_close( &ctx.sha512, vhash );

     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

@@ -1775,9 +1775,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

-     bmw512_4way_init( &ctx.bmw );
-     bmw512_4way_update( &ctx.bmw, vhash, 64 );
-     bmw512_4way_close( &ctx.bmw, vhash );
+     bmw512_4x64_init( &ctx.bmw );
+     bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+     bmw512_4x64_close( &ctx.bmw, vhash );

 #if defined(__VAES__)

@@ -1801,15 +1801,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif     

-     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
+     jh512_4x64_init( &ctx.jh );
+     jh512_4x64_update( &ctx.jh, vhash, 64 );
+     jh512_4x64_close( &ctx.jh, vhash );

-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
+     keccak512_4x64_init( &ctx.keccak );
+     keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_4x64_close( &ctx.keccak, vhash );

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

@@ -1850,9 +1850,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
+     hamsi512_4x64_init( &ctx.hamsi );
+     hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_4x64_close( &ctx.hamsi, vhash );

     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

@@ -1863,9 +1863,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );

-     shabal512_4way_init( &ctx.shabal );
-     shabal512_4way_update( &ctx.shabal, vhash, 64 );
-     shabal512_4way_close( &ctx.shabal, vhash );
+     shabal512_4x32_init( &ctx.shabal );
+     shabal512_4x32_update( &ctx.shabal, vhash, 64 );
+     shabal512_4x32_close( &ctx.shabal, vhash );

     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );

@@ -1876,15 +1876,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

-     sha512_4way_init( &ctx.sha512 );
-     sha512_4way_update( &ctx.sha512, vhash, 64 );
-     sha512_4way_close( &ctx.sha512, vhash );
+     sha512_4x64_init( &ctx.sha512 );
+     sha512_4x64_update( &ctx.sha512, vhash, 64 );
+     sha512_4x64_close( &ctx.sha512, vhash );

     rintrlv_4x64_4x32( vhashB, vhash,  512 );

-     haval256_5_4way_init( &ctx.haval );
-     haval256_5_4way_update( &ctx.haval, vhashB, 64 );
-     haval256_5_4way_close( &ctx.haval, state );
+     haval256_4x32_init( &ctx.haval );
+     haval256_4x32_update( &ctx.haval, vhashB, 64 );
+     haval256_4x32_close( &ctx.haval, state );

     return 1;
 }
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -31,11 +31,11 @@

 union _x17_16way_context_overlay
 {
-    blake512_8way_context    blake;
+    blake512_8x64_context    blake;
    bmw512_8x64_context      bmw;
-    skein512_8way_context    skein;
-    jh512_8way_context       jh;
-    keccak512_8way_context   keccak;
+    skein512_8x64_context    skein;
+    jh512_8x64_context       jh;
+    keccak512_8x64_context   keccak;
    luffa_4way_context       luffa;
    cube_4way_2buf_context   cube;
 #if defined(__VAES__)
@@ -48,17 +48,17 @@ union _x17_16way_context_overlay
    hashState_echo           echo;
 #endif
    simd_4way_context        simd;
-    hamsi512_8way_context    hamsi;
+    hamsi512_8x64_context    hamsi;
    hashState_fugue          fugue;
-    shabal512_16way_context  shabal;
+    shabal512_16x32_context  shabal;
    sph_whirlpool_context    whirlpool;
-    sha512_8way_context      sha512;
-    haval256_5_16way_context haval;
+    sha512_8x64_context      sha512;
+    haval256_16x32_context   haval;
 } __attribute__ ((aligned (64)));
 typedef union _x17_16way_context_overlay x17_16way_context_overlay;

 static __thread __m512i x17_16way_midstate[16] __attribute__((aligned(64)));
-static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
+static __thread blake512_8x64_context blake512_8x64_ctx __attribute__((aligned(64)));

 int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,
                    int thr_id )
@@ -85,13 +85,10 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,
     uint64_t hash15[8] __attribute__ ((aligned (32)));
     x17_16way_context_overlay ctx;

-
-
-
-     memcpy( &ctx.blake, &blake512_8way_ctx, sizeof (blake512_8way_ctx) );
-     blake512_8way_final_le( &blake512_8way_ctx, vhashA, nonceA,
+     memcpy( &ctx.blake, &blake512_8x64_ctx, sizeof (blake512_8x64_ctx) );
+     blake512_8x64_final_le( &blake512_8x64_ctx, vhashA, nonceA,
                                                 x17_16way_midstate );
-     blake512_8way_final_le( &ctx.blake, vhashB, nonceB,
+     blake512_8x64_final_le( &ctx.blake, vhashB, nonceB,
                                                 x17_16way_midstate );

     bmw512_8x64_full( &ctx.bmw, vhashA, vhashA, 64 );
@@ -140,22 +137,22 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,

 #endif

-     skein512_8way_full( &ctx.skein, vhashA, vhashA, 64 );
-     skein512_8way_full( &ctx.skein, vhashB, vhashB, 64 );
+     skein512_8x64_full( &ctx.skein, vhashA, vhashA, 64 );
+     skein512_8x64_full( &ctx.skein, vhashB, vhashB, 64 );

-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhashA, 64 );
-     jh512_8way_close( &ctx.jh, vhashA );
-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhashB, 64 );
-     jh512_8way_close( &ctx.jh, vhashB );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhashA, 64 );
+     jh512_8x64_close( &ctx.jh, vhashA );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhashB, 64 );
+     jh512_8x64_close( &ctx.jh, vhashB );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhashA, 64 );
-     keccak512_8way_close( &ctx.keccak, vhashA );
-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhashB, 64 );
-     keccak512_8way_close( &ctx.keccak, vhashB );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhashA, 64 );
+     keccak512_8x64_close( &ctx.keccak, vhashA );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhashB, 64 );
+     keccak512_8x64_close( &ctx.keccak, vhashB );

 //
     rintrlv_8x64_4x128( vhashC, vhashD, vhashA, 512 );
@@ -310,18 +307,17 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,
 */

     
-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhashA, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhashA );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhashA, 64 );
+     hamsi512_8x64_close( &ctx.hamsi, vhashA );
     dintrlv_8x64_512( hash00, hash01, hash02, hash03,
                       hash04, hash05, hash06, hash07, vhashA );
-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhashB, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhashB );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhashB, 64 );
+     hamsi512_8x64_close( &ctx.hamsi, vhashB );
     dintrlv_8x64_512( hash08, hash09, hash10, hash11,
                       hash12, hash13, hash14, hash15, vhashB );

-
     fugue512_full( &ctx.fugue, hash00, hash00, 64 );
     fugue512_full( &ctx.fugue, hash01, hash01, 64 );
     fugue512_full( &ctx.fugue, hash02, hash02, 64 );
@@ -344,9 +340,9 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,
                               hash08, hash09, hash10, hash11,
                               hash12, hash13, hash14, hash15 );

-     shabal512_16way_init( &ctx.shabal );
-     shabal512_16way_update( &ctx.shabal, vhashA, 64 );
-     shabal512_16way_close( &ctx.shabal, vhashA );
+     shabal512_16x32_init( &ctx.shabal );
+     shabal512_16x32_update( &ctx.shabal, vhashA, 64 );
+     shabal512_16x32_close( &ctx.shabal, vhashA );

     dintrlv_16x32_512( hash00, hash01, hash02, hash03,
                        hash04, hash05, hash06, hash07,
@@ -375,12 +371,12 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,
     intrlv_8x64_512( vhashB, hash08, hash09, hash10, hash11,
                              hash12, hash13, hash14, hash15 );

-     sha512_8way_init( &ctx.sha512 );
-     sha512_8way_update( &ctx.sha512, vhashA, 64 );
-     sha512_8way_close( &ctx.sha512, vhashA );
-     sha512_8way_init( &ctx.sha512 );
-     sha512_8way_update( &ctx.sha512, vhashB, 64 );
-     sha512_8way_close( &ctx.sha512, vhashB );
+     sha512_8x64_init( &ctx.sha512 );
+     sha512_8x64_update( &ctx.sha512, vhashA, 64 );
+     sha512_8x64_close( &ctx.sha512, vhashA );
+     sha512_8x64_init( &ctx.sha512 );
+     sha512_8x64_update( &ctx.sha512, vhashB, 64 );
+     sha512_8x64_close( &ctx.sha512, vhashB );

     dintrlv_8x64_512( hash00, hash01, hash02, hash03,
                       hash04, hash05, hash06, hash07, vhashA );
@@ -391,9 +387,9 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,
                               hash08, hash09, hash10, hash11,
                               hash12, hash13, hash14, hash15 );

-     haval256_5_16way_init( &ctx.haval );
-     haval256_5_16way_update( &ctx.haval, vhashA, 64 );
-     haval256_5_16way_close( &ctx.haval, state );
+     haval256_16x32_init( &ctx.haval );
+     haval256_16x32_update( &ctx.haval, vhashA, 64 );
+     haval256_16x32_close( &ctx.haval, state );

     return 1;
 }
@@ -425,7 +421,7 @@ int scanhash_x17_16x32( struct work *work, uint32_t max_nonce,
   edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );

   mm512_intrlv80_8x64( vdata, edata );
-   blake512_8way_prehash_le( &blake512_8way_ctx, x17_16way_midstate, vdata );
+   blake512_8x64_prehash_le( &blake512_8x64_ctx, x17_16way_midstate, vdata );

   nonceA = _mm512_add_epi32( casti_m512i( vdata, 9 ),
                              _mm512_set_epi64( 7, 6, 5, 4, 3, 2, 1, 0 ) );
@@ -456,11 +452,11 @@ int scanhash_x17_16x32( struct work *work, uint32_t max_nonce,

 union _x17_8way_context_overlay
 {
-    blake512_8way_context   blake;
+    blake512_8x64_context   blake;
    bmw512_8x64_context     bmw;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
+    skein512_8x64_context   skein;
+    jh512_8x64_context      jh;
+    keccak512_8x64_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_2buf_context   cube;
 #if defined(__VAES__)
@@ -473,17 +469,17 @@ union _x17_8way_context_overlay
    hashState_echo          echo;
 #endif
    simd_4way_context       simd;
-    hamsi512_8way_context   hamsi;
+    hamsi512_8x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_8way_context  shabal;
+    shabal512_8x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
-    haval256_5_8way_context haval;
+    sha512_8x64_context     sha512;
+    haval256_8x32_context   haval;
 } __attribute__ ((aligned (64)));
 typedef union _x17_8way_context_overlay x17_8way_context_overlay;

 static __thread __m512i x17_8way_midstate[16] __attribute__((aligned(64)));
-static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
+static __thread blake512_8x64_context blake512_8x64_ctx __attribute__((aligned(64)));

 int x17_8x64_hash( void *state, const void *input, int thr_id )
 {
@@ -500,7 +496,7 @@ int x17_8x64_hash( void *state, const void *input, int thr_id )
     uint64_t hash7[8] __attribute__ ((aligned (32)));
     x17_8way_context_overlay ctx;

-     blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ),
+     blake512_8x64_final_le( &blake512_8x64_ctx, vhash, casti_m512i( input, 9 ),
                             x17_8way_midstate );
     
     bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );
@@ -533,15 +529,15 @@ int x17_8x64_hash( void *state, const void *input, int thr_id )

 #endif

-     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, 64 );
-     jh512_8way_close( &ctx.jh, vhash );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhash, 64 );
+     jh512_8x64_close( &ctx.jh, vhash );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, 64 );
-     keccak512_8way_close( &ctx.keccak, vhash );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_8x64_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

@@ -611,9 +607,9 @@ int x17_8x64_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8x64_close( &ctx.hamsi, vhash );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );

@@ -629,9 +625,9 @@ int x17_8x64_hash( void *state, const void *input, int thr_id )
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );

-     shabal512_8way_init( &ctx.shabal );
-     shabal512_8way_update( &ctx.shabal, vhash, 64 );
-     shabal512_8way_close( &ctx.shabal, vhash );
+     shabal512_8x32_init( &ctx.shabal );
+     shabal512_8x32_update( &ctx.shabal, vhash, 64 );
+     shabal512_8x32_close( &ctx.shabal, vhash );

     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
@@ -648,15 +644,15 @@ int x17_8x64_hash( void *state, const void *input, int thr_id )
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );

-     sha512_8way_init( &ctx.sha512 );
-     sha512_8way_update( &ctx.sha512, vhash, 64 );
-     sha512_8way_close( &ctx.sha512, vhash );
+     sha512_8x64_init( &ctx.sha512 );
+     sha512_8x64_update( &ctx.sha512, vhash, 64 );
+     sha512_8x64_close( &ctx.sha512, vhash );

     rintrlv_8x64_8x32( vhashA, vhash,  512 );

-     haval256_5_8way_init( &ctx.haval );
-     haval256_5_8way_update( &ctx.haval, vhashA, 64 );
-     haval256_5_8way_close( &ctx.haval, state );
+     haval256_8x32_init( &ctx.haval );
+     haval256_8x32_update( &ctx.haval, vhashA, 64 );
+     haval256_8x32_close( &ctx.haval, state );

     return 1;
 }
@@ -690,7 +686,7 @@ int scanhash_x17_8x64( struct work *work, uint32_t max_nonce,
   mm512_intrlv80_8x64( vdata, edata );
   *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
                                    0,7, 0,6, 0,5, 0,4, 0,3, 0,2, 0,1, 0,0 ) );
-   blake512_8way_prehash_le( &blake512_8way_ctx, x17_8way_midstate, vdata );
+   blake512_8x64_prehash_le( &blake512_8x64_ctx, x17_8way_midstate, vdata );
   
   do
   {
@@ -717,7 +713,7 @@ int scanhash_x17_8x64( struct work *work, uint32_t max_nonce,

 union _x17_4way_context_overlay
 {
-    blake512_4way_context   blake;
+    blake512_4x64_context   blake;
    bmw512_4x64_context     bmw;
 #if defined(__VAES__)
    groestl512_2way_context groestl;
@@ -726,24 +722,24 @@ union _x17_4way_context_overlay
    hashState_groestl       groestl;
    hashState_echo          echo;
 #endif
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;
-    keccak512_4way_context  keccak;
+    skein512_4x64_context   skein;
+    jh512_4x64_context      jh;
+    keccak512_4x64_context  keccak;
    luffa_2way_context      luffa;
    cube_2way_context       cube;
    shavite512_2way_context shavite;
    simd_2way_context       simd;
-    hamsi512_4way_context   hamsi;
+    hamsi512_4x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_4way_context  shabal;
+    shabal512_4x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_4way_context     sha512;
-    haval256_5_4way_context haval;
+    sha512_4x64_context     sha512;
+    haval256_4x32_context   haval;
 };  
 typedef union _x17_4way_context_overlay x17_4way_context_overlay;

 static __thread __m256i x17_4way_midstate[16] __attribute__((aligned(64)));
-static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64)));
+static __thread blake512_4x64_context blake512_4x64_ctx __attribute__((aligned(64)));

 int x17_4x64_hash( void *state, const void *input, int thr_id )
 {
@@ -756,11 +752,9 @@ int x17_4x64_hash( void *state, const void *input, int thr_id )
     uint64_t hash3[8] __attribute__ ((aligned (32)));
     x17_4way_context_overlay ctx;

-     blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ),
+     blake512_4x64_final_le( &blake512_4x64_ctx, vhash, casti_m256i( input, 9 ),
                             x17_4way_midstate );
     
-//     blake512_4way_full( &ctx.blake, vhash, input, 80 );
-
     bmw512_4x64_init( &ctx.bmw );
     bmw512_4x64_update( &ctx.bmw, vhash, 64 );
     bmw512_4x64_close( &ctx.bmw, vhash );
@@ -789,13 +783,13 @@ int x17_4x64_hash( void *state, const void *input, int thr_id )

     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
+     jh512_4x64_init( &ctx.jh );
+     jh512_4x64_update( &ctx.jh, vhash, 64 );
+     jh512_4x64_close( &ctx.jh, vhash );

-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
+     keccak512_4x64_init( &ctx.keccak );
+     keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_4x64_close( &ctx.keccak, vhash );

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

@@ -836,9 +830,9 @@ int x17_4x64_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
+     hamsi512_4x64_init( &ctx.hamsi );
+     hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_4x64_close( &ctx.hamsi, vhash );

     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

@@ -849,9 +843,9 @@ int x17_4x64_hash( void *state, const void *input, int thr_id )

     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );

-     shabal512_4way_init( &ctx.shabal );
-     shabal512_4way_update( &ctx.shabal, vhash, 64 );
-     shabal512_4way_close( &ctx.shabal, vhash );
+     shabal512_4x32_init( &ctx.shabal );
+     shabal512_4x32_update( &ctx.shabal, vhash, 64 );
+     shabal512_4x32_close( &ctx.shabal, vhash );

     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
       
@@ -862,15 +856,15 @@ int x17_4x64_hash( void *state, const void *input, int thr_id )

     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

-     sha512_4way_init( &ctx.sha512 );
-     sha512_4way_update( &ctx.sha512, vhash, 64 );
-     sha512_4way_close( &ctx.sha512, vhash );     
+     sha512_4x64_init( &ctx.sha512 );
+     sha512_4x64_update( &ctx.sha512, vhash, 64 );
+     sha512_4x64_close( &ctx.sha512, vhash );     

     rintrlv_4x64_4x32( vhashB, vhash,  512 );

-     haval256_5_4way_init( &ctx.haval );
-     haval256_5_4way_update( &ctx.haval, vhashB, 64 );
-     haval256_5_4way_close( &ctx.haval, state );
+     haval256_4x32_init( &ctx.haval );
+     haval256_4x32_update( &ctx.haval, vhashB, 64 );
+     haval256_4x32_close( &ctx.haval, state );

     return 1;
 }
@@ -903,7 +897,7 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,

   mm256_intrlv80_4x64( vdata, edata );
   *noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32( 0,3,0,2, 0,1,0,0 ) );
-   blake512_4way_prehash_le( &blake512_4way_ctx, x17_4way_midstate, vdata );
+   blake512_4x64_prehash_le( &blake512_4x64_ctx, x17_4way_midstate, vdata );

   do
   {
--- a/algo/x17/x17-gate.h
+++ b/algo/x17/x17-gate.h
@@ -6,10 +6,8 @@

 #if defined(SIMD512)
  #define X17_8WAY 1
-//  #define X17_16X32 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X17_4WAY 1
-  #define X17_8X32 1
 #elif defined(__SSE2__) || defined(__ARM_NEON)
  #define X17_2X64 1
 #endif
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -31,20 +31,20 @@

 union _xevan_8way_context_overlay
 {
-   blake512_8way_context   blake;
-   bmw512_8way_context     bmw;
-   skein512_8way_context   skein;
-   jh512_8way_context      jh;
-   keccak512_8way_context  keccak;
+   blake512_8x64_context   blake;
+   bmw512_8x64_context     bmw;
+   skein512_8x64_context   skein;
+   jh512_8x64_context      jh;
+   keccak512_8x64_context  keccak;
   luffa_4way_context      luffa;
   cube_4way_context       cube;
   simd_4way_context       simd;
-   hamsi512_8way_context   hamsi;
+   hamsi512_8x64_context   hamsi;
   hashState_fugue         fugue;
-   shabal512_8way_context  shabal;
+   shabal512_8x32_context  shabal;
   sph_whirlpool_context   whirlpool;
-   sha512_8way_context     sha512;
-   haval256_5_8way_context haval;
+   sha512_8x64_context     sha512;
+   haval256_8x32_context haval;
 #if defined(__VAES__)
    groestl512_4way_context groestl;
    shavite512_4way_context shavite;
@@ -73,10 +73,10 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
     const int dataLen = 128;
     xevan_8way_context_overlay ctx __attribute__ ((aligned (64)));

-     blake512_8way_full( &ctx.blake, vhash, input, 80 );
+     blake512_8x64_full( &ctx.blake, vhash, input, 80 );
     memset( &vhash[8<<3], 0, 64<<3 );

-     bmw512_8way_full( &ctx.bmw, vhash, vhash, dataLen );
+     bmw512_8x64_full( &ctx.bmw, vhash, vhash, dataLen );

 #if defined(__VAES__)

@@ -106,15 +106,15 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )

 #endif

-     skein512_8way_full( &ctx.skein, vhash, vhash, dataLen );
+     skein512_8x64_full( &ctx.skein, vhash, vhash, dataLen );

-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, dataLen );
-     jh512_8way_close( &ctx.jh, vhash );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhash, dataLen );
+     jh512_8x64_close( &ctx.jh, vhash );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, dataLen );
-     keccak512_8way_close( &ctx.keccak, vhash );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhash, dataLen );
+     keccak512_8x64_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );

@@ -185,9 +185,9 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )

 #endif

-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, dataLen );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhash, dataLen );
+     hamsi512_8x64_close( &ctx.hamsi, vhash );

     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, dataLen<<3 );
@@ -204,9 +204,9 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
     intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, dataLen<<3 );

-     shabal512_8way_init( &ctx.shabal );
-     shabal512_8way_update( &ctx.shabal, vhash, dataLen );
-     shabal512_8way_close( &ctx.shabal, vhash );
+     shabal512_8x32_init( &ctx.shabal );
+     shabal512_8x32_update( &ctx.shabal, vhash, dataLen );
+     shabal512_8x32_close( &ctx.shabal, vhash );

     dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, dataLen<<3 );
@@ -223,23 +223,23 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, dataLen<<3 );

-     sha512_8way_init( &ctx.sha512 );
-     sha512_8way_update( &ctx.sha512, vhash, dataLen );
-     sha512_8way_close( &ctx.sha512, vhash );
+     sha512_8x64_init( &ctx.sha512 );
+     sha512_8x64_update( &ctx.sha512, vhash, dataLen );
+     sha512_8x64_close( &ctx.sha512, vhash );

     rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 );

-     haval256_5_8way_init( &ctx.haval );
-     haval256_5_8way_update( &ctx.haval, vhashA, dataLen );
-     haval256_5_8way_close( &ctx.haval, vhashA );
+     haval256_8x32_init( &ctx.haval );
+     haval256_8x32_update( &ctx.haval, vhashA, dataLen );
+     haval256_8x32_close( &ctx.haval, vhashA );

     rintrlv_8x32_8x64( vhash, vhashA, dataLen<<3 );

     memset( &vhash[ 4<<3 ], 0, (dataLen-32) << 3 );

-     blake512_8way_full( &ctx.blake, vhash, vhash, dataLen );
+     blake512_8x64_full( &ctx.blake, vhash, vhash, dataLen );

-     bmw512_8way_full( &ctx.bmw, vhash, vhash, dataLen );
+     bmw512_8x64_full( &ctx.bmw, vhash, vhash, dataLen );

 #if defined(__VAES__)

@@ -269,15 +269,15 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )

 #endif

-     skein512_8way_full( &ctx.skein, vhash, vhash, dataLen );
+     skein512_8x64_full( &ctx.skein, vhash, vhash, dataLen );

-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, dataLen );
-     jh512_8way_close( &ctx.jh, vhash );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhash, dataLen );
+     jh512_8x64_close( &ctx.jh, vhash );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, dataLen );
-     keccak512_8way_close( &ctx.keccak, vhash );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhash, dataLen );
+     keccak512_8x64_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );

@@ -348,9 +348,9 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )

 #endif

-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, dataLen );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhash, dataLen );
+     hamsi512_8x64_close( &ctx.hamsi, vhash );

     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, dataLen<<3 );
@@ -367,9 +367,9 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
     intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, dataLen<<3 );

-     shabal512_8way_init( &ctx.shabal );
-     shabal512_8way_update( &ctx.shabal, vhash, dataLen );
-     shabal512_8way_close( &ctx.shabal, vhash );
+     shabal512_8x32_init( &ctx.shabal );
+     shabal512_8x32_update( &ctx.shabal, vhash, dataLen );
+     shabal512_8x32_close( &ctx.shabal, vhash );

     dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, dataLen<<3 );
@@ -386,15 +386,15 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, dataLen<<3 );

-     sha512_8way_init( &ctx.sha512 );
-     sha512_8way_update( &ctx.sha512, vhash, dataLen );
-     sha512_8way_close( &ctx.sha512, vhash );
+     sha512_8x64_init( &ctx.sha512 );
+     sha512_8x64_update( &ctx.sha512, vhash, dataLen );
+     sha512_8x64_close( &ctx.sha512, vhash );

     rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 );

-     haval256_5_8way_init( &ctx.haval );
-     haval256_5_8way_update( &ctx.haval, vhashA, dataLen );
-     haval256_5_8way_close( &ctx.haval, output );
+     haval256_8x32_init( &ctx.haval );
+     haval256_8x32_update( &ctx.haval, vhashA, dataLen );
+     haval256_8x32_close( &ctx.haval, output );

     return 1;
 }
@@ -403,28 +403,28 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )

 union _xevan_4way_context_overlay
 {
-	blake512_4way_context   blake;
-        bmw512_4way_context     bmw;
+        blake512_4x64_context   blake;
+        bmw512_4x64_context     bmw;
 #if defined(__VAES__)
        groestl512_2way_context groestl;
        echo_2way_context       echo;
 #else
-	hashState_groestl       groestl;
+        hashState_groestl       groestl;
        hashState_echo          echo;
 #endif
-	skein512_4way_context   skein;
-        jh512_4way_context      jh;
-        keccak512_4way_context  keccak;
+        skein512_4x64_context   skein;
+        jh512_4x64_context      jh;
+        keccak512_4x64_context  keccak;
        luffa_2way_context      luffa;
        cube_2way_context       cube;
        shavite512_2way_context shavite;
        simd_2way_context       simd;
-        hamsi512_4way_context   hamsi;
+        hamsi512_4x64_context   hamsi;
        hashState_fugue         fugue;
-        shabal512_4way_context  shabal;
+        shabal512_4x32_context  shabal;
        sph_whirlpool_context   whirlpool;
-        sha512_4way_context     sha512;
-        haval256_5_4way_context haval;
+        sha512_4x64_context     sha512;
+        haval256_4x32_context haval;
 };
 typedef union _xevan_4way_context_overlay xevan_4way_context_overlay;

@@ -440,12 +440,12 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
     const int dataLen = 128;
     xevan_4way_context_overlay ctx __attribute__ ((aligned (64)));

-     blake512_4way_full( &ctx.blake, vhash, input, 80 );
+     blake512_4x64_full( &ctx.blake, vhash, input, 80 );
     memset( &vhash[8<<2], 0, 64<<2 );

-     bmw512_4way_init( &ctx.bmw );
-     bmw512_4way_update( &ctx.bmw, vhash, dataLen );
-     bmw512_4way_close( &ctx.bmw, vhash );
+     bmw512_4x64_init( &ctx.bmw );
+     bmw512_4x64_update( &ctx.bmw, vhash, dataLen );
+     bmw512_4x64_close( &ctx.bmw, vhash );

 #if defined(__VAES__)

@@ -469,15 +469,15 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )

 #endif

-     skein512_4way_full( &ctx.skein, vhash, vhash, dataLen );
+     skein512_4x64_full( &ctx.skein, vhash, vhash, dataLen );

-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, dataLen );
-     jh512_4way_close( &ctx.jh, vhash );
+     jh512_4x64_init( &ctx.jh );
+     jh512_4x64_update( &ctx.jh, vhash, dataLen );
+     jh512_4x64_close( &ctx.jh, vhash );

-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, dataLen );
-     keccak512_4way_close( &ctx.keccak, vhash );
+     keccak512_4x64_init( &ctx.keccak );
+     keccak512_4x64_update( &ctx.keccak, vhash, dataLen );
+     keccak512_4x64_close( &ctx.keccak, vhash );

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );

@@ -518,9 +518,9 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )

 #endif

-     hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way_update( &ctx.hamsi, vhash, dataLen );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
+     hamsi512_4x64_init( &ctx.hamsi );
+     hamsi512_4x64_update( &ctx.hamsi, vhash, dataLen );
+     hamsi512_4x64_close( &ctx.hamsi, vhash );

     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

@@ -532,9 +532,9 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
     // Parallel 4way 32 bit
     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

-     shabal512_4way_init( &ctx.shabal );
-     shabal512_4way_update( &ctx.shabal, vhash, dataLen );
-     shabal512_4way_close( &ctx.shabal, vhash );
+     shabal512_4x32_init( &ctx.shabal );
+     shabal512_4x32_update( &ctx.shabal, vhash, dataLen );
+     shabal512_4x32_close( &ctx.shabal, vhash );

     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

@@ -546,27 +546,27 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )

     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

-     sha512_4way_init( &ctx.sha512 );
-     sha512_4way_update( &ctx.sha512, vhash, dataLen );
-     sha512_4way_close( &ctx.sha512, vhash );
+     sha512_4x64_init( &ctx.sha512 );
+     sha512_4x64_update( &ctx.sha512, vhash, dataLen );
+     sha512_4x64_close( &ctx.sha512, vhash );

     rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 );

-     haval256_5_4way_init( &ctx.haval );
-     haval256_5_4way_update( &ctx.haval, vhashA, dataLen );
-     haval256_5_4way_close( &ctx.haval, vhashA );
+     haval256_4x32_init( &ctx.haval );
+     haval256_4x32_update( &ctx.haval, vhashA, dataLen );
+     haval256_4x32_close( &ctx.haval, vhashA );

     rintrlv_4x32_4x64( vhash, vhashA, dataLen<<3 );

     memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 );

-     blake512_4way_init( &ctx.blake );
-     blake512_4way_update( &ctx.blake, vhash, dataLen );
-     blake512_4way_close(&ctx.blake, vhash);
+     blake512_4x64_init( &ctx.blake );
+     blake512_4x64_update( &ctx.blake, vhash, dataLen );
+     blake512_4x64_close(&ctx.blake, vhash);

-     bmw512_4way_init( &ctx.bmw );
-     bmw512_4way_update( &ctx.bmw, vhash, dataLen );
-     bmw512_4way_close( &ctx.bmw, vhash );
+     bmw512_4x64_init( &ctx.bmw );
+     bmw512_4x64_update( &ctx.bmw, vhash, dataLen );
+     bmw512_4x64_close( &ctx.bmw, vhash );

 #if defined(__VAES__)

@@ -590,15 +590,15 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )

 #endif

-     skein512_4way_full( &ctx.skein, vhash, vhash, dataLen );
+     skein512_4x64_full( &ctx.skein, vhash, vhash, dataLen );

-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, dataLen );
-     jh512_4way_close( &ctx.jh, vhash );
+     jh512_4x64_init( &ctx.jh );
+     jh512_4x64_update( &ctx.jh, vhash, dataLen );
+     jh512_4x64_close( &ctx.jh, vhash );

-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, dataLen );
-     keccak512_4way_close( &ctx.keccak, vhash );
+     keccak512_4x64_init( &ctx.keccak );
+     keccak512_4x64_update( &ctx.keccak, vhash, dataLen );
+     keccak512_4x64_close( &ctx.keccak, vhash );

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );

@@ -639,9 +639,9 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )

 #endif

-     hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way_update( &ctx.hamsi, vhash, dataLen );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
+     hamsi512_4x64_init( &ctx.hamsi );
+     hamsi512_4x64_update( &ctx.hamsi, vhash, dataLen );
+     hamsi512_4x64_close( &ctx.hamsi, vhash );

     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

@@ -652,9 +652,9 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )

     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

-     shabal512_4way_init( &ctx.shabal );
-     shabal512_4way_update( &ctx.shabal, vhash, dataLen );
-     shabal512_4way_close( &ctx.shabal, vhash );
+     shabal512_4x32_init( &ctx.shabal );
+     shabal512_4x32_update( &ctx.shabal, vhash, dataLen );
+     shabal512_4x32_close( &ctx.shabal, vhash );

     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

@@ -665,15 +665,15 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )

     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

-     sha512_4way_init( &ctx.sha512 );
-     sha512_4way_update( &ctx.sha512, vhash, dataLen );
-     sha512_4way_close( &ctx.sha512, vhash );
+     sha512_4x64_init( &ctx.sha512 );
+     sha512_4x64_update( &ctx.sha512, vhash, dataLen );
+     sha512_4x64_close( &ctx.sha512, vhash );

     rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 );

-     haval256_5_4way_init( &ctx.haval );
-     haval256_5_4way_update( &ctx.haval, vhashA, dataLen );
-     haval256_5_4way_close( &ctx.haval, output );
+     haval256_4x32_init( &ctx.haval );
+     haval256_4x32_update( &ctx.haval, vhashA, dataLen );
+     haval256_4x32_close( &ctx.haval, output );

     return 1;
 }
--- a/algo/x22/x22i-4way.c
+++ b/algo/x22/x22i-4way.c
@@ -32,24 +32,24 @@

 union _x22i_8way_ctx_overlay
 {
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
+    blake512_8x64_context   blake;
+    bmw512_8x64_context     bmw;
+    skein512_8x64_context   skein;
+    jh512_8x64_context      jh;
+    keccak512_8x64_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
    simd_4way_context       simd;
-    hamsi512_8way_context   hamsi;
+    hamsi512_8x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_8way_context  shabal;
+    shabal512_8x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
-    haval256_5_8way_context haval;
+    sha512_8x64_context     sha512;
+    haval256_8x32_context   haval;
    sph_tiger_context       tiger;
    sph_gost512_context     gost;
 #if !defined(X22I_8WAY_SHA)
-    sha256_8way_context     sha256;
+    sha256_8x32_context     sha256;
 #endif
 #if defined(__VAES__)
    groestl512_4way_context groestl;
@@ -88,9 +88,9 @@ int x22i_8way_hash( void *output, const void *input, int thrid )
   unsigned char hashA7[64]    __attribute__((aligned(32))) = {0};
   x22i_8way_ctx_overlay ctx;

-   blake512_8way_full( &ctx.blake, vhash, input, 80 );
+   blake512_8x64_full( &ctx.blake, vhash, input, 80 );

-   bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
+   bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );

 #if defined(__VAES__)

@@ -120,15 +120,15 @@ int x22i_8way_hash( void *output, const void *input, int thrid )

 #endif

-   skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
+   skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );
     
-   jh512_8way_init( &ctx.jh );
-   jh512_8way_update( &ctx.jh, vhash, 64 );
-   jh512_8way_close( &ctx.jh, vhash );
+   jh512_8x64_init( &ctx.jh );
+   jh512_8x64_update( &ctx.jh, vhash, 64 );
+   jh512_8x64_close( &ctx.jh, vhash );

-   keccak512_8way_init( &ctx.keccak );
-   keccak512_8way_update( &ctx.keccak, vhash, 64 );
-   keccak512_8way_close( &ctx.keccak, vhash );
+   keccak512_8x64_init( &ctx.keccak );
+   keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+   keccak512_8x64_close( &ctx.keccak, vhash );

   if ( work_restart[thrid].restart ) return 0;
   
@@ -219,9 +219,9 @@ int x22i_8way_hash( void *output, const void *input, int thrid )

   if ( work_restart[thrid].restart ) return 0;
   
-   hamsi512_8way_init( &ctx.hamsi );
-   hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
-   hamsi512_8way_close( &ctx.hamsi, vhash );
+   hamsi512_8x64_init( &ctx.hamsi );
+   hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
+   hamsi512_8x64_close( &ctx.hamsi, vhash );

   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
                     hash4, hash5, hash6, hash7, vhash );
@@ -238,9 +238,9 @@ int x22i_8way_hash( void *output, const void *input, int thrid )
   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3,
                           hash4, hash5, hash6, hash7 );

-   shabal512_8way_init( &ctx.shabal );
-   shabal512_8way_update( &ctx.shabal, vhash, 64 );
-   shabal512_8way_close( &ctx.shabal, vhash );
+   shabal512_8x32_init( &ctx.shabal );
+   shabal512_8x32_update( &ctx.shabal, vhash, 64 );
+   shabal512_8x32_close( &ctx.shabal, vhash );

   dintrlv_8x32_512( &hash0[8], &hash1[8], &hash2[8], &hash3[8],
                     &hash4[8], &hash5[8], &hash6[8], &hash7[8], vhash );
@@ -273,9 +273,9 @@ int x22i_8way_hash( void *output, const void *input, int thrid )
   intrlv_8x64_512( vhash, &hash0[16], &hash1[16], &hash2[16], &hash3[16],
                           &hash4[16], &hash5[16], &hash6[16], &hash7[16] );

-   sha512_8way_init( &ctx.sha512 );
-   sha512_8way_update( &ctx.sha512, vhash, 64 );
-   sha512_8way_close( &ctx.sha512, vhash );
+   sha512_8x64_init( &ctx.sha512 );
+   sha512_8x64_update( &ctx.sha512, vhash, 64 );
+   sha512_8x64_close( &ctx.sha512, vhash );

   dintrlv_8x64_512( &hash0[24], &hash1[24], &hash2[24], &hash3[24],
                     &hash4[24], &hash5[24], &hash6[24], &hash7[24], vhash );
@@ -294,9 +294,9 @@ int x22i_8way_hash( void *output, const void *input, int thrid )

   memset( vhash, 0, 64*8 );

-   haval256_5_8way_init( &ctx.haval );
-   haval256_5_8way_update( &ctx.haval, vhashA, 64 );
-   haval256_5_8way_close( &ctx.haval, vhash );
+   haval256_8x32_init( &ctx.haval );
+   haval256_8x32_update( &ctx.haval, vhashA, 64 );
+   haval256_8x32_close( &ctx.haval, vhash );

   dintrlv_8x32_512( hash0, hash1, hash2, hash3,
                     hash4, hash5, hash6, hash7, vhash );
@@ -400,9 +400,9 @@ int x22i_8way_hash( void *output, const void *input, int thrid )
   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3,
                           hash4, hash5, hash6, hash7 );

-   sha256_8way_init( &ctx.sha256 );
-   sha256_8way_update( &ctx.sha256, vhash, 64 );
-   sha256_8way_close( &ctx.sha256, output );
+   sha256_8x32_init( &ctx.sha256 );
+   sha256_8x32_update( &ctx.sha256, vhash, 64 );
+   sha256_8x32_close( &ctx.sha256, output );

 #endif

@@ -427,8 +427,6 @@ int scanhash_x22i_8way_sha( struct work *work, uint32_t max_nonce,

   if ( bench )  ptarget[7] = 0x08ff;

-   InitializeSWIFFTX();
-
   mm512_bswap32_intrlv80_8x64( vdata, pdata );
   *noncev = mm512_intrlv_blend_32(
              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
@@ -472,8 +470,6 @@ int scanhash_x22i_8way( struct work *work, uint32_t max_nonce,

   if ( bench )  ptarget[7] = 0x08ff;

-   InitializeSWIFFTX();
-   
   mm512_bswap32_intrlv80_8x64( vdata, pdata );
   *noncev = mm512_intrlv_blend_32(
              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
@@ -506,8 +502,8 @@ int scanhash_x22i_8way( struct work *work, uint32_t max_nonce,

 union _x22i_4way_ctx_overlay
 {
-    blake512_4way_context   blake;
-    bmw512_4way_context     bmw;
+    blake512_4x64_context   blake;
+    bmw512_4x64_context     bmw;
 #if defined(__VAES__)
    groestl512_2way_context groestl;
    echo_2way_context       echo;
@@ -516,22 +512,22 @@ union _x22i_4way_ctx_overlay
    hashState_echo          echo;
 #endif
    shavite512_2way_context shavite;
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;
-    keccak512_4way_context  keccak;
+    skein512_4x64_context   skein;
+    jh512_4x64_context      jh;
+    keccak512_4x64_context  keccak;
    luffa_2way_context      luffa;
    cube_2way_context       cube;
    simd_2way_context       simd;
-    hamsi512_4way_context   hamsi;
+    hamsi512_4x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_4way_context  shabal;
+    shabal512_4x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_4way_context     sha512;
-    haval256_5_4way_context haval;
+    sha512_4x64_context     sha512;
+    haval256_4x32_context   haval;
    sph_tiger_context       tiger;
    sph_gost512_context     gost;
 #if !defined(X22I_4WAY_SHA)
-    sha256_4way_context     sha256;
+    sha256_4x32_context     sha256;
 #endif
 };
 typedef union _x22i_4way_ctx_overlay x22i_ctx_overlay;
@@ -551,11 +547,11 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
   unsigned char hashA3[64]    __attribute__((aligned(32))) = {0};
   x22i_ctx_overlay ctx;

-   blake512_4way_full( &ctx.blake, vhash, input, 80 );
+   blake512_4x64_full( &ctx.blake, vhash, input, 80 );

-   bmw512_4way_init( &ctx.bmw );
-   bmw512_4way_update( &ctx.bmw, vhash, 64 );
-   bmw512_4way_close( &ctx.bmw, vhash );
+   bmw512_4x64_init( &ctx.bmw );
+   bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+   bmw512_4x64_close( &ctx.bmw, vhash );

 #if defined(__VAES__)

@@ -579,15 +575,15 @@ int x22i_4way_hash( void *output, const void *input, int thrid )

 #endif

-   skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
+   skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );

-   jh512_4way_init( &ctx.jh );
-   jh512_4way_update( &ctx.jh, vhash, 64 );
-   jh512_4way_close( &ctx.jh, vhash );
+   jh512_4x64_init( &ctx.jh );
+   jh512_4x64_update( &ctx.jh, vhash, 64 );
+   jh512_4x64_close( &ctx.jh, vhash );

-   keccak512_4way_init( &ctx.keccak );
-   keccak512_4way_update( &ctx.keccak, vhash, 64 );
-   keccak512_4way_close( &ctx.keccak, vhash );
+   keccak512_4x64_init( &ctx.keccak );
+   keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+   keccak512_4x64_close( &ctx.keccak, vhash );

   if ( work_restart[thrid].restart ) return false;
   
@@ -632,9 +628,9 @@ int x22i_4way_hash( void *output, const void *input, int thrid )

   if ( work_restart[thrid].restart ) return false;
   
-   hamsi512_4way_init( &ctx.hamsi );
-   hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
-   hamsi512_4way_close( &ctx.hamsi, vhash );
+   hamsi512_4x64_init( &ctx.hamsi );
+   hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
+   hamsi512_4x64_close( &ctx.hamsi, vhash );
   dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

   fugue512_full( &ctx.fugue, hash0, hash0, 64 );
@@ -644,9 +640,9 @@ int x22i_4way_hash( void *output, const void *input, int thrid )

   intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );

-   shabal512_4way_init( &ctx.shabal );
-   shabal512_4way_update( &ctx.shabal, vhash, 64 );
-   shabal512_4way_close( &ctx.shabal, vhash );
+   shabal512_4x32_init( &ctx.shabal );
+   shabal512_4x32_update( &ctx.shabal, vhash, 64 );
+   shabal512_4x32_close( &ctx.shabal, vhash );
   dintrlv_4x32_512( &hash0[8], &hash1[8], &hash2[8], &hash3[8], vhash );

   sph_whirlpool_init( &ctx.whirlpool );
@@ -664,9 +660,9 @@ int x22i_4way_hash( void *output, const void *input, int thrid )

   intrlv_4x64_512( vhash, &hash0[16], &hash1[16], &hash2[16], &hash3[16] );

-   sha512_4way_init( &ctx.sha512 );
-   sha512_4way_update( &ctx.sha512, vhash, 64 );
-   sha512_4way_close( &ctx.sha512, vhash );
+   sha512_4x64_init( &ctx.sha512 );
+   sha512_4x64_update( &ctx.sha512, vhash, 64 );
+   sha512_4x64_close( &ctx.sha512, vhash );
   dintrlv_4x64_512( &hash0[24], &hash1[24], &hash2[24], &hash3[24], vhash );

   if ( work_restart[thrid].restart ) return false;
@@ -680,9 +676,9 @@ int x22i_4way_hash( void *output, const void *input, int thrid )

   memset( vhash, 0, 64*4 );

-   haval256_5_4way_init( &ctx.haval );
-   haval256_5_4way_update( &ctx.haval, vhashA, 64 );
-   haval256_5_4way_close( &ctx.haval, vhash );
+   haval256_4x32_init( &ctx.haval );
+   haval256_4x32_update( &ctx.haval, vhashA, 64 );
+   haval256_4x32_close( &ctx.haval, vhash );
   dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
     
   memset( hashA0, 0, 64 );
@@ -743,9 +739,9 @@ int x22i_4way_hash( void *output, const void *input, int thrid )

   intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );

-   sha256_4way_init( &ctx.sha256 );
-   sha256_4way_update( &ctx.sha256, vhash, 64 );
-   sha256_4way_close( &ctx.sha256, output );
+   sha256_4x32_init( &ctx.sha256 );
+   sha256_4x32_update( &ctx.sha256, vhash, 64 );
+   sha256_4x32_close( &ctx.sha256, output );

 #endif

@@ -770,8 +766,6 @@ int scanhash_x22i_4way_sha( struct work* work, uint32_t max_nonce,

   if ( bench ) ptarget[7] = 0x08ff;

-   InitializeSWIFFTX();
-
   mm256_bswap32_intrlv80_4x64( vdata, pdata );
   *noncev = mm256_intrlv_blend_32(
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
@@ -814,8 +808,6 @@ int scanhash_x22i_4way( struct work* work, uint32_t max_nonce,

   if ( bench ) ptarget[7] = 0x08ff;

-   InitializeSWIFFTX();
-   
   mm256_bswap32_intrlv80_4x64( vdata, pdata );
   *noncev = mm256_intrlv_blend_32(
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
--- a/algo/x22/x22i-gate.c
+++ b/algo/x22/x22i-gate.c
@@ -33,6 +33,7 @@ bool register_x22i_algo( algo_gate_t* gate )

  gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA256_OPT
                      | AVX512_OPT | VAES_OPT | NEON_OPT;
+  InitializeSWIFFTX();
  return true;
 };

--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -63,29 +63,29 @@ void x25x_shuffle( void *hash )

 union _x25x_8way_ctx_overlay
 {
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
+    blake512_8x64_context   blake;
+    bmw512_8x64_context     bmw;
+    skein512_8x64_context   skein;
+    jh512_8x64_context      jh;
+    keccak512_8x64_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
    simd_4way_context       simd;
-    hamsi512_8way_context   hamsi;
+    hamsi512_8x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_8way_context  shabal;
+    shabal512_8x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
-    haval256_5_8way_context haval;
+    sha512_8x64_context     sha512;
+    haval256_8x32_context   haval;
    sph_tiger_context       tiger;
    sph_gost512_context     gost;
 #if defined(X25X_8WAY_SHA)
    sha256_context          sha256;
 #else
-    sha256_8way_context     sha256;
+    sha256_8x32_context     sha256;
 #endif
-    panama_8way_context     panama;
-    blake2s_8way_state      blake2s;
+    panama_8x32_context     panama;
+    blake2s_8x32_state      blake2s;
 #if defined(__VAES__)
    groestl512_4way_context groestl;
    shavite512_4way_context shavite;
@@ -99,7 +99,7 @@ union _x25x_8way_ctx_overlay
 typedef union _x25x_8way_ctx_overlay x25x_8way_ctx_overlay;

 static __thread __m512i x25x_8way_midstate[16] __attribute__((aligned(64)));
-static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
+static __thread blake512_8x64_context blake512_8x64_ctx __attribute__((aligned(64)));

 int x25x_8way_hash( void *output, const void *input, int thrid )
 {
@@ -117,15 +117,15 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
   uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
   x25x_8way_ctx_overlay ctx __attribute__ ((aligned (64)));

-   blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ),
+   blake512_8x64_final_le( &blake512_8x64_ctx, vhash, casti_m512i( input, 9 ),
                                                x25x_8way_midstate );

   dintrlv_8x64_512( hash0[0], hash1[0], hash2[0], hash3[0],
                     hash4[0], hash5[0], hash6[0], hash7[0], vhash );

-   bmw512_8way_init( &ctx.bmw );
-   bmw512_8way_update( &ctx.bmw, vhash, 64 );
-   bmw512_8way_close( &ctx.bmw, vhash );
+   bmw512_8x64_init( &ctx.bmw );
+   bmw512_8x64_update( &ctx.bmw, vhash, 64 );
+   bmw512_8x64_close( &ctx.bmw, vhash );
   dintrlv_8x64_512( hash0[1], hash1[1], hash2[1], hash3[1],
                     hash4[1], hash5[1], hash6[1], hash7[1], vhash );

@@ -175,21 +175,19 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
   
 #endif

-   skein512_8way_init( &ctx.skein );
-   skein512_8way_update( &ctx.skein, vhash, 64 );
-   skein512_8way_close( &ctx.skein, vhash );
+   skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );
   dintrlv_8x64_512( hash0[3], hash1[3], hash2[3], hash3[3],
                     hash4[3], hash5[3], hash6[3], hash7[3], vhash );

-   jh512_8way_init( &ctx.jh );
-   jh512_8way_update( &ctx.jh, vhash, 64 );
-   jh512_8way_close( &ctx.jh, vhash );
+   jh512_8x64_init( &ctx.jh );
+   jh512_8x64_update( &ctx.jh, vhash, 64 );
+   jh512_8x64_close( &ctx.jh, vhash );
   dintrlv_8x64_512( hash0[4], hash1[4], hash2[4], hash3[4],
                     hash4[4], hash5[4], hash6[4], hash7[4], vhash );
   
-   keccak512_8way_init( &ctx.keccak );
-   keccak512_8way_update( &ctx.keccak, vhash, 64 );
-   keccak512_8way_close( &ctx.keccak, vhash );
+   keccak512_8x64_init( &ctx.keccak );
+   keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+   keccak512_8x64_close( &ctx.keccak, vhash );
   dintrlv_8x64_512( hash0[5], hash1[5], hash2[5], hash3[5],
                     hash4[5], hash5[5], hash6[5], hash7[5], vhash );

@@ -303,9 +301,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid )

   if ( work_restart[thrid].restart ) return 0;
   
-   hamsi512_8way_init( &ctx.hamsi );
-   hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
-   hamsi512_8way_close( &ctx.hamsi, vhash );
+   hamsi512_8x64_init( &ctx.hamsi );
+   hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
+   hamsi512_8x64_close( &ctx.hamsi, vhash );
   dintrlv_8x64_512( hash0[11], hash1[11], hash2[11], hash3[11],
                     hash4[11], hash5[11], hash6[11], hash7[11], vhash );
   
@@ -321,9 +319,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
   intrlv_8x32_512( vhash, hash0[12], hash1[12], hash2[12], hash3[12],
                           hash4[12], hash5[12], hash6[12], hash7[12] );

-   shabal512_8way_init( &ctx.shabal );
-   shabal512_8way_update( &ctx.shabal, vhash, 64 );
-   shabal512_8way_close( &ctx.shabal, vhash );
+   shabal512_8x32_init( &ctx.shabal );
+   shabal512_8x32_update( &ctx.shabal, vhash, 64 );
+   shabal512_8x32_close( &ctx.shabal, vhash );
   dintrlv_8x32_512( hash0[13], hash1[13], hash2[13], hash3[13],
                     hash4[13], hash5[13], hash6[13], hash7[13], vhash );

@@ -354,9 +352,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
   intrlv_8x64_512( vhash, hash0[14], hash1[14], hash2[14], hash3[14],
                           hash4[14], hash5[14], hash6[14], hash7[14] );

-   sha512_8way_init( &ctx.sha512 );
-   sha512_8way_update( &ctx.sha512, vhash, 64 );
-   sha512_8way_close( &ctx.sha512, vhash );
+   sha512_8x64_init( &ctx.sha512 );
+   sha512_8x64_update( &ctx.sha512, vhash, 64 );
+   sha512_8x64_close( &ctx.sha512, vhash );
   dintrlv_8x64_512( hash0[15], hash1[15], hash2[15], hash3[15],
                     hash4[15], hash5[15], hash6[15], hash7[15], vhash );

@@ -372,9 +370,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
                            hash4[16], hash5[16], hash6[16], hash7[16] );
   memset( vhash, 0, 64*8 );

-   haval256_5_8way_init( &ctx.haval );
-   haval256_5_8way_update( &ctx.haval, vhashA, 64 );
-   haval256_5_8way_close( &ctx.haval, vhash );
+   haval256_8x32_init( &ctx.haval );
+   haval256_8x32_update( &ctx.haval, vhashA, 64 );
+   haval256_8x32_close( &ctx.haval, vhash );
   dintrlv_8x32_512( hash0[17], hash1[17], hash2[17], hash3[17],
                     hash4[17], hash5[17], hash6[17], hash7[17], vhash );

@@ -462,17 +460,17 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
   intrlv_8x32_512( vhashA, hash0[20], hash1[20], hash2[20], hash3[20],
                            hash4[20], hash5[20], hash6[20], hash7[20] );

-   sha256_8way_init( &ctx.sha256 );
-   sha256_8way_update( &ctx.sha256, vhashA, 64 );
-   sha256_8way_close( &ctx.sha256, vhash );
+   sha256_8x32_init( &ctx.sha256 );
+   sha256_8x32_update( &ctx.sha256, vhashA, 64 );
+   sha256_8x32_close( &ctx.sha256, vhash );
   dintrlv_8x32_512( hash0[21], hash1[21], hash2[21], hash3[21],
                     hash4[21], hash5[21], hash6[21], hash7[21], vhash );

 #endif

-   panama_8way_init( &ctx.panama );
-   panama_8way_update( &ctx.panama, vhash, 64 );
-   panama_8way_close( &ctx.panama, vhash );
+   panama_8x32_init( &ctx.panama );
+   panama_8x32_update( &ctx.panama, vhash, 64 );
+   panama_8x32_close( &ctx.panama, vhash );
   dintrlv_8x32_512( hash0[22], hash1[22], hash2[22], hash3[22],
                     hash4[22], hash5[22], hash6[22], hash7[22], vhash );

@@ -545,8 +543,8 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
   intrlv_8x32_512( vhashX[23], hash0[23], hash1[23], hash2[23], hash3[23],
                                hash4[23], hash5[23], hash6[23], hash7[23] );

-   blake2s_8way_init( &ctx.blake2s, 32 );
-   blake2s_8way_full_blocks( &ctx.blake2s, output, vhashX, 64*24 );
+   blake2s_8x32_init( &ctx.blake2s, 32 );
+   blake2s_8x32_full_blocks( &ctx.blake2s, output, vhashX, 64*24 );

   return 1;
 }
@@ -578,14 +576,13 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
   edata[4] = v128_swap64_32( casti_v128( pdata, 4 ) );   

   mm512_intrlv80_8x64( vdata, edata );
-   *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
-                       0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0 ) );
-   blake512_8way_prehash_le( &blake512_8way_ctx, x25x_8way_midstate, vdata ); 
+   *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi64(
+                                           7, 6, 5, 4, 3, 2, 1, 0 ) );
+   blake512_8x64_prehash_le( &blake512_8x64_ctx, x25x_8way_midstate, vdata ); 

   do
   {
-      if ( x25x_8way_hash( hash, vdata, thr_id ) );
-
+      if ( x25x_8way_hash( hash, vdata, thr_id ) )
      for ( int lane = 0; lane < 8; lane++ )
      if ( unlikely( ( hashd7[ lane ] <= targ32 ) && !bench ) )
      {
@@ -608,8 +605,8 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,

 union _x25x_4way_ctx_overlay
 {
-    blake512_4way_context   blake;
-    bmw512_4way_context     bmw;
+    blake512_4x64_context   blake;
+    bmw512_4x64_context     bmw;
 #if defined(__VAES__)
    groestl512_2way_context groestl;
    echo_2way_context       echo;
@@ -617,34 +614,34 @@ union _x25x_4way_ctx_overlay
    hashState_groestl       groestl;
    hashState_echo          echo;
 #endif
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;
-    keccak512_4way_context  keccak;
+    skein512_4x64_context   skein;
+    jh512_4x64_context      jh;
+    keccak512_4x64_context  keccak;
    luffa_2way_context      luffa;
    cube_2way_context       cube;
    shavite512_2way_context shavite;
    simd_2way_context       simd;
-    hamsi512_4way_context   hamsi;
+    hamsi512_4x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_4way_context  shabal;
+    shabal512_4x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_4way_context     sha512;
-    haval256_5_4way_context haval;
+    sha512_4x64_context     sha512;
+    haval256_4x32_context   haval;
    sph_tiger_context       tiger;
    sph_gost512_context     gost;
 #if defined(X25X_4WAY_SHA)
    sha256_context          sha256;
 #else
-    sha256_4way_context     sha256;
+    sha256_4x32_context     sha256;
 #endif
-    panama_4way_context     panama;
-    blake2s_4way_state      blake2s;
+    panama_4x32_context     panama;
+    blake2s_4x32_state      blake2s;
 };

 typedef union _x25x_4way_ctx_overlay x25x_4way_ctx_overlay;

 static __thread __m256i x25x_4way_midstate[16] __attribute__((aligned(64)));
-static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64)));
+static __thread blake512_4x64_context blake512_4x64_ctx __attribute__((aligned(64)));

 int x25x_4way_hash( void *output, const void *input, int thrid )
 {
@@ -658,14 +655,14 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
   uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
   x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64)));

-   blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ),
+   blake512_4x64_final_le( &blake512_4x64_ctx, vhash, casti_m256i( input, 9 ),
                                                x25x_4way_midstate );
   
   dintrlv_4x64_512( hash0[0], hash1[0], hash2[0], hash3[0], vhash );

-   bmw512_4way_init( &ctx.bmw );
-   bmw512_4way_update( &ctx.bmw, vhash, 64 );
-   bmw512_4way_close( &ctx.bmw, vhash );
+   bmw512_4x64_init( &ctx.bmw );
+   bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+   bmw512_4x64_close( &ctx.bmw, vhash );
   dintrlv_4x64_512( hash0[1], hash1[1], hash2[1], hash3[1], vhash );

 #if defined(__VAES__)
@@ -688,19 +685,19 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
 #endif

   intrlv_4x64_512( vhash, hash0[2], hash1[2], hash2[2], hash3[2] );
-   skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
+   skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );
   dintrlv_4x64_512( hash0[3], hash1[3], hash2[3], hash3[3], vhash );

-   jh512_4way_init( &ctx.jh );
-   jh512_4way_update( &ctx.jh, vhash, 64 );
-   jh512_4way_close( &ctx.jh, vhash );
+   jh512_4x64_init( &ctx.jh );
+   jh512_4x64_update( &ctx.jh, vhash, 64 );
+   jh512_4x64_close( &ctx.jh, vhash );
   dintrlv_4x64_512( hash0[4], hash1[4], hash2[4], hash3[4], vhash );

   if ( work_restart[thrid].restart ) return 0;
   
-   keccak512_4way_init( &ctx.keccak );
-   keccak512_4way_update( &ctx.keccak, vhash, 64 );
-   keccak512_4way_close( &ctx.keccak, vhash );
+   keccak512_4x64_init( &ctx.keccak );
+   keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+   keccak512_4x64_close( &ctx.keccak, vhash );
   dintrlv_4x64_512( hash0[5], hash1[5], hash2[5], hash3[5], vhash );

   rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -751,9 +748,9 @@ int x25x_4way_hash( void *output, const void *input, int thrid )

   if ( work_restart[thrid].restart ) return 0;
   
-   hamsi512_4way_init( &ctx.hamsi );
-   hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
-   hamsi512_4way_close( &ctx.hamsi, vhash );
+   hamsi512_4x64_init( &ctx.hamsi );
+   hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
+   hamsi512_4x64_close( &ctx.hamsi, vhash );
   dintrlv_4x64_512( hash0[11], hash1[11], hash2[11], hash3[11], vhash );

   fugue512_full( &ctx.fugue, hash0[12], hash0[11], 64 );
@@ -763,9 +760,9 @@ int x25x_4way_hash( void *output, const void *input, int thrid )

   intrlv_4x32_512( vhash, hash0[12], hash1[12], hash2[12], hash3[12] );

-   shabal512_4way_init( &ctx.shabal );
-   shabal512_4way_update( &ctx.shabal, vhash, 64 );
-   shabal512_4way_close( &ctx.shabal, vhash );
+   shabal512_4x32_init( &ctx.shabal );
+   shabal512_4x32_update( &ctx.shabal, vhash, 64 );
+   shabal512_4x32_close( &ctx.shabal, vhash );
   dintrlv_4x32_512( hash0[13], hash1[13], hash2[13], hash3[13], vhash );

   sph_whirlpool_init(&ctx.whirlpool);
@@ -783,9 +780,9 @@ int x25x_4way_hash( void *output, const void *input, int thrid )

   intrlv_4x64_512( vhash, hash0[14], hash1[14], hash2[14], hash3[14] );

-   sha512_4way_init( &ctx.sha512 );
-   sha512_4way_update( &ctx.sha512, vhash, 64 );
-   sha512_4way_close( &ctx.sha512, vhash );
+   sha512_4x64_init( &ctx.sha512 );
+   sha512_4x64_update( &ctx.sha512, vhash, 64 );
+   sha512_4x64_close( &ctx.sha512, vhash );
   dintrlv_4x64_512( hash0[15], hash1[15], hash2[15], hash3[15], vhash );

   ComputeSingleSWIFFTX((unsigned char*)hash0[12], (unsigned char*)hash0[16]);
@@ -797,9 +794,9 @@ int x25x_4way_hash( void *output, const void *input, int thrid )

   memset( vhash, 0, 64*4 );

-   haval256_5_4way_init( &ctx.haval );
-   haval256_5_4way_update( &ctx.haval, vhashX[0], 64 );
-   haval256_5_4way_close( &ctx.haval, vhash );
+   haval256_4x32_init( &ctx.haval );
+   haval256_4x32_update( &ctx.haval, vhashX[0], 64 );
+   haval256_4x32_close( &ctx.haval, vhash );
   dintrlv_4x32_512( hash0[17], hash1[17], hash2[17], hash3[17], vhash );

   sph_tiger_init(&ctx.tiger);
@@ -853,16 +850,16 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
   intrlv_4x32_512( vhashX[0], hash0[20], hash1[20], hash2[20], hash3[20] );
   memset( vhash, 0, 64*4 );

-   sha256_4way_init( &ctx.sha256 );
-   sha256_4way_update( &ctx.sha256, vhashX[0], 64 );
-   sha256_4way_close( &ctx.sha256, vhash );
+   sha256_4x32_init( &ctx.sha256 );
+   sha256_4x32_update( &ctx.sha256, vhashX[0], 64 );
+   sha256_4x32_close( &ctx.sha256, vhash );
   dintrlv_4x32_512( hash0[21], hash1[21], hash2[21], hash3[21], vhash );

 #endif

-   panama_4way_init( &ctx.panama );
-   panama_4way_update( &ctx.panama, vhash, 64 );
-   panama_4way_close( &ctx.panama, vhash );
+   panama_4x32_init( &ctx.panama );
+   panama_4x32_update( &ctx.panama, vhash, 64 );
+   panama_4x32_close( &ctx.panama, vhash );
   dintrlv_4x32_512( hash0[22], hash1[22], hash2[22], hash3[22], vhash );

   laneHash(512, (const BitSequence*)hash0[22], 512, (BitSequence*)hash0[23]);
@@ -902,8 +899,8 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
   intrlv_4x32_512( vhashX[22], hash0[22], hash1[22], hash2[22], hash3[22] );
   intrlv_4x32_512( vhashX[23], hash0[23], hash1[23], hash2[23], hash3[23] );

-   blake2s_4way_init( &ctx.blake2s, 32 );
-   blake2s_4way_full_blocks( &ctx.blake2s, output, vhashX, 64*24 );
+   blake2s_4x32_init( &ctx.blake2s, 32 );
+   blake2s_4x32_full_blocks( &ctx.blake2s, output, vhashX, 64*24 );

   return 1;
 }
@@ -936,9 +933,8 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
   edata[4] = v128_swap64_32( casti_v128( pdata, 4 ) );

   mm256_intrlv80_4x64( vdata, edata );
-   *noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32(
-                                                 0, 3, 0, 2, 0, 1, 0, 0 ) );
-   blake512_4way_prehash_le( &blake512_4way_ctx, x25x_4way_midstate, vdata );
+   *noncev = _mm256_add_epi32( *noncev, _mm256_set_epi64x( 3, 2, 1, 0 ) );
+   blake512_4x64_prehash_le( &blake512_4x64_ctx, x25x_4way_midstate, vdata );
   
   do
   {
--- a/algo/x22/x25x.c
+++ b/algo/x22/x25x.c
@@ -231,7 +231,7 @@ int scanhash_x25x( struct work *work, uint32_t max_nonce,
   do
   {
      edata[19] = n;
-      if ( x25x_hash( hash64, edata, thr_id ) );
+      if ( x25x_hash( hash64, edata, thr_id ) )
      if ( unlikely( valid_hash( hash64, ptarget ) && !bench ) )
      {
         pdata[19] = bswap_32( n );
--- a/algo/yespower/yespower-4way.c
+++ b/algo/yespower/yespower-4way.c
@@ -1,692 +0,0 @@
-/*-
- * Copyright 2009 Colin Percival
- * Copyright 2013-2018 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * This file was originally written by Colin Percival as part of the Tarsnap
- * online backup system.
- *
- * This is a proof-of-work focused fork of yescrypt, including reference and
- * cut-down implementation of the obsolete yescrypt 0.5 (based off its first
- * submission to PHC back in 2014) and a new proof-of-work specific variation
- * known as yespower 1.0.  The former is intended as an upgrade for
- * cryptocurrencies that already use yescrypt 0.5 and the latter may be used
- * as a further upgrade (hard fork) by those and other cryptocurrencies.  The
- * version of algorithm to use is requested through parameters, allowing for
- * both algorithms to co-exist in client and miner implementations (such as in
- * preparation for a hard-fork).
- *
- * This is the reference implementation.  Its purpose is to provide a simple
- * human- and machine-readable specification that implementations intended
- * for actual use should be tested against.  It is deliberately mostly not
- * optimized, and it is not meant to be used in production.  Instead, use
- * yespower-opt.c.
- */
-/*
-#warning "This reference implementation is deliberately mostly not optimized. Use yespower-opt.c instead unless you're testing (against) the reference implementation on purpose."
-*/
-#include <errno.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "algo/sha/hmac-sha256-hash-4way.h"
-//#include "sysendian.h"
-
-#include "yespower.h"
-
-
-#if defined(__AVX2__)
-
-
-static void blkcpy_8way( __m256i *dst, const __m256i *src, size_t count )
-{
-	do {
-		*dst++ = *src++;
-	} while (--count);
-}
-
-static void blkxor_8way( __m256i *dst, const __m256i *src, size_t count )
-{
-	do {
-		*dst++ ^= *src++;
-	} while (--count);
-}
-
-/**
- * salsa20(B):
- * Apply the Salsa20 core to the provided block.
- */
-static void salsa20_8way( __m256i B[16], uint32_t rounds )
-{
-	__m256i x[16];
-	size_t i;
-
-	/* SIMD unshuffle */
-	for ( i = 0; i < 16; i++ )
-		x[i * 5 % 16] = B[i];
-
-	for ( i = 0; i < rounds; i += 2 )
-   {
-#define R( a, b, c ) mm256_rol_32( _mm256_add_epi32( a, b ), c )
-      /* Operate on columns */
-
-      x[ 4] = _mm256_xor_si256( x[ 4], R( x[ 0], x[12],  7 ) );
-      x[ 8] = _mm256_xor_si256( x[ 8], R( x[ 4], x[ 0],  9 ) );
-      x[12] = _mm256_xor_si256( x[12], R( x[ 8], x[ 4], 13 ) );
-      x[ 0] = _mm256_xor_si256( x[ 0], R( x[12], x[ 8], 18 ) );
-
-      x[ 9] = _mm256_xor_si256( x[ 9], R( x[ 5], x[ 1],  7 ) );
-      x[13] = _mm256_xor_si256( x[13], R( x[ 9], x[ 5],  9 ) );
-      x[ 1] = _mm256_xor_si256( x[ 1], R( x[13], x[ 9], 13 ) );
-      x[ 5] = _mm256_xor_si256( x[ 5], R( x[ 1], x[13], 18 ) );
-
-      x[14] = _mm256_xor_si256( x[14], R( x[10], x[ 6],  7 ) );
-      x[ 2] = _mm256_xor_si256( x[ 2], R( x[14], x[10],  9 ) );
-      x[ 6] = _mm256_xor_si256( x[ 6], R( x[ 2], x[14], 13 ) );
-      x[10] = _mm256_xor_si256( x[10], R( x[ 6], x[ 2], 18 ) );
-
-      x[ 3] = _mm256_xor_si256( x[ 3], R( x[15], x[11],  7 ) );
-      x[ 7] = _mm256_xor_si256( x[ 7], R( x[ 3], x[15],  9 ) );
-      x[11] = _mm256_xor_si256( x[11], R( x[ 7], x[ 3], 13 ) );
-      x[15] = _mm256_xor_si256( x[15], R( x[11], x[ 7], 18 ) );
-
-		/* Operate on rows */
-
-      x[ 1] = _mm256_xor_si256( x[ 1], R( x[ 0], x[ 3],  7 ) );
-      x[ 2] = _mm256_xor_si256( x[ 2], R( x[ 1], x[ 0],  9 ) );
-      x[ 3] = _mm256_xor_si256( x[ 3], R( x[ 2], x[ 1], 13 ) );
-      x[ 0] = _mm256_xor_si256( x[ 0], R( x[ 3], x[ 2], 18 ) );
-
-      x[ 6] = _mm256_xor_si256( x[ 6], R( x[ 5], x[ 4],  7 ) );
-      x[ 7] = _mm256_xor_si256( x[ 7], R( x[ 6], x[ 5],  9 ) );
-      x[ 4] = _mm256_xor_si256( x[ 4], R( x[ 7], x[ 6], 13 ) );
-      x[ 5] = _mm256_xor_si256( x[ 5], R( x[ 4], x[ 7], 18 ) );
-
-      x[11] = _mm256_xor_si256( x[11], R( x[10], x[ 9],  7 ) );
-      x[ 8] = _mm256_xor_si256( x[ 8], R( x[11], x[10],  9 ) );
-      x[ 9] = _mm256_xor_si256( x[ 9], R( x[ 8], x[11], 13 ) );
-      x[10] = _mm256_xor_si256( x[10], R( x[ 9], x[ 8], 18 ) );
-
-      x[12] = _mm256_xor_si256( x[12], R( x[15], x[14],  7 ) );
-      x[13] = _mm256_xor_si256( x[13], R( x[12], x[15],  9 ) );
-      x[14] = _mm256_xor_si256( x[14], R( x[13], x[12], 13 ) );
-      x[15] = _mm256_xor_si256( x[15], R( x[14], x[13], 18 ) );
-
-#undef R
-	}
-
-	/* SIMD shuffle */
-	for (i = 0; i < 16; i++)
-		B[i] = _mm256_add_epi32( B[i], x[i * 5 % 16] );
-}
-
-/**
- * blockmix_salsa(B):
- * Compute B = BlockMix_{salsa20, 1}(B).  The input B must be 128 bytes in
- * length.
- */
-static void blockmix_salsa_8way( __m256i *B, uint32_t rounds )
-{
-	__m256i X[16];
-	size_t i;
-
-	/* 1: X <-- B_{2r - 1} */
-	blkcpy_8way( X, &B[16], 16 );
-
-	/* 2: for i = 0 to 2r - 1 do */
-	for ( i = 0; i < 2; i++ )
-   {
-		/* 3: X <-- H(X xor B_i) */
-		blkxor_8way( X, &B[i * 16], 16 );
-		salsa20_8way( X, rounds );
-
-		/* 4: Y_i <-- X */
-		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-		blkcpy_8way( &B[i * 16], X, 16 );
-	}
-}
-
-/*
- * These are tunable, but they must meet certain constraints and are part of
- * what defines a yespower version.
- */
-#define PWXsimple 2
-#define PWXgather 4
-/* Version 0.5 */
-#define PWXrounds_0_5 6
-#define Swidth_0_5 8
-/* Version 1.0 */
-#define PWXrounds_1_0 3
-#define Swidth_1_0 11
-
-/* Derived values.  Not tunable on their own. */
-#define PWXbytes (PWXgather * PWXsimple * 8)
-#define PWXwords (PWXbytes / sizeof(uint32_t))
-#define rmin ((PWXbytes + 127) / 128)
-
-/* Runtime derived values.  Not tunable on their own. */
-#define Swidth_to_Sbytes1(Swidth) ((1 << Swidth) * PWXsimple * 8)
-#define Swidth_to_Smask(Swidth) (((1 << Swidth) - 1) * PWXsimple * 8)
-
-typedef struct {
-   __m256i (*S0)[2], (*S1)[2], (*S2)[2];
-   __m256i *S;
-	yespower_version_t version;
-	uint32_t salsa20_rounds;
-	uint32_t PWXrounds, Swidth, Sbytes, Smask;
-	size_t w;
-} pwxform_8way_ctx_t __attribute__ ((aligned (128)));
-
-/**
- * pwxform(B):
- * Transform the provided block using the provided S-boxes.
- */
-static void pwxform_8way( __m256i *B, pwxform_8way_ctx_t *ctx )
-{
-	__m256i (*X)[PWXsimple][2] = (__m256i (*)[PWXsimple][2])B;
-	__m256i (*S0)[2] = ctx->S0, (*S1)[2] = ctx->S1, (*S2)[2] = ctx->S2;
-	__m256i Smask = _mm256_set1_epi32( ctx->Smask );
-	size_t w = ctx->w;
-	size_t i, j, k;
-
-	/* 1: for i = 0 to PWXrounds - 1 do */
-	for ( i = 0; i < ctx->PWXrounds; i++ )
-   {
-		/* 2: for j = 0 to PWXgather - 1 do */
-		for ( j = 0; j < PWXgather; j++ )
-      {
-// Are these pointers or data?
-         __m256i xl = X[j][0][0];
-			__m256i xh = X[j][0][1];
-			__m256i (*p0)[2], (*p1)[2];
-
-			// 3: p0 <-- (lo(B_{j,0}) & Smask) / (PWXsimple * 8) 
-
-// playing with pointers
-/*
-         p0 = S0 + (xl & Smask) / sizeof(*S0);
-			// 4: p1 <-- (hi(B_{j,0}) & Smask) / (PWXsimple * 8) 
-			p1 = S1 + (xh & Smask) / sizeof(*S1);
-*/
-			/* 5: for k = 0 to PWXsimple - 1 do */
-			for ( k = 0; k < PWXsimple; k++ )
-         {
-
-// shift from 32 bit data to 64 bit data
-            __m256i x0, x1, s00, s01, s10, s11;
-            __m128i *p0k = (__m128i*)p0[k];
-            __m128i *p1k = (__m128i*)p1[k];
-
-
-           s00 = _mm256_add_epi64( _mm256_cvtepu32_epi64( p0k[0] ),
-                _mm256_slli_epi64( _mm256_cvtepu32_epi64( p0k[2] ), 32 ) );
-           s01 = _mm256_add_epi64( _mm256_cvtepu32_epi64( p0k[1] ),
-                _mm256_slli_epi64( _mm256_cvtepu32_epi64( p0k[3] ), 32 ) );
-           s10 = _mm256_add_epi64( _mm256_cvtepu32_epi64( p1k[0] ),
-                _mm256_slli_epi64( _mm256_cvtepu32_epi64( p1k[2] ), 32 ) );
-           s11 = _mm256_add_epi64( _mm256_cvtepu32_epi64( p1k[1] ),
-                _mm256_slli_epi64( _mm256_cvtepu32_epi64( p1k[3] ), 32 ) );
-
-            __m128i *xx = (__m128i*)X[j][k];
-            x0 = _mm256_mul_epu32( _mm256_cvtepu32_epi64( xx[0] ),
-                                   _mm256_cvtepu32_epi64( xx[2] ) );
-            x1 = _mm256_mul_epu32( _mm256_cvtepu32_epi64( xx[1] ),
-                                   _mm256_cvtepu32_epi64( xx[3] ) );
-
-            x0 = _mm256_add_epi64( x0, s00 );
-            x1 = _mm256_add_epi64( x1, s01 );
-            
-            x0 = _mm256_xor_si256( x0, s10 );
-            x1 = _mm256_xor_si256( x1, s11 );
-
-            X[j][k][0] = x0; 
-            X[j][k][1] = x1;                        
-			}
-
-			if ( ctx->version != YESPOWER_0_5 &&
-			    ( i == 0 || j < PWXgather / 2 ) )
-         {
-				if ( j & 1 )
-            {
-					for ( k = 0; k < PWXsimple; k++ )
-               {
-						S1[w][0] = X[j][k][0];
-						S1[w][1] = X[j][k][1];
-						w++;
-					}
-				}
-            else
-            {
-					for ( k = 0; k < PWXsimple; k++ )
-               {
-						S0[w + k][0] = X[j][k][0];
-						S0[w + k][1] = X[j][k][1];
-					}
-				}
-			}
-		}
-	}
-
-	if ( ctx->version != YESPOWER_0_5 )
-   {
-		/* 14: (S0, S1, S2) <-- (S2, S0, S1) */
-		ctx->S0 = S2;
-		ctx->S1 = S0;
-		ctx->S2 = S1;
-		/* 15: w <-- w mod 2^Swidth */
-		ctx->w = w & ( ( 1 << ctx->Swidth ) * PWXsimple - 1 );
-	}
-}
-
-/**
- * blockmix_pwxform(B, ctx, r):
- * Compute B = BlockMix_pwxform{salsa20, ctx, r}(B).  The input B must be
- * 128r bytes in length.
- */
-static void blockmix_pwxform_8way( uint32_t *B, pwxform_8way_ctx_t *ctx,
-                                   size_t r )
-{
-	__m256i X[PWXwords];
-	size_t r1, i;
-
-	/* Convert 128-byte blocks to PWXbytes blocks */
-	/* 1: r_1 <-- 128r / PWXbytes */
-	r1 = 128 * r / PWXbytes;
-
-	/* 2: X <-- B'_{r_1 - 1} */
-	blkcpy_8way( X, &B[ (r1 - 1) * PWXwords ], PWXwords );
-
-	/* 3: for i = 0 to r_1 - 1 do */
-	for ( i = 0; i < r1; i++ )
-   {
-		/* 4: if r_1 > 1 */
-		if ( r1 > 1 )
-      {
-			/* 5: X <-- X xor B'_i */
-			blkxor_8way( X, &B[ i * PWXwords ], PWXwords );
-		}
-
-		/* 7: X <-- pwxform(X) */
-		pwxform_8way( X, ctx );
-
-		/* 8: B'_i <-- X */
-		blkcpy_8way( &B[ i * PWXwords ], X, PWXwords );
-	}
-
-	/* 10: i <-- floor((r_1 - 1) * PWXbytes / 64) */
-	i = ( r1 - 1 ) * PWXbytes / 64;
-
-	/* 11: B_i <-- H(B_i) */
-	salsa20_8way( &B[i * 16], ctx->salsa20_rounds );
-
-#if 1 /* No-op with our current pwxform settings, but do it to make sure */
-	/* 12: for i = i + 1 to 2r - 1 do */
-	for ( i++; i < 2 * r; i++ )
-   {
-		/* 13: B_i <-- H(B_i xor B_{i-1}) */
-		blkxor_8way( &B[i * 16], &B[ (i - 1) * 16 ], 16 );
-		salsa20_8way( &B[i * 16], ctx->salsa20_rounds );
-	}
-#endif
-}
-
-// This looks a lot like data dependent addressing
-
-/**
- * integerify(B, r):
- * Return the result of parsing B_{2r-1} as a little-endian integer.
- */
-static __m256i integerify8( const __m256i *B, size_t r )
-{
-/*
- * Our 32-bit words are in host byte order.  Also, they are SIMD-shuffled, but
- * we only care about the least significant 32 bits anyway.
- */
-	const __m256i *X = &B[ (2 * r - 1) * 16 ];
-	return X[0];
-}
-
-/**
- * p2floor(x):
- * Largest power of 2 not greater than argument.
- */
-static uint32_t p2floor8( uint32_t x )
-{
-	uint32_t y;
-	while ( ( y = x & (x - 1) ) )
-		x = y;
-	return x;
-}
-
-/**
- * wrap(x, i):
- * Wrap x to the range 0 to i-1.
- */
-static uint32_t wrap8( uint32_t x, uint32_t i )
-{
-	uint32_t n = p2floor( i );
-	return ( x & (n - 1) ) + (i - n);
-}
-
-/**
- * smix1(B, r, N, V, X, ctx):
- * Compute first loop of B = SMix_r(B, N).  The input B must be 128r bytes in
- * length; the temporary storage V must be 128rN bytes in length; the temporary
- * storage X must be 128r bytes in length.
- */
-static void smix1_8way( __m256i *B, size_t r, uint32_t N,
-                        __m256i *V, __m256i *X, pwxform_8way_ctx_t *ctx )
-{
-	size_t s = 32 * r;
-	uint32_t i, j;
-	size_t k;
-
-	/* 1: X <-- B */
-	for ( k = 0; k < 2 * r; k++ )
-		for ( i = 0; i < 16; i++ )
-			X[ k * 16 + i ] = B[ k * 16 + ( i * 5 % 16 ) ];
-
-	if ( ctx->version != YESPOWER_0_5 )
-   {
-		for ( k = 1; k < r; k++ )
-      {
-			blkcpy_8way( &X[k * 32], &X[ (k - 1) * 32 ], 32 );
-			blockmix_pwxform_8way( &X[k * 32], ctx, 1 );
-		}
-	}
-
-	/* 2: for i = 0 to N - 1 do */
-	for ( i = 0; i < N; i++ )
-   {
-		/* 3: V_i <-- X */
-		blkcpy_8way( &V[i * s], X, s );
-
-		if ( i > 1 )
-      {
-
-// is j int or vector? Integrify has data dependent addressing?
-
-         /* j <-- Wrap(Integerify(X), i) */
-//			j = wrap8( integerify8( X, r ), i );
-
-			/* X <-- X xor V_j */
-			blkxor_8way( X, &V[j * s], s );
-		}
-
-		/* 4: X <-- H(X) */
-		if ( V != ctx->S )
-			blockmix_pwxform_8way( X, ctx, r );
-		else
-			blockmix_salsa_8way( X, ctx->salsa20_rounds );
-	}
-
-	/* B' <-- X */
-	for ( k = 0; k < 2 * r; k++ )
-		for ( i = 0; i < 16; i++ )
-			B[ k * 16 + ( i * 5 % 16 ) ] = X[ k * 16 + i ];
-}
-
-/**
- * smix2(B, r, N, Nloop, V, X, ctx):
- * Compute second loop of B = SMix_r(B, N).  The input B must be 128r bytes in
- * length; the temporary storage V must be 128rN bytes in length; the temporary
- * storage X must be 128r bytes in length.  The value N must be a power of 2
- * greater than 1.
- */
-static void smix2_8way( __m256i *B, size_t r, uint32_t N, uint32_t Nloop,
-                        __m256i *V, __m256i *X, pwxform_8way_ctx_t *ctx )
-{
-	size_t s = 32 * r;
-	uint32_t i, j;
-	size_t k;
-
-	/* X <-- B */
-	for ( k = 0; k < 2 * r; k++ )
-		for ( i = 0; i < 16; i++ )
-			X[ k * 16 + i ] = B[ k * 16 + ( i * 5 % 16 ) ];
-
-	/* 6: for i = 0 to N - 1 do */
-	for ( i = 0; i < Nloop; i++ )
-   {
-		/* 7: j <-- Integerify(X) mod N */
-//		j = integerify8(X, r) & (N - 1);
-
-		/* 8.1: X <-- X xor V_j */
-		blkxor_8way( X, &V[j * s], s );
-		/* V_j <-- X */
-		if ( Nloop != 2 )
-			blkcpy_8way( &V[j * s], X, s );
-
-		/* 8.2: X <-- H(X) */
-		blockmix_pwxform_8way( X, ctx, r );
-	}
-
-	/* 10: B' <-- X */
-	for ( k = 0; k < 2 * r; k++ )
-		for ( i = 0; i < 16; i++ )
-			B[ k * 16 + ( i * 5 % 16 ) ] = X[ k * 16 + i ];
-}
-
-/**
- * smix(B, r, N, p, t, V, X, ctx):
- * Compute B = SMix_r(B, N).  The input B must be 128rp bytes in length; the
- * temporary storage V must be 128rN bytes in length; the temporary storage
- * X must be 128r bytes in length.  The value N must be a power of 2 and at
- * least 16.
- */
-static void smix_8way( __m256i *B, size_t r, uint32_t N,
-                       __m256i *V, __m256i *X, pwxform_8way_ctx_t *ctx)
-{
-	uint32_t Nloop_all = (N + 2) / 3; /* 1/3, round up */
-	uint32_t Nloop_rw = Nloop_all;
-
-	Nloop_all++; Nloop_all &= ~(uint32_t)1; /* round up to even */
-
-   if ( ctx->version == YESPOWER_0_5 )
-		Nloop_rw &= ~(uint32_t)1; /* round down to even */
-	else
-		Nloop_rw++; Nloop_rw &= ~(uint32_t)1; /* round up to even */
-
-	smix1_8way( B, 1, ctx->Sbytes / 128, ctx->S, X, ctx );
-	smix1_8way( B, r, N, V, X, ctx );
-	smix2_8way( B, r, N, Nloop_rw /* must be > 2 */, V, X, ctx );
-	smix2_8way( B, r, N, Nloop_all - Nloop_rw /* 0 or 2 */, V, X, ctx );
-}
-
-/**
- * yespower(local, src, srclen, params, dst):
- * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target".
- *
- * Return 0 on success; or -1 on error.
- */
-int yespower_8way( yespower_local_t *local, const __m256i *src, size_t srclen,
-              const yespower_params_t *params, yespower_8way_binary_t *dst,
-              int thrid )
-{
-	yespower_version_t version = params->version;
-	uint32_t N = params->N;
-	uint32_t r = params->r;
-	const uint8_t *pers = params->pers;
-	size_t perslen = params->perslen;
-	int retval = -1;
-	size_t B_size, V_size;
-	uint32_t *B, *V, *X, *S;
-	pwxform_8way_ctx_t ctx;
-	__m256i sha256[8];
-
-	/* Sanity-check parameters */
-	if ( (version != YESPOWER_0_5 && version != YESPOWER_1_0 ) ||
-	    N < 1024 || N > 512 * 1024 || r < 8 || r > 32 ||
-	    (N & (N - 1)) != 0 || r < rmin ||
-	    (!pers && perslen) )
-   {
-		errno = EINVAL;
-		return -1;
-	}
-
-	/* Allocate memory */
-	B_size = (size_t)128 * r;
-	V_size = B_size * N;
-	if ((V = malloc(V_size)) == NULL)
-		return -1;
-	if ((B = malloc(B_size)) == NULL)
-		goto free_V;
-	if ((X = malloc(B_size)) == NULL)
-		goto free_B;
-	ctx.version = version;
-	if (version == YESPOWER_0_5) {
-		ctx.salsa20_rounds = 8;
-		ctx.PWXrounds = PWXrounds_0_5;
-		ctx.Swidth = Swidth_0_5;
-		ctx.Sbytes = 2 * Swidth_to_Sbytes1(ctx.Swidth);
-	} else {
-		ctx.salsa20_rounds = 2;
-		ctx.PWXrounds = PWXrounds_1_0;
-		ctx.Swidth = Swidth_1_0;
-		ctx.Sbytes = 3 * Swidth_to_Sbytes1(ctx.Swidth);
-	}
-	if ((S = malloc(ctx.Sbytes)) == NULL)
-		goto free_X;
-	ctx.S = S;
-	ctx.S0 = (__m256i (*)[2])S;
-	ctx.S1 = ctx.S0 + (1 << ctx.Swidth) * PWXsimple;
-	ctx.S2 = ctx.S1 + (1 << ctx.Swidth) * PWXsimple;
-	ctx.Smask = Swidth_to_Smask(ctx.Swidth);
-	ctx.w = 0;
-
-   // do prehash
-	sha256_8way_full( sha256, src, srclen );
-
-
-  // need flexible size, use malloc;
-   __m256i vpers[128];
-
-	if ( version != YESPOWER_0_5 && perslen )
-      for ( int i = 0; i < perslen/4 + 1; i++ )
-         vpers[i] = _mm256_set1_epi32( pers[i] );
-
-	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
-	pbkdf2_sha256_8way( B, B_size, sha256, sizeof(sha256), vpers, perslen, 1 );
-
-	blkcpy_8way( sha256, B, sizeof(sha256) / sizeof(sha256[0] ) );
-
-	/* 3: B_i <-- MF(B_i, N) */
-	smix_8way( B, r, N, V, X, &ctx );
-
-	if ( version == YESPOWER_0_5 )
-   {
-		/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
-		pbkdf2_sha256_8way( dst, sizeof(*dst), sha256, sizeof(sha256),
-                          B, B_size, 1 );
-
-		if ( pers )
-      {
-			hmac_sha256_8way_full( dst, sizeof(*dst), vpers, perslen, sha256 );
-			sha256_8way_full( dst, sha256, sizeof(sha256) );
-		}
-	}
-   else
-		hmac_sha256_8way_full( dst, B + B_size - 64, 64, sha256, sizeof(sha256) );
-
-	/* Success! */
-	retval = 1;
-
-	/* Free memory */
-	free(S);
-free_X:
-	free(X);
-free_B:
-	free(B);
-free_V:
-	free(V);
-
-	return retval;
-}
-
-int yespower_8way_tls( const __m256i *src, size_t srclen,
-    const yespower_params_t *params, yespower_8way_binary_t *dst, int trhid )
-{
-/* The reference implementation doesn't use thread-local storage */
-	return yespower_8way( NULL, src, srclen, params, dst, trhid );
-}
-
-int yespower_init_local8( yespower_local_t *local )
-{
-/* The reference implementation doesn't use the local structure */
-	local->base = local->aligned = NULL;
-	local->base_size = local->aligned_size = 0;
-	return 0;
-}
-
-int yespower_free_local8( yespower_local_t *local )
-{
-/* The reference implementation frees its memory in yespower() */
-	(void)local; /* unused */
-	return 0;
-}
-
-int yespower_8way_hash( const char *input, char *output, uint32_t len,
-                        int thrid )
-{
-   return yespower_8way_tls( input, len, &yespower_params,
-           (yespower_binary_t*)output, thrid );
-}
-
-int scanhash_yespower_8way( struct work *work, uint32_t max_nonce,
-                            uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t _ALIGN(128) hash[8*8];
-   uint32_t _ALIGN(128) vdata[20*8];
-   uint32_t _ALIGN(128) endiandata[20];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce;
-   uint32_t n = first_nonce;
-   const int thr_id = mythr->id;
-
-   for ( int k = 0; k < 19; k++ )
-      be32enc( &endiandata[k], pdata[k] );
-   endiandata[19] = n;
-
-// do sha256 prehash
-   SHA256_Init( &sha256_prehash_ctx );
-   SHA256_Update( &sha256_prehash_ctx, endiandata, 64 );
-
-   do {
-      if ( yespower_hash( vdata, hash, 80, thr_id ) )
-      if unlikely( valid_hash( hash, ptarget ) && !opt_benchmark )
-      {
-          be32enc( pdata+19, n );
-          submit_solution( work, hash, mythr );
-      }
-      endiandata[19] = ++n;
-   } while ( n < last_nonce && !work_restart[thr_id].restart );
-   *hashes_done = n - first_nonce;
-   pdata[19] = n;
-   return 0;
-}
-
-#endif  // AVX2
--- a/armbuild-all.sh
+++ b/armbuild-all.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #
 # This script is not intended for users, it is only used for compile testing
-# during develpment. However the information contained may provide compilation
+# during development. However, the information contained may provide compilation
 # tips to users.

 rm cpuminer cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -10,9 +10,9 @@ rm cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=icelake-client -Wall" ./configure --with-curl
+#CFLAGS="-O3 -march=icelake-client -Wall" ./configure --with-curl
 # Rocketlake needs gcc-11
-#CFLAGS="-O3 -march=rocketlake -Wall" ./configure --with-curl
+CFLAGS="-O3 -march=rocketlake -Wall" ./configure --with-curl
 make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-avx512-sha-vaes
@@ -21,7 +21,7 @@ mv cpuminer cpuminer-avx512-sha-vaes
 #make clean || echo clean
 #rm -f config.status
 #CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl
-#make -j 8
+#make -j $(nproc)
 #strip -s cpuminer
 #mv cpuminer cpuminer-alderlake

@@ -30,35 +30,43 @@ mv cpuminer cpuminer-avx512-sha-vaes
 #make clean || echo clean
 #rm -f config.status
 #CFLAGS="-O3 -march=arrowlake-s -Wall" ./configure --with-curl
-#make -j 8
+#make -j $(nproc)
 #strip -s cpuminer
 #mv cpuminer cpuminer-arrowlake-s

 # Intel Core Graniterapids: AVX512, SHA256, VAES, needs gcc-14
-# Apparently Granitrapids will not include AVX10, SHA512 or APX,
+# Granitrapids does not build with AVX10, SHA512 or APX.
 # wait for Diamondrapids & gcc-15.
 #make clean || echo clean
 #rm -f config.status
 #CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl
-#make -j 8
+#make -j $(nproc)
 #strip -s cpuminer
 #mv cpuminer cpuminer-graniterapids

-# Force AVX10-256
+# SHA512 AVX10.1
 #make clean || echo clean
 #rm -f config.status
-#CFLAGS="-O3 -march=arrowlake-s -mavx10.1-256 -Wall" ./configure --with-curl
-#make -j 8
+#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1 -Wall" ./configure --with-curl
+#make -j $(nproc)
 #strip -s cpuminer
-#mv cpuminer cpuminer-avx10-256
+#mv cpuminer cpuminer-avx10_1

-# Force SHA512 AVX10-512
+# SHA512 AVX10.2
 #make clean || echo clean
 #rm -f config.status
-#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1-512 -Wall" ./configure --with-curl
-#make -j 8
+#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.2 -Wall" ./configure --with-curl
+#make -j $(nproc)
 #strip -s cpuminer
-#mv cpuminer cpuminer-avx10-512
+#mv cpuminer cpuminer-avx10_2
+
+# Diamondrapids: AVX10.2, SHA512, APX; needs GCC-15 & CPU with APX to compile.
+#make clean || echo clean
+#rm -f config.status
+#CFLAGS="-O3 -march=diamondrapids -Wall" ./configure --with-curl
+#make -j $(nproc)
+#strip -s cpuminer
+#mv cpuminer cpuminer-diamondrapids

 # Zen5: AVX512 SHA VAES, requires gcc-14.
 #make clean || echo clean
@@ -71,11 +79,10 @@ mv cpuminer cpuminer-avx512-sha-vaes
 # Zen4: AVX512 SHA VAES
 make clean || echo clean
 rm -f config.status
-# znver3 needs gcc-11, znver4 needs gcc-12.3.
+# Zen4: AVX512, SHA, VAES, needs gcc-12.3.
 #CFLAGS="-O3 -march=znver4 -Wall" ./configure --with-curl
 # Inclomplete list of Zen4 AVX512 extensions but includes all extensions used by cpuminer.
 CFLAGS="-O3 -march=znver3 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq -Wall" ./configure --with-curl
-#CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -Wall" ./configure --with-curl
 make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-zen4
@@ -83,7 +90,6 @@ mv cpuminer cpuminer-zen4
 # Zen3 AVX2 SHA VAES
 make clean || echo clean
 rm -f config.status
-#CFLAGS="-O3 -march=znver2 -mvaes" ./configure --with-curl
 CFLAGS="-O3 -march=znver3 -fno-common " ./configure --with-curl
 make -j $(nproc)
 strip -s cpuminer
@@ -159,7 +165,7 @@ mv cpuminer cpuminer-ssse3
 # SSE2
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -msse2 -Wall" ./configure --with-curl
+CFLAGS="-O3 -march=x86-64 -msse2 -Wall" ./configure --with-curl
 make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-sse2
--- a/build-avx2.sh
+++ b/build-avx2.sh
@@ -1,27 +1,9 @@
-#!/bin/bash
-
-#if [ "$OS" = "Windows_NT" ]; then
-#    ./mingw64.sh
-#    exit 0
-#fi
+#!/bin/sh

 # Linux build

 make distclean || echo clean
-
 rm -f config.status
 ./autogen.sh || echo done
-
-# Ubuntu 10.04 (gcc 4.4)
-# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
-
-# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
-#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
-
-#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
 CFLAGS="-O3 -march=haswell -maes -Wall" ./configure --with-curl
-#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
-
 make -j $(nproc)
-
-strip -s cpuminer
--- a/build.sh
+++ b/build.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh

 make distclean || echo clean
 rm -f config.status
--- a/clean-all.sh
+++ b/clean-all.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 #
 # make clean and rm all the targetted executables.

--- a/compat.h
+++ b/compat.h
@@ -3,7 +3,7 @@

 #ifdef WIN32

-#if _WIN32_WINNT==0x0601    // Windows 7
+#if _WIN32_WINNT>=0x0601    // Windows 7
 #define WINDOWS_CPU_GROUPS_ENABLED 1
 #endif

--- a/227
+++ b/227
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.1.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.4.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='25.1'
-PACKAGE_STRING='cpuminer-opt 25.1'
+PACKAGE_VERSION='25.4'
+PACKAGE_STRING='cpuminer-opt 25.4'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -657,10 +657,6 @@ JANSSON_LIBS
 LIBCURL_CPPFLAGS
 LIBCURL_CFLAGS
 LIBCURL
-X86_64_APPLE_FALSE
-X86_64_APPLE_TRUE
-ARM64_APPLE_FALSE
-ARM64_APPLE_TRUE
 HAVE_APPLE_FALSE
 HAVE_APPLE_TRUE
 MINGW_FALSE
@@ -669,8 +665,6 @@ ARCH_ARM64_FALSE
 ARCH_ARM64_TRUE
 ARCH_x86_64_FALSE
 ARCH_x86_64_TRUE
-ARCH_x86_FALSE
-ARCH_x86_TRUE
 USE_ASM_FALSE
 USE_ASM_TRUE
 HAVE_WINDOWS_FALSE
@@ -802,7 +796,6 @@ enable_maintainer_mode
 enable_dependency_tracking
 enable_assembly
 with_curl
-with_crypto
 '
      ac_precious_vars='build_alias
 host_alias
@@ -1366,7 +1359,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 25.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 25.4 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1438,7 +1431,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 25.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 25.4:";;
   esac
  cat <<\_ACEOF

@@ -1461,7 +1454,6 @@ Optional Packages:
  --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
  --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
  --with-curl=PATH         prefix where curl is installed default=/usr
-  --with-crypto=PATH       prefix where openssl crypto is installed default=/usr

 Some influential environment variables:
  CC          C compiler command
@@ -1544,7 +1536,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 25.1
+cpuminer-opt configure 25.4
 generated by GNU Autoconf 2.71

 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1991,7 +1983,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 25.1, which was
+It was created by cpuminer-opt $as_me 25.4, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3599,7 +3591,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='25.1'
+ VERSION='25.4'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -6509,22 +6501,11 @@ fi


 case $target in
-  i*86-*-*)
-    have_x86=true
-    ;;
-  aarch64-apple-*|arm64-apple-*)
-    have_arm64=true
-    have_arm64_apple=true
-    ;;
-  x86_64-apple-*|amd64-apple-*)
-    have_x86_64=true
-    have_x86_64_apple=true
-    ;;
  x86_64-*-*|amd64-*-*)
    have_x86_64=true
    ;;
  aarch64*-*-*|arm64*-*-*)
-    have_arm4=true
+    have_arm64=true
    ;;
  powerpc*-*-*)
    have_ppc=true
@@ -6557,126 +6538,7 @@ printf "%s\n" "#define USE_ASM 1" >>confdefs.h

 fi

-if test x$enable_assembly != xno -a x$have_x86_64 = xtrue
-then
-  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX code" >&5
-printf %s "checking whether we can compile AVX code... " >&6; }
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main (void)
-{
-asm ("vmovdqa %ymm0, %ymm1");
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"
-then :
-
-printf "%s\n" "#define USE_AVX 1" >>confdefs.h
-
-    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-printf "%s\n" "yes" >&6; }
-    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile XOP code" >&5
-printf %s "checking whether we can compile XOP code... " >&6; }
-    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main (void)
-{
-asm ("vprotd \$7, %xmm0, %xmm1");
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"
-then :
-
-printf "%s\n" "#define USE_XOP 1" >>confdefs.h
-
-      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-printf "%s\n" "yes" >&6; }
-
-else $as_nop
-  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
-printf "%s\n" "no" >&6; }
-      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the XOP instruction set." >&5
-printf "%s\n" "$as_me: WARNING: The assembler does not support the XOP instruction set." >&2;}
-
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
-    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX2 code" >&5
-printf %s "checking whether we can compile AVX2 code... " >&6; }
-    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main (void)
-{
-asm ("vpaddd %ymm0, %ymm1, %ymm2");
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"
-then :
-
-printf "%s\n" "#define USE_AVX2 1" >>confdefs.h
-
-      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-printf "%s\n" "yes" >&6; }
-      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX512 code" >&5
-printf %s "checking whether we can compile AVX512 code... " >&6; }
-      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main (void)
-{
-asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"
-then :
-
-printf "%s\n" "#define USE_AVX512 1" >>confdefs.h
-
-        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-printf "%s\n" "yes" >&6; }
-
-else $as_nop
-  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
-printf "%s\n" "no" >&6; }
-        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX512 instruction set." >&5
-printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX512 instruction set." >&2;}
-
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
-
-else $as_nop
-  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
-printf "%s\n" "no" >&6; }
-      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX2 instruction set." >&5
-printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX2 instruction set." >&2;}
-
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
-
-else $as_nop
-  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
-printf "%s\n" "no" >&6; }
-    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX instruction set." >&5
-printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX instruction set." >&2;}
-
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
-fi
-
+# jansson test fails on Linux/Mingw, handled in Makefile.am.
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for json_loads in -ljansson" >&5
 printf %s "checking for json_loads in -ljansson... " >&6; }
 if test ${ac_cv_lib_jansson_json_loads+y}
@@ -6885,9 +6747,6 @@ fi
 fi


-#LDFLAGS="$PTHREAD_LDFLAGS $LDFLAGS"
-# PTHREAD_LIBS="$PTHREAD_LIBS"
-
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether __uint128_t is supported" >&5
 printf %s "checking whether __uint128_t is supported... " >&6; }
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
@@ -6941,14 +6800,6 @@ else
  USE_ASM_FALSE=
 fi

- if test x$have_x86 = xtrue; then
-  ARCH_x86_TRUE=
-  ARCH_x86_FALSE='#'
-else
-  ARCH_x86_TRUE='#'
-  ARCH_x86_FALSE=
-fi
-
 if test x$have_x86_64 = xtrue; then
  ARCH_x86_64_TRUE=
  ARCH_x86_64_FALSE='#'
@@ -6981,28 +6832,6 @@ else
  HAVE_APPLE_FALSE=
 fi

- if test x$have_arm64_apple = xtrue; then
-  ARM64_APPLE_TRUE=
-  ARM64_APPLE_FALSE='#'
-else
-  ARM64_APPLE_TRUE='#'
-  ARM64_APPLE_FALSE=
-fi
-
- if test x$have_x86_64_apple = xtrue; then
-  X86_64_APPLE_TRUE=
-  X86_64_APPLE_FALSE='#'
-else
-  X86_64_APPLE_TRUE='#'
-  X86_64_APPLE_FALSE=
-fi
-
-
-if test x$request_jansson = xtrue ; then
-	JANSSON_LIBS="compat/jansson/libjansson.a"
-else
-	JANSSON_LIBS=-ljansson
-fi

 # libcurl install path (for mingw : --with-curl=/usr/local)

@@ -7020,30 +6849,10 @@ if test -n "$with_curl" ; then
   LIBCURL="-lcurl -lz"
 fi

-# SSL install path (for mingw : --with-crypto=/usr/local/ssl)
-
-# Check whether --with-crypto was given.
-if test ${with_crypto+y}
-then :
-  withval=$with_crypto;
-fi
-
-
-if test -n "$with_crypto" ; then
-   LIBCURL_CFLAGS="$LIBCURL_CFLAGS -I$with_crypto/include"
-   LIBCURL_CPPFLAGS="$LIBCURL_CPPFLAGS -I$with_crypto/include"
-   LIBCURL_LDFLAGS="-L$with_crypto/lib $LIBCURL_LDFLAGS"
-   LIBCURL="$LIBCURL -lssl -lcrypto"
-fi
-
 CFLAGS="$CFLAGS $LIBCURL_CFLAGS"
 CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS"
 LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS"

-#AC_CHECK_LIB([z],[gzopen],[],[])
-#AC_CHECK_LIB([crypto],[OPENSSL_init], crypto=yes, [AC_MSG_ERROR([OpenSSL crypto library required])])
-#AC_CHECK_LIB([ssl],[SSL_new], ssl=yes, ssl=no)
-
 # AC_CHECK_LIB([curl], [curl_multi_timeout],
 #    have_libcurl=yes,
 #    have_libcurl=no AC_MSG_ERROR([curl library required])
@@ -7220,10 +7029,6 @@ if test -z "${USE_ASM_TRUE}" && test -z "${USE_ASM_FALSE}"; then
  as_fn_error $? "conditional \"USE_ASM\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${ARCH_x86_TRUE}" && test -z "${ARCH_x86_FALSE}"; then
-  as_fn_error $? "conditional \"ARCH_x86\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
 if test -z "${ARCH_x86_64_TRUE}" && test -z "${ARCH_x86_64_FALSE}"; then
  as_fn_error $? "conditional \"ARCH_x86_64\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -7240,14 +7045,6 @@ if test -z "${HAVE_APPLE_TRUE}" && test -z "${HAVE_APPLE_FALSE}"; then
  as_fn_error $? "conditional \"HAVE_APPLE\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${ARM64_APPLE_TRUE}" && test -z "${ARM64_APPLE_FALSE}"; then
-  as_fn_error $? "conditional \"ARM64_APPLE\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
-if test -z "${X86_64_APPLE_TRUE}" && test -z "${X86_64_APPLE_FALSE}"; then
-  as_fn_error $? "conditional \"X86_64_APPLE\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi

 : "${CONFIG_STATUS=./config.status}"
 ac_write_fail=0
@@ -7638,7 +7435,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 25.1, which was
+This file was extended by cpuminer-opt $as_me 25.4, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7706,7 +7503,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 25.1
+cpuminer-opt config.status 25.4
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [25.1])
+AC_INIT([cpuminer-opt], [25.4])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
@@ -42,22 +42,11 @@ AC_FUNC_ALLOCA
 AC_CHECK_FUNCS([getopt_long])

 case $target in
-  i*86-*-*)
-    have_x86=true
-    ;;
-  aarch64-apple-*|arm64-apple-*)
-    have_arm64=true
-    have_arm64_apple=true
-    ;;
-  x86_64-apple-*|amd64-apple-*)
-    have_x86_64=true
-    have_x86_64_apple=true
-    ;;
  x86_64-*-*|amd64-*-*)
    have_x86_64=true
    ;;
  aarch64*-*-*|arm64*-*-*)
-    have_arm4=true
+    have_arm64=true
    ;;
  powerpc*-*-*)
    have_ppc=true
@@ -84,42 +73,7 @@ if test x$enable_assembly != xno; then
  AC_DEFINE([USE_ASM], [1], [Define to 1 if assembly routines are wanted.])
 fi

-if test x$enable_assembly != xno -a x$have_x86_64 = xtrue
-then
-  AC_MSG_CHECKING(whether we can compile AVX code)
-  AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vmovdqa %ymm0, %ymm1");])],
-    AC_DEFINE(USE_AVX, 1, [Define to 1 if AVX assembly is available.])
-    AC_MSG_RESULT(yes)
-    AC_MSG_CHECKING(whether we can compile XOP code)
-    AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vprotd \$7, %xmm0, %xmm1");])],
-      AC_DEFINE(USE_XOP, 1, [Define to 1 if XOP assembly is available.])
-      AC_MSG_RESULT(yes)
-    ,
-      AC_MSG_RESULT(no)
-      AC_MSG_WARN([The assembler does not support the XOP instruction set.])
-    )
-    AC_MSG_CHECKING(whether we can compile AVX2 code)
-    AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %ymm0, %ymm1, %ymm2");])],
-      AC_DEFINE(USE_AVX2, 1, [Define to 1 if AVX2 assembly is available.])
-      AC_MSG_RESULT(yes)
-      AC_MSG_CHECKING(whether we can compile AVX512 code)
-      AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");])],
-        AC_DEFINE(USE_AVX512, 1, [Define to 1 if AVX512 assembly is available.])
-        AC_MSG_RESULT(yes)
-      ,
-        AC_MSG_RESULT(no)
-        AC_MSG_WARN([The assembler does not support the AVX512 instruction set.])
-      )
-    ,
-      AC_MSG_RESULT(no)
-      AC_MSG_WARN([The assembler does not support the AVX2 instruction set.])
-    )
-  ,
-    AC_MSG_RESULT(no)
-    AC_MSG_WARN([The assembler does not support the AVX instruction set.])
-  )
-fi
-
+# jansson test fails on Linux/Mingw, handled in Makefile.am.
 AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true)

 AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",
@@ -128,9 +82,6 @@ AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",
      AC_CHECK_LIB([pthreadGC], [pthread_create], PTHREAD_LIBS="-lpthreadGC"
 ))))

-#LDFLAGS="$PTHREAD_LDFLAGS $LDFLAGS"
-# PTHREAD_LIBS="$PTHREAD_LIBS"
-
 AC_MSG_CHECKING(whether __uint128_t is supported)
 AC_COMPILE_IFELSE([AC_LANG_PROGRAM([static __uint128_t i = 100;])],
      AC_DEFINE(USE_INT128, 1, [Define if __uint128_t is available])
@@ -143,19 +94,10 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([static __uint128_t i = 100;])],
 AM_CONDITIONAL([WANT_JANSSON], [test x$request_jansson = xtrue])
 AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
 AM_CONDITIONAL([USE_ASM], [test x$enable_assembly != xno])
-AM_CONDITIONAL([ARCH_x86], [test x$have_x86 = xtrue])
 AM_CONDITIONAL([ARCH_x86_64], [test x$have_x86_64 = xtrue])
 AM_CONDITIONAL([ARCH_ARM64], [test x$have_arm64 = xtrue])
 AM_CONDITIONAL([MINGW], [test "x$OS" = "xWindows_NT"])
 AM_CONDITIONAL([HAVE_APPLE], [test x$have_apple = xtrue])
-AM_CONDITIONAL([ARM64_APPLE], [test x$have_arm64_apple = xtrue])
-AM_CONDITIONAL([X86_64_APPLE], [test x$have_x86_64_apple = xtrue])
-
-if test x$request_jansson = xtrue ; then
-	JANSSON_LIBS="compat/jansson/libjansson.a"
-else
-	JANSSON_LIBS=-ljansson
-fi

 # libcurl install path (for mingw : --with-curl=/usr/local)
 AC_ARG_WITH([curl],
@@ -168,25 +110,10 @@ if test -n "$with_curl" ; then
   LIBCURL="-lcurl -lz"
 fi

-# SSL install path (for mingw : --with-crypto=/usr/local/ssl)
-AC_ARG_WITH([crypto],
-   [  --with-crypto=PATH       prefix where openssl crypto is installed [default=/usr]])
-
-if test -n "$with_crypto" ; then
-   LIBCURL_CFLAGS="$LIBCURL_CFLAGS -I$with_crypto/include"
-   LIBCURL_CPPFLAGS="$LIBCURL_CPPFLAGS -I$with_crypto/include"
-   LIBCURL_LDFLAGS="-L$with_crypto/lib $LIBCURL_LDFLAGS"
-   LIBCURL="$LIBCURL -lssl -lcrypto"
-fi
-
 CFLAGS="$CFLAGS $LIBCURL_CFLAGS"
 CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS"
 LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS"

-#AC_CHECK_LIB([z],[gzopen],[],[])
-#AC_CHECK_LIB([crypto],[OPENSSL_init], crypto=yes, [AC_MSG_ERROR([OpenSSL crypto library required])])
-#AC_CHECK_LIB([ssl],[SSL_new], ssl=yes, ssl=no)
-
 # AC_CHECK_LIB([curl], [curl_multi_timeout],
 #    have_libcurl=yes,
 #    have_libcurl=no AC_MSG_ERROR([curl library required])
--- a/231
+++ b/231
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.1.
+# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.4.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation,
@@ -601,8 +601,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='25.1'
-PACKAGE_STRING='cpuminer-opt 25.1'
+PACKAGE_VERSION='25.4'
+PACKAGE_STRING='cpuminer-opt 25.4'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -650,10 +650,6 @@ JANSSON_LIBS
 LIBCURL_CPPFLAGS
 LIBCURL_CFLAGS
 LIBCURL
-X86_64_APPLE_FALSE
-X86_64_APPLE_TRUE
-ARM64_APPLE_FALSE
-ARM64_APPLE_TRUE
 HAVE_APPLE_FALSE
 HAVE_APPLE_TRUE
 MINGW_FALSE
@@ -662,8 +658,6 @@ ARCH_ARM64_FALSE
 ARCH_ARM64_TRUE
 ARCH_x86_64_FALSE
 ARCH_x86_64_TRUE
-ARCH_x86_FALSE
-ARCH_x86_TRUE
 USE_ASM_FALSE
 USE_ASM_TRUE
 HAVE_WINDOWS_FALSE
@@ -796,7 +790,6 @@ enable_maintainer_mode
 enable_dependency_tracking
 enable_assembly
 with_curl
-with_crypto
 '
      ac_precious_vars='build_alias
 host_alias
@@ -1359,7 +1352,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-'configure' configures cpuminer-opt 25.1 to adapt to many kinds of systems.
+'configure' configures cpuminer-opt 25.4 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1431,7 +1424,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 25.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 25.4:";;
   esac
  cat <<\_ACEOF

@@ -1454,7 +1447,6 @@ Optional Packages:
  --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
  --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
  --with-curl=PATH         prefix where curl is installed default=/usr
-  --with-crypto=PATH       prefix where openssl crypto is installed default=/usr

 Some influential environment variables:
  CC          C compiler command
@@ -1536,7 +1528,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 25.1
+cpuminer-opt configure 25.4
 generated by GNU Autoconf 2.72

 Copyright (C) 2023 Free Software Foundation, Inc.
@@ -1957,7 +1949,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 25.1, which was
+It was created by cpuminer-opt $as_me 25.4, which was
 generated by GNU Autoconf 2.72.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3772,7 +3764,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='25.1'
+ VERSION='25.4'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -6464,22 +6456,11 @@ fi


 case $target in
-  i*86-*-*)
-    have_x86=true
-    ;;
-  aarch64-apple-*|arm64-apple-*)
-    have_arm64=true
-    have_arm64_apple=true
-    ;;
-  x86_64-apple-*|amd64-apple-*)
-    have_x86_64=true
-    have_x86_64_apple=true
-    ;;
  x86_64-*-*|amd64-*-*)
    have_x86_64=true
    ;;
  aarch64*-*-*|arm64*-*-*)
-    have_arm4=true
+    have_arm64=true
    ;;
  powerpc*-*-*)
    have_ppc=true
@@ -6512,130 +6493,7 @@ printf "%s\n" "#define USE_ASM 1" >>confdefs.h

 fi

-if test x$enable_assembly != xno -a x$have_x86_64 = xtrue
-then
-  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX code" >&5
-printf %s "checking whether we can compile AVX code... " >&6; }
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main (void)
-{
-asm ("vmovdqa %ymm0, %ymm1");
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"
-then :
-
-printf "%s\n" "#define USE_AVX 1" >>confdefs.h
-
-    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-printf "%s\n" "yes" >&6; }
-    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile XOP code" >&5
-printf %s "checking whether we can compile XOP code... " >&6; }
-    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main (void)
-{
-asm ("vprotd \$7, %xmm0, %xmm1");
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"
-then :
-
-printf "%s\n" "#define USE_XOP 1" >>confdefs.h
-
-      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-printf "%s\n" "yes" >&6; }
-
-else case e in #(
-  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
-printf "%s\n" "no" >&6; }
-      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the XOP instruction set." >&5
-printf "%s\n" "$as_me: WARNING: The assembler does not support the XOP instruction set." >&2;}
-     ;;
-esac
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
-    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX2 code" >&5
-printf %s "checking whether we can compile AVX2 code... " >&6; }
-    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main (void)
-{
-asm ("vpaddd %ymm0, %ymm1, %ymm2");
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"
-then :
-
-printf "%s\n" "#define USE_AVX2 1" >>confdefs.h
-
-      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-printf "%s\n" "yes" >&6; }
-      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX512 code" >&5
-printf %s "checking whether we can compile AVX512 code... " >&6; }
-      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main (void)
-{
-asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"
-then :
-
-printf "%s\n" "#define USE_AVX512 1" >>confdefs.h
-
-        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-printf "%s\n" "yes" >&6; }
-
-else case e in #(
-  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
-printf "%s\n" "no" >&6; }
-        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX512 instruction set." >&5
-printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX512 instruction set." >&2;}
-       ;;
-esac
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
-
-else case e in #(
-  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
-printf "%s\n" "no" >&6; }
-      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX2 instruction set." >&5
-printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX2 instruction set." >&2;}
-     ;;
-esac
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
-
-else case e in #(
-  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
-printf "%s\n" "no" >&6; }
-    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX instruction set." >&5
-printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX instruction set." >&2;}
-   ;;
-esac
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
-fi
-
+# jansson test fails on Linux/Mingw, handled in Makefile.am.
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for json_loads in -ljansson" >&5
 printf %s "checking for json_loads in -ljansson... " >&6; }
 if test ${ac_cv_lib_jansson_json_loads+y}
@@ -6888,9 +6746,6 @@ esac
 fi


-#LDFLAGS="$PTHREAD_LDFLAGS $LDFLAGS"
-# PTHREAD_LIBS="$PTHREAD_LIBS"
-
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether __uint128_t is supported" >&5
 printf %s "checking whether __uint128_t is supported... " >&6; }
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
@@ -6945,14 +6800,6 @@ else
  USE_ASM_FALSE=
 fi

- if test x$have_x86 = xtrue; then
-  ARCH_x86_TRUE=
-  ARCH_x86_FALSE='#'
-else
-  ARCH_x86_TRUE='#'
-  ARCH_x86_FALSE=
-fi
-
 if test x$have_x86_64 = xtrue; then
  ARCH_x86_64_TRUE=
  ARCH_x86_64_FALSE='#'
@@ -6985,28 +6832,6 @@ else
  HAVE_APPLE_FALSE=
 fi

- if test x$have_arm64_apple = xtrue; then
-  ARM64_APPLE_TRUE=
-  ARM64_APPLE_FALSE='#'
-else
-  ARM64_APPLE_TRUE='#'
-  ARM64_APPLE_FALSE=
-fi
-
- if test x$have_x86_64_apple = xtrue; then
-  X86_64_APPLE_TRUE=
-  X86_64_APPLE_FALSE='#'
-else
-  X86_64_APPLE_TRUE='#'
-  X86_64_APPLE_FALSE=
-fi
-
-
-if test x$request_jansson = xtrue ; then
-	JANSSON_LIBS="compat/jansson/libjansson.a"
-else
-	JANSSON_LIBS=-ljansson
-fi

 # libcurl install path (for mingw : --with-curl=/usr/local)

@@ -7024,30 +6849,10 @@ if test -n "$with_curl" ; then
   LIBCURL="-lcurl -lz"
 fi

-# SSL install path (for mingw : --with-crypto=/usr/local/ssl)
-
-# Check whether --with-crypto was given.
-if test ${with_crypto+y}
-then :
-  withval=$with_crypto;
-fi
-
-
-if test -n "$with_crypto" ; then
-   LIBCURL_CFLAGS="$LIBCURL_CFLAGS -I$with_crypto/include"
-   LIBCURL_CPPFLAGS="$LIBCURL_CPPFLAGS -I$with_crypto/include"
-   LIBCURL_LDFLAGS="-L$with_crypto/lib $LIBCURL_LDFLAGS"
-   LIBCURL="$LIBCURL -lssl -lcrypto"
-fi
-
 CFLAGS="$CFLAGS $LIBCURL_CFLAGS"
 CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS"
 LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS"

-#AC_CHECK_LIB([z],[gzopen],[],[])
-#AC_CHECK_LIB([crypto],[OPENSSL_init], crypto=yes, [AC_MSG_ERROR([OpenSSL crypto library required])])
-#AC_CHECK_LIB([ssl],[SSL_new], ssl=yes, ssl=no)
-
 # AC_CHECK_LIB([curl], [curl_multi_timeout],
 #    have_libcurl=yes,
 #    have_libcurl=no AC_MSG_ERROR([curl library required])
@@ -7236,10 +7041,6 @@ if test -z "${USE_ASM_TRUE}" && test -z "${USE_ASM_FALSE}"; then
  as_fn_error $? "conditional \"USE_ASM\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${ARCH_x86_TRUE}" && test -z "${ARCH_x86_FALSE}"; then
-  as_fn_error $? "conditional \"ARCH_x86\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
 if test -z "${ARCH_x86_64_TRUE}" && test -z "${ARCH_x86_64_FALSE}"; then
  as_fn_error $? "conditional \"ARCH_x86_64\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -7256,14 +7057,6 @@ if test -z "${HAVE_APPLE_TRUE}" && test -z "${HAVE_APPLE_FALSE}"; then
  as_fn_error $? "conditional \"HAVE_APPLE\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${ARM64_APPLE_TRUE}" && test -z "${ARM64_APPLE_FALSE}"; then
-  as_fn_error $? "conditional \"ARM64_APPLE\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
-if test -z "${X86_64_APPLE_TRUE}" && test -z "${X86_64_APPLE_FALSE}"; then
-  as_fn_error $? "conditional \"X86_64_APPLE\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi

 : "${CONFIG_STATUS=./config.status}"
 ac_write_fail=0
@@ -7657,7 +7450,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 25.1, which was
+This file was extended by cpuminer-opt $as_me 25.4, which was
 generated by GNU Autoconf 2.72.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7725,7 +7518,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 25.1
+cpuminer-opt config.status 25.4
 configured by $0, generated by GNU Autoconf 2.72,
  with options \\"\$ac_cs_config\\"

--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -152,7 +152,7 @@ int stratum_thr_id = -1;
 int api_thr_id = -1;
 bool stratum_need_reset = false;
 struct work_restart *work_restart = NULL;
-struct stratum_ctx stratum;
+struct stratum_ctx stratum = {0};
 double opt_diff_factor = 1.0;
 double opt_target_factor = 1.0;
 uint32_t zr5_pok = 0;
@@ -187,7 +187,7 @@ static bool opt_api_enabled = false;
 char *opt_api_allow = NULL;
 int opt_api_listen = 0;
 int opt_api_remote = 0;
-char *default_api_allow = "127.0.0.1";
+const char *default_api_allow = "127.0.0.1";
 int default_api_listen = 4048; 

  pthread_mutex_t applog_lock;
@@ -286,15 +286,15 @@ static inline void drop_policy(void) { }
 static void affine_to_cpu( struct thr_info *thr )
 {
   int thread = thr->id;
-   unsigned long last_error;    
-   bool ok;
+   unsigned long last_error = 0;    
+   bool ok = true;

 #if defined(WINDOWS_CPU_GROUPS_ENABLED)
   unsigned long group_size = GetActiveProcessorCount( 0 );
   unsigned long group      = thread / group_size;
   unsigned long cpu        = thread_affinity_map[ thread % group_size ];

-   GROUP_AFFINITY affinity;
+   GROUP_AFFINITY affinity = {0};
   affinity.Group = group;
   affinity.Mask = 1ULL << cpu;

@@ -320,8 +320,7 @@ static void affine_to_cpu( struct thr_info *thr )
   {
      last_error = GetLastError();
      if ( !thread )
-      applog( LOG_WARNING, "Set affinity returned error 0x%x for thread %d",
-                           last_error, thread );
+         applog( LOG_WARNING, "Set affinity returned error 0x%x", last_error );
   }
 }   

@@ -871,9 +870,9 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
   work->tx_count = tx_count;

   /* assemble block header */
-   algo_gate.build_block_header( work, swab32( version ),
+   algo_gate.build_block_header( work, bswap_32( version ),
                                 (uint32_t*) prevhash, (uint32_t*) merkle_tree,
-                                 swab32( curtime ), le32dec( &bits ),
+                                 bswap_32( curtime ), le32dec( &bits ),
                                 final_sapling_hash );

   if ( unlikely( !jobj_binary( val, "target", target, sizeof(target) ) ) )
@@ -1774,7 +1773,7 @@ static bool get_work(struct thr_info *thr, struct work *work)
      // why 74? std cmp_size is 76, std data is 128
 		for ( int n = 0; n < 74; n++ ) ( (char*)work->data )[n] = n;

-      work->data[algo_gate.ntime_index] = swab32(ts);  // ntime
+      work->data[algo_gate.ntime_index] = bswap_32(ts);  // ntime
  
      // this overwrites much of the for loop init
      memset( work->data + algo_gate.nonce_index, 0x00, 52);  // nonce..nonce+52
@@ -2010,36 +2009,37 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
 {
   bool new_job;

-   pthread_rwlock_wrlock( &g_work_lock );
   pthread_mutex_lock( &sctx->work_lock );

   new_job =  sctx->new_job;  // otherwise just increment extranonce2
   sctx->new_job = false;
+
+   pthread_rwlock_wrlock( &g_work_lock );
   
   free( g_work->job_id );
   g_work->job_id = strdup( sctx->job.job_id );
   g_work->xnonce2_len = sctx->xnonce2_size;
   g_work->xnonce2 = (uchar*) realloc( g_work->xnonce2, sctx->xnonce2_size );
+   g_work->height = sctx->block_height;
+   g_work->targetdiff = sctx->job.diff
+                           / ( opt_target_factor * opt_diff_factor );
   memcpy( g_work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size );
   algo_gate.build_extraheader( g_work, sctx );
   net_diff = nbits_to_diff( g_work->data[ algo_gate.nbits_index ] );
   algo_gate.set_work_data_endian( g_work );
-   g_work->height = sctx->block_height;
-   g_work->targetdiff = sctx->job.diff
-                           / ( opt_target_factor * opt_diff_factor );
   diff_to_hash( g_work->target, g_work->targetdiff );

+   g_work_time = time(NULL);
+   restart_threads();
+   pthread_rwlock_unlock( &g_work_lock );
+
   // Pre increment extranonce2 in case of being called again before receiving
   // a new job
   for ( int t = 0;
         t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] );
         t++ );

-   g_work_time = time(NULL);
-   restart_threads();
-
   pthread_mutex_unlock( &sctx->work_lock );
-   pthread_rwlock_unlock( &g_work_lock );

   pthread_mutex_lock( &stats_lock );

@@ -2073,7 +2073,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )

   // Update data and calculate new estimates.
   if ( ( stratum_diff != sctx->job.diff )
-   || ( last_block_height != sctx->block_height ) )
+     || ( last_block_height != sctx->block_height ) )
   {
      if ( unlikely( !session_first_block ) )
         session_first_block = stratum.block_height;
@@ -2190,7 +2190,7 @@ static void *miner_thread( void *userdata )
   }

   // wait for stratum to send first job
-   if ( have_stratum ) while ( unlikely( stratum_down ) )
+   if ( have_stratum ) while ( unlikely( !stratum.job.job_id ) )
   {
     if ( opt_debug )
        applog( LOG_INFO, "Thread %d waiting for first job", thr_id );
@@ -2204,7 +2204,6 @@ static void *miner_thread( void *userdata )
   {
       uint64_t hashes_done;
       struct timeval tv_start, tv_end, diff;
-//       int64_t max64 = 1000;
       int nonce_found = 0;

       if ( have_stratum ) 
@@ -2230,13 +2229,6 @@ static void *miner_thread( void *userdata )
       }
       else if ( !opt_benchmark ) // GBT or getwork
       {
-         // max64 is used to set end_nonce to match the scantime.
-         // It also factors the nonce range to end the scan when nonces are
-         // exhausted. In either case needing new work can be assumed.
-         // Only problem is every thread will call get_work.
-         // First thread resets scantime blocking all subsequent threads
-         // from fetching new work.
-
          pthread_rwlock_wrlock( &g_work_lock );
          const time_t now = time(NULL);
          if ( ( ( now - g_work_time ) >= opt_scantime )
@@ -2873,12 +2865,12 @@ static bool cpu_capability( bool display_only )
     bool sw_has_avx       = false;
     bool sw_has_avx2      = false;
     bool sw_has_avx512    = false;
-     bool sw_has_avx10_256 = false;
-     bool sw_has_avx10_512 = false;
+     bool sw_has_avx10     = false;
     bool sw_has_aes       = false;
     bool sw_has_vaes      = false;
     bool sw_has_sha256    = false;        // x86_64 or AArch64
     bool sw_has_sha512    = false;        // x86_64 or AArch64
+/*
     set_t algo_features   = algo_gate.optimizations;
     bool algo_has_sse2    = set_incl( SSE2_OPT,    algo_features );
     bool algo_has_sse42   = set_incl( SSE42_OPT,   algo_features );
@@ -2901,7 +2893,7 @@ static bool cpu_capability( bool display_only )
     bool use_sha512;
     bool use_neon;
     bool use_none;
-
+*/
     #if defined(__x86_64__)
         sw_has_x86_64 = true;
     #elif defined(__aarch64__)
@@ -2913,6 +2905,7 @@ static bool cpu_capability( bool display_only )
           sw_arm_arch = __ARM_ARCH;
         #endif
     #endif
+
     // x86_64 only
     #if defined(__SSE2__)
         sw_has_sse2 = true;
@@ -2935,15 +2928,18 @@ static bool cpu_capability( bool display_only )
     #if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__))
         sw_has_avx512 = true;
     #endif
-     #if defined(__AVX10_1_256__)
-         sw_has_avx10_256 = true;
-     #endif
-     #if defined(__AVX10_1_512__)
-         sw_has_avx10_512 = true;
+// AVX10 version is not significant as of AVX10.2. If that changes use a better
+// way to test the version than sequentially.
+//     #if defined(__AVX10_2__)
+// 
+//     #elif defined(__AVX10_1__)
+     #if defined(__AVX10_1__)
+         sw_has_avx10 = true;
     #endif
+
     // x86_64 or AArch64 
     #if defined(__AES__) || defined(__ARM_FEATURE_AES)
-       sw_has_aes = true;
+         sw_has_aes = true;
     #endif
     #ifdef __VAES__
         sw_has_vaes = true;
@@ -2954,6 +2950,7 @@ static bool cpu_capability( bool display_only )
     #if defined(__SHA512__) || defined(__ARM_FEATURE_SHA512)
         sw_has_sha512 = true;
     #endif
+
     // AArch64 only
     #if defined(__ARM_NEON)
         sw_has_neon = true;
@@ -2971,16 +2968,20 @@ static bool cpu_capability( bool display_only )
         sw_has_sme2 = true;
     #endif

+     // CPU
     cpu_brand_string( cpu_brand );
     printf( "CPU: %s\n", cpu_brand );

-     printf("SW built on " __DATE__
+     // Build
+     printf( "SW built on " __DATE__
     #if defined(__clang__)
-        " with CLANG-%d.%d.%d", __clang_major__, __clang_minor__, __clang_patchlevel__);
+        " with CLANG-%d.%d.%d", __clang_major__, __clang_minor__,
+                                __clang_patchlevel__ );
     #elif defined(__GNUC__)
-        " with GCC-%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
+        " with GCC-%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ );
     #endif

+     // OS
     #if defined(__linux)
        printf(" Linux\n");
     #elif defined(WIN32)
@@ -3001,8 +3002,7 @@ static bool cpu_capability( bool display_only )
     printf("CPU features: ");
     if ( cpu_arch_x86_64()  )
     {
-       if      ( cpu_has_avx10  )    printf( " AVX10.%d-%d", avx10_version(),
-                                                       avx10_vector_length() );
+       if      ( cpu_has_avx10  )    printf( " AVX10.%d", avx10_version() );
       if      ( cpu_has_avx512 )    printf( " AVX512" );
       else if ( cpu_has_avx2   )    printf( " AVX2  " );
       else if ( cpu_has_avx    )    printf( " AVX   " );
@@ -3027,8 +3027,7 @@ static bool cpu_capability( bool display_only )
     printf("\nSW features:  ");
     if ( sw_has_x86_64 )
     {                     
-        if      ( sw_has_avx10_512 ) printf( " AVX10-512" );
-        else if ( sw_has_avx10_256 ) printf( " AVX10-256" );
+        if      ( sw_has_avx10     ) printf( " AVX10 " );
        else if ( sw_has_avx512    ) printf( " AVX512" );
        else if ( sw_has_avx2      ) printf( " AVX2  " );
        else if ( sw_has_avx       ) printf( " AVX   " );
@@ -3051,123 +3050,15 @@ static bool cpu_capability( bool display_only )
     if         ( sw_has_sha512  )   printf( " SHA512" );
     else if    ( sw_has_sha256  )   printf( " SHA256" );

-     if ( !display_only )
-     {
-        printf("\nAlgo features:");
-        if ( algo_features == EMPTY_SET ) printf( " None" );
-        else
-        {
-           if      ( algo_has_avx512 )  printf( " AVX512" );
-           else if ( algo_has_avx2   )  printf( " AVX2  " );
-           else if ( algo_has_sse42  )  printf( " SSE4.2" );
-           else if ( algo_has_sse2   )  printf( " SSE2  " );
-           if      ( algo_has_neon   )  printf( " NEON"   );
-           if      ( algo_has_vaes   )  printf( " VAES"   );
-           else if ( algo_has_aes    )  printf( "  AES"   );
-           if      ( algo_has_sha512 )  printf( " SHA512" );
-           else if ( algo_has_sha256 )  printf( " SHA256" );
-        }
-     }
     printf("\n");
-
-     if ( display_only ) return true;
-
-     // Determine mining options
-     use_sse2   = cpu_has_sse2   && sw_has_sse2   && algo_has_sse2;
-     use_sse42  = cpu_has_sse42  && sw_has_sse42  && algo_has_sse42;
-     use_avx    = cpu_has_avx    && sw_has_avx    && algo_has_avx;
-     use_avx2   = cpu_has_avx2   && sw_has_avx2   && algo_has_avx2;
-     use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
-     use_aes    = cpu_has_aes    && sw_has_aes    && algo_has_aes;
-     use_vaes   = cpu_has_vaes   && sw_has_vaes   && algo_has_vaes;
-     use_sha256 = cpu_has_sha256 && sw_has_sha256 && algo_has_sha256;
-     use_sha512 = cpu_has_sha512 && sw_has_sha512 && algo_has_sha512;
-     use_neon   = sw_has_aarch64 && sw_has_neon   && algo_has_neon;
-     use_none = !( use_sse2 || use_sse42 || use_avx || use_aes || use_avx512
-             || use_avx2 || use_sha256 || use_vaes || use_sha512 || use_neon );
-
-     // Display best options
-     if ( !use_none )
-     {
-        applog_nl( "Enabled optimizations:" );
-        if      ( use_neon   ) printf( " NEON"   );
-        if      ( use_avx512 ) printf( " AVX512" );
-        else if ( use_avx2   ) printf( " AVX2"   );
-        else if ( use_avx    ) printf( " AVX"    );
-        else if ( use_sse42  ) printf( " SSE42"  );
-        else if ( use_sse2   ) printf( " SSE2"   );
-        if      ( use_vaes   ) printf( " VAES"   );
-        else if ( use_aes    ) printf( " AES"    );
-        if      ( use_sha512 ) printf( " SHA512" );
-        else if ( use_sha256 ) printf( " SHA256" );
-        printf( "\n" );
-     }
-
+     
     return true;
 }

-void show_version_and_exit(void)
-{
-        printf("\n built on " __DATE__
-#ifdef _MSC_VER
-         " with VC++ 2013\n");
-#elif defined(__GNUC__)
-         " with GCC");
-        printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
-#endif
-        printf(" features:"
-#if defined(USE_ASM) && defined(__i386__)
-                " i386"
-#endif
-#if defined(USE_ASM) && defined(__x86_64__)
-                " x86_64"
-#endif
-#if defined(USE_ASM) && (defined(__i386__) || defined(__x86_64__))
-                " SSE2"
-#endif
-#if defined(__x86_64__) && defined(USE_AVX)
-                " AVX"
-#endif
-#if defined(__x86_64__) && defined(USE_AVX2)
-                " AVX2"
-#endif
-#if defined(__x86_64__) && defined(USE_XOP)
-                " XOP"
-#endif
-#if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__)
-                " ARM"
-#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \
-        defined(__ARM_ARCH_5TEJ__) || defined(__ARM_ARCH_6__) || \
-        defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || \
-        defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6T2__) || \
-        defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \
-        defined(__ARM_ARCH_7__) || \
-        defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || \
-        defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__)
-                " ARMv5E"
-#endif
-#if defined(__ARM_NEON__)
-                " NEON"
-#endif
-#endif
-                "\n\n");
-
-        printf("%s\n", curl_version());
-#ifdef JANSSON_VERSION
-        printf("jansson/%s ", JANSSON_VERSION);
-#endif
-#ifdef PTW32_VERSION
-        printf("pthreads/%d.%d.%d.%d ", PTW32_VERSION);
-#endif
-        printf("\n");
-        exit(0);
-}
-
 void show_usage_and_exit(int status)
 {
 	if (status)
                fprintf(stderr, "Try `--help' for more information.\n");
-//		fprintf(stderr, "Try `" PACKAGE_NAME " --help' for more information.\n");
 	else
 		printf(usage);
 	exit(status);
@@ -3183,7 +3074,6 @@ void parse_arg(int key, char *arg )
 {
 	char *p;
 	int v, i;
-//	uint64_t ul;
 	double d;

 	switch( key )
@@ -3242,7 +3132,7 @@ void parse_arg(int key, char *arg )
 		else if ( arg )
      {
 			/* port or 0 to disable */
-         opt_api_allow = default_api_allow;      
+         opt_api_allow = (char*)default_api_allow;      
         opt_api_listen = atoi(arg);
 		}
      break;
@@ -3278,7 +3168,7 @@ void parse_arg(int key, char *arg )

   // debug overrides quiet          
 	case 'q':  // quiet
-		if ( !( opt_debug || opt_protocol ) ) opt_quiet = true;
+      opt_quiet = !( opt_debug || opt_protocol );
 		break;
 	case 'D':  // debug
 		opt_debug = true;
@@ -3327,7 +3217,8 @@ void parse_arg(int key, char *arg )
 		free(rpc_user);
 		rpc_user = strdup(arg);
 		break;
-	case 'o':  // url
+
+   case 'o':  // url
   {
 		char *ap, *hp;
 		ap = strstr( arg, "://" );
@@ -3392,7 +3283,8 @@ void parse_arg(int key, char *arg )
 		have_stratum = !opt_benchmark && !strncasecmp( rpc_url, "stratum", 7 );
 		break;
 	}
-	case 'O':  // userpass
+
+   case 'O':  // userpass
 		p = strchr(arg, ':');
 		if (!p)
      {
@@ -3552,10 +3444,10 @@ void parse_arg(int key, char *arg )
   case 1029:  // stratum-keepalive
      opt_stratum_keepalive = true;
      break;
-   case 'V':
+   case 'V':   // version
      display_cpu_capability();
      exit(0);
-	case 'h':
+	case 'h':   // help
 		show_usage_and_exit(0);

   default:
@@ -3864,12 +3756,23 @@ int main(int argc, char *argv[])
 	}
 #endif

-#if defined(WIN32) && defined(WINDOWS_CPU_GROUPS_ENABLED)
-      if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) )
-         applog( LOG_INFO, "Found %d CPUs in %d groups",
-                           num_cpus, num_cpugroups );
+#if defined(WIN32)
+
+#if defined(_WIN32_WINNT)
+   if (opt_debug)
+      applog( LOG_INFO, "_WIN232_WINNT = 0x%04x", _WIN32_WINNT ); 
+#else
+   if (opt_debug)
+      applog( LOG_INFO, "_WIN232_WINNT undefined." );
 #endif
-   
+#if defined(WINDOWS_CPU_GROUPS_ENABLED)
+   if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) )
+      applog( LOG_INFO, "Found %d CPUs in %d groups",
+                              num_cpus, num_cpugroups );
+#endif
+
+#endif
+
   conditional_state = malloc( opt_n_threads * ((sizeof(bool)) ) );
   memset( conditional_state, 0, opt_n_threads * ((sizeof(bool)) ) );
   
@@ -3890,7 +3793,7 @@ int main(int argc, char *argv[])
         if ( cpu < num_cpus ) active_cpus++;
      }
      if ( opt_n_threads > active_cpus )
-         applog( LOG_WARNING, "Affinity: more threads (%d) than active CPUs (%d)", opt_n_threads, active_cpus );
+         applog( LOG_WARNING, "More miner threads (%d) than active CPUs in affinity mask (%d)", opt_n_threads, active_cpus );
      if ( !opt_quiet )
      {
         char affinity_mask[64];
--- a/miner.h
+++ b/miner.h
@@ -8,8 +8,8 @@
   #define USER_AGENT_ARCH "x64"     // Intel, AMD x86_64
 #elif defined(__aarch64__)
   #define USER_AGENT_ARCH "arm"     // AArch64
-//#elif
-//  #define USER_AGENT_ARCH "r5"     // RISC-V             
+#elif defined(__riscv)
+   #define USER_AGENT_ARCH "rv"     // RISC-V             
 #else
   #define USER_AGENT_ARCH
 #endif
@@ -65,7 +65,7 @@
 # endif
 #endif

-// no mm_maloc for Neon
+// no mm_malloc for Neon
 #if !defined(__ARM_NEON)

 #include <mm_malloc.h>
@@ -173,6 +173,7 @@ static inline bool is_windows(void)
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
 #endif

+#if 0
 // deprecated, see simd-int.h
 #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
 #define WANT_BUILTIN_BSWAP
@@ -183,6 +184,7 @@ static inline bool is_windows(void)
 */
 #endif

+/*
 static inline uint32_t swab32(uint32_t x)
 {
 #ifdef WANT_BUILTIN_BSWAP
@@ -195,6 +197,8 @@ static inline uint32_t swab32(uint32_t x)
 //   return bswap_32(v);
 #endif
 }
+*/
+#endif

 // Swap any two variables of the same type without using a temp
 #define swap_vars(a,b) a^=b; b^=a; a^=b;
@@ -289,26 +293,6 @@ static inline void le16enc(void *pp, uint16_t x)

 json_t* json_load_url(char* cfg_url, json_error_t *err);

-//void sha256_init(uint32_t *state);
-//void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
-//void sha256d(unsigned char *hash, const unsigned char *data, int len);
-
-#ifdef USE_ASM
-#if defined(__ARM_NEON__) || defined(__i386__) || defined(__x86_64__)
-#define HAVE_SHA256_4WAY 1
-int sha256_use_4way();
-void sha256_init_4way(uint32_t *state);
-void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
-#endif
-//#if defined(__x86_64__) && defined(USE_AVX2)
-#if defined(__x86_64__) && defined(__AVX2__)
-#define HAVE_SHA256_8WAY 1
-int sha256_use_8way();
-void sha256_init_8way(uint32_t *state);
-void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap);
-#endif
-#endif
-
 struct work;

 void work_free(struct work *w);
@@ -851,10 +835,9 @@ Options:\n\
  -a, --algo=ALGO       specify the algorithm to use\n\
                          allium        Garlicoin (GRLC)\n\
                          anime         Animecoin (ANI)\n\
-                          argon2        Argon2 Coin (AR2)\n\
                          argon2d250\n\
-                          argon2d500    argon2d-dyn, Dynamic (DYN)\n\
-                          argon2d4096   argon2d-uis, Unitus (UIS)\n\
+                          argon2d500\n\
+                          argon2d4096\n\
                          axiom         Shabal-256 MemoHash\n\
                          blake         blake256r14 (SFR)\n\
                          blake2b       Blake2b 256\n\
--- a/simd-utils.h
+++ b/simd-utils.h
@@ -140,28 +140,28 @@
 #include <stdint.h>
 #include <stddef.h>

-// AVX512 macros are not a reliable indicator of 512 bit vector capability
-// because they get defined with AVX10_1_256 which doesn't support 512 bit.
-// EVEX512 is also unreliable as it can also be defined when 512b is not
-// available.
-// Use AVX10_1_512 for 512b & AVX10_1_256 for 256b whenever AVX10 is present.
-// Use AVX512 macros only whithout AVX10.
-
 /*
-// Test for macros
-#ifdef __AVX10_1__
+// Test for AVX10 macros
+// AVX10-256 was abandoned by Intel before any CPUs were built.
+#ifdef __AVX10__            // does not exist
+#warning "__AVX10__"   
+#endif
+#ifdef __AVX10_1__          // GCC-14
 #warning "__AVX10_1__"
 #endif
-#ifdef __AVX10_1_256__
+#ifdef __AVX10_2__          // GCC-15
+#warning "__AVX10_2__"
+#endif
+#ifdef __AVX10_1_256__      // obsolete
 #warning "__AVX10_1_256__"
 #endif
-#ifdef __AVX10_1_512__
-#warning "__AVX10_1_512__"
+#ifdef __AVX10_1_512__    
+#warning "__AVX10_1_512__"  // does not exist
 #endif
-#ifdef __EVEX256__
-#warning "__EVEX256__"
+#ifdef __EVEX256__          // likely obsolete
+#warning "__EVEX256__"   
 #endif
-#ifdef __EVEX512__
+#ifdef __EVEX512__          // likely obsolete
 #warning "__EVEX512__"
 #endif
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
@@ -169,27 +169,14 @@
 #endif
 */

-// SIMD512: Use 512, 256 & 128 bit vectors, AVX512VBMI is not included and
-// must be tested seperately. 
-// VL256: Include AVX512VL instructions for 256 & 128 bit vectors.
-// VBMI: Include AVX512VBMI instructions for supported vector lengths.
-
-#if defined(__AVX10_1__)
-
-  #define VL256 1
-  #define VBMI 1
-  #if defined(__AVX10_1_512__)
-    #define SIMD512 1
-  #endif
-
-#elif defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-  #define VL256 1
+// With Intel abandoning AVX10-256 the SIM512 & VL256 macros are almost
+// identical with the only difference being VBMI is included in VL256.
+#if defined(__AVX10_1__) || ( defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) )
  #define SIMD512 1
-  #if defined(__AVX512VBMI__)
+  #define VL256 1
+  #if defined(__AVX10_1__) || defined(__AVX512VBMI__)
    #define VBMI 1
  #endif
-
 #endif

 /*
@@ -204,13 +191,75 @@
 #endif
 */

+// targetted intrinsics
 #if defined(__x86_64__)
+  #include <x86intrin.h>
+#elif defined(__aarch64__) && defined(__ARM_NEON)
+  #include <arm_neon.h>
+#elif defined(__riscv) && defined(__riscv_vector)
+  #include <riscv_vector.h> 
+#endif

-#include <x86intrin.h>
+// Single global definition for frequently used vector constants.
+// The GCC optimizer can merge constants but merging different vector lengths
+// might be beyond it's scope. 

-#elif defined(__aarch64__)
+// Frequently used SSE/AVX shuffle constants.
+#if defined(SIMD512)

-#include <arm_neon.h>
+// When used with shuffle_epi8 performs are standard bswap of all elements.
+// When used with permutexvar_epi8 (requires AVX512VBMI or AVX10) performs a
+// bswap of the elements in the lower 128 bits of the source and broadcasts
+// the result to all 128 bit lanes of the destination.
+
+extern const __m512i V512_BSWAP64;
+#define V256_BSWAP64 _mm512_castsi512_si256( V512_BSWAP64 )
+#define V128_BSWAP64 _mm512_castsi512_si128( V512_BSWAP64 )
+
+extern const __m512i V512_BSWAP32;
+#define V256_BSWAP32 _mm512_castsi512_si256( V512_BSWAP32 )
+#define V128_BSWAP32 _mm512_castsi512_si128( V512_BSWAP32 )
+
+#elif defined(__AVX2__)
+
+extern const __m256i V256_BSWAP64;
+#define V128_BSWAP64 _mm256_castsi256_si128( V256_BSWAP64 )
+
+extern const __m256i V256_BSWAP32;
+#define V128_BSWAP32 _mm256_castsi256_si128( V256_BSWAP32 )
+
+// These shuffles aren't needed with AVX512, uses ror/rol instead.
+
+extern const __m256i V256_SHUFLR64_8;
+#define V128_SHUFLR64_8 _mm256_castsi256_si128( V256_SHUFLR64_8 )
+
+extern const __m256i V256_SHUFLR64_24;
+#define V128_SHUFLR64_24 _mm256_castsi256_si128( V256_SHUFLR64_24 )
+
+extern const __m256i V256_SHUFLL64_8;
+#define V128_SHUFLL64_8 _mm256_castsi256_si128( V256_SHUFLL64_8 )
+
+extern const __m256i V256_SHUFLL64_24;
+#define V128_SHUFLL64_24 _mm256_castsi256_si128( V256_SHUFLL64_24 )
+
+extern const __m256i V256_SHUFLR32_8;
+#define V128_SHUFLR32_8 _mm256_castsi256_si128( V256_SHUFLR32_8 )
+
+extern const __m256i V256_SHUFLL32_8;
+#define V128_SHUFLL32_8 _mm256_castsi256_si128( V256_SHUFLL32_8 )
+
+#elif defined(__SSSE3__)
+
+extern const __m128i V128_BSWAP64;
+extern const __m128i V128_BSWAP32;
+
+extern const __m128i V128_SHUFLR64_8;
+extern const __m128i V128_SHUFLR64_24;
+extern const __m128i V128_SHUFLL64_8;
+extern const __m128i V128_SHUFLL64_24;
+
+extern const __m128i V128_SHUFLR32_8;
+extern const __m128i V128_SHUFLL32_8;

 #endif

@@ -225,7 +274,7 @@
 // x86_64 AVX512 512 bit vectors
 #include "simd-utils/simd-512.h"

-// aarch64 neon 128 bit vectors
+// aarch64 NEON 128 bit vectors
 #include "simd-utils/simd-neon.h"

 #include "simd-utils/intrlv.h"
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -589,20 +589,7 @@ static inline void extr_lane_4x32( void *d, const void *s,
   ((uint32_t*)d)[15] = ((const uint32_t*)s)[ lane+60 ];
 }

-#if defined(__SSSE3__)
-
-static inline void v128_bswap32_80( void *d, void *s )
-{
-  const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
-                                            0x0405060700010203 );
-  casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), bswap_shuf );
-  casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), bswap_shuf );
-  casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), bswap_shuf );
-  casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), bswap_shuf );
-  casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), bswap_shuf );
-}
-
-#elif defined(__aarch64__) && defined(__ARM_NEON)
+#if defined(__SSSE3__) || defined(__ARM_NEON)

 static inline void v128_bswap32_80( void *d, void *s )
 {
@@ -641,6 +628,8 @@ static inline void v128_bswap32_80( void *d, void *s )

 #endif

+#if defined(__SSE2__) || defined(__ARM_NEON)
+
 static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
 {
  v128u32_t s0 = casti_v128u32( src,0 );
@@ -649,27 +638,12 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
  v128u32_t s3 = casti_v128u32( src,3 );
  v128u32_t s4 = casti_v128u32( src,4 );

-#if defined(__SSSE3__)
-
-  const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
-                                            0x0405060700010203 );
-
-  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
-  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
-  s2 = _mm_shuffle_epi8( s2, bswap_shuf );
-  s3 = _mm_shuffle_epi8( s3, bswap_shuf );
-  s4 = _mm_shuffle_epi8( s4, bswap_shuf );
-
-#else
-
  s0 = v128_bswap32( s0 );
  s1 = v128_bswap32( s1 );
  s2 = v128_bswap32( s2 );
  s3 = v128_bswap32( s3 );
  s4 = v128_bswap32( s4 );

-#endif
-
  casti_v128u32( d, 0 ) = v128_duplane32( s0, 0 );
  casti_v128u32( d, 1 ) = v128_duplane32( s0, 1 );
  casti_v128u32( d, 2 ) = v128_duplane32( s0, 2 );
@@ -696,6 +670,8 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
  casti_v128u32( d,19 ) = v128_duplane32( s2, 3 );
 }

+#endif // SSE2 || NEON
+
 // 8x32

 #if defined(__AVX2__)
@@ -1112,8 +1088,6 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )

 static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
 {
-  const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
-                                            0x0405060700010203 );
  const __m256i c1 = v256_32( 1 );
  const __m256i c2 = _mm256_add_epi32( c1, c1 );
  const __m256i c3 = _mm256_add_epi32( c2, c1 );
@@ -1124,11 +1098,11 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
  v128_t s3 = casti_v128( src,3 );
  v128_t s4 = casti_v128( src,4 );

-  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
-  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
-  s2 = _mm_shuffle_epi8( s2, bswap_shuf );
-  s3 = _mm_shuffle_epi8( s3, bswap_shuf );
-  s4 = _mm_shuffle_epi8( s4, bswap_shuf );
+  s0 = v128_bswap32( s0 );
+  s1 = v128_bswap32( s1 );
+  s2 = v128_bswap32( s2 );
+  s3 = v128_bswap32( s3 );
+  s4 = v128_bswap32( s4 );

  casti_m256i( d, 0 ) = _mm256_broadcastd_epi32( s0 );
  casti_m256i( d, 1 ) = _mm256_permutevar8x32_epi32(
@@ -1617,8 +1591,6 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )

 static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
 {
-  const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
-                                             0x0405060700010203 );
  const __m512i c1 = v512_32( 1 );
  const __m512i c2 = _mm512_add_epi32( c1, c1 );
  const __m512i c3 = _mm512_add_epi32( c2, c1 );
@@ -1628,11 +1600,11 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
  v128_t s3 = casti_v128( src,3 );
  v128_t s4 = casti_v128( src,4 );

-  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
-  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
-  s2 = _mm_shuffle_epi8( s2, bswap_shuf );
-  s3 = _mm_shuffle_epi8( s3, bswap_shuf );
-  s4 = _mm_shuffle_epi8( s4, bswap_shuf );
+  s0 = v128_bswap32( s0 );
+  s1 = v128_bswap32( s1 );
+  s2 = v128_bswap32( s2 );
+  s3 = v128_bswap32( s3 );
+  s4 = v128_bswap32( s4 );

  casti_m512i( d, 0 ) = _mm512_broadcastd_epi32(  s0 );
  casti_m512i( d, 1 ) = _mm512_permutexvar_epi32( c1,
@@ -1878,6 +1850,8 @@ static inline void dintrlv_2x64( void *dst0, void *dst1,

 #endif

+#if defined(__SSE2__) || defined(__ARM_NEON)
+
 static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
 {
  v128u64_t s0 = casti_v128u64( src,0 );
@@ -1886,27 +1860,12 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
  v128u64_t s3 = casti_v128u64( src,3 );
  v128u64_t s4 = casti_v128u64( src,4 );

-#if defined(__SSSE3__)
-
-  const v128u64_t bswap_shuf = v128_set64( 0x0c0d0e0f08090a0b,
-                                           0x0405060700010203 );
-
-  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
-  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
-  s2 = _mm_shuffle_epi8( s2, bswap_shuf );
-  s3 = _mm_shuffle_epi8( s3, bswap_shuf );
-  s4 = _mm_shuffle_epi8( s4, bswap_shuf );
-
-#else
-
  s0 = v128_bswap32( s0 );
  s1 = v128_bswap32( s1 );
  s2 = v128_bswap32( s2 );
  s3 = v128_bswap32( s3 );
  s4 = v128_bswap32( s4 );

-#endif
-
  casti_v128u64( d,0 ) = v128_duplane64( s0, 0 );
  casti_v128u64( d,1 ) = v128_duplane64( s0, 1 );

@@ -1923,6 +1882,8 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
  casti_v128u64( d,9 ) = v128_duplane64( s4, 1 );
 }

+#endif  // SSE2 || NEON
+
 static inline void extr_lane_2x64( void *dst, const void *src,
                                   const int lane, const int bit_len )
 {
@@ -2233,25 +2194,23 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )

 static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
 {
-  const __m256i bswap_shuf = mm256_bcast_m128(
-                    _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
-  __m256i s0 = casti_m256i( src,0 );
-  __m256i s1 = casti_m256i( src,1 );
+  __m256i s0 = casti_m256i( src,0 );  // s0, s1
+  __m256i s2 = casti_m256i( src,1 );  // s2, s3
  v128_t s4 = casti_v128( src,4 );

-  s0 = _mm256_shuffle_epi8( s0, bswap_shuf );
-  s1 = _mm256_shuffle_epi8( s1, bswap_shuf );
-  s4 = _mm_shuffle_epi8( s4, _mm256_castsi256_si128( bswap_shuf ) );
+  s0 = mm256_bswap_32( s0 );
+  s2 = mm256_bswap_32( s2 );
+  s4 = v128_bswap32( s4 );

  casti_m256i( d, 0 ) = _mm256_permute4x64_epi64( s0, 0x00 );
  casti_m256i( d, 1 ) = _mm256_permute4x64_epi64( s0, 0x55 );
  casti_m256i( d, 2 ) = _mm256_permute4x64_epi64( s0, 0xaa );
  casti_m256i( d, 3 ) = _mm256_permute4x64_epi64( s0, 0xff );
  
-  casti_m256i( d, 4 ) = _mm256_permute4x64_epi64( s1, 0x00 );
-  casti_m256i( d, 5 ) = _mm256_permute4x64_epi64( s1, 0x55 );
-  casti_m256i( d, 6 ) = _mm256_permute4x64_epi64( s1, 0xaa );
-  casti_m256i( d, 7 ) = _mm256_permute4x64_epi64( s1, 0xff );
+  casti_m256i( d, 4 ) = _mm256_permute4x64_epi64( s2, 0x00 );
+  casti_m256i( d, 5 ) = _mm256_permute4x64_epi64( s2, 0x55 );
+  casti_m256i( d, 6 ) = _mm256_permute4x64_epi64( s2, 0xaa );
+  casti_m256i( d, 7 ) = _mm256_permute4x64_epi64( s2, 0xff );

  casti_m256i( d, 8 ) = _mm256_permute4x64_epi64(
                          _mm256_castsi128_si256( s4 ), 0x00 );
@@ -2648,8 +2607,6 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )

 static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
 {
-  const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
-                                            0x0405060700010203 );
  const __m512i c1 = v512_64( 1 );
  v128_t s0 = casti_v128( src,0 );
  v128_t s1 = casti_v128( src,1 );
@@ -2657,11 +2614,11 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
  v128_t s3 = casti_v128( src,3 );
  v128_t s4 = casti_v128( src,4 );

-  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
-  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
-  s2 = _mm_shuffle_epi8( s2, bswap_shuf );
-  s3 = _mm_shuffle_epi8( s3, bswap_shuf );
-  s4 = _mm_shuffle_epi8( s4, bswap_shuf );
+  s0 = v128_bswap32( s0 );
+  s1 = v128_bswap32( s1 );
+  s2 = v128_bswap32( s2 );
+  s3 = v128_bswap32( s3 );
+  s4 = v128_bswap32( s4 );

  casti_m512i( d,0 ) = _mm512_broadcastq_epi64(  s0 );
  casti_m512i( d,1 ) = _mm512_permutexvar_epi64( c1,
@@ -2842,49 +2799,45 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2,

 static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
 {
-  const __m512i bswap_shuf = mm512_bcast_m128(
-                    _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
  const v128_t s0 = casti_v128( src,0 );
  const v128_t s1 = casti_v128( src,1 );
  const v128_t s2 = casti_v128( src,2 );
  const v128_t s3 = casti_v128( src,3 );
  const v128_t s4 = casti_v128( src,4 );

-  casti_m512i( d,0 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s0 ),
-                                                 bswap_shuf );
-  casti_m512i( d,1 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s1 ),
-                                                 bswap_shuf );
-  casti_m512i( d,2 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s2 ),
-                                                 bswap_shuf );
-  casti_m512i( d,3 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s3 ),
-                                                 bswap_shuf );
-  casti_m512i( d,4 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s4 ),
-                                                 bswap_shuf );
+  casti_m512i( d,0 ) = _mm512_permutexvar_epi8( V512_BSWAP32,
+                        _mm512_castsi128_si512( s0 ) );
+  casti_m512i( d,1 ) = _mm512_permutexvar_epi8( V512_BSWAP32,
+                        _mm512_castsi128_si512( s1 ) );
+  casti_m512i( d,2 ) = _mm512_permutexvar_epi8( V512_BSWAP32,
+                        _mm512_castsi128_si512( s2 ) );
+  casti_m512i( d,3 ) = _mm512_permutexvar_epi8( V512_BSWAP32,
+                        _mm512_castsi128_si512( s3 ) );
+  casti_m512i( d,4 ) = _mm512_permutexvar_epi8( V512_BSWAP32,
+                        _mm512_castsi128_si512( s4 ) );
 }

 #else

 static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
 {
-  const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
-                                             0x0405060700010203 );
  v128_t s0 = casti_v128( src,0 );
  v128_t s1 = casti_v128( src,1 );
  v128_t s2 = casti_v128( src,2 );
  v128_t s3 = casti_v128( src,3 );
  v128_t s4 = casti_v128( src,4 );

-  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
-  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
-  s2 = _mm_shuffle_epi8( s2, bswap_shuf );
-  s3 = _mm_shuffle_epi8( s3, bswap_shuf );
-  s4 = _mm_shuffle_epi8( s4, bswap_shuf );
+  s0 = v128_bswap32( s0 );
+  s1 = v128_bswap32( s1 );
+  s2 = v128_bswap32( s2 );
+  s3 = v128_bswap32( s3 );
+  s4 = v128_bswap32( s4 );

-  casti_m512i( d,0 ) = mm512_bcast_m128( s0 );
-  casti_m512i( d,1 ) = mm512_bcast_m128( s1 );
-  casti_m512i( d,2 ) = mm512_bcast_m128( s2 );
-  casti_m512i( d,3 ) = mm512_bcast_m128( s3 );
-  casti_m512i( d,4 ) = mm512_bcast_m128( s4 );
+  casti_m512i( d,0 ) = mm512_bcast128( s0 );
+  casti_m512i( d,1 ) = mm512_bcast128( s1 );
+  casti_m512i( d,2 ) = mm512_bcast128( s2 );
+  casti_m512i( d,3 ) = mm512_bcast128( s3 );
+  casti_m512i( d,4 ) = mm512_bcast128( s4 );
 }

 #endif   // AVX512VBMI ELSE
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -521,29 +521,12 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #elif defined(__SSSE3__)
 // SSSE3: fastest 32 bit, very fast 16, fast 8

-#define v128_shuflr64_8( v ) \
-    _mm_shuffle_epi8( v, _mm_set_epi64x( \
-                                  0x080f0e0d0c0b0a09, 0x0007060504030201 ) )
-
-#define v128_shufll64_8( v ) \
-   _mm_shuffle_epi8( v, _mm_set_epi64x( \
-                                  0x0e0d0c0b0a09080f, 0x0605040302010007 ) )
-
-#define v128_shuflr64_24( v ) \
-    _mm_shuffle_epi8( v, _mm_set_epi64x( \
-                                  0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
-
-#define v128_shufll64_24( v ) \
-   _mm_shuffle_epi8( v, _mm_set_epi64x( \
-                                  0x0c0b0a09080f0e0d, 0x0403020100070605 ) )
-
-#define v128_shuflr32_8( v ) \
-   _mm_shuffle_epi8( v, _mm_set_epi64x( \
-                                  0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
-
-#define v128_shufll32_8( v ) \
-   _mm_shuffle_epi8( v, _mm_set_epi64x( \
-                                  0x0e0d0c0f0a09080b, 0x0605040702010003 ) )
+#define v128_shuflr64_8( v )        _mm_shuffle_epi8( v, V128_SHUFLR64_8 )
+#define v128_shufll64_8( v )        _mm_shuffle_epi8( v, V128_SHUFLL64_8 )
+#define v128_shuflr64_24(v )        _mm_shuffle_epi8( v, V128_SHUFLR64_24 )
+#define v128_shufll64_24(v )        _mm_shuffle_epi8( v, V128_SHUFLL64_24 )
+#define v128_shuflr32_8( v )        _mm_shuffle_epi8( v, V128_SHUFLR32_8 )
+#define v128_shufll32_8( v )        _mm_shuffle_epi8( v, V128_SHUFLL32_8 )

 #define v128_ror64( v, c ) \
   ( (c) ==  8 ) ? v128_shuflr64_8( v ) \
@@ -612,74 +595,6 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 // (v1 ^ v0) >>> n, ARM NEON has optimized version
 #define v128_ror64xor( v1, v0, n )  v128_ror64( v128_xor( v1, v0 ), n ) 

-/* not used
-// x2 rotates elements in 2 individual vectors in a double buffered
-// optimization for SSE2, does nothing for AVX512 but is there for
-// transparency.
-
-#if defined(VL256)
-
-#define v128_2ror64( v1, v0, c ) \
-   _mm_ror_epi64( v0, c ); \
-   _mm_ror_epi64( v1, c )
-
-#define v128_2rol64( v1, v0, c ) \
-   _mm_rol_epi64( v0, c ); \
-   _mm_rol_epi64( v1, c )
-
-#define v128_2ror32( v1, v0, c ) \
-   _mm_ror_epi32( v0, c ); \
-   _mm_ror_epi32( v1, c )
-
-#define v128_2rol32( v1, v0, c ) \
-   _mm_rol_epi32( v0, c ); \
-   _mm_rol_epi32( v1, c )
-
-#else  // SSE2
-
-#define v128_2ror64( v1, v0, c ) \
-{ \
- __m128i t0 = _mm_srli_epi64( v0, c ); \
- __m128i t1 = _mm_srli_epi64( v1, c ); \
- v0 = _mm_slli_epi64( v0, 64-(c) ); \
- v1 = _mm_slli_epi64( v1, 64-(c) ); \
- v0 = _mm_or_si256( v0, t0 ); \
- v1 = _mm_or_si256( v1, t1 ); \
-}
-
-#define v128_2rol64( v1, v0, c ) \
-{ \
- __m128i t0 = _mm_slli_epi64( v0, c ); \
- __m128i t1 = _mm_slli_epi64( v1, c ); \
- v0 = _mm_srli_epi64( v0, 64-(c) ); \
- v1 = _mm_srli_epi64( v1, 64-(c) ); \
- v0 = _mm_or_si256( v0, t0 ); \
- v1 = _mm_or_si256( v1, t1 ); \
-}
-
-#define v128_2ror32( v1, v0, c ) \
-{ \
- __m128i t0 = _mm_srli_epi32( v0, c ); \
- __m128i t1 = _mm_srli_epi32( v1, c ); \
- v0 = _mm_slli_epi32( v0, 32-(c) ); \
- v1 = _mm_slli_epi32( v1, 32-(c) ); \
- v0 = _mm_or_si256( v0, t0 ); \
- v1 = _mm_or_si256( v1, t1 ); \
-}
-
-#define v128_2rol32( v1, v0, c ) \
-{ \
- __m128i t0 = _mm_slli_epi32( v0, c ); \
- __m128i t1 = _mm_slli_epi32( v1, c ); \
- v0 = _mm_srli_epi32( v0, 32-(c) ); \
- v1 = _mm_srli_epi32( v1, 32-(c) ); \
- v0 = _mm_or_si256( v0, t0 ); \
- v1 = _mm_or_si256( v1, t1 ); \
-}
-
-#endif   // AVX512 else SSE2
-*/
-
 // Cross lane shuffles

 // No NEON version
@@ -721,13 +636,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
   _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0001020304050607, \
                                        0x08090a0b0c0d0e0f ) )

-#define v128_bswap64( v ) \
-   _mm_shuffle_epi8( v, _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
-                                        0x0001020304050607 ) )
+#define v128_bswap64( v )  _mm_shuffle_epi8( v, V128_BSWAP64 )
+
+#define v128_bswap32( v )  _mm_shuffle_epi8( v, V128_BSWAP32 )

-#define v128_bswap32( v ) \
-   _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
-                                        0x0405060700010203 ) )
 #define v128_bswap16( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0e0f0c0d0a0b0809, \
                                        0x0607040502030001 )
@@ -735,85 +647,30 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 // 8 byte qword * 8 qwords * 2 lanes = 128 bytes
 #define v128_block_bswap64( d, s ) \
 { \
-  v128_t ctl = _mm_set_epi64x(  0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
-  casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
-  casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
-  casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
-  casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
-  casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \
-  casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \
-  casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
-  casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
-}
-#define v128_block_bswap64_512     v128_block_bswap64
-
-#define v128_block_bswap64_1024( d, s ) \
-{ \
-  v128_t ctl = _mm_set_epi64x(  0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
-  casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \
-  casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \
-  casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \
-  casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \
-  casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \
-  casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \
-  casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \
-  casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \
-  casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \
-  casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \
-  casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \
-  casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \
-  casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \
-  casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \
-  casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \
-  casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \
+  casti_v128( d,0 ) = v128_bswap64( casti_v128( s,0 ) ); \
+  casti_v128( d,1 ) = v128_bswap64( casti_v128( s,1 ) ); \
+  casti_v128( d,2 ) = v128_bswap64( casti_v128( s,2 ) ); \
+  casti_v128( d,3 ) = v128_bswap64( casti_v128( s,3 ) ); \
+  casti_v128( d,4 ) = v128_bswap64( casti_v128( s,4 ) ); \
+  casti_v128( d,5 ) = v128_bswap64( casti_v128( s,5 ) ); \
+  casti_v128( d,6 ) = v128_bswap64( casti_v128( s,6 ) ); \
+  casti_v128( d,7 ) = v128_bswap64( casti_v128( s,7 ) ); \
 }

 // 4 byte dword * 8 dwords * 4 lanes = 128 bytes
 #define v128_block_bswap32( d, s ) \
 { \
-  v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
-  casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
-  casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
-  casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
-  casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
-  casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \
-  casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \
-  casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
-  casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
+  casti_v128( d,0 ) = v128_bswap32( casti_v128( s,0 ) ); \
+  casti_v128( d,1 ) = v128_bswap32( casti_v128( s,1 ) ); \
+  casti_v128( d,2 ) = v128_bswap32( casti_v128( s,2 ) ); \
+  casti_v128( d,3 ) = v128_bswap32( casti_v128( s,3 ) ); \
+  casti_v128( d,4 ) = v128_bswap32( casti_v128( s,4 ) ); \
+  casti_v128( d,5 ) = v128_bswap32( casti_v128( s,5 ) ); \
+  casti_v128( d,6 ) = v128_bswap32( casti_v128( s,6 ) ); \
+  casti_v128( d,7 ) = v128_bswap32( casti_v128( s,7 ) ); \
 }
 #define v128_block_bswap32_256       v128_block_bswap32

-
-#define v128_block_bswap32_128( d, s ) \
-{ \
-  v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
-  casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
-  casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
-  casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
-  casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
-}   
-
-#define v128_block_bswap32_512( d, s ) \
-{ \
-  v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
-  casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \
-  casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \
-  casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \
-  casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \
-  casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \
-  casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \
-  casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \
-  casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \
-  casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \
-  casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \
-  casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \
-  casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \
-  casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \
-  casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \
-  casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \
-  casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \
-}
-
 #else  // SSE2

 static inline v128_t v128_bswap64( __m128i v )
@@ -835,7 +692,7 @@ static inline v128_t v128_bswap16( __m128i v )
  return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
 }

-#define v128_bswap128( v )   v128_qrev32( v128_bswap64( v ) )
+#define v128_bswap128( v )   v128_rev64( v128_bswap64( v ) )

 static inline void v128_block_bswap64( __m128i *d, const __m128i *s )
 {
@@ -849,26 +706,6 @@ static inline void v128_block_bswap64( __m128i *d, const __m128i *s )
   d[7] = v128_bswap64( s[7] );
 }

-static inline void v128_block_bswap64_1024( __m128i *d, const __m128i *s )
-{
-   d[ 0] = v128_bswap64( s[ 0] );
-   d[ 1] = v128_bswap64( s[ 1] );
-   d[ 2] = v128_bswap64( s[ 2] );
-   d[ 3] = v128_bswap64( s[ 3] );
-   d[ 4] = v128_bswap64( s[ 4] );
-   d[ 5] = v128_bswap64( s[ 5] );
-   d[ 6] = v128_bswap64( s[ 6] );
-   d[ 7] = v128_bswap64( s[ 7] );
-   d[ 8] = v128_bswap64( s[ 8] );
-   d[ 9] = v128_bswap64( s[ 9] );
-   d[10] = v128_bswap64( s[10] );
-   d[11] = v128_bswap64( s[11] );
-   d[14] = v128_bswap64( s[12] );
-   d[13] = v128_bswap64( s[13] );
-   d[14] = v128_bswap64( s[14] );
-   d[15] = v128_bswap64( s[15] );
-}
-
 static inline void v128_block_bswap32( __m128i *d, const __m128i *s )
 {
   d[0] = v128_bswap32( s[0] );
@@ -882,26 +719,6 @@ static inline void v128_block_bswap32( __m128i *d, const __m128i *s )
 }
 #define v128_block_bswap32_256  v128_block_bswap32

-static inline void v128_block_bswap32_512( __m128i *d, const __m128i *s )
-{
-   d[ 0] = v128_bswap32( s[ 0] );
-   d[ 1] = v128_bswap32( s[ 1] );
-   d[ 2] = v128_bswap32( s[ 2] );
-   d[ 3] = v128_bswap32( s[ 3] );
-   d[ 4] = v128_bswap32( s[ 4] );
-   d[ 5] = v128_bswap32( s[ 5] );
-   d[ 6] = v128_bswap32( s[ 6] );
-   d[ 7] = v128_bswap32( s[ 7] );
-   d[ 8] = v128_bswap32( s[ 8] );
-   d[ 9] = v128_bswap32( s[ 9] );
-   d[10] = v128_bswap32( s[10] );
-   d[11] = v128_bswap32( s[11] );
-   d[12] = v128_bswap32( s[12] );
-   d[13] = v128_bswap32( s[13] );
-   d[14] = v128_bswap32( s[14] );
-   d[15] = v128_bswap32( s[15] );
-}
-
 #endif // SSSE3 else SSE2

 // alignr instruction for 32 & 64 bit elements is only available with AVX512
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -61,8 +61,10 @@ typedef union
 #if defined(__AVX2__)

 // Broadcast, ie set1, from 128 bit vector input.
-#define mm256_bcast_m128( v ) \
+#define mm256_bcast128( v ) \
   _mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
+// deprecated
+#define mm256_bcast_m128    mm256_bcast128

 // Set either the low or high 64 bit elements in 128 bit lanes, other elements
 // are set to zero.
@@ -73,23 +75,23 @@ typedef union

 #else

-#define mm256_bcast128lo_64( i64 )   mm256_bcast_m128( v128_mov64( i64 ) )
+#define mm256_bcast128lo_64( i64 )     mm256_bcast128( v128_mov64( i64 ) )

 #define mm256_bcast128hi_64( i64 )   _mm256_permute4x64_epi64( \
                   _mm256_castsi128_si256( v128_mov64( i64 ) ), 0x11 )

 #endif

-#define mm256_set2_64( i1, i0 )   mm256_bcast_m128( _mm_set_epi64x( i1, i0 ) )
+#define mm256_set2_64( i1, i0 )   mm256_bcast128( _mm_set_epi64x( i1, i0 ) )

 #define mm256_set4_32( i3, i2, i1, i0 ) \
-   mm256_bcast_m128( _mm_set_epi32( i3, i2, i1, i0 ) )
+   mm256_bcast128( _mm_set_epi32( i3, i2, i1, i0 ) )

 // All SIMD constant macros are actually functions containing executable
 // code and therefore can't be used as compile time initializers.

 #define m256_zero            _mm256_setzero_si256()
-#define m256_one_128         mm256_bcast_m128( v128_one )
+#define m256_one_128         mm256_bcast128( v128_one )

 static inline __m256i mm256_neg1_fn()
 {
@@ -231,21 +233,8 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_swap64_32    mm256_qrev32       // grandfathered

 #define mm256_qrev16(v)    mm256_shuffle16( v, 0x1b )
-
-#define mm256_qrev8(v) \
-   _mm256_shuffle_epi8( v, mm256_bcast_m128( \
-                         v128_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) )
-
 #define mm256_lrev16(v)    mm256_shuffle16( v, 0xb1 )

-#define mm256_lrev8(v) \
-   _mm256_shuffle_epi8( v, mm256_bcast_m128( \
-                         v128_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) )
-
-#define mm256_wrev8(v)  \
-   _mm256_shuffle_epi8( v, mm256_bcast_m128( \
-                         v128_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
-
 //
 //           Bit rotations.

@@ -268,50 +257,33 @@ static inline __m256i mm256_not( const __m256i v )

 #if defined(VL256)

-#define mm256_ror_64    _mm256_ror_epi64
-#define mm256_rol_64    _mm256_rol_epi64
-#define mm256_ror_32    _mm256_ror_epi32
-#define mm256_rol_32    _mm256_rol_epi32
+#define mm256_ror_64            _mm256_ror_epi64
+#define mm256_rol_64            _mm256_rol_epi64
+#define mm256_ror_32            _mm256_ror_epi32
+#define mm256_rol_32            _mm256_rol_epi32

 // Redundant but naming may be a better fit in some applications.
-#define mm126_shuflr64_8( v)      _mm256_ror_epi64( v,  8 )
-#define mm156_shufll64_8( v)      _mm256_rol_epi64( v,  8 )
-#define mm256_shuflr64_16(v)      _mm256_ror_epi64( v, 16 )
-#define mm256_shufll64_16(v)      _mm256_rol_epi64( v, 16 )
-#define mm256_shuflr64_24(v)      _mm256_ror_epi64( v, 24 )
-#define mm256_shufll64_24(v)      _mm256_rol_epi64( v, 24 )
-#define mm256_shuflr32_8( v)      _mm256_ror_epi32( v,  8 )
-#define mm256_shufll32_8( v)      _mm256_rol_epi32( v,  8 )
-#define mm256_shuflr32_16(v)      _mm256_ror_epi32( v, 16 )
-#define mm256_shufll32_16(v)      _mm256_rol_epi32( v, 16 )
+#define mm256_shuflr64_8( v)    _mm256_ror_epi64( v,  8 )
+#define mm256_shufll64_8( v)    _mm256_rol_epi64( v,  8 )
+#define mm256_shuflr64_16(v)    _mm256_ror_epi64( v, 16 )
+#define mm256_shufll64_16(v)    _mm256_rol_epi64( v, 16 )
+#define mm256_shuflr64_24(v)    _mm256_ror_epi64( v, 24 )
+#define mm256_shufll64_24(v)    _mm256_rol_epi64( v, 24 )
+#define mm256_shuflr32_8( v)    _mm256_ror_epi32( v,  8 )
+#define mm256_shufll32_8( v)    _mm256_rol_epi32( v,  8 )
+#define mm256_shuflr32_16(v)    _mm256_ror_epi32( v, 16 )
+#define mm256_shufll32_16(v)    _mm256_rol_epi32( v, 16 )

 #else

 // ROR & ROL will always find the fastest but these names may be a better fit
 // in some applications.
-#define mm256_shuflr64_8( v ) \
-    _mm256_shuffle_epi8( v, mm256_bcast_m128( \
-                 _mm_set_epi64x( 0x080f0e0d0c0b0a09, 0x0007060504030201 ) ) )
-
-#define mm256_shufll64_8( v ) \
-   _mm256_shuffle_epi8( v, mm256_bcast_m128( \
-                 _mm_set_epi64x( 0x0e0d0c0b0a09080f, 0x0605040302010007 ) ) )
-
-#define mm256_shuflr64_24( v ) \
-   _mm256_shuffle_epi8( v, mm256_bcast_m128( \
-                  _mm_set_epi64x( 0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) )
-
-#define mm256_shufll64_24( v ) \
-   _mm256_shuffle_epi8( v, mm256_bcast_m128( \
-                  _mm_set_epi64x( 0x0c0b0a09080f0e0d, 0x0403020100070605 ) ) )
-
-#define mm256_shuflr32_8( v ) \
-   _mm256_shuffle_epi8( v, mm256_bcast_m128( \
-                  _mm_set_epi64x( 0x0c0f0e0d080b0a09, 0x0407060500030201 ) ) )
-
-#define mm256_shufll32_8( v ) \
-   _mm256_shuffle_epi8( v, mm256_bcast_m128( \
-                  _mm_set_epi64x( 0x0e0d0c0f0a09080b, 0x0605040702010003 ) ) )
+#define mm256_shuflr64_8( v )   _mm256_shuffle_epi8( v, V256_SHUFLR64_8 )
+#define mm256_shufll64_8( v )   _mm256_shuffle_epi8( v, V256_SHUFLL64_8 )
+#define mm256_shuflr64_24(v )   _mm256_shuffle_epi8( v, V256_SHUFLR64_24 )
+#define mm256_shufll64_24(v )   _mm256_shuffle_epi8( v, V256_SHUFLL64_24 )
+#define mm256_shuflr32_8( v )   _mm256_shuffle_epi8( v, V256_SHUFLR32_8 )
+#define mm256_shufll32_8( v )   _mm256_shuffle_epi8( v, V256_SHUFLL32_8 )

 #define mm256_ror_64( v, c ) \
   ( (c) ==  8 ) ? mm256_shuflr64_8( v ) \
@@ -347,96 +319,6 @@ static inline __m256i mm256_not( const __m256i v )

 #endif

-//
-// x2 rotates elements in 2 individual vectors in a double buffered
-// optimization for AVX2, does nothing for AVX512 but is here for
-// transparency.
-
-#if defined(VL256)
-/*
-#define mm256_ror_64    _mm256_ror_epi64
-#define mm256_rol_64    _mm256_rol_epi64
-#define mm256_ror_32    _mm256_ror_epi32
-#define mm256_rol_32    _mm256_rol_epi32
-*/
-#define mm256_rorx2_64( v1, v0, c ) \
-   _mm256_ror_epi64( v0, c ); \
-   _mm256_ror_epi64( v1, c )
-
-#define mm256_rolx2_64( v1, v0, c ) \
-   _mm256_rol_epi64( v0, c ); \
-   _mm256_rol_epi64( v1, c )
-
-#define mm256_rorx2_32( v1, v0, c ) \
-   _mm256_ror_epi32( v0, c ); \
-   _mm256_ror_epi32( v1, c )
-
-#define mm256_rolx2_32( v1, v0, c ) \
-   _mm256_rol_epi32( v0, c ); \
-   _mm256_rol_epi32( v1, c )
-
-#else   // AVX2
-/*
-// use shuflr64 shuflr32 below for optimized bit rotations of multiples of 8.
-
-#define mm256_ror_64( v, c ) \
-   _mm256_or_si256( _mm256_srli_epi64( v, c ), \
-                    _mm256_slli_epi64( v, 64-(c) ) )
-
-#define mm256_rol_64( v, c ) \
-   _mm256_or_si256( _mm256_slli_epi64( v, c ), \
-                    _mm256_srli_epi64( v, 64-(c) ) )
-
-#define mm256_ror_32( v, c ) \
-   _mm256_or_si256( _mm256_srli_epi32( v, c ), \
-                    _mm256_slli_epi32( v, 32-(c) ) )
-
-#define mm256_rol_32( v, c ) \
-   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
-                    _mm256_srli_epi32( v, 32-(c) ) )
-*/
-#define mm256_rorx2_64( v1, v0, c ) \
-{ \
- __m256i t0 = _mm256_srli_epi64( v0, c ); \
- __m256i t1 = _mm256_srli_epi64( v1, c ); \
- v0 = _mm256_slli_epi64( v0, 64-(c) ); \
- v1 = _mm256_slli_epi64( v1, 64-(c) ); \
- v0 = _mm256_or_si256( v0, t0 ); \
- v1 = _mm256_or_si256( v1, t1 ); \
-}
-
-#define mm256_rolx2_64( v1, v0, c ) \
-{ \
- __m256i t0 = _mm256_slli_epi64( v0, c ); \
- __m256i t1 = _mm256_slli_epi64( v1, c ); \
- v0 = _mm256_srli_epi64( v0, 64-(c) ); \
- v1 = _mm256_srli_epi64( v1, 64-(c) ); \
- v0 = _mm256_or_si256( v0, t0 ); \
- v1 = _mm256_or_si256( v1, t1 ); \
-}
-
-#define mm256_rorx2_32( v1, v0, c ) \
-{ \
- __m256i t0 = _mm256_srli_epi32( v0, c ); \
- __m256i t1 = _mm256_srli_epi32( v1, c ); \
- v0 = _mm256_slli_epi32( v0, 32-(c) ); \
- v1 = _mm256_slli_epi32( v1, 32-(c) ); \
- v0 = _mm256_or_si256( v0, t0 ); \
- v1 = _mm256_or_si256( v1, t1 ); \
-}
-
-#define mm256_rolx2_32( v1, v0, c ) \
-{ \
- __m256i t0 = _mm256_slli_epi32( v0, c ); \
- __m256i t1 = _mm256_slli_epi32( v1, c ); \
- v0 = _mm256_srli_epi32( v0, 32-(c) ); \
- v1 = _mm256_srli_epi32( v1, 32-(c) ); \
- v0 = _mm256_or_si256( v0, t0 ); \
- v1 = _mm256_or_si256( v1, t1 ); \
-}
-
-#endif     // AVX512 else AVX2
-
 #if defined(__AVX2__)

 // 128 bit version of unpack
@@ -453,20 +335,14 @@ static inline __m256i mm256_not( const __m256i v )
 //
 // Cross lane shuffles
 //
-// Rotate elements accross all lanes.
-#define mm256_shuffle_16( v, c ) \
-   _mm256_or_si256( _mm256_shufflehi_epi16( v, c ), \
-                    _mm256_shufflelo_epi16( v, c ) )

 // Swap 128 bit elements in 256 bit vector.
-#define mm256_swap_128( v )     _mm256_permute4x64_epi64( v, 0x4e )
 #define mm256_rev_128( v )      _mm256_permute4x64_epi64( v, 0x4e )
+#define mm256_swap_128          mm256_rev_128    // grandfathered

-// Rotate 256 bit vector by one 64 bit element
-#define mm256_shuflr_64( v )    _mm256_permute4x64_epi64( v, 0x39 )
-#define mm256_shufll_64( v )    _mm256_permute4x64_epi64( v, 0x93 )

-// Reverse 64 bit elements 
+/* not used
+// Reverse elements 
 #define mm256_rev_64( v )       _mm256_permute4x64_epi64( v, 0x1b )

 #define mm256_rev_32( v ) \
@@ -474,7 +350,12 @@ static inline __m256i mm256_not( const __m256i v )
                                0x0000000400000005, 0x0000000600000007 )

 #define mm256_rev_16( v ) \
-   _mm256_permute4x64_epi64( mm256_shuffle_16( v, 0x1b ), 0x4e )
+   _mm256_permute4x64_epi64( mm256_shuffle16( v, 0x1b ), 0x4e )
+*/
+
+// Rotate 256 bit vector by one 64 bit element
+#define mm256_shuflr_64( v )    _mm256_permute4x64_epi64( v, 0x39 )
+#define mm256_shufll_64( v )    _mm256_permute4x64_epi64( v, 0x93 )

 /* Not used
 // Rotate 256 bit vector by one 32 bit element.
@@ -486,7 +367,7 @@ static inline __m256i mm256_shufll_32( const __m256i v )
 #else
 #define mm256_shuflr_32( v ) \
    _mm256_permutevar8x32_epi32( v, \
-                 _mm256_set_spi64x( 0x0000000000000007, 0x0000000600000005, \
+                 _mm256_set_epi64x( 0x0000000000000007, 0x0000000600000005, \
                                    0x0000000400000003, 0x0000000200000001 ) )
 #define mm256_shufll_32( v ) \
    _mm256_permutevar8x32_epi32( v, \
@@ -507,113 +388,64 @@ static inline __m256i mm256_shufll_32( const __m256i v )
   _mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( v1 ), \
                                           _mm256_castsi256_ps( v2 ), c ) ); 

-#define mm256_swap128_64(v)     _mm256_shuffle_epi32( v, 0x4e )
 #define mm256_rev128_64(v)      _mm256_shuffle_epi32( v, 0x4e )
+#define mm256_swap128_64        mm256_rev128_64   // grandfathered
+
+/*not used
 #define mm256_rev128_32(v)      _mm256_shuffle_epi32( v, 0x1b )
-#define mm256_rev128_16(v)      mm256_shuffle_16( v, 0x1b )
+#define mm256_rev128_16(v)      mm256_shuffle16( v, 0x1b )
+*/

 #define mm256_shuflr128_32(v)   _mm256_shuffle_epi32( v, 0x39 )
 #define mm256_shufll128_32(v)   _mm256_shuffle_epi32( v, 0x93 )

-#define mm256_shuflr128_16(v)   mm256_shuffle_16( v, 0x39 )
-#define mm256_shufll128_16(v)   mm256_shuffle_16( v, 0x93 )
+/* not used
+#define mm256_shuflr128_16(v)   mm256_shuffle16( v, 0x39 )
+#define mm256_shufll128_16(v)   mm256_shuffle16( v, 0x93 )

-/* Not used
 static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 { return _mm256_alignr_epi8( v, v, c ); }
 */

 // Reverse byte order in elements, endian bswap.
-#define mm256_bswap_64( v ) \
-   _mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
-                               0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) )
+#define mm256_bswap_64( v )     _mm256_shuffle_epi8( v, V256_BSWAP64 )

-#define mm256_bswap_32( v ) \
-   _mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) )
+#define mm256_bswap_32( v )     _mm256_shuffle_epi8( v, V256_BSWAP32 )

+/* not used
 #define mm256_bswap_16( v ) \
-   _mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
+   _mm256_shuffle_epi8( v, mm256_bcast128( _mm_set_epi64x( \
                                0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
-//
+*/

 // Source and destination are pointers, may point to same memory.
 // 8 byte qword * 8 qwords * 4 lanes = 256 bytes
 #define mm256_block_bswap_64( d, s ) \
 { \
-  __m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
-                                                  0x0001020304050607 ) ); \
-  casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
-  casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
-  casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
-  casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
-  casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
-  casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
-  casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
-  casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
-}
-#define mm256_block_bswap64_512   mm256_block_bswap_64
-
-#define mm256_block_bswap64_1024( d, s ) \
-{ \
-  __m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
-                                                  0x0001020304050607 ) ); \
-  casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
-  casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
-  casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
-  casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
-  casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
-  casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
-  casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
-  casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
-  casti_m256i( d, 8 ) = _mm256_shuffle_epi8( casti_m256i( s, 8 ), ctl ); \
-  casti_m256i( d, 9 ) = _mm256_shuffle_epi8( casti_m256i( s, 9 ), ctl ); \
-  casti_m256i( d,10 ) = _mm256_shuffle_epi8( casti_m256i( s,10 ), ctl ); \
-  casti_m256i( d,11 ) = _mm256_shuffle_epi8( casti_m256i( s,11 ), ctl ); \
-  casti_m256i( d,12 ) = _mm256_shuffle_epi8( casti_m256i( s,12 ), ctl ); \
-  casti_m256i( d,13 ) = _mm256_shuffle_epi8( casti_m256i( s,13 ), ctl ); \
-  casti_m256i( d,14 ) = _mm256_shuffle_epi8( casti_m256i( s,14 ), ctl ); \
-  casti_m256i( d,15 ) = _mm256_shuffle_epi8( casti_m256i( s,15 ), ctl ); \
+  casti_m256i( d,0 ) = mm256_bswap_64( casti_m256i( s,0 ) ); \
+  casti_m256i( d,1 ) = mm256_bswap_64( casti_m256i( s,1 ) ); \
+  casti_m256i( d,2 ) = mm256_bswap_64( casti_m256i( s,2 ) ); \
+  casti_m256i( d,3 ) = mm256_bswap_64( casti_m256i( s,3 ) ); \
+  casti_m256i( d,4 ) = mm256_bswap_64( casti_m256i( s,4 ) ); \
+  casti_m256i( d,5 ) = mm256_bswap_64( casti_m256i( s,5 ) ); \
+  casti_m256i( d,6 ) = mm256_bswap_64( casti_m256i( s,6 ) ); \
+  casti_m256i( d,7 ) = mm256_bswap_64( casti_m256i( s,7 ) ); \
 }

 // 4 byte dword * 8 dwords * 8 lanes = 256 bytes
 #define mm256_block_bswap_32( d, s ) \
 { \
-  __m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
-                                                  0x0405060700010203 ) ); \
-  casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
-  casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
-  casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
-  casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
-  casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
-  casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
-  casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
-  casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
+  casti_m256i( d, 0 ) = mm256_bswap_32( casti_m256i( s, 0 ) ); \
+  casti_m256i( d, 1 ) = mm256_bswap_32( casti_m256i( s, 1 ) ); \
+  casti_m256i( d, 2 ) = mm256_bswap_32( casti_m256i( s, 2 ) ); \
+  casti_m256i( d, 3 ) = mm256_bswap_32( casti_m256i( s, 3 ) ); \
+  casti_m256i( d, 4 ) = mm256_bswap_32( casti_m256i( s, 4 ) ); \
+  casti_m256i( d, 5 ) = mm256_bswap_32( casti_m256i( s, 5 ) ); \
+  casti_m256i( d, 6 ) = mm256_bswap_32( casti_m256i( s, 6 ) ); \
+  casti_m256i( d, 7 ) = mm256_bswap_32( casti_m256i( s, 7 ) ); \
 }
 #define mm256_block_bswap32_256      mm256_block_bswap_32

-#define mm256_block_bswap32_512( d, s ) \
-{ \
-  __m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
-                                                  0x0405060700010203 ) ); \
-  casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
-  casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
-  casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
-  casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
-  casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
-  casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
-  casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
-  casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
-  casti_m256i( d, 8 ) = _mm256_shuffle_epi8( casti_m256i( s, 8 ), ctl ); \
-  casti_m256i( d, 9 ) = _mm256_shuffle_epi8( casti_m256i( s, 9 ), ctl ); \
-  casti_m256i( d,10 ) = _mm256_shuffle_epi8( casti_m256i( s,10 ), ctl ); \
-  casti_m256i( d,11 ) = _mm256_shuffle_epi8( casti_m256i( s,11 ), ctl ); \
-  casti_m256i( d,12 ) = _mm256_shuffle_epi8( casti_m256i( s,12 ), ctl ); \
-  casti_m256i( d,13 ) = _mm256_shuffle_epi8( casti_m256i( s,13 ), ctl ); \
-  casti_m256i( d,14 ) = _mm256_shuffle_epi8( casti_m256i( s,14 ), ctl ); \
-  casti_m256i( d,15 ) = _mm256_shuffle_epi8( casti_m256i( s,15 ), ctl ); \
-}
-
 #if defined(VL256)

 #define mm256_alignr64      _mm256_alignr_epi64
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -108,11 +108,13 @@ typedef union

 // A simple 128 bit permute, using function instead of macro avoids
 // problems if the v arg passed as an expression.
-static inline __m512i mm512_perm_128( const __m512i v, const int c )
+static inline __m512i mm512_perm128( const __m512i v, const int c )
 {  return _mm512_shuffle_i64x2( v, v, c ); }

 // Broadcast 128 bit vector to all lanes of 512 bit vector.
-#define mm512_bcast_m128( v )  mm512_perm_128( _mm512_castsi128_si512( v ), 0 )
+#define mm512_bcast128( v )    mm512_perm128( _mm512_castsi128_si512( v ), 0 )
+// deprecated
+#define mm512_bcast_m128  mm512_bcast128 

 // Set either the low or high 64 bit elements in 128 bit lanes, other elements
 // are set to zero.
@@ -120,7 +122,7 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
 #define mm512_bcast128hi_64( i64 )     _mm512_maskz_set1_epi64( 0xaa, i64 )

 #define mm512_set2_64( i1, i0 ) \
-   mm512_bcast_m128( _mm_set_epi64x( i1, i0 ) )
+   mm512_bcast128( _mm_set_epi64x( i1, i0 ) )

 // Pseudo constants.
 #define m512_zero       _mm512_setzero_si512()
@@ -248,105 +250,57 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Reverse byte order of packed elements, vectorized endian conversion.

-#define mm512_bswap_64( v ) \
-   _mm512_shuffle_epi8( v, mm512_bcast_m128( _mm_set_epi64x( \
-                              0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) )
+#define mm512_bswap_64( v )  _mm512_shuffle_epi8( v, V512_BSWAP64 )

-#define mm512_bswap_32( v ) \
-   _mm512_shuffle_epi8( v, mm512_bcast_m128( _mm_set_epi64x( \
-                              0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) )
+#define mm512_bswap_32( v )  _mm512_shuffle_epi8( v, V512_BSWAP32 )
+
+/* not used
+#define mm512_bswap_16( v ) \
+   _mm512_shuffle_epi8( v, mm512_bcast128( _mm_set_epi64x( \
+                              0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
+*/

 #define mm512_bswap_16( v ) \
-   _mm512_shuffle_epi8( v, mm512_bcast_m128( _mm_set_epi64x( \
-                              0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )

 // Source and destination are pointers, may point to same memory.
 // 8 lanes of 64 bytes each
 #define mm512_block_bswap_64( d, s ) \
 { \
-  const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
-                                0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
-  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
-  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
-  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
-  casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
-  casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
-  casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
-  casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
-  casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
-}
-#define mm512_block_bswap64_512    mm512_block_bswap_64
-
-#define mm512_block_bswap64_1024( d, s ) \
-{ \
-  const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
-                                0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
-  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
-  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
-  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
-  casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
-  casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
-  casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
-  casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
-  casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
-  casti_m512i( d, 8 ) = _mm512_shuffle_epi8( casti_m512i( s, 8 ), ctl ); \
-  casti_m512i( d, 9 ) = _mm512_shuffle_epi8( casti_m512i( s, 9 ), ctl ); \
-  casti_m512i( d,10 ) = _mm512_shuffle_epi8( casti_m512i( s,10 ), ctl ); \
-  casti_m512i( d,11 ) = _mm512_shuffle_epi8( casti_m512i( s,11 ), ctl ); \
-  casti_m512i( d,12 ) = _mm512_shuffle_epi8( casti_m512i( s,12 ), ctl ); \
-  casti_m512i( d,13 ) = _mm512_shuffle_epi8( casti_m512i( s,13 ), ctl ); \
-  casti_m512i( d,14 ) = _mm512_shuffle_epi8( casti_m512i( s,14 ), ctl ); \
-  casti_m512i( d,15 ) = _mm512_shuffle_epi8( casti_m512i( s,15 ), ctl ); \
+  casti_m512i( d, 0 ) = mm512_bswap_64( casti_m512i( s, 0 ) ); \
+  casti_m512i( d, 1 ) = mm512_bswap_64( casti_m512i( s, 1 ) ); \
+  casti_m512i( d, 2 ) = mm512_bswap_64( casti_m512i( s, 2 ) ); \
+  casti_m512i( d, 3 ) = mm512_bswap_64( casti_m512i( s, 3 ) ); \
+  casti_m512i( d, 4 ) = mm512_bswap_64( casti_m512i( s, 4 ) ); \
+  casti_m512i( d, 5 ) = mm512_bswap_64( casti_m512i( s, 5 ) ); \
+  casti_m512i( d, 6 ) = mm512_bswap_64( casti_m512i( s, 6 ) ); \
+  casti_m512i( d, 7 ) = mm512_bswap_64( casti_m512i( s, 7 ) ); \
 }

 // 16 lanes of 32 bytes each
 #define mm512_block_bswap_32( d, s ) \
 { \
-  const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
-  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
-  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
-  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
-  casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
-  casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
-  casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
-  casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
-  casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
+  casti_m512i( d, 0 ) = mm512_bswap_32( casti_m512i( s, 0 ) ); \
+  casti_m512i( d, 1 ) = mm512_bswap_32( casti_m512i( s, 1 ) ); \
+  casti_m512i( d, 2 ) = mm512_bswap_32( casti_m512i( s, 2 ) ); \
+  casti_m512i( d, 3 ) = mm512_bswap_32( casti_m512i( s, 3 ) ); \
+  casti_m512i( d, 4 ) = mm512_bswap_32( casti_m512i( s, 4 ) ); \
+  casti_m512i( d, 5 ) = mm512_bswap_32( casti_m512i( s, 5 ) ); \
+  casti_m512i( d, 6 ) = mm512_bswap_32( casti_m512i( s, 6 ) ); \
+  casti_m512i( d, 7 ) = mm512_bswap_32( casti_m512i( s, 7 ) ); \
 }
 #define mm512_block_bswap32_256   mm512_block_bswap_32

-#define mm512_block_bswap32_512( d, s ) \
-{ \
-  const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
-  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
-  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
-  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
-  casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
-  casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
-  casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
-  casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
-  casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
-  casti_m512i( d, 8 ) = _mm512_shuffle_epi8( casti_m512i( s, 8 ), ctl ); \
-  casti_m512i( d, 9 ) = _mm512_shuffle_epi8( casti_m512i( s, 9 ), ctl ); \
-  casti_m512i( d,10 ) = _mm512_shuffle_epi8( casti_m512i( s,10 ), ctl ); \
-  casti_m512i( d,11 ) = _mm512_shuffle_epi8( casti_m512i( s,11 ), ctl ); \
-  casti_m512i( d,12 ) = _mm512_shuffle_epi8( casti_m512i( s,12 ), ctl ); \
-  casti_m512i( d,13 ) = _mm512_shuffle_epi8( casti_m512i( s,13 ), ctl ); \
-  casti_m512i( d,14 ) = _mm512_shuffle_epi8( casti_m512i( s,14 ), ctl ); \
-  casti_m512i( d,15 ) = _mm512_shuffle_epi8( casti_m512i( s,15 ), ctl ); \
-}
-
-
-
 // Cross-lane shuffles implementing rotation of packed elements.
 // 

+// shuffle 16 bit elements within 64 bit lanes.
+#define mm512_shuffle16( v, c ) \
+   _mm512_shufflehi_epi16( _mm512_shufflelo_epi16( v, c ), c )
+
 // Rotate elements across entire vector.
-static inline __m512i mm512_swap_256( const __m512i v )
+static inline __m512i mm512_rev_256( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 4 ); }
-#define mm512_shuflr_256   mm512_swap_256
-#define mm512_shufll_256   mm512_swap_256
+#define mm512_swap_256      mm512_rev_256     // grandfathered

 static inline __m512i mm512_shuflr_128( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 2 ); }
@@ -394,9 +348,8 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 // Rotate elements within 256 bit lanes of 512 bit vector.

 // Swap hi & lo 128 bits in each 256 bit lane
-#define mm512_swap256_128( v )      _mm512_permutex_epi64( v, 0x4e )
-#define mm512_shuflr256_128 mm512_swap256_128
-#define mm512_shufll256_128 mm512_swap256_128
+#define mm512_rev256_128( v )       _mm512_permutex_epi64( v, 0x4e )
+#define mm512_swap256_128           mm512_rev256_128  // grandfathered

 // Rotate 256 bit lanes by one 64 bit element
 #define mm512_shuflr256_64( v )     _mm512_permutex_epi64( v, 0x39 )
@@ -450,15 +403,23 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 //
 // Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
 
-#define mm512_swap128_64( v )   _mm512_shuffle_epi32( v, 0x4e )
-#define mm512_shuflr128_64      mm512_swap128_64
-#define mm512_shufll128_64      mm512_swap128_64
+#define mm512_rev128_64( v )      _mm512_shuffle_epi32( v, 0x4e )
+#define mm512_swap128_64          mm512_rev128_64   // grandfathered
+
+/*not used
+#define mm512_rev128_32(v)        _mm526_shuffle_epi32( v, 0x1b )
+#define mm512_rev128_16(v)         mm512_shuffle16( v, 0x1b )
+*/

 // Rotate 128 bit lanes by one 32 bit element
 #define mm512_shuflr128_32( v )    _mm512_shuffle_epi32( v, 0x39 )
 #define mm512_shufll128_32( v )    _mm512_shuffle_epi32( v, 0x93 )

 /* Not used
+
+#define mm512_shuflr128_16(v)   mm512_shuffle16( v, 0x39 )
+#define mm512_shufll128_16(v)   mm512_shuffle16( v, 0x93 )
+   
 // Rotate 128 bit lanes right by c bytes, versatile and just as fast
 static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
 {  return _mm512_alignr_epi8( v, v, c ); }
@@ -476,11 +437,10 @@ static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
                                           _mm512_castsi512_ps( v2 ), c ) ); 

 // 64 bit lanes
-// Not really necessary with AVX512, included for consistency with AVX2/SSE.
+// ROL, ROR not necessary with AVX512, included for consistency with AVX2/SSE.

-#define mm512_swap64_32( v )    _mm512_shuffle_epi32( v, 0xb1 )
-#define mm512_shuflr64_32       mm512_swap64_32
-#define mm512_shufll64_32       mm512_swap64_32
+#define mm512_qrev32( v )       _mm512_shuffle_epi32( v, 0xb1 )
+#define mm512_swap64_32         mm512_qrev32        // grandfathered

 #define mm512_shuflr64_24( v )  _mm512_ror_epi64( v, 24 )
 #define mm512_shufll64_24( v )  _mm512_rol_epi64( v, 24 )
@@ -494,9 +454,7 @@ static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
 /* Not used
 // 32 bit lanes

-#define mm512_swap32_16( v )    _mm512_ror_epi32( v, 16 )
-#define mm512_shuflr32_16       mm512_swap32_16
-#define mm512_shufll32_16       mm512_swap32_16
+#define mm512_lrev16( v )       _mm512_ror_epi32( v, 16 )

 #define mm512_shuflr32_8( v )   _mm512_ror_epi32( v,  8 )
 #define mm512_shufll32_8( v )   _mm512_rol_epi32( v,  8 )
--- a/simd-utils/simd-constants.c
+++ b/simd-utils/simd-constants.c
@@ -0,0 +1,55 @@
+#include "simd-utils.h"
+
+#if defined(SIMD512)
+
+const __m512i V512_BSWAP64 =     { 0x0001020304050607, 0x08090a0b0c0d0e0f,
+                                   0x0001020304050607, 0x08090a0b0c0d0e0f,
+                                   0x0001020304050607, 0x08090a0b0c0d0e0f,
+                                   0x0001020304050607, 0x08090a0b0c0d0e0f };
+
+const __m512i V512_BSWAP32 =     { 0x0405060700010203, 0x0c0d0e0f08090a0b,
+                                   0x0405060700010203, 0x0c0d0e0f08090a0b,
+                                   0x0405060700010203, 0x0c0d0e0f08090a0b,
+                                   0x0405060700010203, 0x0c0d0e0f08090a0b };
+
+#elif defined(__AVX2__)
+
+const __m256i V256_BSWAP64     = { 0x0001020304050607, 0x08090a0b0c0d0e0f,
+                                   0x0001020304050607, 0x08090a0b0c0d0e0f };
+
+const __m256i V256_BSWAP32     = { 0x0405060700010203, 0x0c0d0e0f08090a0b,
+                                   0x0405060700010203, 0x0c0d0e0f08090a0b };
+
+const __m256i V256_SHUFLR64_8  = { 0x0007060504030201, 0x080f0e0d0c0b0a09,
+                                   0x0007060504030201, 0x080f0e0d0c0b0a09 };
+
+const __m256i V256_SHUFLR64_24 = { 0x0201000706050403, 0x0a09080f0e0d0c0b,
+                                   0x0201000706050403, 0x0a09080f0e0d0c0b };
+
+const __m256i V256_SHUFLL64_8  = { 0x0605040302010007, 0x0e0d0c0b0a09080f,
+                                   0x0605040302010007, 0x0e0d0c0b0a09080f };
+
+const __m256i V256_SHUFLL64_24 = { 0x0403020100070605, 0x0c0b0a09080f0e0d,
+                                   0x0403020100070605, 0x0c0b0a09080f0e0d };
+
+const __m256i V256_SHUFLR32_8  = { 0x0407060500030201, 0x0c0f0e0d080b0a09,
+                                   0x0407060500030201, 0x0c0f0e0d080b0a09 };
+
+const __m256i V256_SHUFLL32_8  = { 0x0605040702010003, 0x0e0d0c0f0a09080b,
+                                   0x0605040702010003, 0x0e0d0c0f0a09080b };
+
+#elif defined(__SSSE3__)
+
+const v128_t V128_BSWAP64      = { 0x0001020304050607, 0x08090a0b0c0d0e0f };
+const v128_t V128_BSWAP32      = { 0x0405060700010203, 0x0c0d0e0f08090a0b };
+
+const v128_t V128_SHUFLR64_8   = { 0x0007060504030201, 0x080f0e0d0c0b0a09 };
+const v128_t V128_SHUFLR64_24  = { 0x0201000706050403, 0x0a09080f0e0d0c0b };
+const v128_t V128_SHUFLL64_8   = { 0x0605040302010007, 0x0e0d0c0b0a09080f };
+const v128_t V128_SHUFLL64_24  = { 0x0403020100070605, 0x0c0b0a09080f0e0d };
+
+const v128_t V128_SHUFLR32_8   = { 0x0407060500030201, 0x0c0f0e0d080b0a09 };
+const v128_t V128_SHUFLL32_8   = { 0x0605040702010003, 0x0e0d0c0f0a09080b };
+
+#endif
+
--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -14,10 +14,10 @@
 //  veor3q( v2, v1, v0 )                xor3        v2 ^ v1 ^ v0   
 //  vxarq_u64( v1, v0, n )              ror64xor    ( v1 ^ v0 ) >>> n )
 //  vbcaxq_u{64,32,16,8}( v2, v1, v0 )  xorandnot   v2 ^ ( v1 & ~v0 )
+//  vsraq_n_u{64,32,16,8}( v1, v0, n )  v1 + ( v0 >> n )
 //
 // not used anywhere yet
-//  vrax1q_u64( v1, v0 )                  v1 ^ ( v0 <<< 1 )
-//  vsraq_n_u{64,32,16,8}( v1, v0, n )    v1 + ( v0 >> n )
+//  vrax1q_u64( v1, v0 )                v1 ^ ( v0 <<< 1 )

 #define v128_t                        uint32x4_t   // default, 
 #define v128u64_t                     uint64x2_t
@@ -124,7 +124,7 @@
 // ~v1 & v0
 #define v128_andnot( v1, v0 )         vbicq_u32( v0, v1 )

-// ~( a ^ b ), same as (~a) ^ b
+// ~( v1 ^ v0 ), same as (~v1) ^ v0
 #define v128_xnor( v1, v0 )           v128_not( v128_xor( v1, v0 ) )

 // ~v1 | v0,  args reversed for consistency with x86_64
@@ -136,8 +136,11 @@
 // known way to test arm minor version.
 #if defined(__ARM_FEATURE_SHA3)
  #define v128_xor3                   veor3q_u32
+  #define v128_xor4( v3, v2, v1, v0 ) veorq_u32( v3, veor3q_u32( v2, v1, v0 ) )
 #else
  #define v128_xor3( v2, v1, v0 )     veorq_u32( veorq_u32( v2, v1 ), v0 )
+  #define v128_xor4( v3, v2, v1, v0 ) veorq_u32 ( veorq_u32( v3, v2 ), \
+                                                  veorq_u32( v1, v0 ) )
 #endif

 // v2 & v1 & v0
@@ -153,13 +156,13 @@
  #define v128_xorandnot( v2, v1, v0 )  v128_xor( v2, v128_andnot( v1, v0 ) )
 #endif

-// a ^ ( b & c )
+// v2 ^ ( v1 & v0 )
 #define v128_xorand( v2, v1, v0 )     v128_xor( v2, v128_and( v1, v0 ) )

-// a & ( b ^ c )
+// v2 & ( v1 ^ v0 )
 #define v128_andxor( v2, v1, v0 )     v128_and( v2, v128_xor( v1, v0 ) )

-// a ^ ( b | c )
+// v2 ^ ( v1 | v0 )
 #define v128_xoror( v2, v1, v0 )      v128_xor( v2, v128_or( v1, v0 ) )

 // v2 | ( v1 & v0 )
@@ -240,7 +243,7 @@ typedef union
 #define cast_v128u32( p )              (*((uint32x4_t*)(p)))
 #define castp_v128u32( p )             ((uint32x4_t*)(p))

-// set1
+// set1, integer argument
 #define v128_64                        vmovq_n_u64
 #define v128_32                        vmovq_n_u32
 #define v128_16                        vmovq_n_u16
@@ -326,10 +329,59 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 }

 // how to build a bitmask from vector elements? Efficiently???
-#define v128_movmask32                 
-#define v128_movmask64                
+//#define v128_movmask32                 
+//#define v128_movmask64                
+
+#define v128_shuffle8( v, vmask ) \
+     vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) )

 // Bit rotation
+/*
+#define v128_shuflr64_8( v )        v128_shuffle8( v, V128_SHUFLR64_8 )
+#define v128_shufll64_8( v )        v128_shuffle8( v, V128_SHUFLL64_8 )
+#define v128_shuflr64_16(v )        v128_shuffle8( v, V128_SHUFLR64_16 )
+#define v128_shufll64_16(v )        v128_shuffle8( v, V128_SHUFLL64_16 )
+#define v128_shuflr64_24(v )        v128_shuffle8( v, V128_SHUFLR64_24 )
+#define v128_shufll64_24(v )        v128_shuffle8( v, V128_SHUFLL64_24 )
+#define v128_shuflr32_8( v )        v128_shuffle8( v, V128_SHUFLR32_8 )
+#define v128_shufll32_8( v )        v128_shuffle8( v, V128_SHUFLL32_8 )
+
+#define v128_ror64( v, c ) \
+   ( (c) ==  8 ) ? v128_shuflr64_8( v ) \
+ : ( (c) == 16 ) ? v128_shuflr64_16( v ) \
+ : ( (c) == 24 ) ? v128_shuflr64_24( v ) \
+ : ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
+ : ( (c) == 40 ) ? v128_shufll64_24( v ) \
+ : ( (c) == 48 ) ? v128_shufll64_16( v ) \
+ : ( (c) == 56 ) ? v128_shufll64_8( v ) \
+ :                 vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
+                                ((uint64x2_t)(v)), c )
+
+#define v128_rol64( v, c ) \
+   ( (c) ==  8 ) ? v128_shufll64_8( v ) \
+ : ( (c) == 16 ) ? v128_shufll64_16( v ) \
+ : ( (c) == 24 ) ? v128_shufll64_24( v ) \
+ : ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
+ : ( (c) == 40 ) ? v128_shuflr64_24( v ) \
+ : ( (c) == 48 ) ? v128_shuflr64_16( v ) \
+ : ( (c) == 56 ) ? v128_shuflr64_8( v ) \
+ :                 vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
+                                ((uint64x2_t)(v)), c )
+
+#define v128_ror32( v, c ) \
+   ( (c) ==  8 ) ? v128_shuflr32_8( v ) \
+ : ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
+ : ( (c) == 24 ) ? v128_shufll32_8( v ) \
+ :                 vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
+                                ((uint32x4_t)(v)), c )
+
+#define v128_rol32( v, c ) \
+   ( (c) ==  8 ) ? v128_shufll32_8( v ) \
+ : ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
+ : ( (c) == 24 ) ? v128_shuflr32_8( v ) \
+ :                 vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
+                                ((uint32x4_t)(v)), c )
+*/

 #define v128_ror64( v, c ) \
  ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
@@ -351,6 +403,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
                : vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
                               ((uint32x4_t)(v)), c )

+/* not used
 #define v128_ror16( v, c ) \
  ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)(v)) ) \
               : vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \
@@ -368,6 +421,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 #define v128_rol8( v, c ) \
      vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
                 ((uint8x16_t)(v)), c )
+*/

 // ( v1 ^ v0 ) >>> c 
 #if defined(__ARM_FEATURE_SHA3)
@@ -376,57 +430,13 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
  #define v128_ror64xor( v1, v0, c )  v128_ror64( v128_xor( v1, v0 ), c ) 
 #endif

-#define v128_2ror64( v1, v0, c ) \
-{ \
- uint64x2_t t0 = vshrq_n_u64( v0, c ); \
- uint64x2_t t1 = vshrq_n_u64( v1, c ); \
- v0 = vsliq_n_u64( v0, 64-(c) ); \
- v1 = vsliq_n_u64( v1, 64-(c) ); \
- v0 = vorrq_u64( v0, t0 ); \
- v1 = vorrq_u64( v1, t1 ); \
-}
-
-#define v128_2rol64_( v1, v0, c ) \
-{ \
- uint64x2_t t0 = vshlq_n_u64( v0, c ); \
- uint64x2_t t1 = vshlq_n_u64( v1, c ); \
- v0 = vsriq_n_u64( v0, 64-(c) ); \
- v1 = vsriq_n_u64( v1, 64-(c) ); \
- v0 = vorrq_u64( v0, t0 ); \
- v1 = vorrq_u64( v1, t1 ); \
-}
-
-#define v128_2rorl32( v1, v0, c ) \
-{ \
- uint32x4_t t0 = vshrq_n_u32( v0, c ); \
- uint32x4_t t1 = vshrq_n_u32( v1, c ); \
- v0 = vsliq_n_u32( v0, 32-(c) ); \
- v1 = vsliq_n_u32( v1, 32-(c) ); \
- v0 = vorrq_32( v0, t0 ); \
- v1 = vorrq_u32( v1, t1 ); \
-}
-
-#define v128_2ror32( v1, v0, c ) \
-{ \
- uint32x4_t t0 = vshlq_n_u32( v0, c ); \
- uint32x4_t t1 = vshlq_n_u32( v1, c ); \
- v0 = vsriq_n_u32( v0, 32-(c) ); \
- v1 = vsriq_n_u32( v1, 32-(c) ); \
- v0 = vorrq_u32( v0, t0 ); \
- v1 = vorrq_u32( v1, t1 ); \
-}
-
-/* not used anywhere and hopefully never will
-// vector mask, use as last resort. prefer tbl, rev, alignr, etc
-#define v128_shufflev32( v, vmask ) \
-  v128_set32( ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[3] ], \
-              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[2] ], \
-              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[1] ], \
-              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[0] ] ) \
+/* not used
+// v1 + ( v0 >> c )
+#define v128_addsr64( v1, v0, c )     vsraq_n_u64( v1, v0, c )
+#define v128_addsr32( v1, v0, c )     vsraq_n_u32( v1, v0, c )
 */

-#define v128_shuffle8( v, vmask ) \
-     vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) )
+// Cross lane shuffle

 // sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
 // Bit rotation already promotes faster widths. Usage is context sensitive.
@@ -438,19 +448,14 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 #define v128_qrev16            vrev64q_u16
 #define v128_lrev16            vrev32q_u16

-// aka bswap
-// #define v128_qrev8             vrev64q_u8
-// #define v128_lrev8             vrev32q_u8
-// #define v128_wrev8             vrev16q_u8
-
 // full vector rotation

 // reverse elements in vector
 static inline uint64x2_t v128_rev64( uint64x2_t v )
 {   return vextq_u64( v, v, 1 ); }
-#define v128_swap64     v128_rev64   // grandfathered
+#define v128_swap64           v128_rev64   // grandfathered

-#define v128_rev32(v)        v128_rev64( v128_qrev32( v ) )
+#define v128_rev32(v)         v128_rev64( v128_qrev32( v ) )

 // shuffle-rotate vector elements
 static inline uint32x4_t v128_shuflr32( uint32x4_t v )
@@ -468,7 +473,6 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
 #define v128_bswap64(v)        (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) )
 #define v128_bswap128(v)       (uint32x4_t)v128_rev64( v128_bswap64(v) )

-// Useful for x86_64 but does nothing for ARM
 #define v128_block_bswap32( dst, src ) \
 { \
   casti_v128u32( dst,0 ) = v128_bswap32( casti_v128u32( src,0 ) ); \
@@ -482,26 +486,6 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
 }
 #define v128_block_bswap32_256    v128_block_bswap32

-#define v128_block_bswap32_512( dst, src ) \
-{ \
-   casti_v128u32( dst, 0 ) = v128_bswap32( casti_v128u32( src, 0 ) ); \
-   casti_v128u32( dst, 1 ) = v128_bswap32( casti_v128u32( src, 1 ) ); \
-   casti_v128u32( dst, 2 ) = v128_bswap32( casti_v128u32( src, 2 ) ); \
-   casti_v128u32( dst, 3 ) = v128_bswap32( casti_v128u32( src, 3 ) ); \
-   casti_v128u32( dst, 4 ) = v128_bswap32( casti_v128u32( src, 4 ) ); \
-   casti_v128u32( dst, 5 ) = v128_bswap32( casti_v128u32( src, 5 ) ); \
-   casti_v128u32( dst, 6 ) = v128_bswap32( casti_v128u32( src, 6 ) ); \
-   casti_v128u32( dst, 7 ) = v128_bswap32( casti_v128u32( src, 7 ) ); \
-   casti_v128u32( dst, 8 ) = v128_bswap32( casti_v128u32( src, 8 ) ); \
-   casti_v128u32( dst, 9 ) = v128_bswap32( casti_v128u32( src, 9 ) ); \
-   casti_v128u32( dst,10 ) = v128_bswap32( casti_v128u32( src,10 ) ); \
-   casti_v128u32( dst,11 ) = v128_bswap32( casti_v128u32( src,11 ) ); \
-   casti_v128u32( dst,12 ) = v128_bswap32( casti_v128u32( src,12 ) ); \
-   casti_v128u32( dst,13 ) = v128_bswap32( casti_v128u32( src,13 ) ); \
-   casti_v128u32( dst,14 ) = v128_bswap32( casti_v128u32( src,14 ) ); \
-   casti_v128u32( dst,15 ) = v128_bswap32( casti_v128u32( src,15 ) ); \
-}
-
 #define v128_block_bswap64( dst, src ) \
 { \
   casti_v128u64( dst,0 ) = v128_bswap64( casti_v128u64( src,0 ) ); \
@@ -513,27 +497,6 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
   casti_v128u64( dst,6 ) = v128_bswap64( casti_v128u64( src,6 ) ); \
   casti_v128u64( dst,7 ) = v128_bswap64( casti_v128u64( src,7 ) ); \
 }
-#define v128_block_bswap64_512   v128_block_bswap64 \
-
-#define v128_block_bswap64_1024( dst, src ) \
-{ \
-   casti_v128u64( dst, 0 ) = v128_bswap64( casti_v128u64( src, 0 ) ); \
-   casti_v128u64( dst, 1 ) = v128_bswap64( casti_v128u64( src, 1 ) ); \
-   casti_v128u64( dst, 2 ) = v128_bswap64( casti_v128u64( src, 2 ) ); \
-   casti_v128u64( dst, 3 ) = v128_bswap64( casti_v128u64( src, 3 ) ); \
-   casti_v128u64( dst, 4 ) = v128_bswap64( casti_v128u64( src, 4 ) ); \
-   casti_v128u64( dst, 5 ) = v128_bswap64( casti_v128u64( src, 5 ) ); \
-   casti_v128u64( dst, 6 ) = v128_bswap64( casti_v128u64( src, 6 ) ); \
-   casti_v128u64( dst, 7 ) = v128_bswap64( casti_v128u64( src, 7 ) ); \
-   casti_v128u64( dst, 8 ) = v128_bswap64( casti_v128u64( src, 8 ) ); \
-   casti_v128u64( dst, 9 ) = v128_bswap64( casti_v128u64( src, 9 ) ); \
-   casti_v128u64( dst,10 ) = v128_bswap64( casti_v128u64( src,10 ) ); \
-   casti_v128u64( dst,11 ) = v128_bswap64( casti_v128u64( src,11 ) ); \
-   casti_v128u64( dst,12 ) = v128_bswap64( casti_v128u64( src,12 ) ); \
-   casti_v128u64( dst,13 ) = v128_bswap64( casti_v128u64( src,13 ) ); \
-   casti_v128u64( dst,14 ) = v128_bswap64( casti_v128u64( src,14 ) ); \
-   casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \
-}

 // Bitwise blend using vector mask, use only bytewise for compatibility
 // with x86_64.
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -173,21 +173,13 @@ static inline int cpu_fanpercent()
 	return 0;
 }

+#if defined(__x86_64__)

 // x86_64 CPUID

 // This list is incomplete, it only contains features of interest to cpuminer.
 // refer to http://en.wikipedia.org/wiki/CPUID for details.

-// AVX10 compatibility notes
-//
-// Display format: AVX10.[version]-[vectorwidth]
-// AVX10.1-512 is a rebranding of AVX512 and is effectively the AVX* superset
-// with full 512 bit vector support.
-// AVX10.2-256 is effectively AVX2 + AVX512_VL, all AVX512 instructions and
-// features applied only to 256 bit and 128 bit vectors.
-// Future AVX10 versions will add new instructions and features.
-
 // Register array indexes
 #define EAX_Reg  (0)
 #define EBX_Reg  (1)
@@ -209,6 +201,7 @@ static inline int cpu_fanpercent()
 // CPU_INFO: EAX=1, ECX=0
 // ECX
 #define SSE3_Flag                 1    
+#define PCLMULQDQ_Flag           (1<< 1)
 #define SSSE3_Flag               (1<< 9)
 #define XOP_Flag                 (1<<11)   // obsolete
 #define FMA3_Flag                (1<<12)
@@ -239,6 +232,7 @@ static inline int cpu_fanpercent()
 #define AVX512_VBMI_Flag         (1<< 1) 
 #define AVX512_VBMI2_Flag        (1<< 6)
 #define VAES_Flag                (1<< 9)
+#define VPCLMULQDQ_Flag          (1<<10)
 #define AVX512_VNNI_Flag         (1<<11)
 #define AVX512_BITALG_Flag       (1<<12)
 #define AVX512_VPOPCNTDQ_Flag    (1<<14)
@@ -260,6 +254,8 @@ static inline int cpu_fanpercent()
 #define AVX512_BF16_Flag         (1<< 5)
 #define AMX_FP16_Flag            (1<<21)
 #define AVX_IFMA_Flag            (1<<23)
+#define MOVRS_Flag               (1<<31)  // Both names are referenced in docs
+#define AVX10_MOVRS_Flag         (1<<31)
 // EDX
 #define AVX_VNNI_INT8_Flag       (1<< 4)
 #define AVX_NE_CONVERT_Flag      (1<< 5)
@@ -271,17 +267,15 @@ static inline int cpu_fanpercent()
 // AVX10_FEATURES: EAX=0x24, ECX=0
 // EBX
 #define AVX10_VERSION_mask        0xff      // bits [7:0]
-#define AVX10_128_Flag           (1<<16)
-#define AVX10_256_Flag           (1<<17)   
-#define AVX10_512_Flag           (1<<18)   
+//#define AVX10_128_Flag           (1<<16)
+//#define AVX10_256_Flag           (1<<17)   
+//#define AVX10_512_Flag           (1<<18)   

 // Use this to detect presence of feature
 #define AVX_mask     (AVX_Flag|XSAVE_Flag|OSXSAVE_Flag)
 #define FMA3_mask    (FMA3_Flag|AVX_mask)
 #define AVX512_mask  (AVX512_VL_Flag|AVX512_BW_Flag|AVX512_DQ_Flag|AVX512_F_Flag)

-#if defined(__x86_64__)
-
 static inline void cpuid( unsigned int leaf, unsigned int subleaf,
                          unsigned int output[4] )
 {
@@ -317,7 +311,7 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf,
 #elif defined(ARM_AUXV)

 // Always test if HWCAP variable is defined in the kernel before attempting
-// to compile it. If not defined the feature can't be tested and won't be
+// to compile this. If not defined the feature can't be tested and won't be
 // included in the compile.
 // This can occur if compiling with an old kernel and a new CPU and could
 // result in a suboptimal build.
@@ -543,6 +537,15 @@ static inline bool cpu_arch_aarch64()
 #endif
 }   

+static inline bool cpu_arch_riscv64()
+{
+#if defined(__riscv) && ( __riscv_xlen == 64 )
+   return true;
+#else
+   return false;
+#endif
+}
+
 static inline bool has_sse()
 {
 #if defined(__x86_64__)
@@ -608,6 +611,16 @@ static inline bool has_neon()
 #endif
 }

+// No apparent CPUID equivalent on riscv, returns SW build info.
+static inline bool has_rvv()
+{
+#if defined(__riscv) && defined(__riscv_vector)
+   return true;
+#else
+   return false;
+#endif
+}
+
 static inline bool has_avx()
 {
 #if defined(__x86_64__)
@@ -897,7 +910,6 @@ static inline bool has_apx_f()
 #endif
 }

-// Not much use on it's own
 static inline bool has_avx10()
 {
 #if defined(__x86_64__)
@@ -922,49 +934,6 @@ static inline unsigned int avx10_version()
    return 0;
 }

-// also includes 256 & 128
-static inline bool has_avx10_512()
-{
-#if defined(__x86_64__)
-    if ( has_avx10() )
-    {
-       unsigned int cpu_info[4] = { 0 };
-       cpuid( AVX10_FEATURES, 0, cpu_info );
-       return cpu_info[ EBX_Reg ] & AVX10_512_Flag;
-    }
-#endif
-    return false;
-}
-
-// Includes 128 but might not include 512
-static inline bool has_avx10_256()
-{
-#if defined(__x86_64__)
-    if ( has_avx10() )
-    {
-       unsigned int cpu_info[4] = { 0 };
-       cpuid( AVX10_FEATURES, 0, cpu_info );
-       return cpu_info[ EBX_Reg ] & AVX10_256_Flag;
-    }
-#endif
-    return false;
-}
-
-// AVX10 vector register length
-static inline unsigned int avx10_vector_length()
-{
-#if defined(__x86_64__)
-    if ( has_avx10() )
-    {
-       unsigned int cpu_info[4] = { 0 };
-       cpuid( AVX10_FEATURES, 0, cpu_info );
-       return cpu_info[ EBX_Reg ] & AVX10_512_Flag ? 512
-          : ( cpu_info[ EBX_Reg ] & AVX10_256_Flag ? 256 : 0 );
-    }
-#endif
-    return 0;
-}
-
 // ARM SVE vector register length, converted from bytes to bits.
 static inline int sve_vector_length()
 {
@@ -975,6 +944,33 @@ static inline int sve_vector_length()
   return 0;
 }

+// Assume min_vlen refers to the register size
+static inline int rvv_vector_length()
+{
+#if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_v_min_vlen)
+   return __riscv_v_min_vlen;
+#endif
+   return 0;
+}
+
+// generic
+static inline int vector_length()
+{
+#if defined(__x86_64__)
+   return has_avx10() || has_avx512() ? 512
+        : has_avx2()   ? 256
+        : has_sse2()   ? 128
+        :                0;
+#elif defined(__aarch64__)
+   return has_sve()    ? sve_vector_length()
+        : has_neon()   ? 128
+        :                0;
+#elif defined(__riscv) && defined(__riscv_vector)
+   return rvv_vector_length();
+#endif
+   return 0;
+}
+
 static inline uint32_t cpuid_get_highest_function_number()
 {
 #if defined(__x86_64__)
@@ -1061,13 +1057,17 @@ static inline void cpu_brand_string( char* s )
        memcpy( s + 32, cpu_info, sizeof(cpu_info) );
    }

-#elif defined(__arm__) || defined(__aarch64__)
+#elif defined(__aarch64__)

    sprintf( s, "ARM 64 bit CPU" );

+#elif defined(__riscv) && (__riscv_xlen == 64)   
+
+    sprintf( s, "RISC-V 64 bit CPU" );
+    
 #else

-    sprintf( s, "unknown CPU architecture" );
+    sprintf( s, "unknown/unsupported CPU architecture" );

 #endif
 }    
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -16,13 +16,8 @@ export MINGW_LIB="/usr/x86_64-w64-mingw32/lib"
 export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
 # used by GCC
 export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs"
-# Support for Windows 7 CPU groups, AES sometimes not included in -march
-# CPU groups disabled due to incompatibilities between Intel and AMD CPUs.
-# CPU groups are enabled by default in Makefile, use -U to disable.
-export DEFAULT_CFLAGS="-maes -O3 -Wall -U_WIN32_WINNT"
-export DEFAULT_CFLAGS_OLD="-O3 -Wall -U_WIN32_WINNT"
-#export DEFAULT_CFLAGS="-maes -O3 -Wall"
-#export DEFAULT_CFLAGS_OLD="-O3 -Wall"
+export DEFAULT_CFLAGS="-maes -O3 -Wall"
+export DEFAULT_CFLAGS_OLD="-O3 -Wall"

 # make link to local gmp header file.
 ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h
@@ -125,12 +120,12 @@ CFLAGS="-msse2 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-sse2.exe
-make clean || echo clean
+#make clean || echo clean

 # Native with CPU groups ennabled
-make clean || echo clean
-rm -f config.status
-CFLAGS="-march=native $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
-make -j 8
-strip -s cpuminer.exe
+#make clean || echo clean
+#rm -f config.status
+#CFLAGS="-march=native $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
+#make -j 8
+#strip -s cpuminer.exe
Author	SHA1	Message	Date
Jay D Dee	66191db93c	v25.4	2025-06-20 20:31:41 -04:00
Jay D Dee	dd99580a4c	v25.3	2025-01-16 12:31:53 -05:00
Jay D Dee	1ed18bf22e	v25.2	2025-01-12 18:58:21 -05:00