v26.1

v25.7
v25.6
2026-02-23 17:03:08 +00:00 · 2026-01-13 19:17:47 -05:00 · 2025-11-15 10:44:32 -05:00 · 2025-07-20 19:43:10 -04:00 · 2025-07-09 01:32:38 -04:00 · 2025-06-20 20:31:41 -04:00
133 changed files with 4797 additions and 9021 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,15 +1,35 @@

+if HAVE_APPLE
+# MacOS uses Homebrew to install needed packages but they aren't linked for
+# the jansson test in configure. Ignore the failed test & link them now,
+# different path for different CPU arch.
+
+if ARCH_ARM64
+  EXTRA_INCLUDES = -I/opt/homebrew/include
+  EXTRA_LIBS     = -L/opt/homebrew/lib
+else
+  EXTRA_INCLUDES = -I/usr/local/include
+  EXTRA_LIBS     = -L/usr/local/lib
+endif
+
+else
+
 if WANT_JANSSON
-JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson
+# Can't find jansson libraries, compile the included source code.
+  EXTRA_INCLUDES = -I$(top_srcdir)/compat/jansson
+  EXTRA_LIBS     = -L$(top_srcdir)/compat/jansson
 else
-JANSSON_INCLUDES=
+  EXTRA_INCLUDES =
+  EXTRA_LIBS     =
+endif
+
 endif

 EXTRA_DIST = example-cfg.json nomacro.pl

 SUBDIRS = compat

-ALL_INCLUDES	= @PTHREAD_FLAGS@ -fno-strict-aliasing $(JANSSON_INCLUDES) -I.
+ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(EXTRA_INCLUDES) -I.

 bin_PROGRAMS = cpuminer

@@ -23,6 +43,7 @@ cpuminer_SOURCES = \
  sysinfos.c \
  algo-gate-api.c\
  malloc-huge.c \
+  simd-utils/simd-constants.c \
  algo/argon2d/argon2d-gate.c \
  algo/argon2d/blake2/blake2b.c \
  algo/argon2d/argon2d/argon2.c \
@@ -166,8 +187,6 @@ cpuminer_SOURCES = \
  algo/shavite/sph-shavite-aesni.c \
  algo/shavite/shavite-hash-2way.c \
  algo/shavite/shavite-hash-4way.c \
-  algo/simd/nist.c \
-  algo/simd/vector.c \
  algo/simd/sph_simd.c \
  algo/simd/simd-hash-2way.c \
  algo/skein/sph_skein.c \
@@ -275,28 +294,28 @@ cpuminer_SOURCES = \
  algo/yespower/yespower-ref.c \
  algo/yespower/yespower-blake2b-ref.c

-disable_flags =
-
-if USE_ASM
-   cpuminer_SOURCES += asm/neoscrypt_asm.S
-else
-   disable_flags += -DNOASM
-endif
-
 if HAVE_WINDOWS
   cpuminer_SOURCES += compat/winansi.c
 endif

+if USE_ASM
+   disable_flags =
+   cpuminer_SOURCES += asm/neoscrypt_asm.S
+else
+   disable_flags = -DNOASM
+endif
+
 cpuminer_LDFLAGS = @LDFLAGS@
-cpuminer_LDADD	= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
+cpuminer_LDADD	= $(EXTRA_LIBS) @LIBCURL@ -ljansson @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
 cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
 cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)

-if HAVE_WINDOWS
-cpuminer_CFLAGS += -Wl,--stack,10485760
+if ARCH_ARM64
+   cpuminer_CFLAGS += -flax-vector-conversions
 endif

 if HAVE_WINDOWS
+
 # use to profile an object
 # gprof_cflags = -pg -g3
 # cpuminer_LDFLAGS += -pg
@@ -310,5 +329,4 @@ cpuminer-neoscrypt.o: neoscrypt.c
 	@echo "CUSTOM ${@}: ${filter %.o,${^}} ${filter %.c,${^}}"
 	$(CC) $(common_ccflags) -g -O3 $(gprof_cflags) -MT $@ -MD -MP -c -o $@ $<

-
 endif
--- a/README.md
+++ b/README.md
@@ -36,44 +36,28 @@ for compile instructions.
 Requirements
 ------------

-1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
-Intel Core2 and newer and AMD equivalents. Further optimizations are available
-on some algoritms for CPUs with AES, AVX, AVX2, SHA, AVX512 and VAES.
-
-32 bit CPUs are not supported.
-Other CPU architectures such as ARM, Raspberry Pi, RISC-V, Xeon Phi, etc,
-are not supported.
+1. A 64 bit CPU supporting x86_64 (Intel or AMD) or aarch64 (ARM).
+x86_64 requires SSE2, aarch64 requires armv8 & NEON.

 Mobile CPUs like laptop computers are not recommended because they aren't
 designed for extreme heat of operating at full load for extended periods of
 time.

-Older CPUs and ARM architecture may be supported by cpuminer-multi by TPruvot.
-
-2. 64 bit Linux or Windows OS. Ubuntu and Fedora based distributions,
-including Mint and Centos, are known to work and have all dependencies
-in their repositories. Others may work but may require more effort. Older
-versions such as Centos 6 don't work due to missing features. 
-
-Windows 7 or newer is supported with mingw_w64 and msys or using the pre-built
-binaries. WindowsXP 64 bit is YMMV.
-
-FreeBSD is not actively tested but should work, YMMV.
-MacOS, OSx and Android are not supported.
+2. 64 bit operating system including Linux, Windows, MacOS, or BSD.
+Android, IOS and alt OSs like Haiku & ReactOS are not supported.

 3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
-RPC getwork using http:// or https://.
-GBT is YMMV.
+RPC getblocktemplate using http:// or https://.

 Supported Algorithms
 --------------------

                          allium        Garlicoin
                          anime         Animecoin
-                          argon2        Argon2 coin (AR2)
-                          argon2d250    argon2d-crds, Credits (CRDS)
-                          argon2d500    argon2d-dyn,  Dynamic (DYN)
-                          argon2d4096   argon2d-uis, Unitus, (UIS)
+                          argon2d250    
+                          argon2d500
+                          argon2d1000
+                          argon2d4096
                          blake         Blake-256
                          blake2b       Blake2-512
                          blake2s       Blake2-256
--- a/67
+++ b/67
@@ -32,8 +32,6 @@ Requirements

 32 bit CPUs are not supported.

-Older CPUs are supported by open source cpuminer-multi by TPruvot but at reduced performance.
-
 Mining on mobile devices that meet the requirements is not recommended due to the risk of
 overheating and damaging the battery. Mining has unlimited demand, it will push any device
 to or beyond its limits. There is also a fire risk with overheated lithium batteries.
@@ -75,6 +73,71 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v26.1
+
+Fixed segfault in scrypt algo on some older CPUs.
+
+v25.7
+
+Fixed a bug calculating TTF longer than 1 year.
+Faster argon2d.
+Faster hamsi AVX512.
+Faster switfftx AVX2.
+Other small fixes and improvements.
+
+v25.6
+
+Added argon2d1000, argon2d16000 algos.
+Target specific AES optimizations improve shavite for ARM64 & x86_64.
+
+v25.5
+
+x86_64: Fixed an insidious bug in sha256 early rejection optimization for AVX2 & AVX512.
+x86_64: Faster sha256d, sha256dt for AVX2 & AVX512.
+Other small bug fixes.
+
+v25.4
+
+x86_64: improved handling of vector constants used for byte permutations.
+x86_64: removed hooks for cancelled AVX10-256.
+Minor bug fixes & improvements.
+More code cleanup.
+
+v25.3
+
+#442, #443: Fixed a regression in Makefile.am.
+Removed algo features log display.
+Some code cleanup.
+
+v25.2
+
+ARM: Fixed regression from v25.1 that could cause build fail.
+BSD: FreeBSD is now supported. Other BSDs may also work.
+MacOS: build with installed jansson library instead of compiling the included source code.
+Windows: remove "_WIN32_WINNT=0x0601" which was a downgrade on Win11.
+Changed build.sh shell from bash to sh.
+
+v25.1
+
+MacOS ARM64: m7m algo is now working.
+MacOS ARM64: can now be compiled with GCC.
+MacOS x86_64: is now working compiled with GCC.
+Fixed some minor bugs & removed some obsolete code.
+
+v24.8
+
+ARM: Apple MacOS on M series CPU is now supported compiled from source
+code, see Wiki for details.
+ARM: Fix incorrect compiler version display when using clang. 
+build.sh can now be used to compile all targets, arm_build.sh & build_msys2.sh
+have been removed.
+Windows: MSys2 build now enables CPU groups by default, prebuilt binaries
+continue to be compiled with CPU groups disabled.
+
+v24.7
+
+ARM: compile works for Windows using MSys2 & MingW, see wiki for details.
+
 v24.6

 ARM: Fixed scryptn2, x16*, broken in v24.2. 
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -295,8 +295,10 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
  {
    case ALGO_ALLIUM:       rc = register_allium_algo        ( gate ); break;
    case ALGO_ANIME:        rc = register_anime_algo         ( gate ); break;
-    case ALGO_ARGON2D250:   rc = register_argon2d_crds_algo  ( gate ); break;
-    case ALGO_ARGON2D500:   rc = register_argon2d_dyn_algo   ( gate ); break;
+    case ALGO_ARGON2D250:   rc = register_argon2d250_algo    ( gate ); break;
+    case ALGO_ARGON2D500:   rc = register_argon2d500_algo    ( gate ); break;
+    case ALGO_ARGON2D1000:  rc = register_argon2d1000_algo   ( gate ); break;
+    case ALGO_ARGON2D16000: rc = register_argon2d16000_algo  ( gate ); break;
    case ALGO_ARGON2D4096:  rc = register_argon2d4096_algo   ( gate ); break;
    case ALGO_AXIOM:        rc = register_axiom_algo         ( gate ); break;
    case ALGO_BLAKE:        rc = register_blake_algo         ( gate ); break;
@@ -416,8 +418,6 @@ void exec_hash_function( int algo, void *output, const void *pdata )
 const char* const algo_alias_map[][2] =
 {
 //   alias                proper
-  { "argon2d-dyn",       "argon2d500"     },
-  { "argon2d-uis",       "argon2d4096"    },
  { "bcd",               "x13bcd"         },
  { "bitcore",           "timetravel10"   },
  { "bitzeny",           "yescryptr8"     },
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -172,8 +172,11 @@ void ( *set_work_data_endian )  ( struct work* );

 json_t* ( *longpoll_rpc_call )  ( CURL*, int*, char* );

+// Deprecated
 set_t optimizations;
+
 int  ( *get_work_data_size )     ();
+
 int  ntime_index;
 int  nbits_index;
 int  nonce_index;            // use with caution, see warning below
@@ -274,8 +277,6 @@ void std_get_new_work( struct work *work, struct work *g_work, int thr_id,

 void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
 void sha256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
-// OpenSSL sha256 deprecated
-//void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );

 bool std_le_work_decode( struct work *work );
 bool std_be_work_decode( struct work *work );
--- a/algo/argon2d/argon2d-gate.c
+++ b/algo/argon2d/argon2d-gate.c
@@ -6,9 +6,39 @@ static const size_t INPUT_BYTES = 80;  // Lenth of a block header in bytes. Inpu
 static const size_t OUTPUT_BYTES = 32; // Length of output needed for a 256-bit hash
 static const unsigned int DEFAULT_ARGON2_FLAG = 2; //Same as ARGON2_DEFAULT_FLAGS

-// Credits
+// generic, works with most variations of argon2d
+int scanhash_argon2d( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t _ALIGN(64) hash[8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const int thr_id = mythr->id;
+   const uint32_t first_nonce = (const uint32_t)pdata[19];
+   const uint32_t last_nonce = (const uint32_t)max_nonce;
+   uint32_t nonce = first_nonce;
+   const bool bench = opt_benchmark;

-void argon2d_crds_hash( void *output, const void *input )
+   v128_bswap32_80( edata, pdata );
+   do
+   {
+      edata[19] = nonce;
+      algo_gate.hash( hash, edata, thr_id );
+      if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
+      {
+          pdata[19] = bswap_32( nonce );
+          submit_solution( work, hash, mythr );
+      }
+      nonce++;
+  } while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
+
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce;
+   return 0;
+}
+
+void argon2d250_hash( void *output, const void *input )
 {
 	argon2_context context;
 	context.out = (uint8_t *)output;
@@ -34,48 +64,15 @@ void argon2d_crds_hash( void *output, const void *input )
 	argon2_ctx( &context, Argon2_d );
 }

-int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
+bool register_argon2d250_algo( algo_gate_t* gate )
 {
-   uint32_t _ALIGN(64) edata[20];
-   uint32_t _ALIGN(64) hash[8];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t Htarg = ptarget[7];
-   uint32_t nonce = first_nonce;
-
-   swab32_array( edata, pdata, 20 );
-
-   do {
-      be32enc(&edata[19], nonce);
-      argon2d_crds_hash( hash, edata );
-      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
-      {
-          pdata[19] = nonce;
-          submit_solution( work, hash, mythr );
-      }
-      nonce++;
-   } while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce + 1;
-   return 0;
-}
-
-bool register_argon2d_crds_algo( algo_gate_t* gate )
-{
-        gate->scanhash = (void*)&scanhash_argon2d_crds;
-        gate->hash = (void*)&argon2d_crds_hash;
-        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
+        gate->scanhash = (void*)&scanhash_argon2d;
+        gate->hash = (void*)&argon2d250_hash;
        opt_target_factor = 65536.0;
        return true;
 }

-// Dynamic
-
-void argon2d_dyn_hash( void *output, const void *input )
+void argon2d500_hash( void *output, const void *input )
 {
    argon2_context context;
    context.out = (uint8_t *)output;
@@ -101,48 +98,81 @@ void argon2d_dyn_hash( void *output, const void *input )
    argon2_ctx( &context, Argon2_d );
 }

-int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
+bool register_argon2d500_algo( algo_gate_t* gate )
 {
-   uint32_t _ALIGN(64) edata[20];
-   uint32_t _ALIGN(64) hash[8];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const int thr_id = mythr->id; 
-   const uint32_t first_nonce = (const uint32_t)pdata[19];
-   const uint32_t last_nonce = (const uint32_t)max_nonce;
-   uint32_t nonce = first_nonce;
-   const bool bench = opt_benchmark;
-
-   v128_bswap32_80( edata, pdata );
-   do
-   {
-      edata[19] = nonce;
-      argon2d_dyn_hash( hash, edata );
-      if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
-           && !bench ) )
-      {
-          pdata[19] = bswap_32( nonce );;
-          submit_solution( work, hash, mythr );
-      }
-      nonce++;
-  } while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
-
-   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce;
-   return 0;
-}
-
-bool register_argon2d_dyn_algo( algo_gate_t* gate )
-{
-        gate->scanhash = (void*)&scanhash_argon2d_dyn;
-        gate->hash = (void*)&argon2d_dyn_hash;
-        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
+        gate->scanhash = (void*)&scanhash_argon2d;
+        gate->hash = (void*)&argon2d500_hash;
        opt_target_factor = 65536.0;
        return true;
 }

-// Unitus
+void argon2d1000_hash( void *output, const void *input )
+{
+    argon2_context context;
+    context.out = (uint8_t *)output;
+    context.outlen = (uint32_t)OUTPUT_BYTES;
+    context.pwd = (uint8_t *)input;
+    context.pwdlen = (uint32_t)INPUT_BYTES;
+    context.salt = (uint8_t *)input; //salt = input
+    context.saltlen = (uint32_t)INPUT_BYTES;
+    context.secret = NULL;
+    context.secretlen = 0;
+    context.ad = NULL;
+    context.adlen = 0;
+    context.allocate_cbk = NULL;
+    context.free_cbk = NULL;
+    context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS
+    // main configurable Argon2 hash parameters
+    context.m_cost = 1000;  // Memory in KiB (1MB)
+    context.lanes = 8;     // Degree of Parallelism
+    context.threads = 1;   // Threads
+    context.t_cost = 2;    // Iterations
+    context.version = ARGON2_VERSION_10;
+
+    argon2_ctx( &context, Argon2_d );
+}
+
+bool register_argon2d1000_algo( algo_gate_t* gate )
+{
+        gate->scanhash = (void*)&scanhash_argon2d;
+        gate->hash = (void*)&argon2d1000_hash;
+        opt_target_factor = 65536.0;
+        return true;
+}
+
+void argon2d16000_hash( void *output, const void *input )
+{
+   argon2_context context;
+   context.out = (uint8_t *)output;
+   context.outlen = (uint32_t)OUTPUT_BYTES;
+   context.pwd = (uint8_t *)input;
+   context.pwdlen = (uint32_t)INPUT_BYTES;
+   context.salt = (uint8_t *)input; //salt = input
+   context.saltlen = (uint32_t)INPUT_BYTES;
+   context.secret = NULL;
+   context.secretlen = 0;
+   context.ad = NULL;
+   context.adlen = 0;
+   context.allocate_cbk = NULL;
+   context.free_cbk = NULL;
+   context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS
+   // main configurable Argon2 hash parameters
+   context.m_cost = 16000; // Memory in KiB (~16384KB)
+   context.lanes = 1;    // Degree of Parallelism
+   context.threads = 1;  // Threads
+   context.t_cost = 1;   // Iterations
+   context.version = ARGON2_VERSION_10;
+
+   argon2_ctx( &context, Argon2_d );
+}
+
+bool register_argon2d16000_algo( algo_gate_t* gate )
+{
+        gate->scanhash = (void*)&scanhash_argon2d;
+        gate->hash = (void*)&argon2d16000_hash;
+        opt_target_factor = 65536.0;
+        return true;
+}

 int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
@@ -154,7 +184,7 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = (const uint32_t)max_nonce;
   uint32_t n = first_nonce;
-   const int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;  
   uint32_t t_cost = 1; // 1 iteration
   uint32_t m_cost = 4096; // use 4MB
   uint32_t parallelism = 1; // 1 thread, 2 lanes
@@ -182,7 +212,6 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
 bool register_argon2d4096_algo( algo_gate_t* gate )
 {
        gate->scanhash = (void*)&scanhash_argon2d4096;
-        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT |NEON_OPT;
        opt_target_factor = 65536.0;
        return true;
 }
--- a/algo/argon2d/argon2d-gate.h
+++ b/algo/argon2d/argon2d-gate.h
@@ -4,22 +4,27 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-// Credits: version = 0x10, m_cost = 250.
-bool register_argon2d_crds_algo( algo_gate_t* gate );
-
-void argon2d_crds_hash( void *state, const void *input );
-
-int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
+int scanhash_argon2d( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );

+// Credits: version = 0x10, m_cost = 250.
+bool register_argon2d250_algo( algo_gate_t* gate );
+
+void argon2d250_hash( void *state, const void *input );
+
 // Dynamic: version = 0x10, m_cost = 500.
-bool register_argon2d_dyn_algo( algo_gate_t* gate );
+bool register_argon2d500_algo( algo_gate_t* gate );

-void argon2d_dyn_hash( void *state, const void *input );
+void argon2d500_hash( void *state, const void *input );

-int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done, struct thr_info *mythr );
+// Zero Dynamics Cash: version = 0x10, m_cost = 1000.
+bool register_argon2d1000_algo( algo_gate_t* gate );

+void argon2d1000_hash( void *state, const void *input );
+
+bool register_argon2d16000_algo( algo_gate_t* gate );
+
+void argon2d16000_hash( void *state, const void *input );

 // Unitus: version = 0x13, m_cost = 4096.
 bool register_argon2d4096_algo( algo_gate_t* gate );
--- a/algo/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2d/blake2/blamka-round-opt.h
@@ -66,82 +66,60 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)

 #if defined(__SSSE3__)  || defined(__ARM_NEON)

-#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
-    do {                                                                       \
-        v128_t t0 = v128_alignr8(B1, B0, 8);                               \
-        v128_t t1 = v128_alignr8(B0, B1, 8);                               \
-        B0 = t0;                                                               \
-        B1 = t1;                                                               \
-                                                                               \
-        t0 = C0;                                                               \
-        C0 = C1;                                                               \
-        C1 = t0;                                                               \
-                                                                               \
-        t0 = v128_alignr8(D1, D0, 8);                                       \
-        t1 = v128_alignr8(D0, D1, 8);                                       \
-        D0 = t1;                                                               \
-        D1 = t0;                                                               \
-    } while ((void)0, 0)
+#define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+   v128_t t = v128_alignr8( B1, B0, 8 ); \
+   B1 = v128_alignr8( B0, B1, 8 ); \
+   B0 = t; \
+   t = v128_alignr8( D1, D0, 8 ); \
+   D0 = v128_alignr8( D0, D1, 8 ); \
+   D1 = t; \
+}

-#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
-    do {                                                                       \
-        v128_t t0 = v128_alignr8(B0, B1, 8);                               \
-        v128_t t1 = v128_alignr8(B1, B0, 8);                               \
-        B0 = t0;                                                               \
-        B1 = t1;                                                               \
-                                                                               \
-        t0 = C0;                                                               \
-        C0 = C1;                                                               \
-        C1 = t0;                                                               \
-                                                                               \
-        t0 = v128_alignr8(D0, D1, 8);                                       \
-        t1 = v128_alignr8(D1, D0, 8);                                       \
-        D0 = t1;                                                               \
-        D1 = t0;                                                               \
-    } while ((void)0, 0)
+#define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+    v128_t t = v128_alignr8( B0, B1, 8 ); \
+    B1 = v128_alignr8( B1, B0, 8 ); \
+    B0 = t; \
+    t = v128_alignr8( D0, D1, 8 ); \
+    D0 = v128_alignr8( D1, D0, 8 ); \
+    D1 = t; \
+}

 #else /* SSE2 */

-#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
-    do {                                                                       \
-        v128_t t0 = D0;                                                       \
-        v128_t t1 = B0;                                                       \
-        D0 = C0;                                                               \
-        C0 = C1;                                                               \
-        C1 = D0;                                                               \
-        D0 = v128_unpackhi64(D1, v128_unpacklo64(t0, t0));               \
-        D1 = v128_unpackhi64(t0, v128_unpacklo64(D1, D1));               \
-        B0 = v128_unpackhi64(B0, v128_unpacklo64(B1, B1));               \
-        B1 = v128_unpackhi64(B1, v128_unpacklo64(t1, t1));               \
-    } while ((void)0, 0)
+#define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+    v128_t t = D0; \
+    D0 = v128_unpackhi64( D1, v128_unpacklo64( D0, D0 ) ); \
+    D1 = v128_unpackhi64( t, v128_unpacklo64( D1, D1 ) ); \
+    t = B0; \
+    B0 = v128_unpackhi64( B0, v128_unpacklo64( B1, B1 ) ); \
+    B1 = v128_unpackhi64( B1, v128_unpacklo64( t, t ) ); \
+}
+
+#define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+    v128_t t = B0; \
+    B0 = v128_unpackhi64( B1, v128_unpacklo64( B0, B0 ) ); \
+    B1 = v128_unpackhi64( t, v128_unpacklo64( B1, B1 ) ); \
+    t = D0; \
+    D0 = v128_unpackhi64( D0, v128_unpacklo64( D1, D1 ) ); \
+    D1 = v128_unpackhi64( D1, v128_unpacklo64( t, t ) ); \
+}

-#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
-    do {                                                                       \
-        v128_t t0, t1;                                                        \
-        t0 = C0;                                                               \
-        C0 = C1;                                                               \
-        C1 = t0;                                                               \
-        t0 = B0;                                                               \
-        t1 = D0;                                                               \
-        B0 = v128_unpackhi64(B1, v128_unpacklo64(B0, B0));               \
-        B1 = v128_unpackhi64(t0, v128_unpacklo64(B1, B1));               \
-        D0 = v128_unpackhi64(D0, v128_unpacklo64(D1, D1));               \
-        D1 = v128_unpackhi64(D1, v128_unpacklo64(t1, t1));               \
-    } while ((void)0, 0)
 #endif

-#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1)                           \
-    do {                                                                       \
-        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
-        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
-                                                                               \
-        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                           \
-                                                                               \
-        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
-        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
-                                                                               \
-        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                         \
-    } while ((void)0, 0)
+#define BLAKE2_ROUND( A0, A1, B0, B1, C0, C1, D0, D1 ) \
+{ \
+    G1( A0, B0, C0, D0, A1, B1, C1, D1 ); \
+    G2( A0, B0, C0, D0, A1, B1, C1, D1 ); \
+    DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \
+    G1( A0, B0, C1, D0, A1, B1, C0, D1 ); \
+    G2( A0, B0, C1, D0, A1, B1, C0, D1 ); \
+    UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \
+}
+
 #else /* __AVX2__ */

 #include <immintrin.h>
@@ -211,7 +189,6 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
-        \
        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
@@ -219,17 +196,14 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)

 #define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
    do { \
-        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
-        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
-        B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
-        tmp1 = C0; \
-        B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
-        C0 = C1; \
-        tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
-        C1 = tmp1; \
+        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0x33); \
+        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0xCC); \
+        B0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
+        B1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
        tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
-        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
-        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
+        D0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
+        D1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
    } while(0);

 #define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -237,7 +211,6 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
-        \
        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
@@ -247,27 +220,21 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
    do { \
        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
-        B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
-        tmp1 = C0; \
-        B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
-        C0 = C1; \
+        B0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
+        B1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
        tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
-        C1 = tmp1; \
        tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
-        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
-        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        D1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
+        D0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
    } while((void)0, 0);

 #define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
    do{ \
        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        \
        DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
-        \
        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        \
        UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
    } while((void)0, 0);

@@ -275,12 +242,9 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
    do{ \
        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        \
        DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        \
-        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        \
+        G1_AVX2(A0, A1, B0, B1, C1, C0, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C1, C0, D0, D1) \
        UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
    } while((void)0, 0);

@@ -290,12 +254,73 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)

 #include <immintrin.h>

+/*
 static inline __m512i muladd(__m512i x, __m512i y)
 {
    __m512i z = _mm512_mul_epu32(x, y);
    return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
 }
+*/

+#define G1( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+  __m512i z0, z1; \
+  z0 = _mm512_mul_epu32( A0, B0 ); \
+  z1 = _mm512_mul_epu32( A1, B1 ); \
+  A0 = _mm512_add_epi64( A0, B0 ); \
+  A1 = _mm512_add_epi64( A1, B1 ); \
+  z0 = _mm512_add_epi64( z0, z0 ); \
+  z1 = _mm512_add_epi64( z1, z1 ); \
+  A0 = _mm512_add_epi64( A0, z0 ); \
+  A1 = _mm512_add_epi64( A1, z1 ); \
+  D0 = _mm512_xor_si512(D0, A0); \
+  D1 = _mm512_xor_si512(D1, A1); \
+  D0 = _mm512_ror_epi64(D0, 32); \
+  D1 = _mm512_ror_epi64(D1, 32); \
+  z0 = _mm512_mul_epu32( C0, D0 ); \
+  z1 = _mm512_mul_epu32( C1, D1 ); \
+  C0 = _mm512_add_epi64( C0, D0 ); \
+  C1 = _mm512_add_epi64( C1, D1 ); \
+  z0 = _mm512_add_epi64( z0, z0 ); \
+  z1 = _mm512_add_epi64( z1, z1 ); \
+  C0 = _mm512_add_epi64( C0, z0 ); \
+  C1 = _mm512_add_epi64( C1, z1 ); \
+  B0 = _mm512_xor_si512(B0, C0); \
+  B1 = _mm512_xor_si512(B1, C1); \
+  B0 = _mm512_ror_epi64(B0, 24); \
+  B1 = _mm512_ror_epi64(B1, 24); \
+}
+
+#define G2( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+  __m512i z0, z1; \
+  z0 = _mm512_mul_epu32( A0, B0 ); \
+  z1 = _mm512_mul_epu32( A1, B1 ); \
+  A0 = _mm512_add_epi64( A0, B0 ); \
+  A1 = _mm512_add_epi64( A1, B1 ); \
+  z0 = _mm512_add_epi64( z0, z0 ); \
+  z1 = _mm512_add_epi64( z1, z1 ); \
+  A0 = _mm512_add_epi64( A0, z0 ); \
+  A1 = _mm512_add_epi64( A1, z1 ); \
+  D0 = _mm512_xor_si512(D0, A0); \
+  D1 = _mm512_xor_si512(D1, A1); \
+  D0 = _mm512_ror_epi64(D0, 16); \
+  D1 = _mm512_ror_epi64(D1, 16); \
+  z0 = _mm512_mul_epu32( C0, D0 ); \
+  z1 = _mm512_mul_epu32( C1, D1 ); \
+  C0 = _mm512_add_epi64( C0, D0 ); \
+  C1 = _mm512_add_epi64( C1, D1 ); \
+  z0 = _mm512_add_epi64( z0, z0 ); \
+  z1 = _mm512_add_epi64( z1, z1 ); \
+  C0 = _mm512_add_epi64( C0, z0 ); \
+  C1 = _mm512_add_epi64( C1, z1 ); \
+  B0 = _mm512_xor_si512(B0, C0); \
+  B1 = _mm512_xor_si512(B1, C1); \
+  B0 = _mm512_ror_epi64(B0, 63); \
+  B1 = _mm512_ror_epi64(B1, 63); \
+}
+
+/*
 #define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
    do { \
        A0 = muladd(A0, B0); \
@@ -316,7 +341,8 @@ static inline __m512i muladd(__m512i x, __m512i y)
        B0 = _mm512_ror_epi64(B0, 24); \
        B1 = _mm512_ror_epi64(B1, 24); \
    } while ((void)0, 0)
-
+*/
+/* 
 #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
    do { \
        A0 = muladd(A0, B0); \
@@ -337,15 +363,14 @@ static inline __m512i muladd(__m512i x, __m512i y)
        B0 = _mm512_ror_epi64(B0, 63); \
        B1 = _mm512_ror_epi64(B1, 63); \
    } while ((void)0, 0)
+*/

 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
    do { \
        B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
        B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
-\
        C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
        C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
-\
        D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
        D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
    } while ((void)0, 0)
@@ -354,10 +379,8 @@ static inline __m512i muladd(__m512i x, __m512i y)
    do { \
        B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
        B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
-\
        C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
        C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
-\
        D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
        D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
    } while ((void)0, 0)
@@ -366,15 +389,17 @@ static inline __m512i muladd(__m512i x, __m512i y)
    do { \
        G1(A0, B0, C0, D0, A1, B1, C1, D1); \
        G2(A0, B0, C0, D0, A1, B1, C1, D1); \
-\
        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
-\
        G1(A0, B0, C0, D0, A1, B1, C1, D1); \
        G2(A0, B0, C0, D0, A1, B1, C1, D1); \
-\
        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
    } while ((void)0, 0)

+static const __m512i swap_q0  = { 0,1,  8,9,  2,3,  10,11 }; 
+static const __m512i swap_q1  = { 4,5, 12,13, 6,7,  14,15 };
+static const __m512i uswap_q0 = { 0,1,  4,5,  8,9,  12,13 };
+static const __m512i uswap_q1 = { 2,3,  6,7, 10,11, 14,15 };
+
 #define SWAP_HALVES(A0, A1) \
    do { \
        __m512i t; \
@@ -383,19 +408,36 @@ static inline __m512i muladd(__m512i x, __m512i y)
        A0 = t; \
    } while((void)0, 0)

+#define SWAP_QUARTERS(A0, A1) \
+{ \
+   __m512i t = _mm512_permutex2var_epi64( A0, swap_q0, A1 ); \
+   A1 = _mm512_permutex2var_epi64( A0, swap_q1, A1 ); \
+   A0 = t; \
+}   
+
+#define UNSWAP_QUARTERS(A0, A1) \
+{ \
+   __m512i t = _mm512_permutex2var_epi64( A0, uswap_q0, A1 ); \
+   A1 = _mm512_permutex2var_epi64( A0, uswap_q1, A1 ); \
+   A0 = t; \
+}   
+   
+/*
 #define SWAP_QUARTERS(A0, A1) \
    do { \
        SWAP_HALVES(A0, A1); \
        A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
        A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
    } while((void)0, 0)
-
+*/
+/*
 #define UNSWAP_QUARTERS(A0, A1) \
    do { \
        A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
        A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
        SWAP_HALVES(A0, A1); \
    } while((void)0, 0)
+*/

 #define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
    do { \
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -6,15 +6,15 @@

 #if defined (BLAKE_4WAY)

-blake256r14_4way_context blake_4w_ctx;
+blake256r14_4x32_context blake_4w_ctx;

 void blakehash_4way(void *state, const void *input)
 {
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-     blake256r14_4way_context ctx;
+     blake256r14_4x32_context ctx;
     memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
-     blake256r14_4way_update( &ctx, input + (64<<2), 16 );
-     blake256r14_4way_close( &ctx, vhash );
+     blake256r14_4x32_update( &ctx, input + (64<<2), 16 );
+     blake256r14_4x32_close( &ctx, vhash );
     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

@@ -35,8 +35,8 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
      HTarget = 0x7f;

   v128_bswap32_intrlv80_4x32( vdata, pdata );
-   blake256r14_4way_init( &blake_4w_ctx );
-   blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
+   blake256r14_4x32_init( &blake_4w_ctx );
+   blake256r14_4x32_update( &blake_4w_ctx, vdata, 64 );

   do {
      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
@@ -61,15 +61,15 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,

 #if defined(BLAKE_8WAY)

-blake256r14_8way_context blake_8w_ctx;
+blake256r14_8x32_context blake_8w_ctx;

 void blakehash_8way( void *state, const void *input )
 {
     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
-     blake256r14_8way_context ctx;
+     blake256r14_8x32_context ctx;
     memcpy( &ctx, &blake_8w_ctx, sizeof ctx );
-     blake256r14_8way( &ctx, input + (64<<3), 16 );
-     blake256r14_8way_close( &ctx, vhash );
+     blake256r14_8x32( &ctx, input + (64<<3), 16 );
+     blake256r14_8x32_close( &ctx, vhash );
     _dintrlv_8x32( state,     state+ 32, state+ 64, state+ 96,
                    state+128, state+160, state+192, state+224,
                    vhash, 256 );
@@ -93,8 +93,8 @@ int scanhash_blake_8way( struct work *work, uint32_t max_nonce,

   mm256_bswap32_intrlv80_8x32( vdata, pdata );

-   blake256r14_8way_init( &blake_8w_ctx );
-   blake256r14_8way( &blake_8w_ctx, vdata, 64 );
+   blake256r14_8x32_init( &blake_8w_ctx );
+   blake256r14_8x32( &blake_8w_ctx, vdata, 64 );

   do {
      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
--- a/algo/blake/blake256-hash.c
+++ b/algo/blake/blake256-hash.c
@@ -423,33 +423,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
 		(state)->T1 = T1; \
 	} while (0)

-
-#if defined(__SSSE3__)
-
-#define BLAKE256_4X32_BLOCK_BSWAP32 \
-{ \
-   v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
-                                     0x0405060700010203 ); \
-   M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
-   M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
-   M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
-   M3 = _mm_shuffle_epi8( buf[ 3], shuf_bswap32 ); \
-   M4 = _mm_shuffle_epi8( buf[ 4], shuf_bswap32 ); \
-   M5 = _mm_shuffle_epi8( buf[ 5], shuf_bswap32 ); \
-   M6 = _mm_shuffle_epi8( buf[ 6], shuf_bswap32 ); \
-   M7 = _mm_shuffle_epi8( buf[ 7], shuf_bswap32 ); \
-   M8 = _mm_shuffle_epi8( buf[ 8], shuf_bswap32 ); \
-   M9 = _mm_shuffle_epi8( buf[ 9], shuf_bswap32 ); \
-   MA = _mm_shuffle_epi8( buf[10], shuf_bswap32 ); \
-   MB = _mm_shuffle_epi8( buf[11], shuf_bswap32 ); \
-   MC = _mm_shuffle_epi8( buf[12], shuf_bswap32 ); \
-   MD = _mm_shuffle_epi8( buf[13], shuf_bswap32 ); \
-   ME = _mm_shuffle_epi8( buf[14], shuf_bswap32 ); \
-   MF = _mm_shuffle_epi8( buf[15], shuf_bswap32 ); \
-}
-
-#else  // SSE2
-
 #define BLAKE256_4X32_BLOCK_BSWAP32 \
 { \
   M0 = v128_bswap32( buf[0] ); \
@@ -470,8 +443,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
   MF = v128_bswap32( buf[15] ); \
 }

-#endif  // SSSE3 else SSE2
-
 #define COMPRESS32_4X32( rounds ) \
 { \
   v128_t M0, M1, M2, M3, M4, M5, M6, M7; \
@@ -926,22 +897,6 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
      ROUND_S_4X32_3;
   }

-#if defined(__SSSE3__)
-
-   const v128_t shuf_bswap32 =
-                      v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
-
-   H[0] = _mm_shuffle_epi8( v128_xor3( V8, V0, h[0] ), shuf_bswap32 );
-   H[1] = _mm_shuffle_epi8( v128_xor3( V9, V1, h[1] ), shuf_bswap32 );
-   H[2] = _mm_shuffle_epi8( v128_xor3( VA, V2, h[2] ), shuf_bswap32 );
-   H[3] = _mm_shuffle_epi8( v128_xor3( VB, V3, h[3] ), shuf_bswap32 );
-   H[4] = _mm_shuffle_epi8( v128_xor3( VC, V4, h[4] ), shuf_bswap32 );
-   H[5] = _mm_shuffle_epi8( v128_xor3( VD, V5, h[5] ), shuf_bswap32 );
-   H[6] = _mm_shuffle_epi8( v128_xor3( VE, V6, h[6] ), shuf_bswap32 );
-   H[7] = _mm_shuffle_epi8( v128_xor3( VF, V7, h[7] ), shuf_bswap32 );
-
-#else
-
   H[0] = v128_bswap32( v128_xor3( V8, V0, h[0] ) );
   H[1] = v128_bswap32( v128_xor3( V9, V1, h[1] ) );
   H[2] = v128_bswap32( v128_xor3( VA, V2, h[2] ) );
@@ -950,8 +905,6 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
   H[5] = v128_bswap32( v128_xor3( VD, V5, h[5] ) );
   H[6] = v128_bswap32( v128_xor3( VE, V6, h[6] ) );
   H[7] = v128_bswap32( v128_xor3( VF, V7, h[7] ) );
-
-#endif
 }

 #if defined (__AVX2__)
@@ -1291,24 +1244,22 @@ do { \
   VD = v256_32( T0 ^ 0x299F31D0 ); \
   VE = v256_32( T1 ^ 0x082EFA98 ); \
   VF = v256_32( T1 ^ 0xEC4E6C89 ); \
-   const __m256i shuf_bswap32 = mm256_set2_64( \
-                               0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
-   M0 = _mm256_shuffle_epi8( * buf    , shuf_bswap32 ); \
-   M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
-   M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
-   M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
-   M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
-   M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
-   M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
-   M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
-   M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
-   M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
-   MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
-   MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
-   MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
-   MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
-   ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
-   MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
+   M0 = mm256_bswap_32( * buf     ); \
+   M1 = mm256_bswap_32( *(buf+ 1) ); \
+   M2 = mm256_bswap_32( *(buf+ 2) ); \
+   M3 = mm256_bswap_32( *(buf+ 3) ); \
+   M4 = mm256_bswap_32( *(buf+ 4) ); \
+   M5 = mm256_bswap_32( *(buf+ 5) ); \
+   M6 = mm256_bswap_32( *(buf+ 6) ); \
+   M7 = mm256_bswap_32( *(buf+ 7) ); \
+   M8 = mm256_bswap_32( *(buf+ 8) ); \
+   M9 = mm256_bswap_32( *(buf+ 9) ); \
+   MA = mm256_bswap_32( *(buf+10) ); \
+   MB = mm256_bswap_32( *(buf+11) ); \
+   MC = mm256_bswap_32( *(buf+12) ); \
+   MD = mm256_bswap_32( *(buf+13) ); \
+   ME = mm256_bswap_32( *(buf+14) ); \
+   MF = mm256_bswap_32( *(buf+15) ); \
   ROUND_S_8WAY(0); \
   ROUND_S_8WAY(1); \
   ROUND_S_8WAY(2); \
@@ -1401,7 +1352,7 @@ do { \
   H7 = mm256_xor3( VF, V7, H7 ); \
 }

-void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
+void blake256_8x32_round0_prehash_le( void *midstate, const void *midhash,
                                      void *data )
 {
   __m256i *M = (__m256i*)data;
@@ -1491,7 +1442,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
                         _mm256_xor_si256( v256_32( CSE ), M[15] ) );
 }

-void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
+void blake256_8x32_final_rounds_le( void *final_hash, const void *midstate,
                     const void *midhash, const void *data, const int rounds )
 {
   __m256i *H = (__m256i*)final_hash;
@@ -1596,17 +1547,14 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
      ROUND256_8WAY_3;
   }

-   const __m256i shuf_bswap32 =
-                  mm256_set2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
-
-   H[0] = _mm256_shuffle_epi8( mm256_xor3( V8, V0, h[0] ), shuf_bswap32 );
-   H[1] = _mm256_shuffle_epi8( mm256_xor3( V9, V1, h[1] ), shuf_bswap32 );
-   H[2] = _mm256_shuffle_epi8( mm256_xor3( VA, V2, h[2] ), shuf_bswap32 );
-   H[3] = _mm256_shuffle_epi8( mm256_xor3( VB, V3, h[3] ), shuf_bswap32 );
-   H[4] = _mm256_shuffle_epi8( mm256_xor3( VC, V4, h[4] ), shuf_bswap32 );
-   H[5] = _mm256_shuffle_epi8( mm256_xor3( VD, V5, h[5] ), shuf_bswap32 );
-   H[6] = _mm256_shuffle_epi8( mm256_xor3( VE, V6, h[6] ), shuf_bswap32 );
-   H[7] = _mm256_shuffle_epi8( mm256_xor3( VF, V7, h[7] ), shuf_bswap32 );
+   H[0] = mm256_bswap_32( mm256_xor3( V8, V0, h[0] ) );
+   H[1] = mm256_bswap_32( mm256_xor3( V9, V1, h[1] ) );
+   H[2] = mm256_bswap_32( mm256_xor3( VA, V2, h[2] ) );
+   H[3] = mm256_bswap_32( mm256_xor3( VB, V3, h[3] ) );
+   H[4] = mm256_bswap_32( mm256_xor3( VC, V4, h[4] ) );
+   H[5] = mm256_bswap_32( mm256_xor3( VD, V5, h[5] ) );
+   H[6] = mm256_bswap_32( mm256_xor3( VE, V6, h[6] ) );
+   H[7] = mm256_bswap_32( mm256_xor3( VF, V7, h[7] ) );
 }

 #endif
@@ -1933,8 +1881,6 @@ do { \
   __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
   __m512i V0, V1, V2, V3, V4, V5, V6, V7; \
   __m512i V8, V9, VA, VB, VC, VD, VE, VF; \
-   const __m512i shuf_bswap32 = mm512_bcast_m128( v128_set64( \
-                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
   V0 = H0; \
   V1 = H1; \
   V2 = H2; \
@@ -1951,22 +1897,22 @@ do { \
   VD = v512_32( T0 ^ 0x299F31D0 ); \
   VE = v512_32( T1 ^ 0x082EFA98 ); \
   VF = v512_32( T1 ^ 0xEC4E6C89 ); \
-   M0 = _mm512_shuffle_epi8( * buf    , shuf_bswap32 ); \
-   M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
-   M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
-   M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
-   M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
-   M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
-   M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
-   M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
-   M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
-   M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
-   MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
-   MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
-   MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
-   MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
-   ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
-   MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
+   M0 = mm512_bswap_32( * buf     ); \
+   M1 = mm512_bswap_32( *(buf+ 1) ); \
+   M2 = mm512_bswap_32( *(buf+ 2) ); \
+   M3 = mm512_bswap_32( *(buf+ 3) ); \
+   M4 = mm512_bswap_32( *(buf+ 4) ); \
+   M5 = mm512_bswap_32( *(buf+ 5) ); \
+   M6 = mm512_bswap_32( *(buf+ 6) ); \
+   M7 = mm512_bswap_32( *(buf+ 7) ); \
+   M8 = mm512_bswap_32( *(buf+ 8) ); \
+   M9 = mm512_bswap_32( *(buf+ 9) ); \
+   MA = mm512_bswap_32( *(buf+10) ); \
+   MB = mm512_bswap_32( *(buf+11) ); \
+   MC = mm512_bswap_32( *(buf+12) ); \
+   MD = mm512_bswap_32( *(buf+13) ); \
+   ME = mm512_bswap_32( *(buf+14) ); \
+   MF = mm512_bswap_32( *(buf+15) ); \
   ROUND_S_16WAY(0); \
   ROUND_S_16WAY(1); \
   ROUND_S_16WAY(2); \
@@ -2063,7 +2009,7 @@ do { \
 // is constant for every nonce and only needs to be run once per job. The
 // second part is run for each nonce using the precalculated midstate and the
 // hash from the first block.
-void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
+void blake256_16x32_round0_prehash_le( void *midstate, const void *midhash,
                                       void *data )
 {
   __m512i *M = (__m512i*)data;
@@ -2157,7 +2103,7 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
 }

 // Dfault is 14 rounds, blakecoin & vanilla are 8.
-void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
+void blake256_16x32_final_rounds_le( void *final_hash, const void *midstate,
                     const void *midhash, const void *data, const int rounds )
 {
   __m512i *H = (__m512i*)final_hash;
@@ -2274,27 +2220,23 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
   }

   // Byte swap final hash
-   const __m512i shuf_bswap32 =  mm512_bcast_m128( v128_set64( 
-                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
-   H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 );
-   H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 );
-   H[2] = _mm512_shuffle_epi8( mm512_xor3( VA, V2, h[2] ), shuf_bswap32 );
-   H[3] = _mm512_shuffle_epi8( mm512_xor3( VB, V3, h[3] ), shuf_bswap32 );
-   H[4] = _mm512_shuffle_epi8( mm512_xor3( VC, V4, h[4] ), shuf_bswap32 );
-   H[5] = _mm512_shuffle_epi8( mm512_xor3( VD, V5, h[5] ), shuf_bswap32 );
-   H[6] = _mm512_shuffle_epi8( mm512_xor3( VE, V6, h[6] ), shuf_bswap32 );
-   H[7] = _mm512_shuffle_epi8( mm512_xor3( VF, V7, h[7] ), shuf_bswap32 );
+   H[0] = mm512_bswap_32( mm512_xor3( V8, V0, h[0] ) );
+   H[1] = mm512_bswap_32( mm512_xor3( V9, V1, h[1] ) );
+   H[2] = mm512_bswap_32( mm512_xor3( VA, V2, h[2] ) );
+   H[3] = mm512_bswap_32( mm512_xor3( VB, V3, h[3] ) );
+   H[4] = mm512_bswap_32( mm512_xor3( VC, V4, h[4] ) );
+   H[5] = mm512_bswap_32( mm512_xor3( VD, V5, h[5] ) );
+   H[6] = mm512_bswap_32( mm512_xor3( VE, V6, h[6] ) );
+   H[7] = mm512_bswap_32( mm512_xor3( VF, V7, h[7] ) );
 }

 #endif

 // Blake-256 4 way

-static const uint32_t salt_zero_4x32_small[4] = { 0, 0, 0, 0 };
-
 static void
 blake32_4x32_init( blake_4x32_small_context *ctx, const uint32_t *iv,
-                   const uint32_t *salt, int rounds )
+                   int rounds )
 {
   casti_v128( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
   casti_v128( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 );
@@ -2404,11 +2346,10 @@ blake32_4x32_close( blake_4x32_small_context *ctx, unsigned ub, unsigned n,

 // Blake-256 8 way

-static const uint32_t salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };

 static void
-blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
-                   const uint32_t *salt, int rounds )
+blake32_8way_init( blake256_8x32_context *sc, const uint32_t *iv,
+                   int rounds )
 {
   casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E6676A09E667 );
   casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE85BB67AE85 );
@@ -2424,7 +2365,7 @@ blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
 }

 static void
-blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
+blake32_8way( blake256_8x32_context *sc, const void *data, size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   __m256i *buf;
@@ -2466,7 +2407,7 @@ blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
 }

 static void
-blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
+blake32_8way_close( blake256_8x32_context *sc, unsigned ub, unsigned n,
                    void *dst, size_t out_size_w32 )
 {
   __m256i buf[16];
@@ -2520,7 +2461,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
 }

 static void
-blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
+blake32_8way_le( blake256_8x32_context *sc, const void *data, size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   __m256i *buf;
@@ -2562,7 +2503,7 @@ blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
 }

 static void
-blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
+blake32_8way_close_le( blake256_8x32_context *sc, unsigned ub, unsigned n,
                       void *dst, size_t out_size_w32 )
 {
   __m256i buf[16];
@@ -2622,8 +2563,8 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
 //Blake-256 16 way AVX512

 static void
-blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
-                   const uint32_t *salt, int rounds )
+blake32_16way_init( blake256_16x32_context *sc, const uint32_t *iv,
+                    int rounds )
 {
   casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E6676A09E667 );
   casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE85BB67AE85 );
@@ -2639,7 +2580,7 @@ blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
 }

 static void
-blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
+blake32_16way( blake256_16x32_context *sc, const void *data, size_t len )
 {
   __m512i *vdata = (__m512i*)data;
   __m512i *buf;
@@ -2679,7 +2620,7 @@ blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
   sc->ptr = ptr;
 }
 static void
-blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
+blake32_16way_close( blake256_16x32_context *sc, unsigned ub, unsigned n,
                    void *dst, size_t out_size_w32 )
 {
   __m512i buf[16];
@@ -2733,7 +2674,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
 }

 static void
-blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
+blake32_16way_le( blake256_16x32_context *sc, const void *data, size_t len )
 {
   __m512i *vdata = (__m512i*)data;
   __m512i *buf;
@@ -2776,7 +2717,7 @@ blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
 }

 static void
-blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
+blake32_16way_close_le( blake256_16x32_context *sc, unsigned ub, unsigned n,
                    void *dst, size_t out_size_w32 )
 {
   __m512i buf[16];
@@ -2827,65 +2768,65 @@ blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
 }

 void
-blake256_16way_init(void *cc)
+blake256_16x32_init(void *cc)
 {
-   blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
+   blake32_16way_init( cc, IV256, 14 );
 }

 void
-blake256_16way_update(void *cc, const void *data, size_t len)
+blake256_16x32_update(void *cc, const void *data, size_t len)
 {
        blake32_16way(cc, data, len);
 }

 void
-blake256_16way_close(void *cc, void *dst)
+blake256_16x32_close(void *cc, void *dst)
 {
        blake32_16way_close(cc, 0, 0, dst, 8);
 }

 void
-blake256_16way_update_le(void *cc, const void *data, size_t len)
+blake256_16x32_update_le(void *cc, const void *data, size_t len)
 {
   blake32_16way_le(cc, data, len);
 }

 void
-blake256_16way_close_le(void *cc, void *dst)
+blake256_16x32_close_le(void *cc, void *dst)
 {
    blake32_16way_close_le(cc, 0, 0, dst, 8);
 }

 void blake256r14_16way_init(void *cc)
 {
-   blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
+   blake32_16way_init( cc, IV256, 14 );
 }

 void
-blake256r14_16way_update(void *cc, const void *data, size_t len)
+blake256r14_16x32_update(void *cc, const void *data, size_t len)
 {
   blake32_16way(cc, data, len);
 }

 void
-blake256r14_16way_close(void *cc, void *dst)
+blake256r14_16x32_close(void *cc, void *dst)
 {
   blake32_16way_close(cc, 0, 0, dst, 8);
 }

 void blake256r8_16way_init(void *cc)
 {
-   blake32_16way_init( cc, IV256, salt_zero_8way_small, 8 );
+   blake32_16way_init( cc, IV256, 8 );
 }

 void
-blake256r8_16way_update(void *cc, const void *data, size_t len)
+blake256r8_16x32_update(void *cc, const void *data, size_t len)
 {
   blake32_16way(cc, data, len);
 }

 void
-blake256r8_16way_close(void *cc, void *dst)
+blake256r8_16x32_close(void *cc, void *dst)
 {
   blake32_16way_close(cc, 0, 0, dst, 8);
 }
@@ -2898,7 +2839,7 @@ blake256r8_16way_close(void *cc, void *dst)
 void
 blake256_4x32_init(void *ctx)
 {
-   blake32_4x32_init( ctx, IV256, salt_zero_4x32_small, 14 );
+   blake32_4x32_init( ctx, IV256, 14 );
 }

 void
@@ -2918,31 +2859,31 @@ blake256_4x32_close(void *ctx, void *dst)
 // Blake-256 8 way

 void
-blake256_8way_init(void *cc)
+blake256_8x32_init(void *cc)
 {
-   blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
+   blake32_8way_init( cc, IV256, 14 );
 }

 void
-blake256_8way_update(void *cc, const void *data, size_t len)
+blake256_8x32_update(void *cc, const void *data, size_t len)
 {
        blake32_8way(cc, data, len);
 }

 void
-blake256_8way_close(void *cc, void *dst)
+blake256_8x32_close(void *cc, void *dst)
 {
        blake32_8way_close(cc, 0, 0, dst, 8);
 }

 void
-blake256_8way_update_le(void *cc, const void *data, size_t len)
+blake256_8x32_update_le(void *cc, const void *data, size_t len)
 {
        blake32_8way_le(cc, data, len);
 }

 void
-blake256_8way_close_le(void *cc, void *dst)
+blake256_8x32_close_le(void *cc, void *dst)
 {
        blake32_8way_close_le(cc, 0, 0, dst, 8);
 }
@@ -2952,7 +2893,7 @@ blake256_8way_close_le(void *cc, void *dst)
 // 14 rounds Blake, Decred
 void blake256r14_4x32_init(void *cc)
 {
-   blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 14 );
+   blake32_4x32_init( cc, IV256, 14 );
 }

 void
@@ -2969,19 +2910,19 @@ blake256r14_4x32_close(void *cc, void *dst)

 #if defined(__AVX2__)

-void blake256r14_8way_init(void *cc)
+void blake256r14_8x32_init(void *cc)
 {
-   blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
+   blake32_8way_init( cc, IV256, 14 );
 }

 void
-blake256r14_8way_update(void *cc, const void *data, size_t len)
+blake256r14_8x32_update(void *cc, const void *data, size_t len)
 {
   blake32_8way(cc, data, len);
 }

 void
-blake256r14_8way_close(void *cc, void *dst)
+blake256r14_8x32_close(void *cc, void *dst)
 {
   blake32_8way_close(cc, 0, 0, dst, 8);
 }
@@ -2991,7 +2932,7 @@ blake256r14_8way_close(void *cc, void *dst)
 // 8 rounds Blakecoin, Vanilla
 void blake256r8_4x32_init(void *cc)
 {
-   blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 8 );
+   blake32_4x32_init( cc, IV256, 8 );
 }

 void
@@ -3008,19 +2949,19 @@ blake256r8_4x32_close(void *cc, void *dst)

 #if defined (__AVX2__)

-void blake256r8_8way_init(void *cc)
+void blake256r8_8x32_init(void *cc)
 {
-   blake32_8way_init( cc, IV256, salt_zero_8way_small, 8 );
+   blake32_8way_init( cc, IV256, 8 );
 }

 void
-blake256r8_8way_update(void *cc, const void *data, size_t len)
+blake256r8_8x32_update(void *cc, const void *data, size_t len)
 {
   blake32_8way(cc, data, len);
 }

 void
-blake256r8_8way_close(void *cc, void *dst)
+blake256r8_8x32_close(void *cc, void *dst)
 {
   blake32_8way_close(cc, 0, 0, dst, 8);
 }
--- a/algo/blake/blake256-hash.h
+++ b/algo/blake/blake256-hash.h
@@ -29,13 +29,6 @@ typedef struct

 void blake256_transform_le( uint32_t *H, const uint32_t *buf,
                            const uint32_t T0, const uint32_t T1, int rounds );
-/*
-void blake256_init( blake256_context *sc );
-void blake256_update( blake256_context *sc, const void *data, size_t len );
-void blake256_close( blake256_context *sc, void *dst );
-void blake256_full( blake256_context *sc, void *dst, const void *data,
-                    size_t len );
-*/

 //////////////////////////////////
 //
@@ -55,6 +48,10 @@ typedef blake_4x32_small_context blake256_4x32_context;
 void blake256_4x32_init(void *ctx);
 void blake256_4x32_update(void *ctx, const void *data, size_t len);
 void blake256_4x32_close(void *ctx, void *dst);
+void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
+                                      void *data );
+void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
+                     const void *midhash, const void *data, const int rounds );

 // 14 rounds
 typedef blake_4x32_small_context blake256r14_4x32_context;
@@ -68,29 +65,6 @@ void blake256r8_4x32_init(void *cc);
 void blake256r8_4x32_update(void *cc, const void *data, size_t len);
 void blake256r8_4x32_close(void *cc, void *dst);

-void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
-                                      void *data );
-void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
-                     const void *midhash, const void *data, const int rounds );
-
-#define blake_4way_small_context         blake256_4x32_context
-#define blake256_4way_context            blake256_4x32_context
-#define blake256_4way_init               blake256_4x32_init
-#define blake256_4way_update             blake256_4x32_update
-#define blake256_4way_close              blake256_4x32_close
-#define blake256_4way_update_le          blake256_4x32_update_le
-#define blake256_4way_close_le           blake256_4x32_close_le
-#define blake256_4way_round0_prehash_le  blake256_4x32_round0_prehash_le
-#define blake256_4way_final_rounds_le    blake256_4x32_final_rounds_le
-#define blake256r14_4way_context         blake256r14_4x32_context
-#define blake256r14_4way_init            blake256r14_4x32_init
-#define blake256r14_4way_update          blake256r14_4x32_update
-#define blake256r14_4way_close           blake256r14_4x32_close
-#define blake256r8_4way_context          blake256r14_4x32_context
-#define blake256r8_4way_init             blake256r14_4x32_init
-#define blake256r8_4way_update           blake256r14_4x32_update
-#define blake256r8_4way_close            blake256r14_4x32_close
-
 #ifdef __AVX2__

 //////////////////////////////
@@ -107,45 +81,28 @@ typedef struct
 } blake_8way_small_context;

 // Default 14 rounds
-typedef blake_8way_small_context blake256_8way_context;
-void blake256_8way_init(void *cc);
-void blake256_8way_update(void *cc, const void *data, size_t len);
-void blake256_8way_close(void *cc, void *dst);
-void blake256_8way_update_le(void *cc, const void *data, size_t len);
-void blake256_8way_close_le(void *cc, void *dst);
-void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
+typedef blake_8way_small_context blake256_8x32_context;
+void blake256_8x32_init(void *cc);
+void blake256_8x32_update(void *cc, const void *data, size_t len);
+void blake256_8x32_close(void *cc, void *dst);
+void blake256_8x32_update_le(void *cc, const void *data, size_t len);
+void blake256_8x32_close_le(void *cc, void *dst);
+void blake256_8x32_round0_prehash_le( void *midstate, const void *midhash,
                                      void *data );
-void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
+void blake256_8x32_final_rounds_le( void *final_hash, const void *midstate,
                    const void *midhash, const void *data, const int rounds );

 // 14 rounds, blake, decred
-typedef blake_8way_small_context blake256r14_8way_context;
-void blake256r14_8way_init(void *cc);
-void blake256r14_8way_update(void *cc, const void *data, size_t len);
-void blake256r14_8way_close(void *cc, void *dst);
+typedef blake_8way_small_context blake256r14_8x32_context;
+void blake256r14_8x32_init(void *cc);
+void blake256r14_8x32_update(void *cc, const void *data, size_t len);
+void blake256r14_8x32_close(void *cc, void *dst);

 // 8 rounds, blakecoin, vanilla
-typedef blake_8way_small_context blake256r8_8way_context;
-void blake256r8_8way_init(void *cc);
-void blake256r8_8way_update(void *cc, const void *data, size_t len);
-void blake256r8_8way_close(void *cc, void *dst);
-
-#define blake_8x32_small_context      blake256_8way_context
-#define blake_8x32_init               blake256_8way_init
-#define blake_8x32_update             blake256_8way_update
-#define blake_8x32_close              blake256_8way_close
-#define blake_8x32_update_le          blake256_8way_update_le
-#define blake_8x32_close_le           blake256_8way_close_le
-#define blake_8x32_round0_prehash_le  blake256_8way_round0_prehash
-#define blake_8x32_final_rounds_le    blake256_8way_final_rounds_le
-#define blake256r14_8x32_context      blake256r14_8way_context
-#define blake256r14_8x32_init         blake256r14_8way_init
-#define blake256r14_8x32_update       blake256r14_8way_update
-#define blake256r14_8x32_close        blake256r14_8way_close
-#define blake256r8_8x32_context       blake256r14_8way_context
-#define blake256r8_8x32_init          blake256r14_8way_init
-#define blake256r8_8x32_update        blake256r14_8way_update
-#define blake256r8_8x32_close         blake256r14_8way_close
+typedef blake_8way_small_context blake256r8_8x32_context;
+void blake256r8_8x32_init(void *cc);
+void blake256r8_8x32_update(void *cc, const void *data, size_t len);
+void blake256r8_8x32_close(void *cc, void *dst);

 #if defined(SIMD512)

@@ -163,46 +120,29 @@ typedef struct
 } blake_16way_small_context __attribute__ ((aligned (128)));

 // Default 14 rounds
-typedef blake_16way_small_context blake256_16way_context;
-void blake256_16way_init(void *cc);
-void blake256_16way_update(void *cc, const void *data, size_t len);
-void blake256_16way_close(void *cc, void *dst);
+typedef blake_16way_small_context blake256_16x32_context;
+void blake256_16x32_init(void *cc);
+void blake256_16x32_update(void *cc, const void *data, size_t len);
+void blake256_16x32_close(void *cc, void *dst);
 // Expects data in little endian order, no byte swap needed
-void blake256_16way_update_le(void *cc, const void *data, size_t len);
-void blake256_16way_close_le(void *cc, void *dst);
-void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
+void blake256_16x32_update_le(void *cc, const void *data, size_t len);
+void blake256_16x32_close_le(void *cc, void *dst);
+void blake256_16x32_round0_prehash_le( void *midstate, const void *midhash,
                                       void *data );
-void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
+void blake256_16x32_final_rounds_le( void *final_hash, const void *midstate,
                     const void *midhash, const void *data, const int rounds );

 // 14 rounds, blake, decred
-typedef blake_16way_small_context blake256r14_16way_context;
-void blake256r14_16way_init(void *cc);
-void blake256r14_16way_update(void *cc, const void *data, size_t len);
-void blake256r14_16way_close(void *cc, void *dst);
+typedef blake_16way_small_context blake256r14_16x32_context;
+void blake256r14_16x32_init(void *cc);
+void blake256r14_16x32_update(void *cc, const void *data, size_t len);
+void blake256r14_16x32_close(void *cc, void *dst);

 // 8 rounds, blakecoin, vanilla
-typedef blake_16way_small_context blake256r8_16way_context;
-void blake256r8_16way_init(void *cc);
-void blake256r8_16way_update(void *cc, const void *data, size_t len);
-void blake256r8_16way_close(void *cc, void *dst);
-
-#define blake_16x32_small_context      blake256_16way_context
-#define blake_16x32_init               blake256_16way_init
-#define blake_16x32_update             blake256_16way_update
-#define blake_16x32_close              blake256_16way_close
-#define blake_16x32_update_le          blake256_16way_update_le
-#define blake_16x32_close_le           blake256_16way_close_le
-#define blake_16x32_round0_prehash_le  blake256_16way_round0_prehash
-#define blake_16x32_final_rounds_le    blake256_16way_final_rounds_le
-#define blake256r14_16x32_context      blake256r14_16way_context
-#define blake256r14_16x32_init         blake256r14_16way_init
-#define blake256r14_16x32_update       blake256r14_16way_update
-#define blake256r14_16x32_close        blake256r14_16way_close
-#define blake256r8_16x32_context       blake256r8_16way_context
-#define blake256r8_16x32_init          blake256r8_16way_init
-#define blake256r8_16x32_update        blake256r8_16way_update
-#define blake256r8_16x32_close         blake256r8_16way_close
+typedef blake_16way_small_context blake256r8_16x32_context;
+void blake256r8_16x32_init(void *cc);
+void blake256r8_16x32_update(void *cc, const void *data, size_t len);
+void blake256r8_16x32_close(void *cc, void *dst);

 #endif  // AVX512
 #endif  // AVX2
--- a/algo/blake/blake2b-hash.h
+++ b/algo/blake/blake2b-hash.h
@@ -14,7 +14,6 @@
 #define ALIGN(x) __attribute__((aligned(x)))
 #endif

-
 #if defined(SIMD512)

 typedef struct ALIGN( 64 ) {
@@ -30,11 +29,6 @@ void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
                          size_t inlen );
 void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out );

-#define blake2b_8way_ctx         blake2b_8x64_ctx
-#define blake2b_8way_init        blake2b_8x64_init
-#define blake2b_8way_update      blake2b_8x64_update
-#define blake2b_8way_final       blake2b_8x64_final
-
 #endif

 #if defined(__AVX2__)
@@ -53,11 +47,6 @@ void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
                          size_t inlen );
 void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out );

-#define blake2b_4way_ctx         blake2b_4x64_ctx
-#define blake2b_4way_init        blake2b_4x64_init
-#define blake2b_4way_update      blake2b_4x64_update
-#define blake2b_4way_final       blake2b_4x64_final
-
 #endif

 #endif
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -17,7 +17,7 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
   uint32_t hash[8*8] __attribute__ ((aligned (128)));;
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
+   blake2b_8x64_ctx ctx __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[49]);   // 3*16+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -35,9 +35,9 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );

-      blake2b_8way_init( &ctx );
-      blake2b_8way_update( &ctx, vdata, 80 );
-      blake2b_8way_final( &ctx, hash );
+      blake2b_8x64_init( &ctx );
+      blake2b_8x64_update( &ctx, vdata, 80 );
+      blake2b_8x64_final( &ctx, hash );

      for ( int lane = 0; lane < 8; lane++ )
      if ( hash7[ lane<<1 ] <= Htarg )
@@ -61,10 +61,10 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
 // Function not used, code inlined.
 void blake2b_4way_hash(void *output, const void *input)
 {
-    blake2b_4way_ctx ctx;
-    blake2b_4way_init( &ctx );
-    blake2b_4way_update( &ctx, input, 80 );
-    blake2b_4way_final( &ctx, output );
+    blake2b_4x64_ctx ctx;
+    blake2b_4x64_init( &ctx );
+    blake2b_4x64_update( &ctx, input, 80 );
+    blake2b_4x64_final( &ctx, output );
 }

 int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
@@ -73,7 +73,7 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
 	uint32_t hash[8*4] __attribute__ ((aligned (64)));;
   uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
+   blake2b_4x64_ctx ctx __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[25]);   // 3*8+1
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
@@ -90,9 +90,9 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

-      blake2b_4way_init( &ctx ); 
-      blake2b_4way_update( &ctx, vdata, 80 );
-      blake2b_4way_final( &ctx, hash );
+      blake2b_4x64_init( &ctx ); 
+      blake2b_4x64_update( &ctx, vdata, 80 );
+      blake2b_4x64_final( &ctx, hash );

      for ( int lane = 0; lane < 4; lane++ )
      if ( hash7[ lane<<1 ] <= Htarg )
--- a/algo/blake/blake2s-hash.h
+++ b/algo/blake/blake2s-hash.h
@@ -61,6 +61,11 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
 int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
                              const void *input, uint64_t inlen );

+#define blake2s_4x32_state       blake2s_4way_state
+#define blake2s_4x32_init        blake2s_4way_init
+#define blake2s_4x32_update      blake2s_4way_update
+#define blake2s_4x32_final       blake2s_4way_final
+#define blake2s_4x32_full_blocks blake2s_4way_full_blocks

 #if defined(__AVX2__)

@@ -81,6 +86,12 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
 int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
                              const void *input, uint64_t inlen );

+#define blake2s_8x32_state       blake2s_8way_state
+#define blake2s_8x32_init        blake2s_8way_init
+#define blake2s_8x32_update      blake2s_8way_update
+#define blake2s_8x32_final       blake2s_8way_final
+#define blake2s_8x32_full_blocks blake2s_8way_full_blocks
+
 #endif

 #if defined(SIMD512)
@@ -100,6 +111,11 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in,
                         uint64_t inlen );
 int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );

+#define blake2s_16x32_state       blake2s_16way_state
+#define blake2s_16x32_init        blake2s_16way_init
+#define blake2s_16x32_update      blake2s_16way_update
+#define blake2s_16x32_final       blake2s_16way_final
+
 #endif

 #if 0
--- a/algo/blake/blake512-hash.c
+++ b/algo/blake/blake512-hash.c
@@ -617,24 +617,22 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
  VD = v512_64( CB5 ^ T0 ); \
  VE = v512_64( CB6 ^ T1 ); \
  VF = v512_64( CB7 ^ T1 ); \
-  const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set64( \
-                                   0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
-  M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
-  M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
-  M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
-  M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
-  M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
-  M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
-  M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
-  M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
-  M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
-  M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
-  MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
-  MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
-  MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
-  MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
-  ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
-  MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
+  M0 = mm512_bswap_64( *(buf+ 0) ); \
+  M1 = mm512_bswap_64( *(buf+ 1) ); \
+  M2 = mm512_bswap_64( *(buf+ 2) ); \
+  M3 = mm512_bswap_64( *(buf+ 3) ); \
+  M4 = mm512_bswap_64( *(buf+ 4) ); \
+  M5 = mm512_bswap_64( *(buf+ 5) ); \
+  M6 = mm512_bswap_64( *(buf+ 6) ); \
+  M7 = mm512_bswap_64( *(buf+ 7) ); \
+  M8 = mm512_bswap_64( *(buf+ 8) ); \
+  M9 = mm512_bswap_64( *(buf+ 9) ); \
+  MA = mm512_bswap_64( *(buf+10) ); \
+  MB = mm512_bswap_64( *(buf+11) ); \
+  MC = mm512_bswap_64( *(buf+12) ); \
+  MD = mm512_bswap_64( *(buf+13) ); \
+  ME = mm512_bswap_64( *(buf+14) ); \
+  MF = mm512_bswap_64( *(buf+15) ); \
  ROUND_B_8WAY(0); \
  ROUND_B_8WAY(1); \
  ROUND_B_8WAY(2); \
@@ -661,7 +659,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
  H7 = mm512_xor3( VF, V7, H7 ); \
 }

-void blake512_8way_compress( blake_8way_big_context *sc )
+void blake512_8x64_compress( blake_8x64_big_context *sc )
 { 
  __m512i M0, M1, M2, M3, M4, M5, M6, M7;
  __m512i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -685,25 +683,22 @@ void blake512_8way_compress( blake_8way_big_context *sc )
  VE = v512_64( CB6 ^ sc->T1 );
  VF = v512_64( CB7 ^ sc->T1 );

-  const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set64( 
-                                   0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
-
-  M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
-  M1 = _mm512_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
-  M2 = _mm512_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
-  M3 = _mm512_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
-  M4 = _mm512_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
-  M5 = _mm512_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
-  M6 = _mm512_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
-  M7 = _mm512_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
-  M8 = _mm512_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
-  M9 = _mm512_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
-  MA = _mm512_shuffle_epi8( sc->buf[10], shuf_bswap64 );
-  MB = _mm512_shuffle_epi8( sc->buf[11], shuf_bswap64 );
-  MC = _mm512_shuffle_epi8( sc->buf[12], shuf_bswap64 );
-  MD = _mm512_shuffle_epi8( sc->buf[13], shuf_bswap64 );
-  ME = _mm512_shuffle_epi8( sc->buf[14], shuf_bswap64 );
-  MF = _mm512_shuffle_epi8( sc->buf[15], shuf_bswap64 );
+  M0 = mm512_bswap_64( sc->buf[ 0] );
+  M1 = mm512_bswap_64( sc->buf[ 1] );
+  M2 = mm512_bswap_64( sc->buf[ 2] );
+  M3 = mm512_bswap_64( sc->buf[ 3] );
+  M4 = mm512_bswap_64( sc->buf[ 4] );
+  M5 = mm512_bswap_64( sc->buf[ 5] );
+  M6 = mm512_bswap_64( sc->buf[ 6] );
+  M7 = mm512_bswap_64( sc->buf[ 7] );
+  M8 = mm512_bswap_64( sc->buf[ 8] );
+  M9 = mm512_bswap_64( sc->buf[ 9] );
+  MA = mm512_bswap_64( sc->buf[10] );
+  MB = mm512_bswap_64( sc->buf[11] );
+  MC = mm512_bswap_64( sc->buf[12] );
+  MD = mm512_bswap_64( sc->buf[13] );
+  ME = mm512_bswap_64( sc->buf[14] );
+  MF = mm512_bswap_64( sc->buf[15] );

  ROUND_B_8WAY(0);
  ROUND_B_8WAY(1);
@@ -733,7 +728,7 @@ void blake512_8way_compress( blake_8way_big_context *sc )
 }

 // won't be used after prehash implemented
-void blake512_8way_compress_le( blake_8x64_big_context *sc )
+void blake512_8x64_compress_le( blake_8x64_big_context *sc )
 {
  __m512i M0, M1, M2, M3, M4, M5, M6, M7;
  __m512i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -1177,7 +1172,7 @@ void blake512_8x64_full( blake_8x64_big_context *sc, void * dst,
   {
      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
            sc->T1 = sc->T1 + 1;
-      blake512_8way_compress( sc );
+      blake512_8x64_compress( sc );
      sc->ptr = 0;
   }

@@ -1213,7 +1208,7 @@ void blake512_8x64_full( blake_8x64_big_context *sc, void * dst,
   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
       sc->T1 = sc->T1 + 1;

-   blake512_8way_compress( sc );
+   blake512_8x64_compress( sc );
   
   mm512_block_bswap_64( (__m512i*)dst, sc->H );
 }
@@ -1244,7 +1239,7 @@ void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst,
   {
      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
            sc->T1 = sc->T1 + 1;
-      blake512_8way_compress_le( sc );
+      blake512_8x64_compress_le( sc );
      sc->ptr = 0;
   }

@@ -1280,7 +1275,7 @@ void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst,
   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
       sc->T1 = sc->T1 + 1;

-   blake512_8way_compress_le( sc );
+   blake512_8x64_compress_le( sc );

   mm512_block_bswap_64( (__m512i*)dst, sc->H );
 }
@@ -1355,24 +1350,22 @@ blake512_8x64_close(void *cc, void *dst)
  VD = v256_64( CB5 ^ T0 ); \
  VE = v256_64( CB6 ^ T1 ); \
  VF = v256_64( CB7 ^ T1 ); \
-  const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set64( \
-                             0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
-  M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
-  M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
-  M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
-  M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
-  M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
-  M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
-  M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
-  M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
-  M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
-  M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
-  MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
-  MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
-  MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
-  MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
-  ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
-  MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
+  M0 = mm256_bswap_64( *(buf+ 0) ); \
+  M1 = mm256_bswap_64( *(buf+ 1) ); \
+  M2 = mm256_bswap_64( *(buf+ 2) ); \
+  M3 = mm256_bswap_64( *(buf+ 3) ); \
+  M4 = mm256_bswap_64( *(buf+ 4) ); \
+  M5 = mm256_bswap_64( *(buf+ 5) ); \
+  M6 = mm256_bswap_64( *(buf+ 6) ); \
+  M7 = mm256_bswap_64( *(buf+ 7) ); \
+  M8 = mm256_bswap_64( *(buf+ 8) ); \
+  M9 = mm256_bswap_64( *(buf+ 9) ); \
+  MA = mm256_bswap_64( *(buf+10) ); \
+  MB = mm256_bswap_64( *(buf+11) ); \
+  MC = mm256_bswap_64( *(buf+12) ); \
+  MD = mm256_bswap_64( *(buf+13) ); \
+  ME = mm256_bswap_64( *(buf+14) ); \
+  MF = mm256_bswap_64( *(buf+15) ); \
  ROUND_B_4WAY(0); \
  ROUND_B_4WAY(1); \
  ROUND_B_4WAY(2); \
@@ -1400,7 +1393,7 @@ blake512_8x64_close(void *cc, void *dst)
 }


-void blake512_4way_compress( blake_4x64_big_context *sc )
+void blake512_4x64_compress( blake_4x64_big_context *sc )
 {
  __m256i M0, M1, M2, M3, M4, M5, M6, M7;
  __m256i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -1423,25 +1416,23 @@ void blake512_4way_compress( blake_4x64_big_context *sc )
  VD = v256_64( CB5 ^ sc->T0 );
  VE = v256_64( CB6 ^ sc->T1 );
  VF = v256_64( CB7 ^ sc->T1 );
-  const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set64(
-                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ) );

-  M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
-  M1 = _mm256_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
-  M2 = _mm256_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
-  M3 = _mm256_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
-  M4 = _mm256_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
-  M5 = _mm256_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
-  M6 = _mm256_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
-  M7 = _mm256_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
-  M8 = _mm256_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
-  M9 = _mm256_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
-  MA = _mm256_shuffle_epi8( sc->buf[10], shuf_bswap64 );
-  MB = _mm256_shuffle_epi8( sc->buf[11], shuf_bswap64 );
-  MC = _mm256_shuffle_epi8( sc->buf[12], shuf_bswap64 );
-  MD = _mm256_shuffle_epi8( sc->buf[13], shuf_bswap64 );
-  ME = _mm256_shuffle_epi8( sc->buf[14], shuf_bswap64 );
-  MF = _mm256_shuffle_epi8( sc->buf[15], shuf_bswap64 );
+  M0 = mm256_bswap_64( sc->buf[ 0] );
+  M1 = mm256_bswap_64( sc->buf[ 1] );
+  M2 = mm256_bswap_64( sc->buf[ 2] );
+  M3 = mm256_bswap_64( sc->buf[ 3] );
+  M4 = mm256_bswap_64( sc->buf[ 4] );
+  M5 = mm256_bswap_64( sc->buf[ 5] );
+  M6 = mm256_bswap_64( sc->buf[ 6] );
+  M7 = mm256_bswap_64( sc->buf[ 7] );
+  M8 = mm256_bswap_64( sc->buf[ 8] );
+  M9 = mm256_bswap_64( sc->buf[ 9] );
+  MA = mm256_bswap_64( sc->buf[10] );
+  MB = mm256_bswap_64( sc->buf[11] );
+  MC = mm256_bswap_64( sc->buf[12] );
+  MD = mm256_bswap_64( sc->buf[13] );
+  ME = mm256_bswap_64( sc->buf[14] );
+  MF = mm256_bswap_64( sc->buf[15] );

  ROUND_B_4WAY(0);
  ROUND_B_4WAY(1);
@@ -1470,7 +1461,7 @@ void blake512_4way_compress( blake_4x64_big_context *sc )
  sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
 }

-void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate,
+void blake512_4x64_prehash_le( blake512_4x64_context *sc, __m256i *midstate,
                               const void *data )
 {
   __m256i V0, V1, V2, V3, V4, V5, V6, V7;
@@ -1562,7 +1553,7 @@ void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate,
   midstate[15] = VF;
 }

-void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
+void blake512_4x64_final_le( blake512_4x64_context *sc, void *hash,
                             const __m256i nonce, const __m256i *midstate )
 {
   __m256i M0, M1, M2, M3, M4, M5, M6, M7;
@@ -1685,7 +1676,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
 }


-void blake512_4x64_init( blake_4x64_big_context *sc )
+void blake512_4x64_init( blake512_4x64_context *sc )
 {
   casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
   casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
@@ -1798,7 +1789,7 @@ blake64_4way_close( blake_4x64_big_context *sc, void *dst )
 }

 // init, update & close
-void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
+void blake512_4x64_full( blake512_4x64_context *sc, void * dst,
                         const void *data, size_t len )
 {

@@ -1824,7 +1815,7 @@ void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
   {
      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
         sc->T1 =  sc->T1 + 1;
-      blake512_4way_compress( sc );
+      blake512_4x64_compress( sc );
      sc->ptr = 0;
   }

@@ -1859,7 +1850,7 @@ void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
       sc->T1 = sc->T1 + 1;

-   blake512_4way_compress( sc );
+   blake512_4x64_compress( sc );

   mm256_block_bswap_64( (__m256i*)dst, sc->H );
 }
@@ -1934,29 +1925,6 @@ void blake512_2x64_compress( blake_2x64_big_context *sc )
  VE = v128_64( CB6 ^ sc->T1 );
  VF = v128_64( CB7 ^ sc->T1 );

-#if defined(__SSSE3__)
-
-  const v128u64_t shuf_bswap64 = v128_set64(
-                                 0x08090a0b0c0d0e0f, 0x0001020304050607 );
-  M0 = v128_shuffle8( sc->buf[ 0], shuf_bswap64 );
-  M1 = v128_shuffle8( sc->buf[ 1], shuf_bswap64 );
-  M2 = v128_shuffle8( sc->buf[ 2], shuf_bswap64 );
-  M3 = v128_shuffle8( sc->buf[ 3], shuf_bswap64 );
-  M4 = v128_shuffle8( sc->buf[ 4], shuf_bswap64 );
-  M5 = v128_shuffle8( sc->buf[ 5], shuf_bswap64 );
-  M6 = v128_shuffle8( sc->buf[ 6], shuf_bswap64 );
-  M7 = v128_shuffle8( sc->buf[ 7], shuf_bswap64 );
-  M8 = v128_shuffle8( sc->buf[ 8], shuf_bswap64 );
-  M9 = v128_shuffle8( sc->buf[ 9], shuf_bswap64 );
-  MA = v128_shuffle8( sc->buf[10], shuf_bswap64 );
-  MB = v128_shuffle8( sc->buf[11], shuf_bswap64 );
-  MC = v128_shuffle8( sc->buf[12], shuf_bswap64 );
-  MD = v128_shuffle8( sc->buf[13], shuf_bswap64 );
-  ME = v128_shuffle8( sc->buf[14], shuf_bswap64 );
-  MF = v128_shuffle8( sc->buf[15], shuf_bswap64 );
-
-#else  // SSE2 & NEON
-
  M0 = v128_bswap64( sc->buf[ 0] );
  M1 = v128_bswap64( sc->buf[ 1] );
  M2 = v128_bswap64( sc->buf[ 2] );
@@ -1974,8 +1942,6 @@ void blake512_2x64_compress( blake_2x64_big_context *sc )
  ME = v128_bswap64( sc->buf[14] );
  MF = v128_bswap64( sc->buf[15] );
  
-#endif
-
  ROUND_B_2X64(0);
  ROUND_B_2X64(1);
  ROUND_B_2X64(2);
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -54,10 +54,10 @@ int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );

   // Partialy prehash second block without touching nonces in block_buf[3].
-   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-      blake256_16way_final_rounds_le( hash32, midstate_vars, block0_hash,
+      blake256_16x32_final_rounds_le( hash32, midstate_vars, block0_hash,
                                      block_buf, rounds );
      for ( int lane = 0; lane < 16; lane++ )
      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -123,10 +123,10 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
   block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );

   // Partialy prehash second block without touching nonces in block_buf[3].
-   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-      blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
+      blake256_8x32_final_rounds_le( hash32, midstate_vars, block0_hash,
                                     block_buf, rounds );
      for ( int lane = 0; lane < 8; lane++ )
      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -148,16 +148,16 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
   
 #elif defined (BLAKECOIN_4WAY)

-blake256r8_4way_context blakecoin_4w_ctx;
+blake256r8_4x32_context blakecoin_4w_ctx;

 void blakecoin_4way_hash(void *state, const void *input)
 {
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-     blake256r8_4way_context ctx;
+     blake256r8_4x32_context ctx;

     memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
-     blake256r8_4way_update( &ctx, input + (64<<2), 16 );
-     blake256r8_4way_close( &ctx, vhash );
+     blake256r8_4x32_update( &ctx, input + (64<<2), 16 );
+     blake256r8_4x32_close( &ctx, vhash );

     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }
@@ -178,8 +178,8 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
      HTarget = 0x7f;

   v128_bswap32_intrlv80_4x32( vdata, pdata );
-   blake256r8_4way_init( &blakecoin_4w_ctx );
-   blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
+   blake256r8_4x32_init( &blakecoin_4w_ctx );
+   blake256r8_4x32_update( &blakecoin_4w_ctx, vdata, 64 );

   do {
      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -16,28 +16,27 @@ extern void pentablakehash_4way( void *output, const void *input )
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-     blake512_4way_context ctx;
+     blake512_4x64_context ctx;

+     blake512_4x64_init( &ctx );
+     blake512_4x64_update( &ctx, input, 80 );
+     blake512_4x64_close( &ctx, vhash );

-     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, input, 80 );
-     blake512_4way_close( &ctx, vhash );
+     blake512_4x64_init( &ctx );
+     blake512_4x64_update( &ctx, vhash, 64 );
+     blake512_4x64_close( &ctx, vhash );

-     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, vhash, 64 );
-     blake512_4way_close( &ctx, vhash );
+     blake512_4x64_init( &ctx );
+     blake512_4x64_update( &ctx, vhash, 64 );
+     blake512_4x64_close( &ctx, vhash );

-     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, vhash, 64 );
-     blake512_4way_close( &ctx, vhash );
+     blake512_4x64_init( &ctx );
+     blake512_4x64_update( &ctx, vhash, 64 );
+     blake512_4x64_close( &ctx, vhash );

-     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, vhash, 64 );
-     blake512_4way_close( &ctx, vhash );
-
-     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, vhash, 64 );
-     blake512_4way_close( &ctx, vhash );
+     blake512_4x64_init( &ctx );
+     blake512_4x64_update( &ctx, vhash, 64 );
+     blake512_4x64_close( &ctx, vhash );

     memcpy( output,    hash0, 32 );
     memcpy( output+32, hash1, 32 );
--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -227,7 +227,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
 	v[14] = S->f[0] ^ blake2s_IV[6];
 	v[15] = S->f[1] ^ blake2s_IV[7];

-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)

   v128_t *V = (v128_t*)v;

@@ -263,19 +263,6 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
   V[3] = v128_swap64( V[3] ); \
   V[2] = v128_shufll32( V[2] )

-   BLAKE2S_ROUND(0);
-   BLAKE2S_ROUND(1);
-   BLAKE2S_ROUND(2);
-   BLAKE2S_ROUND(3);
-   BLAKE2S_ROUND(4);
-   BLAKE2S_ROUND(5);
-   BLAKE2S_ROUND(6);
-   BLAKE2S_ROUND(7);
-   BLAKE2S_ROUND(8);
-   BLAKE2S_ROUND(9);
-   
-#undef BLAKE2S_ROUND
-
 #else

 #define G(r,i,a,b,c,d) \
@@ -290,7 +277,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
 		b = SPH_ROTR32(b ^ c, 7); \
 	} while(0)

-#define ROUND(r)  \
+#define BLAKE2S_ROUND(r)  \
 	do { \
 		G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
 		G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
@@ -302,24 +289,25 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
 		G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
 	} while(0)

-   ROUND( 0 );
-	ROUND( 1 );
-	ROUND( 2 );
-	ROUND( 3 );
-	ROUND( 4 );
-	ROUND( 5 );
-	ROUND( 6 );
-	ROUND( 7 );
-	ROUND( 8 );
-	ROUND( 9 );
-
 #endif

+   BLAKE2S_ROUND(0);
+   BLAKE2S_ROUND(1);
+   BLAKE2S_ROUND(2);
+   BLAKE2S_ROUND(3);
+   BLAKE2S_ROUND(4);
+   BLAKE2S_ROUND(5);
+   BLAKE2S_ROUND(6);
+   BLAKE2S_ROUND(7);
+   BLAKE2S_ROUND(8);
+   BLAKE2S_ROUND(9);
+   
+
 	for( size_t i = 0; i < 8; ++i )
 		S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];

 #undef G
-#undef ROUND
+#undef BLAKE2S_ROUND
 	return 0;
 }

--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -39,16 +39,14 @@
 #include <stddef.h>
 #include "simd-utils.h"

-#define SPH_SIZE_bmw256   256
-
-#define SPH_SIZE_bmw512   512
-
 // BMW-256 4 way 32

+#if defined(__SSE2__) || defined(__ARM_NEON)
+
 typedef struct
 {
-   v128_t buf[64];
-   v128_t H[16];
+   v128u32_t buf[64];
+   v128u32_t H[16];
   size_t ptr;
   uint32_t bit_count;  // assume bit_count fits in 32 bits
 } bmw_4way_small_context;
@@ -58,13 +56,19 @@ typedef bmw_4way_small_context bmw256_4way_context;
 void bmw256_4way_init( bmw256_4way_context *ctx );

 void bmw256_4way_update(void *cc, const void *data, size_t len);
-#define bmw256_4way bmw256_4way_update

 void bmw256_4way_close(void *cc, void *dst);

 void bmw256_4way_addbits_and_close(
        void *cc, unsigned ub, unsigned n, void *dst);

+#define bmw256_4x32_context bmw256_4way_context
+#define bmw256_4x32_init    bmw256_4way_init
+#define bmw256_4x32_update  bmw256_4way_update
+#define bmw256_4x32_close   bmw256_4way_close
+
+#endif
+
 #if defined(__AVX2__)

 // BMW-256 8 way 32
@@ -85,6 +89,11 @@ void bmw256_8way_update( bmw256_8way_context *ctx, const void *data,
 #define bmw256_8way bmw256_8way_update
 void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );

+#define bmw256_8x32_context bmw256_8way_context
+#define bmw256_8x32_init    bmw256_8way_init
+#define bmw256_8x32_update  bmw256_8way_update
+#define bmw256_8x32_close   bmw256_8way_close
+
 #endif

 #if defined(SIMD512)
@@ -106,6 +115,11 @@ void bmw256_16way_update( bmw256_16way_context *ctx, const void *data,
                          size_t len );
 void bmw256_16way_close( bmw256_16way_context *ctx, void *dst );

+#define bmw256_16x32_context bmw256_16way_context
+#define bmw256_16x32_init    bmw256_16way_init
+#define bmw256_16x32_update  bmw256_16way_update
+#define bmw256_16x32_close   bmw256_16way_close
+
 #endif

 // BMW-512 2 way 64
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -45,7 +45,7 @@ extern "C"{

 #define LPAR   (

-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)

 // BMW-256 4 way 32
 /*
@@ -284,9 +284,9 @@ static const uint32_t IV256[] = {
                     v128_xor( M[13], H[13] ) ) )


-void compress_small( const v128u64_t *M, const v128u64_t H[16], v128u64_t dH[16] )
+void compress_small( const v128u32_t *M, const v128u32_t H[16], v128u32_t dH[16] )
 {
-   v128u64_t qt[32], xl, xh; \
+   v128u32_t qt[32], xl, xh; \

   qt[ 0] = v128_add32( ss0( Ws0 ), H[ 1] );
   qt[ 1] = v128_add32( ss1( Ws1 ), H[ 2] );
@@ -428,49 +428,25 @@ static const uint32_t final_s[16][4] =
   { 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae },
   { 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
 };
-/*
-static const v128u64_t final_s[16] =
-{
-   { 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
-   { 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
-   { 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 },
-   { 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 },
-   { 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 },
-   { 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 },
-   { 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 },
-   { 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 },
-   { 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 },
-   { 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 },
-   { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
-   { 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab },
-   { 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac },
-   { 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad },
-   { 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
-   { 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
-};
-*/
+
 void bmw256_4way_init( bmw256_4way_context *ctx )
 {
-   ctx->H[ 0] = v128_64( 0x4041424340414243 );
-   ctx->H[ 1] = v128_64( 0x4445464744454647 );
-   ctx->H[ 2] = v128_64( 0x48494A4B48494A4B );
-   ctx->H[ 3] = v128_64( 0x4C4D4E4F4C4D4E4F );
-   ctx->H[ 4] = v128_64( 0x5051525350515253 );
-   ctx->H[ 5] = v128_64( 0x5455565754555657 );
-   ctx->H[ 6] = v128_64( 0x58595A5B58595A5B );
-   ctx->H[ 7] = v128_64( 0x5C5D5E5F5C5D5E5F );
-   ctx->H[ 8] = v128_64( 0x6061626360616263 );
-   ctx->H[ 9] = v128_64( 0x6465666764656667 );
-   ctx->H[10] = v128_64( 0x68696A6B68696A6B );
-   ctx->H[11] = v128_64( 0x6C6D6E6F6C6D6E6F );
-   ctx->H[12] = v128_64( 0x7071727370717273 );
-   ctx->H[13] = v128_64( 0x7475767774757677 );
-   ctx->H[14] = v128_64( 0x78797A7B78797A7B );
-   ctx->H[15] = v128_64( 0x7C7D7E7F7C7D7E7F );
-
-
-//   for ( int i = 0; i < 16; i++ )
-//      sc->H[i] = v128_32( iv[i] );
+   ctx->H[ 0] = v128_32( 0x40414243 );
+   ctx->H[ 1] = v128_32( 0x44454647 );
+   ctx->H[ 2] = v128_32( 0x48494A4B );
+   ctx->H[ 3] = v128_32( 0x4C4D4E4F );
+   ctx->H[ 4] = v128_32( 0x50515253 );
+   ctx->H[ 5] = v128_32( 0x54555657 );
+   ctx->H[ 6] = v128_32( 0x58595A5B );
+   ctx->H[ 7] = v128_32( 0x5C5D5E5F );
+   ctx->H[ 8] = v128_32( 0x60616263 );
+   ctx->H[ 9] = v128_32( 0x64656667 );
+   ctx->H[10] = v128_32( 0x68696A6B );
+   ctx->H[11] = v128_32( 0x6C6D6E6F );
+   ctx->H[12] = v128_32( 0x70717273 );
+   ctx->H[13] = v128_32( 0x74757677 );
+   ctx->H[14] = v128_32( 0x78797A7B );
+   ctx->H[15] = v128_32( 0x7C7D7E7F );
   ctx->ptr = 0;
   ctx->bit_count = 0;
 }
@@ -478,10 +454,10 @@ void bmw256_4way_init( bmw256_4way_context *ctx )
 static void
 bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
 {
-   v128u64_t *vdata = (v128u64_t*)data;
-   v128u64_t *buf;
-   v128u64_t htmp[16];
-   v128u64_t *h1, *h2;
+   v128u32_t *vdata = (v128u32_t*)data;
+   v128u32_t *buf;
+   v128u32_t htmp[16];
+   v128u32_t *h1, *h2;
   size_t ptr;
   const int buf_size = 64;  // bytes of one lane, compatible with len

@@ -503,7 +479,7 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
      ptr += clen;
      if ( ptr == buf_size )
      {
-         v128u64_t *ht;
+         v128u32_t *ht;
         compress_small( buf, h1, h2 );
         ht = h1;
         h1 = h2;
@@ -521,14 +497,14 @@ static void
 bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
 	void *dst, size_t out_size_w32)
 {
-   v128u64_t *buf;
-   v128u64_t h1[16], h2[16], *h;
+   v128u32_t *buf;
+   v128u32_t h1[16], h2[16], *h;
   size_t ptr, u, v;
   const int buf_size = 64;  // bytes of one lane, compatible with len

   buf = sc->buf;
   ptr = sc->ptr;
-   buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
+   buf[ ptr>>2 ] = v128_32( 0x00000080 );
   ptr += 4;
   h = sc->H;

@@ -548,7 +524,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
   for ( u = 0; u < 16; u ++ )
      buf[u] = h2[u];

-   compress_small( buf, (v128u64_t*)final_s, h1 );
+   compress_small( buf, (v128u32_t*)final_s, h1 );

   for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
      casti_v128( dst, u ) = h1[v];
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -683,8 +683,9 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
   mj[14] = mm256_rol_64( M[14], 15 );
   mj[15] = mm256_rol_64( M[15], 16 );

-   __m256i K = _mm256_set1_epi64x( 16 * 0x0555555555555555ULL );
-   const __m256i Kincr = _mm256_set1_epi64x( 0x0555555555555555ULL );
+   __m256i K = _mm256_set1_epi64x( 0x5555555555555550ULL );
+   static const __m256i Kincr = { 0x0555555555555555ULL, 0x0555555555555555ULL,
+                                  0x0555555555555555ULL, 0x0555555555555555ULL };

   qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K );
   K = _mm256_add_epi64( K, Kincr );
@@ -1094,7 +1095,7 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
                        __m512i dH[16] )
 {
   __m512i qt[32], xl, xh;
-   __m512i mh[16];
+   __m512i mh[16], mj[16];
   int i;

   for ( i = 0; i < 16; i++ )
@@ -1117,8 +1118,6 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
   qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
   qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );

-   __m512i mj[16];
- 
   mj[ 0] = mm512_rol_64( M[ 0],  1 );
   mj[ 1] = mm512_rol_64( M[ 1],  2 );
   mj[ 2] = mm512_rol_64( M[ 2],  3 );
@@ -1136,8 +1135,11 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
   mj[14] = mm512_rol_64( M[14], 15 );
   mj[15] = mm512_rol_64( M[15], 16 );

-   __m512i K = _mm512_set1_epi64( 16 * 0x0555555555555555ULL );
-   const __m512i Kincr = _mm512_set1_epi64( 0x0555555555555555ULL );
+   __m512i K = _mm512_set1_epi64( 0x5555555555555550ULL );
+   static const __m512i Kincr = { 0x0555555555555555ULL, 0x0555555555555555ULL,
+                                  0x0555555555555555ULL, 0x0555555555555555ULL,
+                                  0x0555555555555555ULL, 0x0555555555555555ULL,
+                                  0x0555555555555555ULL, 0x0555555555555555ULL };

   qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], K );
   K = _mm512_add_epi64( K, Kincr );
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -39,7 +39,7 @@ static void transform( cubehashParam *sp )

 #elif defined(__AVX2__)

-    register __m256i x0, x1, x2, x3, y0, y1;
+    register __m256i x0, x1, x2, x3, t0;

    x0 = _mm256_load_si256( (__m256i*)sp->x     );
    x1 = _mm256_load_si256( (__m256i*)sp->x + 1 );   
@@ -50,10 +50,10 @@ static void transform( cubehashParam *sp )
    { 
        x2 = _mm256_add_epi32( x0, x2 );
        x3 = _mm256_add_epi32( x1, x3 );
-        y0 = mm256_rol_32( x1, 7 );
-        y1 = mm256_rol_32( x0, 7 );
-        x0 = _mm256_xor_si256( y0, x2 );
-        x1 = _mm256_xor_si256( y1, x3 );
+        t0 = mm256_rol_32( x1, 7 );
+        x1 = mm256_rol_32( x0, 7 );
+        x0 = _mm256_xor_si256( t0, x2 );
+        x1 = _mm256_xor_si256( x1, x3 );
        x2 = mm256_swap128_64( x2 );
        x3 = mm256_swap128_64( x3 );
        x2 = _mm256_add_epi32( x0, x2 );
@@ -75,7 +75,7 @@ static void transform( cubehashParam *sp )

 #else   // AVX, SSE2, NEON

-    v128_t x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
+    v128_t x0, x1, x2, x3, x4, x5, x6, x7, t0, t1;

    x0 = casti_v128( sp->x, 0 );
    x1 = casti_v128( sp->x, 1 );
@@ -92,16 +92,12 @@ static void transform( cubehashParam *sp )
       x5 = v128_add32( x1, x5 );
       x6 = v128_add32( x2, x6 );
       x7 = v128_add32( x3, x7 );
-       y0 = x2;
-       y1 = x3;
-       y2 = x0;
-       y3 = x1;
-       x0 = v128_rol32( y0, 7 );
-       x1 = v128_rol32( y1, 7 );
-       x2 = v128_rol32( y2, 7 );
-       x3 = v128_rol32( y3, 7 );
-       x0 = v128_xor( x0, x4 );
-       x1 = v128_xor( x1, x5 );
+       t0 = v128_rol32( x2, 7 );
+       t1 = v128_rol32( x3, 7 );
+       x2 = v128_rol32( x0, 7 );
+       x3 = v128_rol32( x1, 7 );
+       x0 = v128_xor( t0, x4 );
+       x1 = v128_xor( t1, x5 );
       x2 = v128_xor( x2, x6 );
       x3 = v128_xor( x3, x7 );
       x4 = v128_swap64( x4 );
@@ -112,17 +108,13 @@ static void transform( cubehashParam *sp )
       x5 = v128_add32( x1, x5 );
       x6 = v128_add32( x2, x6 );
       x7 = v128_add32( x3, x7 );
-       y0 = x1;
-       y1 = x0;
-       y2 = x3;
-       y3 = x2;
-       x0 = v128_rol32( y0, 11 );
-       x1 = v128_rol32( y1, 11 );
-       x2 = v128_rol32( y2, 11 );
-       x3 = v128_rol32( y3, 11 );
-	    x0 = v128_xor( x0, x4 );
+       t0 = v128_rol32( x1, 11 );
+       x1 = v128_rol32( x0, 11 );
+       t1 = v128_rol32( x3, 11 );
+       x3 = v128_rol32( x2, 11 );
+       x0 = v128_xor( t0, x4 );
       x1 = v128_xor( x1, x5 );
-	    x2 = v128_xor( x2, x6 );
+       x2 = v128_xor( t1, x6 );
       x3 = v128_xor( x3, x7 );
       x4 = v128_swap64_32( x4 );
 	    x5 = v128_swap64_32( x5 );
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -17,7 +17,7 @@ typedef struct {
 #else
   hashState_groestl       groestl;
 #endif
-   sha256_8way_context     sha;
+   sha256_8x32_context     sha;
 } myrgr_8way_ctx_holder;

 myrgr_8way_ctx_holder myrgr_8way_ctx;
@@ -29,7 +29,7 @@ void init_myrgr_8way_ctx()
 #else
     init_groestl( &myrgr_8way_ctx.groestl, 64 );
 #endif
-     sha256_8way_init( &myrgr_8way_ctx.sha );
+     sha256_8x32_init( &myrgr_8way_ctx.sha );
 }

 void myriad_8way_hash( void *output, const void *input )
@@ -96,8 +96,8 @@ void myriad_8way_hash( void *output, const void *input )
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
                       hash6, hash7 );
     
-     sha256_8way_update( &ctx.sha, vhash, 64 );
-     sha256_8way_close( &ctx.sha, output );
+     sha256_8x32_update( &ctx.sha, vhash, 64 );
+     sha256_8x32_close( &ctx.sha, output );
 }

 int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
@@ -156,7 +156,7 @@ int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,

 typedef struct {
    hashState_groestl       groestl;
-    sha256_4way_context     sha;
+    sha256_4x32_context     sha;
 } myrgr_4way_ctx_holder;

 myrgr_4way_ctx_holder myrgr_4way_ctx;
@@ -164,7 +164,7 @@ myrgr_4way_ctx_holder myrgr_4way_ctx;
 void init_myrgr_4way_ctx()
 {
     init_groestl (&myrgr_4way_ctx.groestl, 64 );
-     sha256_4way_init( &myrgr_4way_ctx.sha );
+     sha256_4x32_init( &myrgr_4way_ctx.sha );
 }

 void myriad_4way_hash( void *output, const void *input )
@@ -189,8 +189,8 @@ void myriad_4way_hash( void *output, const void *input )

     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

-     sha256_4way_update( &ctx.sha, vhash, 64 );
-     sha256_4way_close( &ctx.sha, output );
+     sha256_4x32_update( &ctx.sha, vhash, 64 );
+     sha256_4x32_close( &ctx.sha, output );
 }

 int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -503,32 +503,28 @@ do { \
  SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \
  SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \
  s4 = mm512_swap64_32( s4 ); \
-  s5 = mm512_swap64_32( s5 ); \
+  t0 = _mm512_mask_shuffle_epi32( s4, 0xaaaa, s5, 0xb1 ); \
  sD = mm512_swap64_32( sD ); \
-  sE = mm512_swap64_32( sE ); \
-  t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
+  t1 = _mm512_mask_shuffle_epi32( sD, 0xaaaa, sE, 0xb1 ); \
  L8( s0, t0, s9, t1 ); \
  s6 = mm512_swap64_32( s6 ); \
  sF = mm512_swap64_32( sF ); \
-  t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
+  t2 = _mm512_mask_shuffle_epi32( s6, 0x5555, s5, 0xb1 ); \
+  t3 = _mm512_mask_shuffle_epi32( sF, 0x5555, sE, 0xb1 ); \
  L8( s1, t2, sA, t3 ); \
  s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \
  sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \
 \
-  s7 = mm512_swap64_32( s7 ); \
-  sC = mm512_swap64_32( sC ); \
-  t4 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
-  t5 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
+  t4 = _mm512_mask_shuffle_epi32( s6, 0xaaaa, s7, 0xb1 ); \
+  t5 = _mm512_mask_shuffle_epi32( sF, 0xaaaa, sC, 0xb1 ); \
  L8( s2, t4, sB, t5 ); \
  s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \
  sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \
  s6 = mm512_swap64_32( s6 ); \
  sF = mm512_swap64_32( sF ); \
 \
-  t2 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
+  t2 = _mm512_mask_shuffle_epi32( s4, 0x5555, s7, 0xb1 ); \
+  t3 = _mm512_mask_shuffle_epi32( sD, 0x5555, sC, 0xb1 ); \
  L8( s3, t2, s8, t3 ); \
  s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \
  s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \
@@ -537,21 +533,20 @@ do { \
  s7 = mm512_swap64_32( s7 ); \
  sC = mm512_swap64_32( sC ); \
 \
-  t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \
+  t0 = _mm512_mask_shuffle_epi32( s0, 0xaaaa, s8, 0xb1 ); \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
-  t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \
+  t2 = _mm512_mask_shuffle_epi32( sA, 0x5555, s2, 0xb1 ); \
  t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \
  t3 = mm512_swap64_32( t3 ); \
  L8( t0, t1, t2, t3 ); \
-  t3 = mm512_swap64_32( t3 ); \
  s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
-  s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \
+  s8 = _mm512_mask_shuffle_epi32( s8, 0x5555, t0, 0xb1 ); \
  s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
  s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
-  s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \
+  s2 = _mm512_mask_shuffle_epi32( s2, 0xaaaa, t2, 0xb1 ); \
  sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
-  s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \
-  sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \
+  s3 = _mm512_mask_shuffle_epi32( s3, 0xaaaa, t3, 0xb1 ); \
+  sB = _mm512_mask_shuffle_epi32( sB, 0x5555, t3, 0xb1 ); \
 \
  t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \
@@ -1059,7 +1054,7 @@ void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
   WRITE_STATE_BIG8( sc );
 }

-void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
+void hamsi_8way_big_final( hamsi512_8x64_context *sc, __m512i *buf )
 {
   __m512i m0, m1, m2, m3, m4, m5, m6, m7;

@@ -1071,7 +1066,7 @@ void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
   WRITE_STATE_BIG8( sc );
 }

-void hamsi512_8way_init( hamsi_8way_big_context *sc )
+void hamsi512_8x64_init( hamsi512_8x64_context *sc )
 {
   sc->partial_len = 0;
   sc->count_high = sc->count_low = 0;
@@ -1087,7 +1082,7 @@ void hamsi512_8way_init( hamsi_8way_big_context *sc )
   sc->h[7] = v512_64( iv[7] );
   }

-void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
+void hamsi512_8x64_update( hamsi512_8x64_context *sc, const void *data,
                           size_t len )
 {
   __m512i *vdata = (__m512i*)data;
@@ -1099,7 +1094,7 @@ void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
   sc->partial_len = len;
 }

-void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
+void hamsi512_8x64_close( hamsi512_8x64_context *sc, void *dst )
 {
   __m512i pad[1];
   uint32_t ch, cl;
@@ -1142,7 +1137,7 @@ do { \
  } \
 } while (0)

-// v3 ternary logic, 8 instructions, 2 local vars
+// v4 ternary logic, 8 instructions, 2 local vars
 #define SBOX( a, b, c, d ) \
 { \
  __m256i tb, td; \
@@ -1268,7 +1263,7 @@ do { \
 } while (0)
 #endif

-// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
+// v3, 15 instructions
 #define SBOX( a, b, c, d ) \
 { \
  __m256i tb, td; \
@@ -1286,7 +1281,7 @@ do { \
 #endif

 /*
-/ v2, 16 instructions, 10 TL equivalent instructions
+/ v2, 16 instructions
 #define SBOX( a, b, c, d ) \
 { \
  __m256i t = mm256_xorand( d, a, c ); \
@@ -1944,7 +1939,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void * dst,

 ////////////

-void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
+void hamsi_big( hamsi512_4x64_context *sc, __m256i *buf, size_t num )
 {
   DECL_STATE_BIG
   uint32_t tmp;
@@ -1968,7 +1963,7 @@ void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
   WRITE_STATE_BIG( sc );
 }

-void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
+void hamsi_big_final( hamsi512_4x64_context *sc, __m256i *buf )
 {
   __m256i m0, m1, m2, m3, m4, m5, m6, m7;
   DECL_STATE_BIG
@@ -1979,7 +1974,7 @@ void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
   WRITE_STATE_BIG( sc );
 }

-void hamsi512_4way_init( hamsi_4way_big_context *sc )
+void hamsi512_4x64_init( hamsi512_4x64_context *sc )
 {
   sc->partial_len = 0;
   sc->count_high = sc->count_low = 0;
@@ -1994,7 +1989,7 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc )
   sc->h[7] = v256_64( iv[7] );
 }

-void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
+void hamsi512_4x64_update( hamsi512_4x64_context *sc, const void *data,
      size_t len )
 {
   __m256i *vdata = (__m256i*)data;
@@ -2006,7 +2001,7 @@ void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
   sc->partial_len = len;
 }

-void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
+void hamsi512_4x64_close( hamsi512_4x64_context *sc, void *dst )
 {
   __m256i pad[1];
   uint32_t ch, cl;
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -72,17 +72,17 @@ typedef struct
   size_t partial_len;
   uint32_t count_high, count_low;
 } hamsi_4way_big_context;
-typedef hamsi_4way_big_context hamsi512_4way_context;
+typedef hamsi_4way_big_context hamsi512_4x64_context;

-void hamsi512_4way_init( hamsi512_4way_context *sc );
-void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
+void hamsi512_4x64_init( hamsi512_4x64_context *sc );
+void hamsi512_4x64_update( hamsi512_4x64_context *sc, const void *data,
      size_t len );
-void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
+void hamsi512_4x64_close( hamsi512_4x64_context *sc, void *dst );

-#define hamsi512_4x64_context   hamsi512_4way_context
-#define hamsi512_4x64_init      hamsi512_4way_init
-#define hamsi512_4x64_update    hamsi512_4way_update
-#define hamsi512_4x64_close     hamsi512_4way_close
+#define hamsi512_4way_context   hamsi512_4x64_context
+#define hamsi512_4way_init      hamsi512_4x64_init
+#define hamsi512_4way_update    hamsi512_4x64_update
+#define hamsi512_4way_close     hamsi512_4x64_close

 // Hamsi-512 8x32

@@ -115,17 +115,17 @@ typedef struct
   size_t partial_len;
   uint32_t count_high, count_low;
 } hamsi_8way_big_context;
-typedef hamsi_8way_big_context hamsi512_8way_context;
+typedef hamsi_8way_big_context hamsi512_8x64_context;

-void hamsi512_8way_init( hamsi512_8way_context *sc );
-void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
+void hamsi512_8x64_init( hamsi512_8x64_context *sc );
+void hamsi512_8x64_update( hamsi512_8x64_context *sc, const void *data,
                           size_t len );
-void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );
+void hamsi512_8x64_close( hamsi512_8x64_context *sc, void *dst );

-#define hamsi512_8x64_context   hamsi512_8way_context
-#define hamsi512_8x64_init      hamsi512_8way_init
-#define hamsi512_8x64_update    hamsi512_8way_update
-#define hamsi512_8x64_close     hamsi512_8way_close
+#define hamsi512_8way_context   hamsi512_8x64_context
+#define hamsi512_8way_init      hamsi512_8x64_init
+#define hamsi512_8way_update    hamsi512_8x64_update
+#define hamsi512_8way_close     hamsi512_8x64_close

 // Hamsi-512 16x32

--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -82,12 +82,15 @@ typedef struct {
 typedef haval_4way_context haval256_5_4way_context;

 void haval256_5_4way_init( void *cc );
-
 void haval256_5_4way_update( void *cc, const void *data, size_t len );
 //#define haval256_5_4way haval256_5_4way_update
-
 void haval256_5_4way_close( void *cc, void *dst );

+#define haval256_4x32_context    haval256_5_4way_context
+#define haval256_4x32_init       haval256_5_4way_init
+#define haval256_4x32_update     haval256_5_4way_update
+#define haval256_4x32_close      haval256_5_4way_close
+
 #if defined(__AVX2__)

 typedef struct {
@@ -100,11 +103,14 @@ typedef struct {
 typedef haval_8way_context haval256_5_8way_context;

 void haval256_5_8way_init( void *cc );
-
 void haval256_5_8way_update( void *cc, const void *data, size_t len );
-
 void haval256_5_8way_close( void *cc, void *dst );

+#define haval256_8x32_context    haval256_5_8way_context
+#define haval256_8x32_init       haval256_5_8way_init
+#define haval256_8x32_update     haval256_5_8way_update
+#define haval256_8x32_close      haval256_5_8way_close
+
 #endif // AVX2

 #if defined(SIMD512)
@@ -119,11 +125,14 @@ typedef struct {
 typedef haval_16way_context haval256_5_16way_context;

 void haval256_5_16way_init( void *cc );
-
 void haval256_5_16way_update( void *cc, const void *data, size_t len );
-
 void haval256_5_16way_close( void *cc, void *dst );

+#define haval256_16x32_context    haval256_5_16way_context
+#define haval256_16x32_init       haval256_5_16way_init
+#define haval256_16x32_update     haval256_5_16way_update
+#define haval256_16x32_close      haval256_5_16way_close
+
 #endif // AVX512

 #ifdef __cplusplus
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -161,29 +161,25 @@ keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
 static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
                                 size_t byte_len, size_t lim )
 {
-    unsigned eb;
-    union {
-       __m512i tmp[lim + 1];
-       uint64_t dummy;   /* for alignment */
-    } u;
+    __m512i tmp[lim + 1] __attribute__ ((aligned (64)));
    size_t j;
    size_t m512_len = byte_len >> 3;
+    const unsigned eb = hard_coded_eb;

-    eb = hard_coded_eb;
    if ( kc->ptr == (lim - 8) )
    {
        const uint64_t t = eb | 0x8000000000000000;
-        u.tmp[0] = _mm512_set1_epi64( t );
+        tmp[0] = _mm512_set1_epi64( t );
        j = 8;
    }
    else
    {
        j = lim - kc->ptr;
-        u.tmp[0] = _mm512_set1_epi64( eb );
-        memset_zero_512( u.tmp + 1, (j>>3) - 2 );
-        u.tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
+        tmp[0] = _mm512_set1_epi64( eb );
+        memset_zero_512( tmp + 1, (j>>3) - 2 );
+        tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
    }
-    keccak64_8way_core( kc, u.tmp, j, lim );
+    keccak64_8way_core( kc, tmp, j, lim );
    /* Finalize the "lane complement" */
    NOT64( kc->w[ 1], kc->w[ 1] );
    NOT64( kc->w[ 2], kc->w[ 2] );
@@ -194,7 +190,7 @@ static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
    memcpy_512( dst, kc->w, m512_len );
 }

-void keccak256_8way_init( void *kc )
+void keccak256_8x64_init( void *kc )
 {
   keccak64_8way_init( kc, 256 );
 }
@@ -361,29 +357,25 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
 static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
            size_t lim )
 {
-    unsigned eb;
-    union {
-       __m256i tmp[lim + 1];
-       uint64_t dummy;   /* for alignment */
-    } u;
+    __m256i tmp[lim + 1] __attribute__ ((aligned (32)));
    size_t j;
    size_t m256_len = byte_len >> 3;
+    const unsigned eb = hard_coded_eb;

-    eb = hard_coded_eb;
    if ( kc->ptr == (lim - 8) )
    {
        const uint64_t t = eb | 0x8000000000000000;
-        u.tmp[0] = _mm256_set1_epi64x( t );
+        tmp[0] = _mm256_set1_epi64x( t );
        j = 8;
    }
    else
    {
        j = lim - kc->ptr;
-        u.tmp[0] = _mm256_set1_epi64x( eb );
-        memset_zero_256( u.tmp + 1, (j>>3) - 2 );
-        u.tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
+        tmp[0] = _mm256_set1_epi64x( eb );
+        memset_zero_256( tmp + 1, (j>>3) - 2 );
+        tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
    }
-    keccak64_core( kc, u.tmp, j, lim );
+    keccak64_core( kc, tmp, j, lim );
    /* Finalize the "lane complement" */
    NOT64( kc->w[ 1], kc->w[ 1] );
    NOT64( kc->w[ 2], kc->w[ 2] );
--- a/algo/keccak/sha3d-4way.c
+++ b/algo/keccak/sha3d-4way.c
@@ -9,7 +9,7 @@
 void sha3d_hash_8way(void *state, const void *input)
 {
    uint32_t buffer[16*8] __attribute__ ((aligned (128)));
-    keccak256_8way_context ctx;
+    keccak256_8x64_context ctx;

    keccak256_8x64_init( &ctx );
    keccak256_8x64_update( &ctx, input, 80 );
@@ -69,7 +69,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
 void sha3d_hash_4way(void *state, const void *input)
 {
    uint32_t buffer[16*4] __attribute__ ((aligned (64)));
-    keccak256_4way_context ctx;
+    keccak256_4x64_context ctx;

    keccak256_4x64_init( &ctx );
    keccak256_4x64_update( &ctx, input, 80 );
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -80,14 +80,14 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
    __m512i t = a0; \
    a0 = mm512_xoror( a3, a0, a1 ); \
    a2 = _mm512_xor_si512( a2, a3 ); \
-    a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
+    a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 nxor (a3 & t) */ \
    a3 = mm512_xorand( a2, a3, t ); \
    a2 = mm512_xorand( a1, a2, a0); \
    a1 = _mm512_or_si512( a1, a3 ); \
    a3 = _mm512_xor_si512( a3, a2 ); \
    t  = _mm512_xor_si512( t, a1 ); \
    a2 = _mm512_and_si512( a2, a1 ); \
-    a1 = mm512_xnor( a1, a0 ); \
+    a1 = mm512_nxor( a1, a0 ); \
    a0 = t; \
 }

@@ -273,8 +273,6 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
    uint32_t hash[8*4] __attribute((aligned(128)));
    __m512i* chainv = state->chainv;
    __m512i t[2];
-    const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x( 
-                                  0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

    /*---- blank round with m=0 ----*/
    rnd512_4way( state, NULL );
@@ -289,10 +287,8 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
    _mm512_store_si512( (__m512i*)&hash[ 0], t[0] );
    _mm512_store_si512( (__m512i*)&hash[16], t[1] );

-    casti_m512i( b,0 ) = _mm512_shuffle_epi8(
-                                  casti_m512i( hash,0 ), shuff_bswap32 );
-    casti_m512i( b,1 ) = _mm512_shuffle_epi8(
-                                  casti_m512i( hash,1 ), shuff_bswap32 );
+    casti_m512i( b,0 ) = mm512_bswap_32( casti_m512i( hash,0 ) );
+    casti_m512i( b,1 ) = mm512_bswap_32( casti_m512i( hash,1 ) );

    rnd512_4way( state, NULL );

@@ -306,10 +302,8 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
    _mm512_store_si512( (__m512i*)&hash[ 0], t[0] );
    _mm512_store_si512( (__m512i*)&hash[16], t[1] );

-    casti_m512i( b,2 ) = _mm512_shuffle_epi8(
-                                  casti_m512i( hash,0 ), shuff_bswap32 );
-    casti_m512i( b,3 ) = _mm512_shuffle_epi8(
-                                  casti_m512i( hash,1 ), shuff_bswap32 );
+    casti_m512i( b,2 ) = mm512_bswap_32( casti_m512i( hash,0 ) );
+    casti_m512i( b,3 ) = mm512_bswap_32( casti_m512i( hash,1 ) );
 }

 int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
@@ -349,16 +343,14 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
    __m512i msg[2];
    int i;
    int blocks = (int)len >> 5;
-    const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(  
-                                   0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

    state->rembytes = (int)len & 0x1F;

    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
-       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       msg[0] = mm512_bswap_32( vdata[ 0 ] );
+       msg[1] = mm512_bswap_32( vdata[ 1 ] );
       rnd512_4way( state, msg );
    }

@@ -367,7 +359,7 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
    if ( state->rembytes  )
    {
      // remaining data bytes
-      buffer[0] = _mm512_shuffle_epi8( vdata[0], shuff_bswap32 );
+      buffer[0] = mm512_bswap_32( vdata[0] );
      buffer[1] = mm512_bcast128lo_64( 0x0000000080000000 );
    }
    return 0;
@@ -434,16 +426,14 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
    __m512i msg[2];
    int i;
    const int blocks = (int)( inlen >> 5 );
-    const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x( 
-                                   0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

    state->rembytes = inlen & 0x1F;

    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
-       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       msg[0] = mm512_bswap_32( vdata[ 0 ] );
+       msg[1] = mm512_bswap_32( vdata[ 1 ] );
       rnd512_4way( state, msg );
    }

@@ -451,7 +441,7 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
    if ( state->rembytes  )
    {
       // padding of partial block
-       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[0] = mm512_bswap_32( vdata[ 0 ] );
       msg[1] = mm512_bcast128lo_64( 0x0000000080000000 );
       rnd512_4way( state, msg );
    }
@@ -479,16 +469,14 @@ int luffa_4way_update_close( luffa_4way_context *state,
    __m512i msg[2];
    int i;
    const int blocks = (int)( inlen >> 5 );
-    const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x( 
-                                   0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

    state->rembytes = inlen & 0x1F;

    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
-       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       msg[0] = mm512_bswap_32( vdata[ 0 ] );
+       msg[1] = mm512_bswap_32( vdata[ 1 ] );
       rnd512_4way( state, msg );
    }

@@ -496,7 +484,7 @@ int luffa_4way_update_close( luffa_4way_context *state,
    if ( state->rembytes  )
    {
       // padding of partial block
-       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[0] = mm512_bswap_32( vdata[ 0 ] );
       msg[1] = mm512_bcast128lo_64( 0x0000000080000000 );
       rnd512_4way( state, msg );
    }
@@ -539,14 +527,14 @@ int luffa_4way_update_close( luffa_4way_context *state,
    __m256i t = a0; \
    a0 = mm256_xoror( a3, a0, a1 ); \
    a2 = _mm256_xor_si256( a2, a3 ); \
-    a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
+    a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 nxor (a3 & t) */ \
    a3 = mm256_xorand( a2, a3, t ); \
    a2 = mm256_xorand( a1, a2, a0); \
    a1 = _mm256_or_si256( a1, a3 ); \
    a3 = _mm256_xor_si256( a3, a2 ); \
    t  = _mm256_xor_si256( t, a1 ); \
    a2 = _mm256_and_si256( a2, a1 ); \
-    a1 = mm256_xnor( a1, a0 ); \
+    a1 = mm256_nxor( a1, a0 ); \
    a0 = t; \
 }

@@ -775,8 +763,6 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
    uint32 hash[8*2] __attribute((aligned(64)));
    __m256i* chainv = state->chainv;
    __m256i t0, t1;
-    const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
-                                                 0x0405060700010203 );
    /*---- blank round with m=0 ----*/
    rnd512_2way( state, NULL );

@@ -791,10 +777,8 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
    _mm256_store_si256( (__m256i*)&hash[0], t0 );
    _mm256_store_si256( (__m256i*)&hash[8], t1 );

-    casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
-                                  casti_m256i( hash, 0 ), shuff_bswap32 );
-    casti_m256i( b, 1 ) = _mm256_shuffle_epi8( 
-                                  casti_m256i( hash, 1 ), shuff_bswap32 );
+    casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );

    rnd512_2way( state, NULL );

@@ -809,10 +793,8 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
    _mm256_store_si256( (__m256i*)&hash[0], t0 );
    _mm256_store_si256( (__m256i*)&hash[8], t1 );

-    casti_m256i( b, 2 ) = _mm256_shuffle_epi8( 
-                                  casti_m256i( hash, 0 ), shuff_bswap32 );
-    casti_m256i( b, 3 ) = _mm256_shuffle_epi8( 
-                                  casti_m256i( hash, 1 ), shuff_bswap32 );
+    casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
 }

 int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
@@ -847,15 +829,13 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
    __m256i msg[2];
    int i;
    int blocks = (int)len >> 5;
-    const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
-                                                 0x0405060700010203 );
    state-> rembytes = (int)len & 0x1F;

    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
-       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       msg[0] = mm256_bswap_32( vdata[ 0 ] );
+       msg[1] = mm256_bswap_32( vdata[ 1 ] );
       rnd512_2way( state, msg );
    }

@@ -864,7 +844,7 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
    if ( state->rembytes  )
    {
      // remaining data bytes
-      buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
+      buffer[0] = mm256_bswap_32( vdata[0] );
      buffer[1] = mm256_bcast128lo_64( 0x0000000080000000 );
    }
    return 0;
@@ -916,16 +896,14 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
    __m256i msg[2];
    int i;
    const int blocks = (int)( inlen >> 5 );
-    const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
-                                                 0x0405060700010203 );

    state->rembytes = inlen & 0x1F;

    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
-       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       msg[0] = mm256_bswap_32( vdata[ 0 ] );
+       msg[1] = mm256_bswap_32( vdata[ 1 ] );
       rnd512_2way( state, msg );
    }

@@ -933,7 +911,7 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
    if ( state->rembytes  )
    {
       // padding of partial block
-       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[0] = mm256_bswap_32( vdata[ 0 ] );
       msg[1] = mm256_bcast128lo_64( 0x0000000080000000 );
       rnd512_2way( state, msg );
    }
@@ -961,16 +939,14 @@ int luffa_2way_update_close( luffa_2way_context *state,
    __m256i msg[2];
    int i;
    const int blocks = (int)( inlen >> 5 );
-    const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
-                                                 0x0405060700010203 );

    state->rembytes = inlen & 0x1F;

    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
-       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       msg[0] = mm256_bswap_32( vdata[ 0 ] );
+       msg[1] = mm256_bswap_32( vdata[ 1 ] );
       rnd512_2way( state, msg );
    }

@@ -978,7 +954,7 @@ int luffa_2way_update_close( luffa_2way_context *state,
    if ( state->rembytes  )
    {
       // padding of partial block
-       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[0] = mm256_bswap_32( vdata[ 0 ] );
       msg[1] = mm256_bcast128lo_64( 0x0000000080000000 );
       rnd512_2way( state, msg );
    }
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -69,18 +69,18 @@
    v128_t t = a0; \
    a0 = v128_xoror( a3, a0, a1 ); \
    a2 = v128_xor( a2, a3 ); \
-    a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
+    a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* ~a1 ^ (a3 & t) */ \
    a3 = v128_xorand( a2, a3, t ); \
    a2 = v128_xorand( a1, a2, a0 ); \
    a1 = v128_or( a1, a3 ); \
    a3 = v128_xor( a3, a2 ); \
    t  = v128_xor( t, a1 ); \
    a2 = v128_and( a2, a1 ); \
-    a1 = v128_xnor( a1, a0 ); \
+    a1 = v128_nxor( a1, a0 ); \
    a0 = t; \
 }

-#else
+#elif defined(__ARM_NEON) || defined(__SSE2__)

 #define SUBCRUMB( a0, a1, a2, a3 ) \
 { \
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -26,9 +26,9 @@
 #if defined (ALLIUM_16WAY)  

 typedef union {
-   keccak256_8way_context    keccak;
+   keccak256_8x64_context    keccak;
   cube_4way_2buf_context    cube;
-   skein256_8way_context     skein;
+   skein256_8x64_context     skein;
 #if defined(__VAES__)
   groestl256_4way_context   groestl;
 #else
@@ -60,7 +60,7 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
   uint32_t hash15[8] __attribute__ ((aligned (32)));
   allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));

-   blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
+   blake256_16x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );

   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
@@ -70,12 +70,12 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
                hash15, 256 );
   
-   keccak256_8way_init( &ctx.keccak );
-   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
-   keccak256_8way_close( &ctx.keccak, vhashA);
-   keccak256_8way_init( &ctx.keccak );
-   keccak256_8way_update( &ctx.keccak, vhashB, 32 );
-   keccak256_8way_close( &ctx.keccak, vhashB);
+   keccak256_8x64_init( &ctx.keccak );
+   keccak256_8x64_update( &ctx.keccak, vhashA, 32 );
+   keccak256_8x64_close( &ctx.keccak, vhashA);
+   keccak256_8x64_init( &ctx.keccak );
+   keccak256_8x64_update( &ctx.keccak, vhashB, 32 );
+   keccak256_8x64_close( &ctx.keccak, vhashB);

   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 vhashA, 256 );
@@ -153,12 +153,12 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
                hash15, 256 );

-   skein256_8way_init( &ctx.skein );
-   skein256_8way_update( &ctx.skein, vhashA, 32 );
-   skein256_8way_close( &ctx.skein, vhashA );
-   skein256_8way_init( &ctx.skein );
-   skein256_8way_update( &ctx.skein, vhashB, 32 );
-   skein256_8way_close( &ctx.skein, vhashB );
+   skein256_8x64_init( &ctx.skein );
+   skein256_8x64_update( &ctx.skein, vhashA, 32 );
+   skein256_8x64_close( &ctx.skein, vhashA );
+   skein256_8x64_init( &ctx.skein );
+   skein256_8x64_update( &ctx.skein, vhashB, 32 );
+   skein256_8x64_close( &ctx.skein, vhashB );

   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 vhashA, 256 );
@@ -251,7 +251,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );

   // Partialy prehash second block without touching nonces in block_buf[3].
-   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
     allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -273,9 +273,9 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
 #elif defined (ALLIUM_8WAY)  

 typedef union {
-   keccak256_4way_context    keccak;
+   keccak256_4x64_context    keccak;
   cube_2way_context         cube;
-   skein256_4way_context     skein;
+   skein256_4x64_context     skein;
 #if defined(__VAES__)
   groestl256_2way_context   groestl;
 #else
@@ -298,19 +298,19 @@ static void allium_8way_hash( void *hash, const void *midstate_vars,
   uint64_t *hash7 = (uint64_t*)hash+28;
   allium_8way_ctx_holder ctx __attribute__ ((aligned (64))); 

-   blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
+   blake256_8x32_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );

   dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 vhashA, 256 );
   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );

-   keccak256_4way_init( &ctx.keccak );
-   keccak256_4way_update( &ctx.keccak, vhashA, 32 );
-   keccak256_4way_close( &ctx.keccak, vhashA );
-   keccak256_4way_init( &ctx.keccak );
-   keccak256_4way_update( &ctx.keccak, vhashB, 32 );
-   keccak256_4way_close( &ctx.keccak, vhashB );
+   keccak256_4x64_init( &ctx.keccak );
+   keccak256_4x64_update( &ctx.keccak, vhashA, 32 );
+   keccak256_4x64_close( &ctx.keccak, vhashA );
+   keccak256_4x64_init( &ctx.keccak );
+   keccak256_4x64_update( &ctx.keccak, vhashB, 32 );
+   keccak256_4x64_close( &ctx.keccak, vhashB );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
   dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );
@@ -350,12 +350,12 @@ static void allium_8way_hash( void *hash, const void *midstate_vars,
   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );

-   skein256_4way_init( &ctx.skein );
-   skein256_4way_update( &ctx.skein, vhashA, 32 );
-   skein256_4way_close( &ctx.skein, vhashA );
-   skein256_4way_init( &ctx.skein );
-   skein256_4way_update( &ctx.skein, vhashB, 32 );
-   skein256_4way_close( &ctx.skein, vhashB );
+   skein256_4x64_init( &ctx.skein );
+   skein256_4x64_update( &ctx.skein, vhashA, 32 );
+   skein256_4x64_close( &ctx.skein, vhashA );
+   skein256_4x64_init( &ctx.skein );
+   skein256_4x64_update( &ctx.skein, vhashB, 32 );
+   skein256_4x64_close( &ctx.skein, vhashB );

 #if defined(__VAES__)

@@ -433,7 +433,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                                     n+ 3, n+ 2, n+ 1, n );

   // Partialy prehash second block without touching nonces
-   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
     allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -483,7 +483,7 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
   uint64_t *hash3 = (uint64_t*)hash+12;
   allium_4way_ctx_holder ctx __attribute__ ((aligned (64)));

-   blake256_4way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
+   blake256_4x32_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
   dintrlv_4x32( hash0, hash1, hash2, hash3, vhashA, 256 );

   intrlv_2x64( vhashA, hash0, hash1, 256 );
@@ -588,7 +588,7 @@ int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
   block_buf[15] = v128_32( 640 );

      // Partialy prehash second block without touching nonces
-   blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_4x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
     allium_4way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -616,7 +616,6 @@ int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
 //
 //  1 way

-
 typedef struct 
 {
        blake256_context        blake;
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -14,12 +14,12 @@ bool lyra2h_4way_thread_init()
 return ( lyra2h_4way_matrix = mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
 }

-static __thread blake256_4way_context l2h_4way_blake_mid;
+static __thread blake256_4x32_context l2h_4way_blake_mid;

 void lyra2h_4way_midstate( const void* input )
 {
-       blake256_4way_init( &l2h_4way_blake_mid );
-       blake256_4way_update( &l2h_4way_blake_mid, input, 64 );
+       blake256_4x32_init( &l2h_4way_blake_mid );
+       blake256_4x32_update( &l2h_4way_blake_mid, input, 64 );
 }

 void lyra2h_4way_hash( void *state, const void *input )
@@ -29,11 +29,11 @@ void lyra2h_4way_hash( void *state, const void *input )
     uint32_t hash2[8] __attribute__ ((aligned (64)));
     uint32_t hash3[8] __attribute__ ((aligned (64)));
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
+     blake256_4x32_context ctx_blake __attribute__ ((aligned (64)));

     memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
-     blake256_4way_update( &ctx_blake, input + (64*4), 16 );
-     blake256_4way_close( &ctx_blake, vhash );
+     blake256_4x32_update( &ctx_blake, input + (64*4), 16 );
+     blake256_4x32_close( &ctx_blake, vhash );

     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -7,25 +7,24 @@
 #include "algo/cubehash/cubehash_sse2.h" 
 #include "algo/cubehash/cube-hash-2way.h"

-
 #if defined (LYRA2REV2_16WAY)

 typedef struct {
-   blake256_16way_context    blake;
-   keccak256_8way_context    keccak;
+   blake256_16x32_context    blake;
+   keccak256_8x64_context    keccak;
   cubehashParam             cube;
-   skein256_8way_context     skein;
-   bmw256_16way_context      bmw;
+   skein256_8x64_context     skein;
+   bmw256_16x32_context      bmw;
 } lyra2v2_16way_ctx_holder __attribute__ ((aligned (64)));

 static lyra2v2_16way_ctx_holder l2v2_16way_ctx;

 bool init_lyra2rev2_16way_ctx()
 {
-   keccak256_8way_init( &l2v2_16way_ctx.keccak );
+   keccak256_8x64_init( &l2v2_16way_ctx.keccak );
   cubehashInit( &l2v2_16way_ctx.cube, 256, 16, 32 );
-   skein256_8way_init( &l2v2_16way_ctx.skein );
-   bmw256_16way_init( &l2v2_16way_ctx.bmw );
+   skein256_8x64_init( &l2v2_16way_ctx.skein );
+   bmw256_16x32_init( &l2v2_16way_ctx.bmw );
   return true;
 }

@@ -51,8 +50,8 @@ void lyra2rev2_16way_hash( void *state, const void *input )
   lyra2v2_16way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v2_16way_ctx, sizeof(l2v2_16way_ctx) );

-   blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
-   blake256_16way_close( &ctx.blake, vhash );
+   blake256_16x32_update( &ctx.blake, input + (64<<4), 16 );
+   blake256_16x32_close( &ctx.blake, vhash );

   dintrlv_16x32( hash0,  hash1,  hash2,  hash3,
                  hash4,  hash5,  hash6,  hash7,
@@ -62,17 +61,17 @@ void lyra2rev2_16way_hash( void *state, const void *input )
   intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
                       hash4, hash5, hash6, hash7, 256 );

-   keccak256_8way_update( &ctx.keccak, vhash, 32 );
-   keccak256_8way_close( &ctx.keccak, vhash );
+   keccak256_8x64_update( &ctx.keccak, vhash, 32 );
+   keccak256_8x64_close( &ctx.keccak, vhash );

   dintrlv_8x64( hash0, hash1, hash2, hash3,
                 hash4, hash5, hash6, hash7, vhash, 256 );
   intrlv_8x64( vhash, hash8,  hash9,  hash10, hash11,
                       hash12, hash13, hash14, hash15, 256 );

-   keccak256_8way_init( &ctx.keccak );
-   keccak256_8way_update( &ctx.keccak, vhash, 32 );
-   keccak256_8way_close( &ctx.keccak, vhash );
+   keccak256_8x64_init( &ctx.keccak );
+   keccak256_8x64_update( &ctx.keccak, vhash, 32 );
+   keccak256_8x64_close( &ctx.keccak, vhash );

   dintrlv_8x64( hash8,  hash9,  hash10,  hash11,
                 hash12, hash13, hash14, hash15, vhash, 256 );
@@ -122,22 +121,21 @@ void lyra2rev2_16way_hash( void *state, const void *input )

   intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
                       hash4, hash5, hash6, hash7, 256 );
-   skein256_8way_update( &ctx.skein, vhash, 32 );
-   skein256_8way_close( &ctx.skein, vhash );
+   skein256_8x64_update( &ctx.skein, vhash, 32 );
+   skein256_8x64_close( &ctx.skein, vhash );

   dintrlv_8x64( hash0, hash1, hash2, hash3,
                 hash4, hash5, hash6, hash7, vhash, 256 );
   intrlv_8x64( vhash, hash8,  hash9,  hash10, hash11, hash12,
                       hash13, hash14, hash15, 256 );

-   skein256_8way_init( &ctx.skein );
-   skein256_8way_update( &ctx.skein, vhash, 32 );
-   skein256_8way_close( &ctx.skein, vhash );
+   skein256_8x64_init( &ctx.skein );
+   skein256_8x64_update( &ctx.skein, vhash, 32 );
+   skein256_8x64_close( &ctx.skein, vhash );

   dintrlv_8x64( hash8,  hash9,  hash10, hash11,
                 hash12, hash13, hash14, hash15, vhash, 256 );
   
-   
   cubehash_full( &ctx.cube, (byte*) hash0,  256, (const byte*) hash0, 32 );
   cubehash_full( &ctx.cube, (byte*) hash1,  256, (const byte*) hash1, 32 );
   cubehash_full( &ctx.cube, (byte*) hash2,  256, (const byte*) hash2, 32 );
@@ -160,8 +158,8 @@ void lyra2rev2_16way_hash( void *state, const void *input )
                        hash8,  hash9,  hash10, hash11,
                        hash12, hash13, hash14, hash15, 256 );

-   bmw256_16way_update( &ctx.bmw, vhash, 32 );
-   bmw256_16way_close( &ctx.bmw, state );
+   bmw256_16x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_16x32_close( &ctx.bmw, state );
 }

 int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
@@ -186,8 +184,8 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
   mm512_bswap32_intrlv80_16x32( vdata, pdata );
   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
-   blake256_16way_init( &l2v2_16way_ctx.blake );
-   blake256_16way_update( &l2v2_16way_ctx.blake, vdata, 64 );
+   blake256_16x32_init( &l2v2_16way_ctx.blake );
+   blake256_16x32_update( &l2v2_16way_ctx.blake, vdata, 64 );

   do
   {
@@ -214,21 +212,21 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
 #elif defined (LYRA2REV2_8WAY)

 typedef struct {
-   blake256_8way_context     blake;
-   keccak256_4way_context    keccak;
+   blake256_8x32_context     blake;
+   keccak256_4x64_context    keccak;
   cubehashParam             cube;
-   skein256_4way_context     skein;
-   bmw256_8way_context       bmw;
+   skein256_4x64_context     skein;
+   bmw256_8x32_context       bmw;
 } lyra2v2_8way_ctx_holder __attribute__ ((aligned (64)));

 static lyra2v2_8way_ctx_holder l2v2_8way_ctx;

 bool init_lyra2rev2_8way_ctx()
 {
-   keccak256_4way_init( &l2v2_8way_ctx.keccak );
+   keccak256_4x64_init( &l2v2_8way_ctx.keccak );
   cubehashInit( &l2v2_8way_ctx.cube, 256, 16, 32 );
-   skein256_4way_init( &l2v2_8way_ctx.skein );
-   bmw256_8way_init( &l2v2_8way_ctx.bmw );
+   skein256_4x64_init( &l2v2_8way_ctx.skein );
+   bmw256_8x32_init( &l2v2_8way_ctx.bmw );
   return true;
 }

@@ -246,20 +244,20 @@ void lyra2rev2_8way_hash( void *state, const void *input )
   lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );

-   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
-   blake256_8way_close( &ctx.blake, vhash );
+   blake256_8x32_update( &ctx.blake, input + (64<<3), 16 );
+   blake256_8x32_close( &ctx.blake, vhash );

   dintrlv_8x32( hash0, hash1, hash2, hash3,
                 hash4, hash5, hash6, hash7, vhash, 256 );

   intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
-   keccak256_4way_update( &ctx.keccak, vhash, 32 );
-   keccak256_4way_close( &ctx.keccak, vhash );
+   keccak256_4x64_update( &ctx.keccak, vhash, 32 );
+   keccak256_4x64_close( &ctx.keccak, vhash );
   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
   intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
-   keccak256_4way_init( &ctx.keccak );
-   keccak256_4way_update( &ctx.keccak, vhash, 32 );
-   keccak256_4way_close( &ctx.keccak, vhash );
+   keccak256_4x64_init( &ctx.keccak );
+   keccak256_4x64_update( &ctx.keccak, vhash, 32 );
+   keccak256_4x64_close( &ctx.keccak, vhash );
   dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );

   cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
@@ -282,13 +280,13 @@ void lyra2rev2_8way_hash( void *state, const void *input )
   LYRA2REV2( l2v2_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
   
   intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
-   skein256_4way_update( &ctx.skein, vhash, 32 );
-   skein256_4way_close( &ctx.skein, vhash );
+   skein256_4x64_update( &ctx.skein, vhash, 32 );
+   skein256_4x64_close( &ctx.skein, vhash );
   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
   intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
-   skein256_4way_init( &ctx.skein );
-   skein256_4way_update( &ctx.skein, vhash, 32 );
-   skein256_4way_close( &ctx.skein, vhash );
+   skein256_4x64_init( &ctx.skein );
+   skein256_4x64_update( &ctx.skein, vhash, 32 );
+   skein256_4x64_close( &ctx.skein, vhash );
   dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );

   cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
@@ -303,8 +301,8 @@ void lyra2rev2_8way_hash( void *state, const void *input )
   intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
                       hash4, hash5, hash6, hash7, 256 );

-   bmw256_8way_update( &ctx.bmw, vhash, 32 );
-   bmw256_8way_close( &ctx.bmw, state );
+   bmw256_8x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_8x32_close( &ctx.bmw, state );
 }

 int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
@@ -328,8 +326,8 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
-   blake256_8way_init( &l2v2_8way_ctx.blake );
-   blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 );
+   blake256_8x32_init( &l2v2_8way_ctx.blake );
+   blake256_8x32_update( &l2v2_8way_ctx.blake, vdata, 64 );

   do
   {
@@ -356,21 +354,21 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
 #elif defined (LYRA2REV2_4WAY)

 typedef struct {
-   blake256_4way_context     blake;
-   keccak256_4way_context    keccak;
+   blake256_4x32_context     blake;
+   keccak256_4x64_context    keccak;
   cubehashParam             cube;
-   skein256_4way_context     skein;
-   bmw256_4way_context          bmw;
+   skein256_4x64_context     skein;
+   bmw256_4x32_context          bmw;
 } lyra2v2_4way_ctx_holder;

 static lyra2v2_4way_ctx_holder l2v2_4way_ctx;

 bool init_lyra2rev2_4way_ctx()
 {
-   keccak256_4way_init( &l2v2_4way_ctx.keccak );
+   keccak256_4x64_init( &l2v2_4way_ctx.keccak );
   cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
-   skein256_4way_init( &l2v2_4way_ctx.skein );
-   bmw256_4way_init( &l2v2_4way_ctx.bmw );
+   skein256_4x64_init( &l2v2_4way_ctx.skein );
+   bmw256_4x32_init( &l2v2_4way_ctx.bmw );
   return true;
 }

@@ -385,13 +383,13 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
   memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );

-   blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
-   blake256_4way_close( &ctx.blake, vhash );
+   blake256_4x32_update( &ctx.blake, input + (64<<2), 16 );
+   blake256_4x32_close( &ctx.blake, vhash );

   rintrlv_4x32_4x64( vhash64, vhash, 256 );

-   keccak256_4way_update( &ctx.keccak, vhash64, 32 );
-   keccak256_4way_close( &ctx.keccak, vhash64 );
+   keccak256_4x64_update( &ctx.keccak, vhash64, 32 );
+   keccak256_4x64_close( &ctx.keccak, vhash64 );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

@@ -410,8 +408,8 @@ void lyra2rev2_4way_hash( void *state, const void *input )

   intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );

-   skein256_4way_update( &ctx.skein, vhash64, 32 );
-   skein256_4way_close( &ctx.skein, vhash64 );
+   skein256_4x64_update( &ctx.skein, vhash64, 32 );
+   skein256_4x64_close( &ctx.skein, vhash64 );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

@@ -426,8 +424,8 @@ void lyra2rev2_4way_hash( void *state, const void *input )

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );

-   bmw256_4way_update( &ctx.bmw, vhash, 32 );
-   bmw256_4way_close( &ctx.bmw, state );
+   bmw256_4x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_4x32_close( &ctx.bmw, state );
 }

 int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
@@ -451,8 +449,8 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,

   v128_bswap32_intrlv80_4x32( vdata, pdata );

-   blake256_4way_init( &l2v2_4way_ctx.blake );
-   blake256_4way_update( &l2v2_4way_ctx.blake, vdata, 64 );
+   blake256_4x32_init( &l2v2_4way_ctx.blake );
+   blake256_4x32_update( &l2v2_4way_ctx.blake, vdata, 64 );

   do
   {
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -9,18 +9,18 @@
 #if defined (LYRA2REV3_16WAY)

 typedef struct {
-   blake256_16way_context     blake;
+   blake256_16x32_context     blake;
   cube_4way_context          cube;
-   bmw256_16way_context       bmw;
+   bmw256_16x32_context       bmw;
 } lyra2v3_16way_ctx_holder;

 static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx;

 bool init_lyra2rev3_16way_ctx()
 {
-   blake256_16way_init( &l2v3_16way_ctx.blake );
+   blake256_16x32_init( &l2v3_16way_ctx.blake );
   cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 );
-   bmw256_16way_init( &l2v3_16way_ctx.bmw );
+   bmw256_16x32_init( &l2v3_16way_ctx.bmw );
   return true;
 }

@@ -46,8 +46,8 @@ void lyra2rev3_16way_hash( void *state, const void *input )
   lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) );

-   blake256_16way_update( &ctx.blake, input + (64*16), 16 );
-   blake256_16way_close( &ctx.blake, vhash );
+   blake256_16x32_update( &ctx.blake, input + (64*16), 16 );
+   blake256_16x32_close( &ctx.blake, vhash );

   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
           hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
@@ -120,8 +120,8 @@ void lyra2rev3_16way_hash( void *state, const void *input )
             hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
             hash15, 256 );

-   bmw256_16way_update( &ctx.bmw, vhash, 32 );
-   bmw256_16way_close( &ctx.bmw, state );
+   bmw256_16x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_16x32_close( &ctx.bmw, state );
 }


@@ -145,8 +145,8 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,

   mm512_bswap32_intrlv80_16x32( vdata, pdata );

-   blake256_16way_init( &l2v3_16way_ctx.blake );
-   blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 );
+   blake256_16x32_init( &l2v3_16way_ctx.blake );
+   blake256_16x32_update( &l2v3_16way_ctx.blake, vdata, 64 );

   do
   {
@@ -178,18 +178,18 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
 #elif defined (LYRA2REV3_8WAY)

 typedef struct {
-   blake256_8way_context     blake;
+   blake256_8x32_context     blake;
   cubehashParam             cube;
-   bmw256_8way_context       bmw;
+   bmw256_8x32_context       bmw;
 } lyra2v3_8way_ctx_holder;

 static __thread lyra2v3_8way_ctx_holder l2v3_8way_ctx;

 bool init_lyra2rev3_8way_ctx()
 {
-   blake256_8way_init( &l2v3_8way_ctx.blake );
+   blake256_8x32_init( &l2v3_8way_ctx.blake );
   cubehashInit( &l2v3_8way_ctx.cube, 256, 16, 32 );
-   bmw256_8way_init( &l2v3_8way_ctx.bmw );
+   bmw256_8x32_init( &l2v3_8way_ctx.bmw );
   return true;
 }

@@ -207,8 +207,8 @@ void lyra2rev3_8way_hash( void *state, const void *input )
   lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );

-   blake256_8way_update( &ctx.blake, input + (64*8), 16 );
-   blake256_8way_close( &ctx.blake, vhash );
+   blake256_8x32_update( &ctx.blake, input + (64*8), 16 );
+   blake256_8x32_close( &ctx.blake, vhash );

   dintrlv_8x32( hash0, hash1, hash2, hash3,
                       hash4, hash5, hash6, hash7, vhash, 256 );
@@ -243,8 +243,8 @@ void lyra2rev3_8way_hash( void *state, const void *input )
   intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
                             hash4, hash5, hash6, hash7, 256 );

-   bmw256_8way_update( &ctx.bmw, vhash, 32 );
-   bmw256_8way_close( &ctx.bmw, state );
+   bmw256_8x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_8x32_close( &ctx.bmw, state );

   }

@@ -269,8 +269,8 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
-   blake256_8way_init( &l2v3_8way_ctx.blake );
-   blake256_8way_update( &l2v3_8way_ctx.blake, vdata, 64 );
+   blake256_8x32_init( &l2v3_8way_ctx.blake );
+   blake256_8x32_update( &l2v3_8way_ctx.blake, vdata, 64 );

   do
   {
@@ -300,19 +300,18 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
 #if defined (LYRA2REV3_4WAY)  

 typedef struct {
-   blake256_4way_context     blake;
+   blake256_4x32_context     blake;
   cubehashParam             cube;
-   bmw256_4way_context       bmw;
+   bmw256_4x32_context       bmw;
 } lyra2v3_4way_ctx_holder;

-//static lyra2v3_4way_ctx_holder l2v3_4way_ctx;
 static __thread lyra2v3_4way_ctx_holder l2v3_4way_ctx;

 bool init_lyra2rev3_4way_ctx()
 {
-   blake256_4way_init( &l2v3_4way_ctx.blake );
+   blake256_4x32_init( &l2v3_4way_ctx.blake );
   cubehashInit( &l2v3_4way_ctx.cube, 256, 16, 32 );
-   bmw256_4way_init( &l2v3_4way_ctx.bmw );
+   bmw256_4x32_init( &l2v3_4way_ctx.bmw );
   return true;
 }

@@ -326,8 +325,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
   memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );

-   blake256_4way_update( &ctx.blake, input + (64*4), 16 );
-   blake256_4way_close( &ctx.blake, vhash );
+   blake256_4x32_update( &ctx.blake, input + (64*4), 16 );
+   blake256_4x32_close( &ctx.blake, vhash );
   dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
@@ -349,8 +348,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
-   bmw256_4way_update( &ctx.bmw, vhash, 32 );
-   bmw256_4way_close( &ctx.bmw, state );
+   bmw256_4x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_4x32_close( &ctx.bmw, state );
 }

 int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
@@ -374,8 +373,8 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
   v128_bswap32_intrlv80_4x32( vdata, pdata );
   *noncev = _mm_set_epi32( n+3, n+2, n+1, n );

-   blake256_4way_init( &l2v3_4way_ctx.blake );
-   blake256_4way_update( &l2v3_4way_ctx.blake, vdata, 64 );
+   blake256_4x32_init( &l2v3_4way_ctx.blake );
+   blake256_4x32_update( &l2v3_4way_ctx.blake, vdata, 64 );

   do
   {
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -45,7 +45,7 @@ static void lyra2z_16way_hash( void *state, const void *midstate_vars,
    uint32_t hash14[8] __attribute__ ((aligned (32)));
    uint32_t hash15[8] __attribute__ ((aligned (32)));

-    blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
+    blake256_16x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );

    dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
              hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
@@ -139,7 +139,7 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );

   // Partialy prehash second block without touching nonces in block_buf[3].
-   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
     lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -180,7 +180,7 @@ static void lyra2z_8way_hash( void *state, const void *midstate_vars,
     uint32_t hash7[8] __attribute__ ((aligned (32)));
     uint32_t vhash[8*8] __attribute__ ((aligned (64)));

-     blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
+     blake256_8x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );

     dintrlv_8x32( hash0, hash1, hash2, hash3,
                   hash4, hash5, hash6, hash7, vhash, 256 );
@@ -246,7 +246,7 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
            _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );

   // Partialy prehash second block without touching nonces
-   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
     lyra2z_8way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -279,12 +279,12 @@ bool lyra2z_4way_thread_init()
 return ( lyra2z_4way_matrix = mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
 }

-static __thread blake256_4way_context l2z_4way_blake_mid;
+static __thread blake256_4x32_context l2z_4way_blake_mid;

 void lyra2z_4way_midstate( const void* input )
 {
-       blake256_4way_init( &l2z_4way_blake_mid );
-       blake256_4way_update( &l2z_4way_blake_mid, input, 64 );
+       blake256_4x32_init( &l2z_4way_blake_mid );
+       blake256_4x32_update( &l2z_4way_blake_mid, input, 64 );
 }

 void lyra2z_4way_hash( void *hash, const void *midstate_vars,
@@ -295,15 +295,8 @@ void lyra2z_4way_hash( void *hash, const void *midstate_vars,
     uint32_t hash2[8] __attribute__ ((aligned (64)));
     uint32_t hash3[8] __attribute__ ((aligned (64)));
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-//     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));

-     blake256_4way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
-
-/*
-     memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
-     blake256_4way_update( &ctx_blake, input + (64*4), 16 );
-     blake256_4way_close( &ctx_blake, vhash );
-*/
+     blake256_4x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );

     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

@@ -357,7 +350,7 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
   block_buf[15] = v128_32( 640 );

   // Partialy prehash second block without touching nonces
-   blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_4x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
      lyra2z_4way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -454,11 +447,9 @@ bool register_lyra2z_algo( algo_gate_t* gate )
 #if defined(LYRA2Z_16WAY)
  gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
  gate->scanhash          = (void*)&scanhash_lyra2z_16way;
-//  gate->hash       = (void*)&lyra2z_16way_hash;
 #elif defined(LYRA2Z_8WAY)
  gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
  gate->scanhash          = (void*)&scanhash_lyra2z_8way;
-//  gate->hash       = (void*)&lyra2z_8way_hash;
 #elif defined(LYRA2Z_4WAY)
  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
  gate->scanhash          = (void*)&scanhash_lyra2z_4way;
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -45,7 +45,7 @@ static const uint64_t blake2b_IV[8] =

 #if defined(SIMD512)

-#define G2W_4X64(a,b,c,d) \
+#define G2W(a,b,c,d) \
   a = _mm512_add_epi64( a, b ); \
   d = _mm512_ror_epi64( _mm512_xor_si512( d, a ), 32 ); \
   c = _mm512_add_epi64( c, d ); \
@@ -56,27 +56,15 @@ static const uint64_t blake2b_IV[8] =
   b = _mm512_ror_epi64( _mm512_xor_si512( b, c ), 63 );

 #define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
-   G2W_4X64( s0, s1, s2, s3 ); \
+   G2W( s0, s1, s2, s3 ); \
   s0 = mm512_shufll256_64( s0 ); \
-   s3 = mm512_swap256_128( s3); \
+   s3 = mm512_swap256_128( s3 ); \
   s2 = mm512_shuflr256_64( s2 ); \
-   G2W_4X64( s0, s1, s2, s3 ); \
+   G2W( s0, s1, s2, s3 ); \
   s0 = mm512_shuflr256_64( s0 ); \
   s3 = mm512_swap256_128( s3 ); \
   s2 = mm512_shufll256_64( s2 ); 

-/*
-#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
-   G2W_4X64( s0, s1, s2, s3 ); \
-   s3 = mm512_shufll256_64( s3 ); \
-   s1 = mm512_shuflr256_64( s1); \
-   s2 = mm512_swap256_128( s2 ); \
-   G2W_4X64( s0, s1, s2, s3 ); \
-   s3 = mm512_shuflr256_64( s3 ); \
-   s1 = mm512_shufll256_64( s1 ); \
-   s2 = mm512_swap256_128( s2 ); 
-*/
-
 #define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
@@ -95,7 +83,7 @@ static const uint64_t blake2b_IV[8] =

 #if defined(__AVX2__)

-#define G_4X64(a,b,c,d) \
+#define G_AVX2(a,b,c,d) \
   a = _mm256_add_epi64( a, b ); \
   d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
   c = _mm256_add_epi64( c, d ); \
@@ -107,27 +95,15 @@ static const uint64_t blake2b_IV[8] =

 // Pivot about s1 instead of s0 reduces latency.
 #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
-   G_4X64( s0, s1, s2, s3 ); \
+   G_AVX2( s0, s1, s2, s3 ); \
   s0 = mm256_shufll_64( s0 ); \
-   s3 = mm256_swap_128( s3); \
+   s3 = mm256_swap_128( s3 ); \
   s2 = mm256_shuflr_64( s2 ); \
-   G_4X64( s0, s1, s2, s3 ); \
+   G_AVX2( s0, s1, s2, s3 ); \
   s0 = mm256_shuflr_64( s0 ); \
   s3 = mm256_swap_128( s3 ); \
   s2 = mm256_shufll_64( s2 );

-/*
-#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
-   G_4X64( s0, s1, s2, s3 ); \
-   s3 = mm256_shufll_64( s3 ); \
-   s1 = mm256_shuflr_64( s1); \
-   s2 = mm256_swap_128( s2 ); \
-   G_4X64( s0, s1, s2, s3 ); \
-   s3 = mm256_shuflr_64( s3 ); \
-   s1 = mm256_shufll_64( s1 ); \
-   s2 = mm256_swap_128( s2 );
-*/
-
 #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -148,7 +124,7 @@ static const uint64_t blake2b_IV[8] =

 // process 2 columns in parallel
 // returns void, all args updated
-#define G_2X64(a,b,c,d) \
+#define G_128(a,b,c,d) \
   a = v128_add64( a, b ); \
   d = v128_ror64xor( d, a, 32 ); \
   c = v128_add64( c, d ); \
@@ -161,16 +137,16 @@ static const uint64_t blake2b_IV[8] =
 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
 { \
   v128u64_t t; \
-   G_2X64( s0, s2, s4, s6 ); \
-   G_2X64( s1, s3, s5, s7 ); \
+   G_128( s0, s2, s4, s6 ); \
+   G_128( s1, s3, s5, s7 ); \
   t =  v128_alignr64( s7, s6, 1 ); \
   s6 = v128_alignr64( s6, s7, 1 ); \
   s7 = t; \
   t =  v128_alignr64( s2, s3, 1 ); \
   s2 = v128_alignr64( s3, s2, 1 ); \
   s3 = t; \
-   G_2X64( s0, s2, s5, s6 ); \
-   G_2X64( s1, s3, s4, s7 ); \
+   G_128( s0, s2, s5, s6 ); \
+   G_128( s1, s3, s4, s7 ); \
   t =  v128_alignr64( s6, s7, 1 ); \
   s6 = v128_alignr64( s7, s6, 1 ); \
   s7 = t; \
--- a/algo/m7m/m7m.c
+++ b/algo/m7m/m7m.c
@@ -1,8 +1,6 @@
 #include "cpuminer-config.h"
 #include "algo-gate-api.h"

-#if !defined(__APPLE__)
-
 #include <gmp.h>
 #include <stdbool.h>
 #include <stdlib.h>
@@ -33,6 +31,7 @@ static inline double exp_n( double xt )
        return exp( xt );
 }

+/*
 static inline double exp_n2( double x1, double x2 )
 {
    double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
@@ -53,6 +52,7 @@ static inline double exp_n2( double x1, double x2 )
    else if ( xt > p6 - 1.e-200 )
        return 0.;
 }
+*/

 double swit2_( double wvnmb )
 {
@@ -298,14 +298,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
    return 0;
 }

-#endif   // not apple
-
 bool register_m7m_algo( algo_gate_t *gate )
 {
-#if defined(__APPLE__)
-  applog( LOG_ERR, "M7M algo is not supported on MacOS");
-  return false;
-#else  
  gate->optimizations = SHA256_OPT;
  init_m7m_ctx();
  gate->scanhash              = (void*)&scanhash_m7m_hash;
@@ -315,6 +309,5 @@ bool register_m7m_algo( algo_gate_t *gate )
  gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
  opt_target_factor = 65536.0;
  return true;
-#endif
 }

--- a/algo/panama/panama-hash-4way.h
+++ b/algo/panama/panama-hash-4way.h
@@ -18,11 +18,14 @@ typedef struct {
 } panama_4way_context __attribute__ ((aligned (64)));

 void panama_4way_init( void *cc );
-
 void panama_4way_update( void *cc, const void *data, size_t len );
-
 void panama_4way_close( void *cc, void *dst );

+#define panama_4x32_context panama_4way_context
+#define panama_4x32_init    panama_4way_init
+#define panama_4x32_update  panama_4way_update
+#define panama_4x32_close   panama_4way_close
+
 #if defined(__AVX2__)

 typedef struct {
@@ -34,10 +37,13 @@ typedef struct {
 } panama_8way_context __attribute__ ((aligned (128)));

 void panama_8way_init( void *cc );
-
 void panama_8way_update( void *cc, const void *data, size_t len );
-
 void panama_8way_close( void *cc, void *dst );

+#define panama_8x32_context panama_8way_context
+#define panama_8x32_init    panama_8way_init
+#define panama_8x32_update  panama_8way_update
+#define panama_8x32_close   panama_8way_close
+
 #endif
 #endif
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -11,7 +11,6 @@
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#include "algo/simd/nist.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/shavite/shavite-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
@@ -32,20 +31,20 @@

 union _hmq1725_8way_context_overlay
 {
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
+    blake512_8x64_context   blake;
+    bmw512_8x64_context     bmw;
+    skein512_8x64_context   skein;
+    jh512_8x64_context      jh;
+    keccak512_8x64_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
    simd_4way_context       simd;
-    hamsi512_8way_context   hamsi;
+    hamsi512_8x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_8way_context  shabal;
+    shabal512_8x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
-    haval256_5_8way_context haval;
+    sha512_8x64_context     sha512;
+    haval256_8x32_context   haval;
 #if defined(__VAES__)
    groestl512_4way_context groestl;
    shavite512_4way_context shavite;
@@ -82,7 +81,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   __m512i* vhB = (__m512i*)vhashB;
   __m512i* vhC = (__m512i*)vhashC;

-   bmw512_8way_full( &ctx.bmw, vhash, input, 80 );
+   bmw512_8x64_full( &ctx.bmw, vhash, input, 80 );

   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
                     hash4, hash5, hash6, hash7, vhash );
@@ -142,26 +141,26 @@ extern void hmq1725_8way_hash(void *state, const void *input)

   // B
   if ( likely( vh_mask & 0xff ) )
-       skein512_8way_full( &ctx.skein, vhashB, vhash, 64 );
+       skein512_8x64_full( &ctx.skein, vhashB, vhash, 64 );

   mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );

-   jh512_8way_init( &ctx.jh );
-   jh512_8way_update( &ctx.jh, vhash, 64 );
-   jh512_8way_close( &ctx.jh, vhash );
+   jh512_8x64_init( &ctx.jh );
+   jh512_8x64_update( &ctx.jh, vhash, 64 );
+   jh512_8x64_close( &ctx.jh, vhash );

-   keccak512_8way_init( &ctx.keccak );
-   keccak512_8way_update( &ctx.keccak, vhash, 64 );
-   keccak512_8way_close( &ctx.keccak, vhash );
+   keccak512_8x64_init( &ctx.keccak );
+   keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+   keccak512_8x64_close( &ctx.keccak, vhash );

   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );

   // A
   if ( ( vh_mask & 0xff ) != 0xff )
-       blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
+       blake512_8x64_full( &ctx.blake, vhashA, vhash, 64 );
   // B
   if ( vh_mask & 0xff )
-       bmw512_8way_full( &ctx.bmw, vhashB, vhash, 64 );
+       bmw512_8x64_full( &ctx.bmw, vhashB, vhash, 64 );

   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
@@ -177,16 +176,16 @@ extern void hmq1725_8way_hash(void *state, const void *input)

   if ( likely( ( vh_mask & 0xff ) != 0xff ) )
   {
-      keccak512_8way_init( &ctx.keccak );
-      keccak512_8way_update( &ctx.keccak, vhash, 64 );
-      keccak512_8way_close( &ctx.keccak, vhashA );
+      keccak512_8x64_init( &ctx.keccak );
+      keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+      keccak512_8x64_close( &ctx.keccak, vhashA );
   }

   if ( likely( vh_mask & 0xff ) )
   {
-      jh512_8way_init( &ctx.jh );
-      jh512_8way_update( &ctx.jh, vhash, 64 );
-      jh512_8way_close( &ctx.jh, vhashB );
+      jh512_8x64_init( &ctx.jh );
+      jh512_8x64_update( &ctx.jh, vhash, 64 );
+      jh512_8x64_close( &ctx.jh, vhashB );
   }

   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
@@ -252,9 +251,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   // B
   if ( likely( vh_mask & 0xff ) )
   {
-      haval256_5_8way_init( &ctx.haval );
-      haval256_5_8way_update( &ctx.haval, vhash, 64 );
-      haval256_5_8way_close( &ctx.haval, vhash );
+      haval256_8x32_init( &ctx.haval );
+      haval256_8x32_update( &ctx.haval, vhash, 64 );
+      haval256_8x32_close( &ctx.haval, vhash );
      memset( &vhash[8<<3], 0, 32<<3 );
      rintrlv_8x32_8x64( vhashB, vhash, 512 );
   }
@@ -297,7 +296,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)

 #endif

-   blake512_8way_full( &ctx.blake, vhash, vhash, 64 );
+   blake512_8x64_full( &ctx.blake, vhash, vhash, 64 );

   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );

@@ -352,9 +351,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)

   mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );

-   hamsi512_8way_init( &ctx.hamsi );
-   hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
-   hamsi512_8way_close( &ctx.hamsi, vhash );
+   hamsi512_8x64_init( &ctx.hamsi );
+   hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
+   hamsi512_8x64_close( &ctx.hamsi, vhash );

   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
                     hash4, hash5, hash6, hash7, vhash );
@@ -430,9 +429,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)

   rintrlv_8x64_8x32( vhashA, vhash, 512 );

-   shabal512_8way_init( &ctx.shabal );
-   shabal512_8way_update( &ctx.shabal, vhashA, 64 );
-   shabal512_8way_close( &ctx.shabal, vhash );
+   shabal512_8x32_init( &ctx.shabal );
+   shabal512_8x32_update( &ctx.shabal, vhashA, 64 );
+   shabal512_8x32_close( &ctx.shabal, vhash );

   dintrlv_8x32_512( hash0, hash1, hash2, hash3,
                     hash4, hash5, hash6, hash7, vhash );
@@ -475,9 +474,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   // B
   if ( likely( vh_mask & 0xff ) )
   {
-      sha512_8way_init( &ctx.sha512 );
-      sha512_8way_update( &ctx.sha512, vhash, 64 );
-      sha512_8way_close( &ctx.sha512, vhashB );
+      sha512_8x64_init( &ctx.sha512 );
+      sha512_8x64_update( &ctx.sha512, vhash, 64 );
+      sha512_8x64_close( &ctx.sha512, vhashB );
   }

   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
@@ -510,9 +509,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   
 #endif

-   sha512_8way_init( &ctx.sha512 );
-   sha512_8way_update( &ctx.sha512, vhash, 64 );
-   sha512_8way_close( &ctx.sha512, vhash );
+   sha512_8x64_init( &ctx.sha512 );
+   sha512_8x64_update( &ctx.sha512, vhash, 64 );
+   sha512_8x64_close( &ctx.sha512, vhash );

   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
@@ -523,9 +522,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   {
      intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                         hash7 );
-      haval256_5_8way_init( &ctx.haval );
-      haval256_5_8way_update( &ctx.haval, vhash, 64 );
-      haval256_5_8way_close( &ctx.haval, vhash );
+      haval256_8x32_init( &ctx.haval );
+      haval256_8x32_update( &ctx.haval, vhash, 64 );
+      haval256_8x32_close( &ctx.haval, vhash );
      memset( &vhash[8<<3], 0, 32<<3 );
      rintrlv_8x32_8x64( vhashA, vhash, 512 );
   }
@@ -552,9 +551,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
                    hash7 );
   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );

-   bmw512_8way_init( &ctx.bmw );
-   bmw512_8way_update( &ctx.bmw, vhash, 64 );
-   bmw512_8way_close( &ctx.bmw, state );
+   bmw512_8x64_init( &ctx.bmw );
+   bmw512_8x64_update( &ctx.bmw, vhash, 64 );
+   bmw512_8x64_close( &ctx.bmw, state );
 }

 int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
@@ -606,27 +605,27 @@ int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,

 union _hmq1725_4way_context_overlay
 {
-    blake512_4way_context   blake;
-    bmw512_4way_context     bmw;
+    blake512_4x64_context   blake;
+    bmw512_4x64_context     bmw;
    hashState_groestl       groestl;
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;
-    keccak512_4way_context  keccak;
+    skein512_4x64_context   skein;
+    jh512_4x64_context      jh;
+    keccak512_4x64_context  keccak;
    hashState_luffa         luffa;
    luffa_2way_context      luffa2;
    cubehashParam           cube;
    cube_2way_context       cube2;
    sph_shavite512_context  shavite;
-    hashState_sd            sd;
+    simd512_context         simd;
    shavite512_2way_context shavite2;
-    simd_2way_context       simd;
+    simd_2way_context       simd_2way;
    hashState_echo          echo;
-    hamsi512_4way_context   hamsi;
+    hamsi512_4x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_4way_context  shabal;
+    shabal512_4x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_4way_context     sha512;
-    haval256_5_4way_context haval;
+    sha512_4x64_context     sha512;
+    haval256_4x32_context haval;
 #if defined(__VAES__)
    groestl512_2way_context groestl2;
    echo_2way_context       echo2;
@@ -653,9 +652,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
   __m256i* vhA = (__m256i*)vhashA;
   __m256i* vhB = (__m256i*)vhashB;

-   bmw512_4way_init( &ctx.bmw );
-   bmw512_4way_update( &ctx.bmw, input, 80 );
-   bmw512_4way_close( &ctx.bmw, vhash );
+   bmw512_4x64_init( &ctx.bmw );
+   bmw512_4x64_update( &ctx.bmw, input, 80 );
+   bmw512_4x64_close( &ctx.bmw, vhash );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

@@ -687,17 +686,17 @@ extern void hmq1725_4way_hash(void *state, const void *input)
 // B

    if ( h_mask & 0xffffffff )
-       skein512_4way_full( &ctx.skein, vhashB, vhash, 64 );
+       skein512_4x64_full( &ctx.skein, vhashB, vhash, 64 );

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

-    jh512_4way_init( &ctx.jh );
-    jh512_4way_update( &ctx.jh, vhash, 64 );
-    jh512_4way_close( &ctx.jh, vhash );
+    jh512_4x64_init( &ctx.jh );
+    jh512_4x64_update( &ctx.jh, vhash, 64 );
+    jh512_4x64_close( &ctx.jh, vhash );

-    keccak512_4way_init( &ctx.keccak );
-    keccak512_4way_update( &ctx.keccak, vhash, 64 );
-    keccak512_4way_close( &ctx.keccak, vhash );
+    keccak512_4x64_init( &ctx.keccak );
+    keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+    keccak512_4x64_close( &ctx.keccak, vhash );

 // second fork, A = blake parallel, B= bmw parallel.
    
@@ -705,13 +704,13 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    h_mask = _mm256_movemask_epi8( vh_mask );

    if ( ( h_mask & 0xffffffff ) != 0xffffffff )
-       blake512_4way_full( &ctx.blake, vhashA, vhash, 64 );
+       blake512_4x64_full( &ctx.blake, vhashA, vhash, 64 );

    if ( h_mask & 0xffffffff )
    {
-       bmw512_4way_init( &ctx.bmw );
-       bmw512_4way_update( &ctx.bmw, vhash, 64 );
-       bmw512_4way_close( &ctx.bmw, vhashB );
+       bmw512_4x64_init( &ctx.bmw );
+       bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+       bmw512_4x64_close( &ctx.bmw, vhashB );
    }

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
@@ -734,16 +733,16 @@ extern void hmq1725_4way_hash(void *state, const void *input)

    if ( ( h_mask & 0xffffffff ) != 0xffffffff )
    {
-        keccak512_4way_init( &ctx.keccak );
-        keccak512_4way_update( &ctx.keccak, vhash, 64 );
-        keccak512_4way_close( &ctx.keccak, vhashA );
+        keccak512_4x64_init( &ctx.keccak );
+        keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+        keccak512_4x64_close( &ctx.keccak, vhashA );
    }

    if ( h_mask & 0xffffffff )
    {
-        jh512_4way_init( &ctx.jh );
-        jh512_4way_update( &ctx.jh, vhash, 64 );
-        jh512_4way_close( &ctx.jh, vhashB );
+        jh512_4x64_init( &ctx.jh );
+        jh512_4x64_update( &ctx.jh, vhash, 64 );
+        jh512_4x64_close( &ctx.jh, vhashB );
    }

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
@@ -753,8 +752,8 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    shavite512_2way_full( &ctx.shavite2, vhashA, vhashA, 64 );
    shavite512_2way_full( &ctx.shavite2, vhashB, vhashB, 64 );

-    simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
-    simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
+    simd512_2way_full( &ctx.simd_2way, vhashA, vhashA, 64 );
+    simd512_2way_full( &ctx.simd_2way, vhashB, vhashB, 64 );

    rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );     

@@ -779,9 +778,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    // B
    if ( h_mask & 0xffffffff )
    {
-       haval256_5_4way_init( &ctx.haval );
-       haval256_5_4way_update( &ctx.haval, vhash, 64 );
-       haval256_5_4way_close( &ctx.haval, vhash );
+       haval256_4x32_init( &ctx.haval );
+       haval256_4x32_update( &ctx.haval, vhash, 64 );
+       haval256_4x32_close( &ctx.haval, vhash );
       memset( &vhash[8<<2], 0, 32<<2 );
       rintrlv_4x32_4x64( vhashB, vhash, 512 );
    }
@@ -814,7 +813,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)

 #endif

-    blake512_4way_full( &ctx.blake, vhash, vhash, 64 );
+    blake512_4x64_full( &ctx.blake, vhash, vhash, 64 );

    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

@@ -846,9 +845,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)

    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

-    hamsi512_4way_init( &ctx.hamsi );
-    hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
-    hamsi512_4way_close( &ctx.hamsi, vhash );
+    hamsi512_4x64_init( &ctx.hamsi );
+    hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
+    hamsi512_4x64_close( &ctx.hamsi, vhash );

    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

@@ -869,47 +868,31 @@ extern void hmq1725_4way_hash(void *state, const void *input)
       echo_full( &ctx.echo, (BitSequence *)hash0, 512,
                       (const BitSequence *)hash0, 64 );
    else
-    {
-       init_sd( &ctx.sd, 512 );
-       update_final_sd( &ctx.sd, (BitSequence *)hash0,
-                           (const BitSequence *)hash0, 512 );
-    }
+       simd512_ctx( &ctx.simd, hash0, hash0, 64 );

   if ( hash1[0] & mask ) //4
       echo_full( &ctx.echo, (BitSequence *)hash1, 512,
                       (const BitSequence *)hash1, 64 );
   else
-   {
-       init_sd( &ctx.sd, 512 );
-       update_final_sd( &ctx.sd, (BitSequence *)hash1,
-                           (const BitSequence *)hash1, 512 );
-   }
+       simd512_ctx( &ctx.simd, hash1, hash1, 64 );

   if ( hash2[0] & mask ) //4
       echo_full( &ctx.echo, (BitSequence *)hash2, 512,
                       (const BitSequence *)hash2, 64 );
   else
-   {
-       init_sd( &ctx.sd, 512 );
-       update_final_sd( &ctx.sd, (BitSequence *)hash2,
-                           (const BitSequence *)hash2, 512 );
-   }
+       simd512_ctx( &ctx.simd, hash2, hash2, 64 );

   if ( hash3[0] & mask ) //4
       echo_full( &ctx.echo, (BitSequence *)hash3, 512,
                       (const BitSequence *)hash3, 64 );
   else
-   {
-       init_sd( &ctx.sd, 512 );
-       update_final_sd( &ctx.sd, (BitSequence *)hash3,
-                           (const BitSequence *)hash3, 512 );
-   }
+       simd512_ctx( &ctx.simd, hash3, hash3, 64 );

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

-   shabal512_4way_init( &ctx.shabal );
-   shabal512_4way_update( &ctx.shabal, vhash, 64 );
-   shabal512_4way_close( &ctx.shabal, vhash );
+   shabal512_4x32_init( &ctx.shabal );
+   shabal512_4x32_update( &ctx.shabal, vhash, 64 );
+   shabal512_4x32_close( &ctx.shabal, vhash );

   dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );

@@ -938,9 +921,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)

   if ( h_mask & 0xffffffff )
   {
-      sha512_4way_init( &ctx.sha512 );
-      sha512_4way_update( &ctx.sha512, vhash, 64 );
-      sha512_4way_close( &ctx.sha512, vhashB );
+      sha512_4x64_init( &ctx.sha512 );
+      sha512_4x64_update( &ctx.sha512, vhash, 64 );
+      sha512_4x64_close( &ctx.sha512, vhashB );
   }

   mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
@@ -967,9 +950,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)

 #endif

-   sha512_4way_init( &ctx.sha512 ); 
-   sha512_4way_update( &ctx.sha512, vhash, 64 );
-   sha512_4way_close( &ctx.sha512, vhash ); 
+   sha512_4x64_init( &ctx.sha512 ); 
+   sha512_4x64_update( &ctx.sha512, vhash, 64 );
+   sha512_4x64_close( &ctx.sha512, vhash ); 

 // A = haval parallel, B = Whirlpool serial

@@ -981,9 +964,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)

   if ( ( h_mask & 0xffffffff ) != 0xffffffff )
   {
-      haval256_5_4way_init( &ctx.haval );
-      haval256_5_4way_update( &ctx.haval, vhash, 64 );
-      haval256_5_4way_close( &ctx.haval, vhash );
+      haval256_4x32_init( &ctx.haval );
+      haval256_4x32_update( &ctx.haval, vhash, 64 );
+      haval256_4x32_close( &ctx.haval, vhash );
      memset( &vhash[8<<2], 0, 32<<2 );
      rintrlv_4x32_4x64( vhashA, vhash, 512 );
   }
@@ -1001,9 +984,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)

   mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

-   bmw512_4way_init( &ctx.bmw );
-   bmw512_4way_update( &ctx.bmw, vhash, 64 );
-   bmw512_4way_close( &ctx.bmw, state );
+   bmw512_4x64_init( &ctx.bmw );
+   bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+   bmw512_4x64_close( &ctx.bmw, state );
 }

 int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -13,7 +13,7 @@

 #if defined(LBRY_16WAY)

-static __thread sha256_16way_context sha256_16w_mid;
+static __thread sha256_16x32_context sha256_16w_mid;

 void lbry_16way_hash( void* output, const void* input )
 {
@@ -36,17 +36,17 @@ void lbry_16way_hash( void* output, const void* input )
   uint32_t _ALIGN(64) h13[32];
   uint32_t _ALIGN(64) h14[32];
   uint32_t _ALIGN(64) h15[32];
-   sha256_16way_context    ctx_sha256 __attribute__ ((aligned (64)));
-   sha512_8way_context     ctx_sha512;
-   ripemd160_16way_context ctx_ripemd;
+   sha256_16x32_context    ctx_sha256 __attribute__ ((aligned (64)));
+   sha512_8x64_context     ctx_sha512;
+   ripemd160_16x32_context ctx_ripemd;

   memcpy( &ctx_sha256, &sha256_16w_mid, sizeof(ctx_sha256) );
-   sha256_16way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<4), LBRY_TAIL );
-   sha256_16way_close( &ctx_sha256, vhashA );
+   sha256_16x32_update( &ctx_sha256, input + (LBRY_MIDSTATE<<4), LBRY_TAIL );
+   sha256_16x32_close( &ctx_sha256, vhashA );

-   sha256_16way_init( &ctx_sha256 );
-   sha256_16way_update( &ctx_sha256, vhashA, 32 );
-   sha256_16way_close( &ctx_sha256, vhashA );
+   sha256_16x32_init( &ctx_sha256 );
+   sha256_16x32_update( &ctx_sha256, vhashA, 32 );
+   sha256_16x32_close( &ctx_sha256, vhashA );

   // reinterleave to do sha512 4-way 64 bit twice.
   dintrlv_16x32( h0, h1, h2, h3, h4, h5, h6, h7,
@@ -54,13 +54,13 @@ void lbry_16way_hash( void* output, const void* input )
   intrlv_8x64( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 256 );
   intrlv_8x64( vhashB, h8, h9, h10, h11, h12, h13, h14, h15, 256 );

-   sha512_8way_init( &ctx_sha512 );
-   sha512_8way_update( &ctx_sha512, vhashA, 32 );
-   sha512_8way_close( &ctx_sha512, vhashA );
+   sha512_8x64_init( &ctx_sha512 );
+   sha512_8x64_update( &ctx_sha512, vhashA, 32 );
+   sha512_8x64_close( &ctx_sha512, vhashA );

-   sha512_8way_init( &ctx_sha512 );
-   sha512_8way_update( &ctx_sha512, vhashB, 32 );
-   sha512_8way_close( &ctx_sha512, vhashB );
+   sha512_8x64_init( &ctx_sha512 );
+   sha512_8x64_update( &ctx_sha512, vhashB, 32 );
+   sha512_8x64_close( &ctx_sha512, vhashB );

   // back to 8-way 32 bit
   dintrlv_8x64( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 512 );
@@ -68,22 +68,22 @@ void lbry_16way_hash( void* output, const void* input )
   intrlv_16x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7,
                         h8, h9, h10, h11, h12, h13, h14, h15, 512 );

-   ripemd160_16way_init( &ctx_ripemd );
-   ripemd160_16way_update( &ctx_ripemd, vhashA, 32 );
-   ripemd160_16way_close( &ctx_ripemd, vhashB );
+   ripemd160_16x32_init( &ctx_ripemd );
+   ripemd160_16x32_update( &ctx_ripemd, vhashA, 32 );
+   ripemd160_16x32_close( &ctx_ripemd, vhashB );

-   ripemd160_16way_init( &ctx_ripemd );
-   ripemd160_16way_update( &ctx_ripemd, vhashA+(8<<4), 32 );
-   ripemd160_16way_close( &ctx_ripemd, vhashC );
+   ripemd160_16x32_init( &ctx_ripemd );
+   ripemd160_16x32_update( &ctx_ripemd, vhashA+(8<<4), 32 );
+   ripemd160_16x32_close( &ctx_ripemd, vhashC );

-   sha256_16way_init( &ctx_sha256 );
-   sha256_16way_update( &ctx_sha256, vhashB, 20 );
-   sha256_16way_update( &ctx_sha256, vhashC, 20 );
-   sha256_16way_close( &ctx_sha256, vhashA );
+   sha256_16x32_init( &ctx_sha256 );
+   sha256_16x32_update( &ctx_sha256, vhashB, 20 );
+   sha256_16x32_update( &ctx_sha256, vhashC, 20 );
+   sha256_16x32_close( &ctx_sha256, vhashA );

-   sha256_16way_init( &ctx_sha256 );
-   sha256_16way_update( &ctx_sha256, vhashA, 32 );
-   sha256_16way_close( &ctx_sha256, output );
+   sha256_16x32_init( &ctx_sha256 );
+   sha256_16x32_update( &ctx_sha256, vhashA, 32 );
+   sha256_16x32_close( &ctx_sha256, output );
 }

 int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
@@ -115,8 +115,8 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
   intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
        edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );

-   sha256_16way_init( &sha256_16w_mid );
-   sha256_16way_update( &sha256_16w_mid, vdata, LBRY_MIDSTATE );
+   sha256_16x32_init( &sha256_16w_mid );
+   sha256_16x32_update( &sha256_16w_mid, vdata, LBRY_MIDSTATE );

   do
   {
@@ -144,7 +144,7 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,

 #elif defined(LBRY_8WAY)

-static __thread sha256_8way_context sha256_8w_mid;
+static __thread sha256_8x32_context sha256_8w_mid;

 void lbry_8way_hash( void* output, const void* input )
 {
@@ -159,52 +159,52 @@ void lbry_8way_hash( void* output, const void* input )
   uint32_t _ALIGN(32) h5[32];
   uint32_t _ALIGN(32) h6[32];
   uint32_t _ALIGN(32) h7[32];
-   sha256_8way_context     ctx_sha256 __attribute__ ((aligned (64)));
-   sha512_4way_context     ctx_sha512;
-   ripemd160_8way_context  ctx_ripemd;
+   sha256_8x32_context     ctx_sha256 __attribute__ ((aligned (64)));
+   sha512_4x64_context     ctx_sha512;
+   ripemd160_8x32_context  ctx_ripemd;

   memcpy( &ctx_sha256, &sha256_8w_mid, sizeof(ctx_sha256) );
-   sha256_8way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL );
-   sha256_8way_close( &ctx_sha256, vhashA );
+   sha256_8x32_update( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL );
+   sha256_8x32_close( &ctx_sha256, vhashA );

-   sha256_8way_init( &ctx_sha256 );
-   sha256_8way_update( &ctx_sha256, vhashA, 32 );
-   sha256_8way_close( &ctx_sha256, vhashA );
+   sha256_8x32_init( &ctx_sha256 );
+   sha256_8x32_update( &ctx_sha256, vhashA, 32 );
+   sha256_8x32_close( &ctx_sha256, vhashA );

   // reinterleave to do sha512 4-way 64 bit twice.
   dintrlv_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 );
   intrlv_4x64( vhashA, h0, h1, h2, h3, 256 );
   intrlv_4x64( vhashB, h4, h5, h6, h7, 256 );

-   sha512_4way_init( &ctx_sha512 );
-   sha512_4way_update( &ctx_sha512, vhashA, 32 );
-   sha512_4way_close( &ctx_sha512, vhashA );
+   sha512_4x64_init( &ctx_sha512 );
+   sha512_4x64_update( &ctx_sha512, vhashA, 32 );
+   sha512_4x64_close( &ctx_sha512, vhashA );

-   sha512_4way_init( &ctx_sha512 );
-   sha512_4way_update( &ctx_sha512, vhashB, 32 );
-   sha512_4way_close( &ctx_sha512, vhashB );
+   sha512_4x64_init( &ctx_sha512 );
+   sha512_4x64_update( &ctx_sha512, vhashB, 32 );
+   sha512_4x64_close( &ctx_sha512, vhashB );

   // back to 8-way 32 bit
   dintrlv_4x64( h0, h1, h2, h3, vhashA, 512 );
   dintrlv_4x64( h4, h5, h6, h7, vhashB, 512 );
   intrlv_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );

-   ripemd160_8way_init( &ctx_ripemd );
-   ripemd160_8way_update( &ctx_ripemd, vhashA, 32 );
-   ripemd160_8way_close( &ctx_ripemd, vhashB );
+   ripemd160_8x32_init( &ctx_ripemd );
+   ripemd160_8x32_update( &ctx_ripemd, vhashA, 32 );
+   ripemd160_8x32_close( &ctx_ripemd, vhashB );

-   ripemd160_8way_init( &ctx_ripemd );
-   ripemd160_8way_update( &ctx_ripemd, vhashA+(8<<3), 32 );
-   ripemd160_8way_close( &ctx_ripemd, vhashC );
+   ripemd160_8x32_init( &ctx_ripemd );
+   ripemd160_8x32_update( &ctx_ripemd, vhashA+(8<<3), 32 );
+   ripemd160_8x32_close( &ctx_ripemd, vhashC );

-   sha256_8way_init( &ctx_sha256 );
-   sha256_8way_update( &ctx_sha256, vhashB, 20 );
-   sha256_8way_update( &ctx_sha256, vhashC, 20 );
-   sha256_8way_close( &ctx_sha256, vhashA );
+   sha256_8x32_init( &ctx_sha256 );
+   sha256_8x32_update( &ctx_sha256, vhashB, 20 );
+   sha256_8x32_update( &ctx_sha256, vhashC, 20 );
+   sha256_8x32_close( &ctx_sha256, vhashA );

-   sha256_8way_init( &ctx_sha256 );
-   sha256_8way_update( &ctx_sha256, vhashA, 32 );
-   sha256_8way_close( &ctx_sha256, output );
+   sha256_8x32_init( &ctx_sha256 );
+   sha256_8x32_update( &ctx_sha256, vhashA, 32 );
+   sha256_8x32_close( &ctx_sha256, output );
 }

 int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
@@ -235,8 +235,8 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
   intrlv_8x32( vdata, edata, edata, edata, edata,
                       edata, edata, edata, edata, 1024 );

-   sha256_8way_init( &sha256_8w_mid );
-   sha256_8way_update( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
+   sha256_8x32_init( &sha256_8w_mid );
+   sha256_8x32_update( &sha256_8w_mid, vdata, LBRY_MIDSTATE );

   do
   {
--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -57,7 +57,7 @@ do{ \
 #define ROUND2(a, b, c, d, e, f, s, r, k)  \
 	RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)

-static void ripemd160_4way_round( ripemd160_4way_context *sc )
+static void ripemd160_4x32_round( ripemd160_4x32_context *sc )
 {
   const __m128i *in = (__m128i*)sc->buf;
   __m128i *h  = (__m128i*)sc->val;
@@ -249,7 +249,7 @@ static void ripemd160_4way_round( ripemd160_4way_context *sc )
   h[0] = tmp;
 }

-void ripemd160_4way_init( ripemd160_4way_context *sc )
+void ripemd160_4x32_init( ripemd160_4x32_context *sc )
 {
   sc->val[0] = _mm_set1_epi64x( 0x6745230167452301 );
   sc->val[1] = _mm_set1_epi64x( 0xEFCDAB89EFCDAB89 );
@@ -259,7 +259,7 @@ void ripemd160_4way_init( ripemd160_4way_context *sc )
   sc->count_high = sc->count_low = 0;
 }

-void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
+void ripemd160_4x32_update( ripemd160_4x32_context *sc, const void *data,
                            size_t len )
 {
   __m128i *vdata = (__m128i*)data;
@@ -281,7 +281,7 @@ void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
      len -= clen;
      if ( ptr == block_size )
      {
-         ripemd160_4way_round( sc );
+         ripemd160_4x32_round( sc );
         ptr = 0;
      }
      clow = sc->count_low;
@@ -292,7 +292,7 @@ void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
   }
 }

-void ripemd160_4way_close( ripemd160_4way_context  *sc, void *dst )
+void ripemd160_4x32_close( ripemd160_4x32_context  *sc, void *dst )
 {
   unsigned ptr, u;
   uint32_t low, high;
@@ -306,7 +306,7 @@ void ripemd160_4way_close( ripemd160_4way_context  *sc, void *dst )
   if ( ptr > pad )
   {
       memset_zero_128( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
-       ripemd160_4way_round( sc );
+       ripemd160_4x32_round( sc );
       memset_zero_128( sc->buf, pad>>2 );
   }
   else
@@ -317,7 +317,7 @@ void ripemd160_4way_close( ripemd160_4way_context  *sc, void *dst )
    low = low << 3;
    sc->buf[  pad>>2      ] = _mm_set1_epi32( low  );
    sc->buf[ (pad>>2) + 1 ] = _mm_set1_epi32( high );
-    ripemd160_4way_round( sc );
+    ripemd160_4x32_round( sc );
    for (u = 0; u < 5; u ++)
        casti_v128u32( dst, u ) = sc->val[u];
 }
@@ -357,7 +357,7 @@ do{ \
 #define ROUND2_8W(a, b, c, d, e, f, s, r, k)  \
        RR_8W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)

-static void ripemd160_8way_round( ripemd160_8way_context *sc )
+static void ripemd160_8x32_round( ripemd160_8x32_context *sc )
 {
   const __m256i *in = (__m256i*)sc->buf;
   __m256i *h  = (__m256i*)sc->val;
@@ -550,7 +550,7 @@ static void ripemd160_8way_round( ripemd160_8way_context *sc )
 }


-void ripemd160_8way_init( ripemd160_8way_context *sc )
+void ripemd160_8x32_init( ripemd160_8x32_context *sc )
 {
   sc->val[0] = _mm256_set1_epi64x( 0x6745230167452301 );
   sc->val[1] = _mm256_set1_epi64x( 0xEFCDAB89EFCDAB89 );
@@ -560,7 +560,7 @@ void ripemd160_8way_init( ripemd160_8way_context *sc )
   sc->count_high = sc->count_low = 0;
 }

-void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
+void ripemd160_8x32_update( ripemd160_8x32_context *sc, const void *data,
                            size_t len )
 {
   __m256i *vdata = (__m256i*)data;
@@ -582,7 +582,7 @@ void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
      len -= clen;
      if ( ptr == block_size )
      {
-         ripemd160_8way_round( sc );
+         ripemd160_8x32_round( sc );
         ptr = 0;
      }
      clow = sc->count_low;
@@ -593,7 +593,7 @@ void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
   }
 }

-void ripemd160_8way_close( ripemd160_8way_context  *sc, void *dst )
+void ripemd160_8x32_close( ripemd160_8x32_context  *sc, void *dst )
 {
   unsigned ptr, u;
   uint32_t low, high;
@@ -607,7 +607,7 @@ void ripemd160_8way_close( ripemd160_8way_context  *sc, void *dst )
   if ( ptr > pad )
   {
       memset_zero_256( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
-       ripemd160_8way_round( sc );
+       ripemd160_8x32_round( sc );
       memset_zero_256( sc->buf, pad>>2 );
   }
   else
@@ -618,7 +618,7 @@ void ripemd160_8way_close( ripemd160_8way_context  *sc, void *dst )
    low = low << 3;
    sc->buf[  pad>>2      ] = _mm256_set1_epi32( low  );
    sc->buf[ (pad>>2) + 1 ] = _mm256_set1_epi32( high );
-    ripemd160_8way_round( sc );
+    ripemd160_8x32_round( sc );
    for (u = 0; u < 5; u ++)
        casti_m256i( dst, u ) = sc->val[u];
 }
@@ -629,7 +629,6 @@ void ripemd160_8way_close( ripemd160_8way_context  *sc, void *dst )

 //  RIPEMD-160 16 way

-
 #define F16W_1(x, y, z) \
   _mm512_xor_si512( _mm512_xor_si512( x, y ), z )

@@ -659,7 +658,7 @@ do{ \
 #define ROUND2_16W(a, b, c, d, e, f, s, r, k)  \
        RR_16W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)

-static void ripemd160_16way_round( ripemd160_16way_context *sc )
+static void ripemd160_16x32_round( ripemd160_16x32_context *sc )
 {
   const __m512i *in = (__m512i*)sc->buf;
   __m512i *h  = (__m512i*)sc->val;
@@ -851,7 +850,7 @@ static void ripemd160_16way_round( ripemd160_16way_context *sc )
   h[0] = tmp;
 }

-void ripemd160_16way_init( ripemd160_16way_context *sc )
+void ripemd160_16x32_init( ripemd160_16x32_context *sc )
 {
   sc->val[0] = _mm512_set1_epi64( 0x6745230167452301 );
   sc->val[1] = _mm512_set1_epi64( 0xEFCDAB89EFCDAB89 );
@@ -861,7 +860,7 @@ void ripemd160_16way_init( ripemd160_16way_context *sc )
   sc->count_high = sc->count_low = 0;
 }

-void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
+void ripemd160_16x32_update( ripemd160_16x32_context *sc, const void *data,
                      size_t len )
 {
   __m512i *vdata = (__m512i*)data;
@@ -883,7 +882,7 @@ void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
      len -= clen;
      if ( ptr == block_size )
      {
-         ripemd160_16way_round( sc );
+         ripemd160_16x32_round( sc );
         ptr = 0;
      }
      clow = sc->count_low;
@@ -894,7 +893,7 @@ void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
   }
 }

-void ripemd160_16way_close( ripemd160_16way_context  *sc, void *dst )
+void ripemd160_16x32_close( ripemd160_16x32_context  *sc, void *dst )
 {
   unsigned ptr, u;
   uint32_t low, high;
@@ -908,7 +907,7 @@ void ripemd160_16way_close( ripemd160_16way_context  *sc, void *dst )
   if ( ptr > pad )
   {
       memset_zero_512( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
-       ripemd160_16way_round( sc );
+       ripemd160_16x32_round( sc );
       memset_zero_512( sc->buf, pad>>2 );
   }
   else
@@ -919,7 +918,7 @@ void ripemd160_16way_close( ripemd160_16way_context  *sc, void *dst )
    low = low << 3;
    sc->buf[  pad>>2      ] = _mm512_set1_epi32( low  );
    sc->buf[ (pad>>2) + 1 ] = _mm512_set1_epi32( high );
-    ripemd160_16way_round( sc );
+    ripemd160_16x32_round( sc );
    for (u = 0; u < 5; u ++)
        casti_m512i( dst, u ) = sc->val[u];
 }
--- a/algo/ripemd/ripemd-hash-4way.h
+++ b/algo/ripemd/ripemd-hash-4way.h
@@ -12,12 +12,12 @@ typedef struct
   __m128i buf[64>>2];
   __m128i val[5];
   uint32_t count_high, count_low;
-} __attribute__ ((aligned (64))) ripemd160_4way_context;
+} __attribute__ ((aligned (64))) ripemd160_4x32_context;

-void ripemd160_4way_init( ripemd160_4way_context *sc );
-void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
+void ripemd160_4x32_init( ripemd160_4x32_context *sc );
+void ripemd160_4x32_update( ripemd160_4x32_context *sc, const void *data,
                            size_t len );
-void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst );
+void ripemd160_4x32_close( ripemd160_4x32_context *sc, void *dst );

 #if defined (__AVX2__)

@@ -26,12 +26,12 @@ typedef struct
   __m256i buf[64>>2];
   __m256i val[5];
   uint32_t count_high, count_low;
-} __attribute__ ((aligned (128))) ripemd160_8way_context;
+} __attribute__ ((aligned (128))) ripemd160_8x32_context;

-void ripemd160_8way_init( ripemd160_8way_context *sc );
-void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
+void ripemd160_8x32_init( ripemd160_8x32_context *sc );
+void ripemd160_8x32_update( ripemd160_8x32_context *sc, const void *data,
                            size_t len );
-void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );
+void ripemd160_8x32_close( ripemd160_8x32_context *sc, void *dst );

 #if defined(SIMD512)

@@ -40,12 +40,12 @@ typedef struct
   __m512i buf[64>>2];
   __m512i val[5];
   uint32_t count_high, count_low;
-} __attribute__ ((aligned (128))) ripemd160_16way_context;
+} __attribute__ ((aligned (128))) ripemd160_16x32_context;

-void ripemd160_16way_init( ripemd160_16way_context *sc );
-void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
+void ripemd160_16x32_init( ripemd160_16x32_context *sc );
+void ripemd160_16x32_update( ripemd160_16x32_context *sc, const void *data,
                      size_t len );
-void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst );
+void ripemd160_16x32_close( ripemd160_16x32_context *sc, void *dst );

 #endif // AVX512
 #endif // __AVX2__
--- a/algo/scrypt/neoscrypt.c
+++ b/algo/scrypt/neoscrypt.c
@@ -46,7 +46,7 @@
 #endif

 #ifdef __GNUC__
-#if defined(NOASM) || defined(__arm__) || defined(__aarch64__)
+#if defined(NOASM) || defined(__arm__) || defined(__aarch64__) || defined(__APPLE__)
 #define ASM 0
 #else
 #define ASM 1
@@ -597,6 +597,45 @@ static void blake2s_compress(blake2s_state *S, const void *buf) {
    v[13] = S->t[1] ^ blake2s_IV[5];
    v[14] = S->f[0] ^ blake2s_IV[6];
    v[15] = S->f[1] ^ blake2s_IV[7];
+
+#if defined(__SSE2__) || defined(__ARM_NEON)
+
+   v128_t *V = (v128_t*)v;
+
+#define ROUND( r ) \
+   V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
+                  m[blake2s_sigma[r][ 6]], m[blake2s_sigma[r][ 4]], \
+                  m[blake2s_sigma[r][ 2]], m[blake2s_sigma[r][ 0]] ) ) ); \
+   V[3] = v128_ror32( v128_xor( V[3], V[0] ), 16 ); \
+   V[2] = v128_add32( V[2], V[3] ); \
+   V[1] = v128_ror32( v128_xor( V[1], V[2] ), 12 ); \
+   V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
+                   m[blake2s_sigma[r][ 7]], m[blake2s_sigma[r][ 5]], \
+                   m[blake2s_sigma[r][ 3]], m[blake2s_sigma[r][ 1]] ) ) ); \
+   V[3] = v128_ror32( v128_xor( V[3], V[0] ), 8 ); \
+   V[2] = v128_add32( V[2], V[3] ); \
+   V[1] = v128_ror32( v128_xor( V[1], V[2] ), 7 ); \
+   V[0] = v128_shufll32( V[0] ); \
+   V[3] = v128_swap64( V[3] ); \
+   V[2] = v128_shuflr32( V[2] ); \
+   V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
+                    m[blake2s_sigma[r][12]], m[blake2s_sigma[r][10]], \
+                    m[blake2s_sigma[r][ 8]], m[blake2s_sigma[r][14]] ) ) ); \
+   V[3] = v128_ror32( v128_xor( V[3], V[0] ), 16 ); \
+   V[2] = v128_add32( V[2], V[3] ); \
+   V[1] = v128_ror32( v128_xor( V[1], V[2] ), 12 ); \
+   V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
+                    m[blake2s_sigma[r][13]], m[blake2s_sigma[r][11]], \
+                    m[blake2s_sigma[r][ 9]], m[blake2s_sigma[r][15]] ) ) ); \
+   V[3] = v128_ror32( v128_xor( V[3], V[0] ), 8 ); \
+   V[2] = v128_add32( V[2], V[3] ); \
+   V[1] = v128_ror32( v128_xor( V[1], V[2] ), 7 ); \
+   V[0] = v128_shuflr32( V[0] ); \
+   V[3] = v128_swap64( V[3] ); \
+   V[2] = v128_shufll32( V[2] )
+
+#else
+
 #define G(r,i,a,b,c,d) \
  do { \
    a = a + b + m[blake2s_sigma[r][2*i+0]]; \
@@ -619,6 +658,9 @@ static void blake2s_compress(blake2s_state *S, const void *buf) {
    G(r, 6, v[ 2], v[ 7], v[ 8], v[13]); \
    G(r, 7, v[ 3], v[ 4], v[ 9], v[14]); \
  } while(0)
+
+#endif
+
    ROUND(0);
    ROUND(1);
    ROUND(2);
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -37,8 +37,8 @@

 #if defined(SIMD512)
  #define SCRYPT_THROUGHPUT 16
-#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
-  #define SCRYPT_THROUGHPUT 2
+//#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
+//  #define SCRYPT_THROUGHPUT 2
 #elif defined(__AVX2__)
  #define SCRYPT_THROUGHPUT 8
 #elif defined(__SSE2__) || defined(__ARM_NEON)
@@ -162,7 +162,7 @@ static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
 }

 #endif    // throughput 1
-          //
+          
 #if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)

 static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0, 
@@ -274,9 +274,6 @@ static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,

 #endif   // SHA

-
-
-
 static const uint32_t keypad_4way[ 4*12 ] __attribute((aligned(32))) = 
 {
 	0x80000000, 0x80000000, 0x80000000, 0x80000000,
@@ -339,7 +336,7 @@ static const uint32_t _ALIGN(16) finalblk_4way[4 * 16] = {
 };
 */

-static inline void sha256_4way_init_state( void *state )
+static inline void sha256_4x32_init_state( void *state )
 {
   casti_v128( state, 0 ) = v128_32( 0x6A09E667 );
   casti_v128( state, 1 ) = v128_32( 0xBB67AE85 );
@@ -362,21 +359,21 @@ static inline void HMAC_SHA256_80_init_4way( const uint32_t *key,
 	memcpy( pad, key + 4*16, 4*16 );
 	memcpy( pad + 4*4, keypad_4way, 4*48 );

-   sha256_4way_transform_le( (v128_t*)ihash, (v128_t*)pad,
+   sha256_4x32_transform_le( (v128_t*)ihash, (v128_t*)pad,
                             (const v128_t*)tstate );

-   sha256_4way_init_state( tstate );
+   sha256_4x32_init_state( tstate );

 	for ( i = 0; i < 4*8; i++ )  pad[i] = ihash[i] ^ 0x5c5c5c5c;
 	for ( ; i < 4*16; i++ )      pad[i] = 0x5c5c5c5c;

-   sha256_4way_transform_le( (v128_t*)ostate, (v128_t*)pad,
+   sha256_4x32_transform_le( (v128_t*)ostate, (v128_t*)pad,
                             (const v128_t*)tstate );
   
   for ( i = 0; i < 4*8; i++ )  pad[i] = ihash[i] ^ 0x36363636;
 	for ( ; i < 4*16; i++ )      pad[i] = 0x36363636;

-   sha256_4way_transform_le( (v128_t*)tstate, (v128_t*)pad,
+   sha256_4x32_transform_le( (v128_t*)tstate, (v128_t*)pad,
                             (const v128_t*)tstate );
 }

@@ -389,7 +386,7 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
 	uint32_t _ALIGN(16) obuf[4 * 16];
 	int i, j;

-   sha256_4way_transform_le( (v128_t*)istate, (v128_t*)salt,
+   sha256_4x32_transform_le( (v128_t*)istate, (v128_t*)salt,
                             (const v128_t*)tstate );
 	
 	memcpy(ibuf, salt + 4 * 16, 4 * 16);
@@ -403,10 +400,10 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
 		ibuf[4 * 4 + 2] = i + 1;
 		ibuf[4 * 4 + 3] = i + 1;

-      sha256_4way_transform_le( (v128_t*)obuf, (v128_t*)ibuf,
+      sha256_4x32_transform_le( (v128_t*)obuf, (v128_t*)ibuf,
                                (const v128_t*)istate );
      
-      sha256_4way_transform_le( (v128_t*)ostate2, (v128_t*)obuf,
+      sha256_4x32_transform_le( (v128_t*)ostate2, (v128_t*)obuf,
                                (const v128_t*)ostate );

      for ( j = 0; j < 4 * 8; j++ )
@@ -421,9 +418,9 @@ static inline void PBKDF2_SHA256_128_32_4way( uint32_t *tstate,
 	uint32_t _ALIGN(64) buf[4 * 16];
 	int i;
 	
-   sha256_4way_transform_be( (v128_t*)tstate, (v128_t*)salt,
+   sha256_4x32_transform_be( (v128_t*)tstate, (v128_t*)salt,
                       (const v128_t*)tstate );
-   sha256_4way_transform_be( (v128_t*)tstate, (v128_t*)( salt + 4*16),
+   sha256_4x32_transform_be( (v128_t*)tstate, (v128_t*)( salt + 4*16),
                       (const v128_t*)tstate );

   final[ 0] = v128_32( 0x00000001 );
@@ -434,20 +431,20 @@ static inline void PBKDF2_SHA256_128_32_4way( uint32_t *tstate,
             = v128_xor( final[ 0], final[ 0] ); //_mm_setzero_si128();
   final[15] = v128_32 ( 0x00000620 );

-   sha256_4way_transform_le( (v128_t*)tstate, (v128_t*)final,
+   sha256_4x32_transform_le( (v128_t*)tstate, (v128_t*)final,
                       (const v128_t*)tstate );
   
   memcpy(buf, tstate, 4 * 32);
 	memcpy(buf + 4 * 8, outerpad_4way, 4 * 32);

-   sha256_4way_transform_le( (v128_t*)ostate, (v128_t*)buf,
+   sha256_4x32_transform_le( (v128_t*)ostate, (v128_t*)buf,
                             (const v128_t*)ostate );

   for ( i = 0; i < 4 * 8; i++ )
 		output[i] = bswap_32( ostate[i] );
 }

-#ifdef HAVE_SHA256_8WAY
+#if defined(__AVX2__)

 /*
 static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = {
@@ -470,7 +467,7 @@ static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = {
 };
 */

-static inline void sha256_8way_init_state( void *state )
+static inline void sha256_8x32_init_state( void *state )
 {
   casti_m256i( state, 0 ) = _mm256_set1_epi32( 0x6A09E667 );
   casti_m256i( state, 1 ) = _mm256_set1_epi32( 0xBB67AE85 );
@@ -494,21 +491,21 @@ static inline void HMAC_SHA256_80_init_8way( const uint32_t *key,
 	memset( pad + 8*5, 0x00, 8*40 );
 	for ( i = 0; i < 8; i++ )    pad[ 8*15 + i ] = 0x00000280;

-   sha256_8way_transform_le( (__m256i*)ihash, (__m256i*)pad,
+   sha256_8x32_transform_le( (__m256i*)ihash, (__m256i*)pad,
                             (const __m256i*)tstate );

-   sha256_8way_init_state( tstate );
+   sha256_8x32_init_state( tstate );

   for ( i = 0; i < 8*8; i++ )   pad[i] = ihash[i] ^ 0x5c5c5c5c;
 	for ( ; i < 8*16; i++ )       pad[i] = 0x5c5c5c5c;

-   sha256_8way_transform_le( (__m256i*)ostate, (__m256i*)pad,
+   sha256_8x32_transform_le( (__m256i*)ostate, (__m256i*)pad,
                             (const __m256i*)tstate );

   for ( i = 0; i < 8*8; i++ )   pad[i] = ihash[i] ^ 0x36363636;
 	for ( ; i < 8*16; i++ )       pad[i] = 0x36363636;

-   sha256_8way_transform_le( (__m256i*)tstate, (__m256i*)pad,
+   sha256_8x32_transform_le( (__m256i*)tstate, (__m256i*)pad,
                             (const __m256i*)tstate );
 }

@@ -521,7 +518,7 @@ static inline void PBKDF2_SHA256_80_128_8way( const uint32_t *tstate,
 	uint32_t _ALIGN(32) obuf[8 * 16];
 	int i, j;
 	
-   sha256_8way_transform_le( (__m256i*)istate, (__m256i*)salt,
+   sha256_8x32_transform_le( (__m256i*)istate, (__m256i*)salt,
                             (const __m256i*)tstate );

 	memcpy( ibuf, salt + 8*16, 8*16 );
@@ -544,10 +541,10 @@ static inline void PBKDF2_SHA256_80_128_8way( const uint32_t *tstate,
 		ibuf[8 * 4 + 6] = i + 1;
 		ibuf[8 * 4 + 7] = i + 1;

-      sha256_8way_transform_le( (__m256i*)obuf, (__m256i*)ibuf,
+      sha256_8x32_transform_le( (__m256i*)obuf, (__m256i*)ibuf,
                                (const __m256i*)istate );

-      sha256_8way_transform_le( (__m256i*)ostate2, (__m256i*)obuf,
+      sha256_8x32_transform_le( (__m256i*)ostate2, (__m256i*)obuf,
                                (const __m256i*)ostate );

      for ( j = 0; j < 8*8; j++ )
@@ -562,9 +559,9 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
   uint32_t _ALIGN(128) buf[ 8*16 ];
 	int i;
 	
-   sha256_8way_transform_be( (__m256i*)tstate, (__m256i*)salt,
+   sha256_8x32_transform_be( (__m256i*)tstate, (__m256i*)salt,
                             (const __m256i*)tstate );
-   sha256_8way_transform_be( (__m256i*)tstate, (__m256i*)( salt + 8*16),
+   sha256_8x32_transform_be( (__m256i*)tstate, (__m256i*)( salt + 8*16),
                             (const __m256i*)tstate );
   
   final[ 0] = _mm256_set1_epi32( 0x00000001 );
@@ -575,7 +572,7 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
             = _mm256_setzero_si256();
   final[15] = _mm256_set1_epi32 ( 0x00000620 );

-   sha256_8way_transform_le( (__m256i*)tstate, final,
+   sha256_8x32_transform_le( (__m256i*)tstate, final,
                             (const __m256i*)tstate );

 	memcpy( buf, tstate, 8*32 );
@@ -583,18 +580,18 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
 	memset( buf + 8*9, 0x00, 8*24 );
 	for ( i = 0; i < 8; i++ )     buf[ 8*15 + i ] = 0x00000300;

-   sha256_8way_transform_le( (__m256i*)ostate, (__m256i*)buf,
+   sha256_8x32_transform_le( (__m256i*)ostate, (__m256i*)buf,
                             (const __m256i*)ostate );

 	for (i = 0; i < 8 * 8; i++)
 		output[i] = bswap_32(ostate[i]);
 }

-#endif /* HAVE_SHA256_8WAY */
+#endif //AVX2

 #if defined(SIMD512)

-static inline void sha256_16way_init_state( void *state )
+static inline void sha256_16x32_init_state( void *state )
 {
   casti_m512i( state, 0 ) = _mm512_set1_epi32( 0x6A09E667 );
   casti_m512i( state, 1 ) = _mm512_set1_epi32( 0xBB67AE85 );
@@ -618,21 +615,21 @@ static inline void HMAC_SHA256_80_init_16way( const uint32_t *key,
   memset( pad + 16*5, 0x00, 16*40 );
   for ( i = 0; i < 16; i++ )       pad[ 16*15 + i ] = 0x00000280;

-   sha256_16way_transform_le( (__m512i*)ihash, (__m512i*)pad,
+   sha256_16x32_transform_le( (__m512i*)ihash, (__m512i*)pad,
                              (const __m512i*)tstate );

-   sha256_16way_init_state( tstate );
+   sha256_16x32_init_state( tstate );

   for ( i = 0; i < 16*8; i++ )    pad[i] = ihash[i] ^ 0x5c5c5c5c;
   for ( ; i < 16*16; i++ )        pad[i] = 0x5c5c5c5c;

-   sha256_16way_transform_le( (__m512i*)ostate, (__m512i*)pad,
+   sha256_16x32_transform_le( (__m512i*)ostate, (__m512i*)pad,
                              (const __m512i*)tstate );

   for ( i = 0; i < 16*8; i++ )   pad[i] = ihash[i] ^ 0x36363636;
   for ( ; i < 16*16; i++ )       pad[i] = 0x36363636;
 
-   sha256_16way_transform_le( (__m512i*)tstate, (__m512i*)pad,
+   sha256_16x32_transform_le( (__m512i*)tstate, (__m512i*)pad,
                              (const __m512i*)tstate );
 }

@@ -645,7 +642,7 @@ static inline void PBKDF2_SHA256_80_128_16way( const uint32_t *tstate,
   uint32_t _ALIGN(128) ostate2[ 16*8 ];
   int i, j;

-   sha256_16way_transform_le( (__m512i*)istate, (__m512i*)salt,
+   sha256_16x32_transform_le( (__m512i*)istate, (__m512i*)salt,
                              (const __m512i*)tstate );

   memcpy( ibuf, salt + 16*16, 16*16 );
@@ -676,10 +673,10 @@ static inline void PBKDF2_SHA256_80_128_16way( const uint32_t *tstate,
      ibuf[ 16*4 + 14 ] = i + 1;
      ibuf[ 16*4 + 15 ] = i + 1;

-      sha256_16way_transform_le( (__m512i*)obuf, (__m512i*)ibuf,
+      sha256_16x32_transform_le( (__m512i*)obuf, (__m512i*)ibuf,
                                 (const __m512i*)istate );

-      sha256_16way_transform_le( (__m512i*)ostate2, (__m512i*)obuf,
+      sha256_16x32_transform_le( (__m512i*)ostate2, (__m512i*)obuf,
                                 (const __m512i*)ostate );

      for ( j = 0; j < 16*8; j++ )
@@ -694,9 +691,9 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
   uint32_t _ALIGN(128) buf[ 16*16 ];
   int i;

-   sha256_16way_transform_be( (__m512i*)tstate, (__m512i*)salt,
+   sha256_16x32_transform_be( (__m512i*)tstate, (__m512i*)salt,
                             (const __m512i*)tstate );
-   sha256_16way_transform_be( (__m512i*)tstate, (__m512i*)( salt + 16*16),
+   sha256_16x32_transform_be( (__m512i*)tstate, (__m512i*)( salt + 16*16),
                             (const __m512i*)tstate );

   final[ 0] = _mm512_set1_epi32( 0x00000001 );
@@ -707,7 +704,7 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
             = _mm512_setzero_si512();
   final[15] = _mm512_set1_epi32 ( 0x00000620 );

-   sha256_16way_transform_le( (__m512i*)tstate, final,
+   sha256_16x32_transform_le( (__m512i*)tstate, final,
                             (const __m512i*)tstate );

   memcpy( buf, tstate, 16*32 );
@@ -715,7 +712,7 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
   memset( buf + 16*9, 0x00, 16*24 );
   for ( i = 0; i < 16; i++ )      buf[ 16*15 + i ] = 0x00000300;

-   sha256_16way_transform_le( (__m512i*)ostate, (__m512i*)buf,
+   sha256_16x32_transform_le( (__m512i*)ostate, (__m512i*)buf,
                             (const __m512i*)ostate );

   for ( i = 0; i < 16*8; i++ )
@@ -724,25 +721,10 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,

 #endif // AVX512

-#define SCRYPT_MAX_WAYS 12
-#define HAVE_SCRYPT_3WAY 1
-void scrypt_core(uint32_t *X, uint32_t *V, int N);
-void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
-
-#if defined(__AVX2__)
-#undef SCRYPT_MAX_WAYS
-#define SCRYPT_MAX_WAYS 24
-#define HAVE_SCRYPT_6WAY 1
-void scrypt_core_6way(uint32_t *X, uint32_t *V, int N);
-#endif
-
-#ifndef SCRYPT_MAX_WAYS
-#define SCRYPT_MAX_WAYS 1
-#endif
-
 #include "scrypt-core-4way.h"

-/*
+#if ( SCRYPT_THROUGHPUT == 1 )
+   
 static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
                              uint32_t *midstate, int N, int thr_id )
 {
@@ -752,15 +734,12 @@ static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
 	memcpy(tstate, midstate, 32);
 	HMAC_SHA256_80_init(input, tstate, ostate);
 	PBKDF2_SHA256_80_128(tstate, ostate, input, X);
-
-   scrypt_core_simd128( X, scratchbuf, N );  // woring
-//   scrypt_core_1way( X, V, N );  // working
-//   scrypt_core(X, V, N);
-
+   scrypt_core_1way( X, scratchbuf, N ); 
 	PBKDF2_SHA256_128_32(tstate, ostate, X, output);
   return true;
 }
-*/
+
+#endif

 #if ( SCRYPT_THROUGHPUT == 8 )

@@ -1201,20 +1180,6 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_2buf( X+448, V, N );
 ********************/
-/*
-   scrypt_core_3way( X,     V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_3way( X+ 96, V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_2buf( X+192, V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_3way( X+256, V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_3way( X+352, V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_2buf( X+448, V, N );
-*/
-

   if ( work_restart[thrid].restart ) return 0;

@@ -1265,7 +1230,8 @@ static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input,

 #if ( SCRYPT_THROUGHPUT == 4 )

-#if defined(__SHA__)
+#if 0
+//#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)

 static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
           uint32_t *midstate, int N, int thrid )
@@ -1279,6 +1245,15 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
    memcpy( tstate+16, midstate, 32 );
    memcpy( tstate+24, midstate, 32 );
    
+    HMAC_SHA256_80_init_SHA_2BUF( input, input+20, tstate, tstate+8,
+                                  ostate, ostate+8 );
+    PBKDF2_SHA256_80_128_SHA_2BUF( tstate, tstate+8, ostate, ostate+8,
+                                   input, input+20,  W, W+32 );
+    HMAC_SHA256_80_init_SHA_2BUF( input+40, input+60, tstate+16, tstate+24,
+                                  ostate+16, ostate+24 );
+    PBKDF2_SHA256_80_128_SHA_2BUF( tstate+16, tstate+24, ostate+16, ostate+24,
+                                   input+40, input+60,  W+64, W+96 );
+/*    
    HMAC_SHA256_80_init(  input,     tstate,    ostate    );
    PBKDF2_SHA256_80_128( tstate,    ostate,    input,     W );
    HMAC_SHA256_80_init(  input +20, tstate+ 8, ostate+ 8 );
@@ -1287,7 +1262,7 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
    PBKDF2_SHA256_80_128( tstate+16, ostate+16, input +40, W+64 );
    HMAC_SHA256_80_init(  input +60, tstate+24, ostate+24 );
    PBKDF2_SHA256_80_128( tstate+24, ostate+24, input +60, W+96 );
-
+*/
 /*    
   // Working Linear single threaded SIMD
   scrypt_core_simd128( W,    V, N );
@@ -1313,16 +1288,20 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,

   if ( work_restart[thrid].restart ) return 0;

+   PBKDF2_SHA256_128_32_SHA_2BUF( tstate,    tstate+ 8, ostate,    ostate+ 8,
+                                  W,    W+32, output,    output+ 8 );
+   PBKDF2_SHA256_128_32_SHA_2BUF( tstate+16, tstate+24, ostate+16, ostate+24,
+                                  W+64, W+96, output+16, output+24 );
+/*   
   PBKDF2_SHA256_128_32( tstate,    ostate,    W,    output    );
   PBKDF2_SHA256_128_32( tstate+ 8, ostate+ 8, W+32, output+ 8 );
   PBKDF2_SHA256_128_32( tstate+16, ostate+16, W+64, output+16 );
   PBKDF2_SHA256_128_32( tstate+24, ostate+24, W+96, output+24 );
-
+*/
   return 1;
 }

-#else  
-// SSE2
+#elif defined(__SSE2__) || defined(__ARM_NEON)  

 static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
           uint32_t *midstate, int N, int thrid )
@@ -1426,13 +1405,13 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
         rc = scrypt_N_1_1_256_8way( data, hash, midstate, opt_param_n,
                                     thr_id );
 #elif ( SCRYPT_THROUGHPUT == 4 )
-  #if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
-         rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, opt_param_n,
-                                         thr_id );
-  #else
+//  #if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
+//         rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, opt_param_n,
+//                                         thr_id );
+//  #else
         rc = scrypt_N_1_1_256_4way( data, hash, midstate, opt_param_n,
                                     thr_id );
-  #endif
+//  #endif
 #elif ( SCRYPT_THROUGHPUT == 2 ) && ( defined(__SHA__) || defined(__ARM_FEATURE_SHA2) )
         rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, opt_param_n,
                                         thr_id );
@@ -1480,42 +1459,38 @@ bool scrypt_miner_thread_init( int thr_id )

 bool register_scrypt_algo( algo_gate_t* gate )
 {
-#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
-   gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
-#else
-   gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
-#endif
   gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
   gate->scanhash         = (void*)&scanhash_scrypt;
   opt_target_factor = 65536.0;
   opt_param_n = opt_param_n ? opt_param_n : 1024;
   applog( LOG_INFO,"Scrypt paramaters: N= %d, R= 1", opt_param_n );

-// scrypt_throughput defined at compile time and used to replace
-// MAX_WAYS to reduce memory usage.
-   
-#if defined(SIMD512)
-//   scrypt_throughput = 16;
+   switch ( SCRYPT_THROUGHPUT )
+   {
+     case 16:  // AVX512
       if ( opt_param_n > 0x4000 )
         scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
       else      
         scratchbuf_size = opt_param_n * 4 * 128;  // 4 way
-#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
-//   scrypt_throughput = 2;
+     break;
+     case 2:  // SHA256
         scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
-#elif defined(__AVX2__)
-//   scrypt_throughput = 8;   
+     break;
+     case 8:  // AVX2
       if ( opt_param_n > 0x4000 )
         scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
       else
         scratchbuf_size = opt_param_n * 2 * 128;  // 2 way
-#else
-//   scrypt_throughput = 4;
+     break;
+     case 4:  // SSE2, NEON
       if ( opt_param_n > 0x4000 )
         scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
       else
         scratchbuf_size = opt_param_n * 4 * 128;  // 4 way
-#endif
+     break;
+     default:
+         scratchbuf_size = opt_param_n * 128;  // 1 way
+   }

   char t_units[4] = {0};
   char d_units[4] = {0};
--- a/algo/sha/hmac-sha256-hash-4way.c
+++ b/algo/sha/hmac-sha256-hash-4way.c
@@ -31,7 +31,7 @@
 #include "hmac-sha256-hash-4way.h"
 #include "compat.h"

-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)
 // HMAC 4-way SSE2

 /**
@@ -62,30 +62,30 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
 	/* If Klen > 64, the key is really SHA256(K). */
 	if ( Klen > 64 )
   {
-		sha256_4way_init( &ctx->ictx );
-		sha256_4way_update( &ctx->ictx, K, Klen );
-		sha256_4way_close( &ctx->ictx, khash );
+		sha256_4x32_init( &ctx->ictx );
+		sha256_4x32_update( &ctx->ictx, K, Klen );
+		sha256_4x32_close( &ctx->ictx, khash );
 		K = khash;
 		Klen = 32;
 	}

 	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-   sha256_4way_init( &ctx->ictx );
+   sha256_4x32_init( &ctx->ictx );
 	memset( pad, 0x36, 64*4 );

   for ( i = 0; i < Klen; i++ )
-		casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
+		casti_v128u32( pad, i ) = v128_xor( casti_v128u32( pad, i ),
                                          casti_v128u32( K, i ) );

-   sha256_4way_update( &ctx->ictx, pad, 64 );
+   sha256_4x32_update( &ctx->ictx, pad, 64 );

 	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-	sha256_4way_init( &ctx->octx );
+	sha256_4x32_init( &ctx->octx );
 	memset( pad, 0x5c, 64*4 );
 	for ( i = 0; i < Klen/4; i++ )
-		casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
+		casti_v128u32( pad, i ) = v128_xor( casti_v128u32( pad, i ),
                                          casti_v128u32( K, i ) );
-	sha256_4way_update( &ctx->octx, pad, 64 );
+	sha256_4x32_update( &ctx->octx, pad, 64 );
 }

 /* Add bytes to the HMAC-SHA256 operation. */
@@ -94,7 +94,7 @@ hmac_sha256_4way_update( hmac_sha256_4way_context *ctx, const void *in,
                         size_t len )
 {
 	/* Feed data to the inner SHA256 operation. */
-	sha256_4way_update( &ctx->ictx, in, len );
+	sha256_4x32_update( &ctx->ictx, in, len );
 }

 /* Finish an HMAC-SHA256 operation. */
@@ -104,13 +104,13 @@ hmac_sha256_4way_close( hmac_sha256_4way_context *ctx, void *digest )
 	unsigned char ihash[32*4] __attribute__ ((aligned (64)));

 	/* Finish the inner SHA256 operation. */
-	sha256_4way_close( &ctx->ictx, ihash );
+	sha256_4x32_close( &ctx->ictx, ihash );

 	/* Feed the inner hash to the outer SHA256 operation. */
-	sha256_4way_update( &ctx->octx, ihash, 32 );
+	sha256_4x32_update( &ctx->octx, ihash, 32 );

 	/* Finish the outer SHA256 operation. */
-	sha256_4way_close( &ctx->octx, digest );
+	sha256_4x32_close( &ctx->octx, digest );
 }

 /**
@@ -126,7 +126,7 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
 	hmac_sha256_4way_context PShctx, hctx;
 	uint8_t _ALIGN(128) T[32*4];
 	uint8_t _ALIGN(128) U[32*4];
-   __m128i ivec;
+   v128u32_t ivec;
   size_t i, clen;
 	uint64_t j;
 	int k;
@@ -139,7 +139,7 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
 	for ( i = 0; i * 32 < dkLen; i++ )
   {
 		/* Generate INT(i + 1). */
-      ivec = _mm_set1_epi32( bswap_32( i+1 ) ); 
+      ivec = v128_32( bswap_32( i+1 ) ); 

 		/* Compute U_1 = PRF(P, S || INT(i)). */
 		memcpy( &hctx, &PShctx, sizeof(hmac_sha256_4way_context) );
@@ -158,7 +158,7 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,

 			/* ... xor U_j ... */
 			for ( k = 0; k < 8; k++ )
-				casti_v128u32( T, k ) = _mm_xor_si128( casti_v128u32( T, k ),
+				casti_v128u32( T, k ) = v128_xor( casti_v128u32( T, k ),
                                              casti_v128u32( U, k ) );
 		}

@@ -199,30 +199,30 @@ hmac_sha256_8way_init( hmac_sha256_8way_context *ctx, const void *_K,
   /* If Klen > 64, the key is really SHA256(K). */
   if ( Klen > 64 )
   {
-      sha256_8way_init( &ctx->ictx );
-      sha256_8way_update( &ctx->ictx, K, Klen );
-      sha256_8way_close( &ctx->ictx, khash );
+      sha256_8x32_init( &ctx->ictx );
+      sha256_8x32_update( &ctx->ictx, K, Klen );
+      sha256_8x32_close( &ctx->ictx, khash );
      K = khash;
      Klen = 32;
   }

   /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-   sha256_8way_init( &ctx->ictx );
+   sha256_8x32_init( &ctx->ictx );
   memset( pad, 0x36, 64*8);

   for ( i = 0; i < Klen/4; i++ )
      casti_m256i( pad, i ) = _mm256_xor_si256( casti_m256i( pad, i ),
                                                casti_m256i( K, i ) );

-   sha256_8way_update( &ctx->ictx, pad, 64 );
+   sha256_8x32_update( &ctx->ictx, pad, 64 );

   /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-   sha256_8way_init( &ctx->octx );
+   sha256_8x32_init( &ctx->octx );
   memset( pad, 0x5c, 64*8 );
   for ( i = 0; i < Klen/4; i++ )
      casti_m256i( pad, i ) = _mm256_xor_si256( casti_m256i( pad, i ),
                                                casti_m256i( K, i ) );
-   sha256_8way_update( &ctx->octx, pad, 64 );
+   sha256_8x32_update( &ctx->octx, pad, 64 );
 }

 void
@@ -230,7 +230,7 @@ hmac_sha256_8way_update( hmac_sha256_8way_context *ctx, const void *in,
                         size_t len )
 {
   /* Feed data to the inner SHA256 operation. */
-   sha256_8way_update( &ctx->ictx, in, len );
+   sha256_8x32_update( &ctx->ictx, in, len );
 }

 /* Finish an HMAC-SHA256 operation. */
@@ -240,13 +240,13 @@ hmac_sha256_8way_close( hmac_sha256_8way_context *ctx, void *digest )
   unsigned char ihash[32*8] __attribute__ ((aligned (128)));

   /* Finish the inner SHA256 operation. */
-   sha256_8way_close( &ctx->ictx, ihash );
+   sha256_8x32_close( &ctx->ictx, ihash );

   /* Feed the inner hash to the outer SHA256 operation. */
-   sha256_8way_update( &ctx->octx, ihash, 32 );
+   sha256_8x32_update( &ctx->octx, ihash, 32 );

   /* Finish the outer SHA256 operation. */
-   sha256_8way_close( &ctx->octx, digest );
+   sha256_8x32_close( &ctx->octx, digest );
 }

 /**
@@ -332,21 +332,21 @@ hmac_sha256_16way_init( hmac_sha256_16way_context *ctx, const void *_K,
   /* If Klen > 64, the key is really SHA256(K). */
   if ( Klen > 64 )
   {
-      sha256_16way_init( &ctx->ictx );
-      sha256_16way_update( &ctx->ictx, K, Klen );
-      sha256_16way_close( &ctx->ictx, khash );
+      sha256_16x32_init( &ctx->ictx );
+      sha256_16x32_update( &ctx->ictx, K, Klen );
+      sha256_16x32_close( &ctx->ictx, khash );
      K = khash;
      Klen = 32;
   }

   /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-   sha256_16way_init( &ctx->ictx );
+   sha256_16x32_init( &ctx->ictx );
   memset( pad, 0x36, 64*16 );

   for ( i = 0; i < Klen; i++ )
      casti_m512i( pad, i ) = _mm512_xor_si512( casti_m512i( pad, i ),
                                                casti_m512i( K, i ) );
-   sha256_16way_update( &ctx->ictx, pad, 64 );
+   sha256_16x32_update( &ctx->ictx, pad, 64 );

   /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
   sha256_16way_init( &ctx->octx );
@@ -354,7 +354,7 @@ hmac_sha256_16way_init( hmac_sha256_16way_context *ctx, const void *_K,
   for ( i = 0; i < Klen/4; i++ )
      casti_m512i( pad, i ) = _mm512_xor_si512( casti_m512i( pad, i ),
                                             casti_m512i( K, i ) );
-   sha256_16way_update( &ctx->octx, pad, 64 );
+   sha256_16x32_update( &ctx->octx, pad, 64 );
 }
   
 void
@@ -362,7 +362,7 @@ hmac_sha256_16way_update( hmac_sha256_16way_context *ctx, const void *in,
                         size_t len )
 {
   /* Feed data to the inner SHA256 operation. */
-   sha256_16way_update( &ctx->ictx, in, len );
+   sha256_16x32_update( &ctx->ictx, in, len );
 }

 /* Finish an HMAC-SHA256 operation. */
@@ -372,13 +372,13 @@ hmac_sha256_16way_close( hmac_sha256_16way_context *ctx, void *digest )
   unsigned char ihash[32*16] __attribute__ ((aligned (128)));

   /* Finish the inner SHA256 operation. */
-   sha256_16way_close( &ctx->ictx, ihash );
+   sha256_16x32_close( &ctx->ictx, ihash );

   /* Feed the inner hash to the outer SHA256 operation. */
-   sha256_16way_update( &ctx->octx, ihash, 32 );
+   sha256_16x32_update( &ctx->octx, ihash, 32 );

   /* Finish the outer SHA256 operation. */
-   sha256_16way_close( &ctx->octx, digest );
+   sha256_16x32_close( &ctx->octx, digest );
 }

 /**
--- a/algo/sha/hmac-sha256-hash-4way.h
+++ b/algo/sha/hmac-sha256-hash-4way.h
@@ -1,6 +1,6 @@
 /*-
 * Copyright 2005,2007,2009 Colin Percival
- * Copyright 2020 JayDDee@gmailcom
+ * Copyright 2020 JayDDee246@gmailcom
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
@@ -38,11 +38,12 @@
 #include "simd-utils.h"
 #include "sha256-hash.h"

-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)
+
 typedef struct _hmac_sha256_4way_context
 {
-   sha256_4way_context ictx;
-   sha256_4way_context octx;
+   sha256_4x32_context ictx;
+   sha256_4x32_context octx;
 } hmac_sha256_4way_context;

 //void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
@@ -67,8 +68,8 @@ void pbkdf2_sha256_4way( uint8_t *, size_t, const uint8_t *, size_t,

 typedef struct _hmac_sha256_8way_context
 {
-   sha256_8way_context ictx;
-   sha256_8way_context octx;
+   sha256_8x32_context ictx;
+   sha256_8x32_context octx;
 } hmac_sha256_8way_context;

 //void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
@@ -88,8 +89,8 @@ void pbkdf2_sha256_8way( uint8_t *, size_t, const uint8_t *, size_t,

 typedef struct _hmac_sha256_16way_context
 {
-   sha256_16way_context ictx;
-   sha256_16way_context octx;
+   sha256_16x32_context ictx;
+   sha256_16x32_context octx;
 } hmac_sha256_16way_context;

 //void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -30,6 +30,7 @@ static const uint32_t K256[64] =
   0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
 };

+#if defined(__SSE2__) || defined(__ARM_NEON)
 // SHA-256 4 way SSE2

 #define CHs(X, Y, Z) \
@@ -309,142 +310,6 @@ void sha256_4x32_final_rounds( v128_t *state_out, const v128_t *data,
   v128_store( state_out + 7,  H );
 }

-
-# if 0
-
-// Working correctly but still slower
-int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
-                            const v128_t *state_in, const uint32_t *target )
-{
-   v128_t A, B, C, D, E, F, G, H, T0, T1, T2;
-   v128_t vmask, targ, hash;
-   int t6_mask, flip;
-   v128_t W[16];      v128_memcpy( W, data, 16 );
-
-   A = v128_load( state_in   );
-   B = v128_load( state_in+1 );
-   C = v128_load( state_in+2 );
-   D = v128_load( state_in+3 );
-   E = v128_load( state_in+4 );
-   F = v128_load( state_in+5 );
-   G = v128_load( state_in+6 );
-   H = v128_load( state_in+7 );
-
-   const v128_t IV7 = H;
-   const v128_t IV6 = G;
-
-   SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
-   SHA256_4X32_MSG_EXPANSION( W );
-   SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
-   SHA256_4X32_MSG_EXPANSION( W );
-   SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
-
-   W[ 0] = SHA256_4X32_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
-   W[ 1] = SHA256_4X32_MEXP( W[15], W[10], W[ 2], W[ 1] );
-   W[ 2] = SHA256_4X32_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
-   W[ 3] = SHA256_4X32_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
-   W[ 4] = SHA256_4X32_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
-   W[ 5] = SHA256_4X32_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
-   W[ 6] = SHA256_4X32_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
-   W[ 7] = SHA256_4X32_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
-   W[ 8] = SHA256_4X32_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
-   W[ 9] = SHA256_4X32_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
-   W[10] = SHA256_4X32_MEXP( W[ 8], W[ 3], W[11], W[10] );
-   W[11] = SHA256_4X32_MEXP( W[ 9], W[ 4], W[12], W[11] );
-   W[12] = SHA256_4X32_MEXP( W[10], W[ 5], W[13], W[12] );
-
-   v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C );
-   
-   SHA256_4X32_ROUND( A, B, C, D, E, F, G, H,  0, 48 );
-   SHA256_4X32_ROUND( H, A, B, C, D, E, F, G,  1, 48 );
-   SHA256_4X32_ROUND( G, H, A, B, C, D, E, F,  2, 48 );
-   SHA256_4X32_ROUND( F, G, H, A, B, C, D, E,  3, 48 );
-   SHA256_4X32_ROUND( E, F, G, H, A, B, C, D,  4, 48 );
-   SHA256_4X32_ROUND( D, E, F, G, H, A, B, C,  5, 48 );
-   SHA256_4X32_ROUND( C, D, E, F, G, H, A, B,  6, 48 );
-   SHA256_4X32_ROUND( B, C, D, E, F, G, H, A,  7, 48 );
-   SHA256_4X32_ROUND( A, B, C, D, E, F, G, H,  8, 48 );
-   SHA256_4X32_ROUND( H, A, B, C, D, E, F, G,  9, 48 );
-
-   T0 = v128_add32( v128_32( K256[58] ),
-                   v128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) );
-   B = v128_add32( B, T0 );
-
-   T1 = v128_add32( v128_32( K256[59] ),
-                    v128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) );
-   A = v128_add32( A, T1 );
-
-   T2 = v128_add32( v128_32( K256[60] ),
-                    v128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) );
-   H = v128_add32( H, T2 );
-
-   targ = v128_32( target[7] );
-   hash = v128_bswap32( v128_add32( H, IV7 ) );
-
-   flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash );
-
-   if ( likely(
-             0xf == ( flip ^ v128_movmask32( v128_cmpgt32( hash, targ ) ) ) ))
-   return 0;
-
-   t6_mask = v128_movmask32( vmask = v128_cmpeq32( hash, targ ) );
-
-   // round 58 part 2
-   F = v128_add32( T0, v128_add32( BSG2_0( G ), MAJs( G, H, A ) ) );
-
-   // round 61  part 1
-   W[13] = SHA256_4X32_MEXP( W[11], W[ 6], W[14], W[13] );
-   T0 = v128_add32( v128_32( K256[61] ),
-                    v128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) );
-   G = v128_add32( G, T0 );
-
-   if ( t6_mask )
-   {
-      targ = v128_and( vmask, v128_32( target[6] ) );
-      hash = v128_bswap32( v128_add32( G, IV6 ) );
-
-      if ( ( 0 != ( t6_mask & v128_movmask32( v128_cmpeq32( hash, targ ) ) ) ))
-         return 0;
-      else
-      {
-         flip = ( (int)target[6] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash );
-         if ( 0 != ( t6_mask & ( flip ^ v128_movmask32(
-                                             v128_cmpgt32( hash, targ ) ) ) ) )
-            return 0;
-          else if ( target[6] == 0x80000000 )
-          {
-             if ( 0 == ( t6_mask & v128_movmask32(
-                            v128_cmpgt32( hash, v128_xor( hash, hash ) ) ) ) )
-                return 0;
-          }
-       }
-   }
-   
-   // rounds 59 to 61 part 2
-   E = v128_add32( T1, v128_add32( BSG2_0( F ), MAJs( F, G, H ) ) );
-   D = v128_add32( T2, v128_add32( BSG2_0( E ), MAJs( E, F, G ) ) );
-   C = v128_add32( T0, v128_add32( BSG2_0( D ), MAJs( D, E, F ) ) );
-
-   // rounds 62 & 63
-   W[14] = SHA256_4X32_MEXP( W[12], W[ 7], W[15], W[14] );
-   W[15] = SHA256_4X32_MEXP( W[13], W[ 8], W[ 0], W[15] );
-
-   SHA256_4X32_ROUND( C, D, E, F, G, H, A, B, 14, 48 );
-   SHA256_4X32_ROUND( B, C, D, E, F, G, H, A, 15, 48 );
-
-   state_out[0] = v128_add32( state_in[0], A );
-   state_out[1] = v128_add32( state_in[1], B );
-   state_out[2] = v128_add32( state_in[2], C );
-   state_out[3] = v128_add32( state_in[3], D );
-   state_out[4] = v128_add32( state_in[4], E );
-   state_out[5] = v128_add32( state_in[5], F );
-   state_out[6] = v128_add32( state_in[6], G );
-   state_out[7] = v128_add32( state_in[7], H );
-return 1;
-}
-
-#endif
-
 void sha256_4x32_init( sha256_4x32_context *sc )
 {
   sc->count_high = sc->count_low = 0;
@@ -529,28 +394,30 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
   sha256_4x32_close( &ctx, dst );
 }

+#endif  // SSE2 || NEON
+
 #if defined(__AVX2__)

 // SHA-256 8 way

 #define BSG2_0x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x,  2 ), \
-                                       mm256_ror_32( x, 13 ) ), \
+   mm256_xor3( mm256_ror_32( x,  2 ), \
+               mm256_ror_32( x, 13 ), \
               mm256_ror_32( x, 22 ) )

 #define BSG2_1x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x,  6 ), \
-                                       mm256_ror_32( x, 11 ) ), \
+   mm256_xor3( mm256_ror_32( x,  6 ), \
+               mm256_ror_32( x, 11 ), \
               mm256_ror_32( x, 25 ) )

 #define SSG2_0x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x,  7 ), \
-                                       mm256_ror_32( x, 18 ) ), \
+   mm256_xor3( mm256_ror_32( x,  7 ), \
+               mm256_ror_32( x, 18 ), \
               _mm256_srli_epi32( x, 3 ) ) 

 #define SSG2_1x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 17 ), \
-                                       mm256_ror_32( x, 19 ) ), \
+   mm256_xor3( mm256_ror_32( x, 17 ), \
+               mm256_ror_32( x, 19 ), \
               _mm256_srli_epi32( x, 10 ) )

 #define SHA256_8WAY_MEXP( a, b, c, d ) \
@@ -574,62 +441,6 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
      W[14] = SHA256_8WAY_MEXP( W[12], W[ 7], W[15], W[14] ); \
      W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] ); 

-
-// With AVX512VL ternary logic optimizations are available.
-// If not optimize by forwarding the result of X^Y in MAJ to the next round
-// to avoid recalculating it as Y^Z. This optimization is not applicable
-// when MAJ is optimized with ternary logic.
-
-#if defined(VL256)
-
-#define CHx(X, Y, Z)    _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
-
-#define MAJx(X, Y, Z)   _mm256_ternarylogic_epi32( X, Y, Z, 0xe8 )
-
-#define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
-do { \
-  __m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i) ] ), \
-                                 W[ i ] ); \
-  __m256i T1 = BSG2_1x( E ); \
-  __m256i T2 = BSG2_0x( A ); \
-  T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
-  T1 = _mm256_add_epi32( T1, H ); \
-  T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \
-  T1 = _mm256_add_epi32( T1, T0 ); \
-  D  = _mm256_add_epi32( D,  T1 ); \
-  H  = _mm256_add_epi32( T1, T2 ); \
-} while (0)
-
-#define SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
-   SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H,  0, j ); \
-   SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G,  1, j ); \
-   SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F,  2, j ); \
-   SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E,  3, j ); \
-   SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D,  4, j ); \
-   SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C,  5, j ); \
-   SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B,  6, j ); \
-   SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A,  7, j ); \
-   SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H,  8, j ); \
-   SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G,  9, j ); \
-   SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 10, j ); \
-   SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 11, j ); \
-   SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 12, j ); \
-   SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 13, j ); \
-   SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 14, j ); \
-   SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, j );
-
-// Not used with AVX512, needed to satisfy the compiler
-#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
-{ \
-   __m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
-                              v256_32( K256[(i)+(j)] ) ); \
-   __m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
-   D  = _mm256_add_epi32( D,  T1 ); \
-   H  = _mm256_add_epi32( T1, T2 ); \
-}
-
-#else  // AVX2
-
 #define CHx(X, Y, Z) \
   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 

@@ -641,61 +452,58 @@ do { \

 #define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
 { \
-   __m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
+   H = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
                              v256_32( K256[(i)+(j)] ) ); \
-   __m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
+   __m256i T = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
   Y_xor_Z = X_xor_Y; \
-   D  = _mm256_add_epi32( D,  T1 ); \
-   H  = _mm256_add_epi32( T1, T2 ); \
+   D  = _mm256_add_epi32( D, H ); \
+   H  = _mm256_add_epi32( H, T ); \
 }

 #define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
-do { \
-  __m256i T0 = _mm256_add_epi32( v256_32( K256[(j)+(i)] ), W[i] ); \
-  __m256i T1 = BSG2_1x( E ); \
+{ \
+  __m256i T1 = _mm256_add_epi32( v256_32( K256[(j)+(i)] ), W[i] ); \
+  H = _mm256_add_epi32( H, BSG2_1x( E ) ); \
  __m256i T2 = BSG2_0x( A ); \
-  T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
-  T1 = _mm256_add_epi32( T1, H ); \
+  T1 = _mm256_add_epi32( T1, CHx( E, F, G ) ); \
  T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \
-  T1 = _mm256_add_epi32( T1, T0 ); \
+  H = _mm256_add_epi32( H, T1 ); \
  Y_xor_Z = X_xor_Y; \
-  D  = _mm256_add_epi32( D,  T1 ); \
-  H  = _mm256_add_epi32( T1, T2 ); \
-} while (0)
+  D  = _mm256_add_epi32( D,  H ); \
+  H  = _mm256_add_epi32( H, T2 ); \
+}

 // read Y_xor_Z, update X_xor_Y
 #define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \
  _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
                                         Y_xor_Z ) )

-// start with toc initialized to y^z:   toc = B ^ C
+// start with toc initialized to y^z, toc = B ^ C for first ound.
 // First round reads toc as Y_xor_Z and saves X_xor_Y as tic.
 // Second round reads tic as Y_xor_Z and saves X_xor_Y as toc.

 #define SHA256_8WAY_2ROUNDS( A, B, C, D, E, F, G, H, i0, i1, j ) \
-do { \
-  __m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i0) ] ), \
+{ \
+  __m256i T1 = _mm256_add_epi32( v256_32( K256[ (j)+(i0) ] ), \
                                 W[ i0 ] ); \
-  __m256i T1 = BSG2_1x( E ); \
+  H = _mm256_add_epi32( H, BSG2_1x( E ) ); \
  __m256i T2 = BSG2_0x( A ); \
-  T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
-  T1 = _mm256_add_epi32( T1, H ); \
+  T1 = _mm256_add_epi32( T1, CHx( E, F, G ) ); \
  T2 = _mm256_add_epi32( T2, MAJ_2step( A, B, C, tic, toc ) ); \
-  T1 = _mm256_add_epi32( T1, T0 ); \
-  D  = _mm256_add_epi32( D,  T1 ); \
-  H  = _mm256_add_epi32( T1, T2 ); \
+  H = _mm256_add_epi32( H, T1 ); \
+  D  = _mm256_add_epi32( D,  H ); \
+  H  = _mm256_add_epi32( H, T2 ); \
 \
-  T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i1) ] ), \
+  T1 = _mm256_add_epi32( v256_32( K256[ (j)+(i1) ] ), \
                                 W[ (i1) ] ); \
-  T1 = BSG2_1x( D ); \
+  G = _mm256_add_epi32( G, BSG2_1x( D ) ); \
  T2 = BSG2_0x( H ); \
-  T0 = _mm256_add_epi32( T0, CHx( D, E, F ) ); \
-  T1 = _mm256_add_epi32( T1, G ); \
+  T1 = _mm256_add_epi32( T1, CHx( D, E, F ) ); \
  T2 = _mm256_add_epi32( T2, MAJ_2step( H, A, B, toc, tic ) ); \
-  T1 = _mm256_add_epi32( T1, T0 ); \
-  C  = _mm256_add_epi32( C,  T1 ); \
-  G  = _mm256_add_epi32( T1, T2 ); \
-} while (0)
+  G = _mm256_add_epi32( G, T1 ); \
+  C  = _mm256_add_epi32( C,  G ); \
+  G  = _mm256_add_epi32( G, T2 ); \
+}

 #define SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
 { \
@@ -710,8 +518,6 @@ do { \
   SHA256_8WAY_2ROUNDS( C, D, E, F, G, H, A, B, 14, 15, j ); \
 }

-#endif   // AVX512VL else AVX2
-
 static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W,
                                          const  __m256i *in ) \
 {
@@ -745,7 +551,7 @@ static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W,
 }

 // accepts LE input data
-void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
+void sha256_8x32_transform_le( __m256i *state_out, const __m256i *data,
                               const __m256i *state_in )
 {
   __m256i W[16];
@@ -754,7 +560,7 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
 }

 // Accepts BE input data, need to bswap
-void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
+void sha256_8x32_transform_be( __m256i *state_out, const __m256i *data,
                               const __m256i *state_in )
 {
   __m256i W[16];
@@ -764,7 +570,7 @@ void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
 }

 // Aggressive prehashing, LE byte order
-void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
+void sha256_8x32_prehash_3rounds( __m256i *state_mid, __m256i *X,
                                  const __m256i *W, const __m256i *state_in )
 {
   __m256i A, B, C, D, E, F, G, H, T1;
@@ -788,9 +594,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
   G = _mm256_load_si256( state_in + 6 );
   H = _mm256_load_si256( state_in + 7 );

-#if !defined(VL256)
   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
-#endif

   SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H,  0, 0 );
   SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G,  1, 0 );
@@ -813,7 +617,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
   _mm256_store_si256( state_mid + 7, H );
 }

-void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
+void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data,
          const __m256i *state_in, const __m256i *state_mid, const __m256i *X )
 {
   __m256i A, B, C, D, E, F, G, H;
@@ -830,9 +634,7 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
   G = _mm256_load_si256( state_mid + 6 );
   H = _mm256_load_si256( state_mid + 7 );

-#if !defined(VL256)
   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G );
-#endif

   // round 3 part 2, add nonces  
   A = _mm256_add_epi32( A, W[3] );
@@ -914,15 +716,13 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
   _mm256_store_si256( state_out + 7,  H );
 }

-int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
+int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
                           const __m256i *state_in, const uint32_t *target )
 {
-   __m256i A, B, C, D, E, F, G, H, T0, T1, T2;
+   __m256i A, B, C, D, E, F, G, H, G57, H56;
   __m256i vmask, targ, hash;
   __m256i W[16];  memcpy_256( W, data, 16 );
-   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
-                              0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
-   uint8_t flip, t6_mask;
+   uint8_t flip, t6_mask, t7_mask;

   A = _mm256_load_si256( state_in   );
   B = _mm256_load_si256( state_in+1 );
@@ -933,12 +733,10 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
   G = _mm256_load_si256( state_in+6 );
   H = _mm256_load_si256( state_in+7 );

-   const __m256i IV7 = H;
-   const __m256i IV6 = G;
+   const __m256i istate6 = G;
+   const __m256i istate7 = H;

-#if !defined(VL256)
   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
-#endif

   // rounds 0 to 16, ignore zero padding W[9..14]
   SHA256_8WAY_ROUND(       A, B, C, D, E, F, G, H,  0, 0 );
@@ -981,11 +779,9 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
   W[11] = SHA256_8WAY_MEXP( W[ 9], W[ 4], W[12], W[11] );
   W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] );

-#if !defined(VL256)
   Y_xor_Z = _mm256_xor_si256( B, C );
-#endif

-   // rounds 48 to 57
+   // Rounds 48 to 55
   SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H,  0, 48 );
   SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G,  1, 48 );
   SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F,  2, 48 );
@@ -994,77 +790,83 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
   SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C,  5, 48 );
   SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B,  6, 48 );
   SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A,  7, 48 );
-   SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H,  8, 48 );
-   SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G,  9, 48 );

-   // round 58 to 60 part 1
-   T0 = _mm256_add_epi32( v256_32( K256[58] ),
+   // Round 56
+   H = _mm256_add_epi32( v256_32( K256[56] ),
+                 mm256_add4_32( BSG2_1x( E ), CHx( E, F, G ), W[ 8], H ) );
+   D = _mm256_add_epi32( D, H );
+   H56 = _mm256_add_epi32( H, _mm256_add_epi32( BSG2_0x( A ),
+                                                   MAJx( A, B, C ) ) );
+   Y_xor_Z = X_xor_Y; 
+   
+   // Rounds 57 to 60 part 1
+   G = _mm256_add_epi32( v256_32( K256[57] ),
+                 mm256_add4_32( BSG2_1x( D ), CHx( D, E, F ), W[ 9], G ) );
+   C = _mm256_add_epi32( C, G );
+   G57 = _mm256_add_epi32( G, MAJx( H56, A, B ) );
+   
+   F = _mm256_add_epi32( v256_32( K256[58] ),
                 mm256_add4_32( BSG2_1x( C ), CHx( C, D, E ), W[10], F ) );
-   B = _mm256_add_epi32( B, T0 );
+   B = _mm256_add_epi32( B, F );

-   T1 = _mm256_add_epi32( v256_32( K256[59] ),
+   E = _mm256_add_epi32( v256_32( K256[59] ),
                 mm256_add4_32( BSG2_1x( B ), CHx( B, C, D ), W[11], E ) );
-   A = _mm256_add_epi32( A, T1 );
+   A = _mm256_add_epi32( A, E );

-   T2 = _mm256_add_epi32( v256_32( K256[60] ),
+   D = _mm256_add_epi32( v256_32( K256[60] ),
                 mm256_add4_32( BSG2_1x( A ), CHx( A, B, C ), W[12], D ) );
-   H = _mm256_add_epi32( H, T2 );
+   H = _mm256_add_epi32( H56, D );

   // Got H, test it.
+   hash = mm256_bswap_32( _mm256_add_epi32( H, istate7 ) );
   targ = v256_32( target[7] );
-   hash = _mm256_shuffle_epi8( _mm256_add_epi32( H, IV7 ), bswap_shuf );
-   if ( target[7] )
-   {
+   // A simple unsigned LE test is complicated by the lack of a cmple
+   // instruction, and lack of unsigned compares in AVX2.
   flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
-      if ( likely( 0xff == ( flip ^
-                    mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ))
+   if ( likely( 0xff == ( t7_mask = ( flip ^
+                    mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) )))
      return 0;
-   }
   t6_mask = mm256_movmask_32( vmask =_mm256_cmpeq_epi32( hash, targ ) );

-   // round 58 part 2
-   F = _mm256_add_epi32( T0, _mm256_add_epi32( BSG2_0x( G ),
-                                               MAJx( G, H, A ) ) );
-   // round 61 part 1
-   W[13] = SHA256_8WAY_MEXP( W[11], W[ 6], W[14], W[13] );
-   T0 = _mm256_add_epi32( v256_32( K256[61] ),
-                 mm256_add4_32( BSG2_1x( H ), CHx( H, A, B ), W[13], C ) );
-   G = _mm256_add_epi32( G, T0 );
+   // Round 57 part 2
+   G57 = _mm256_add_epi32( G57, BSG2_0x( H56 ) );
+   Y_xor_Z = X_xor_Y;

-   if ( t6_mask )
+   // Round 61 part 1
+   W[13] = SHA256_8WAY_MEXP( W[11], W[ 6], W[14], W[13] );
+   C = _mm256_add_epi32( v256_32( K256[61] ),
+                 mm256_add4_32( BSG2_1x( H ), CHx( H, A, B ), W[13], C ) );
+   G = _mm256_add_epi32( G57, C );
+
+   if ( t6_mask == (0xff & ~t7_mask ) )
   { 
      // Testing H was inconclusive: hash7 == target7, need to test G
      targ = _mm256_and_si256( vmask, v256_32( target[6] ) );
-      hash = _mm256_shuffle_epi8( _mm256_add_epi32( G, IV6 ), bswap_shuf );
-
-      if ( likely( 0 == ( t6_mask & mm256_movmask_32(
-                                      _mm256_cmpeq_epi32( hash, targ ) ) ) ))
-      {
+      hash = mm256_bswap_32( _mm256_add_epi32( G, istate6 ) );
      flip = ( (int)target[6] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
      if ( likely( 0 != ( t6_mask & ( flip ^
                   mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ) ))
         return 0;
-          if ( likely( ( target[6] == 0x80000000 )
-             && ( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpgt_epi32(
-                              hash, _mm256_xor_si256( hash, hash ) ) ) ) ) ))
-             return 0;
-       } 
-//     else inconclusive, testing targ5 isn't practical, fininsh hashing  
   }

-// At this point either the hash will be good or the test was inconclusive.
-// If the latter it's probably a high target difficulty with a nearly equal
-// high difficulty hash that has a good chance of being good.  
+   // Rounds 58 to 61 part 2
+   F = _mm256_add_epi32( F, _mm256_add_epi32( BSG2_0x( G57 ),
+                                                 MAJx( G57, H, A ) ) );
+   Y_xor_Z = X_xor_Y;

-   // rounds 59 to 61 part 2
-   E = _mm256_add_epi32( T1, _mm256_add_epi32( BSG2_0x( F ),
-                                               MAJx( F, G, H ) ) );
-   D = _mm256_add_epi32( T2, _mm256_add_epi32( BSG2_0x( E ),
-                                               MAJx( E, F, G ) ) );
-   C = _mm256_add_epi32( T0, _mm256_add_epi32( BSG2_0x( D ),
+   E = _mm256_add_epi32( E, _mm256_add_epi32( BSG2_0x( F ),
+                                                 MAJx( F, G57, H ) ) );
+   Y_xor_Z = X_xor_Y;
+
+   D = _mm256_add_epi32( D, _mm256_add_epi32( BSG2_0x( E ),
+                                                 MAJx( E, F, G57 ) ) );
+   Y_xor_Z = X_xor_Y;
+
+   C = _mm256_add_epi32( C, _mm256_add_epi32( BSG2_0x( D ),
                                                 MAJx( D, E, F ) ) );
+   Y_xor_Z = X_xor_Y;

-   // rounds 62 & 63
+   // Rounds 62 & 63
   W[14] = SHA256_8WAY_MEXP( W[12], W[ 7], W[15], W[14] );
   W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );

@@ -1083,8 +885,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
   return 1;
 }

-
-void sha256_8way_init( sha256_8way_context *sc )
+void sha256_8x32_init( sha256_8x32_context *sc )
 {
   sc->count_high = sc->count_low = 0;
   sc->val[0] = v256_32( sha256_iv[0] );
@@ -1100,7 +901,7 @@ void sha256_8way_init( sha256_8way_context *sc )
 // need to handle odd byte length for yespower.
 // Assume only last update is odd.

-void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
+void sha256_8x32_update( sha256_8x32_context *sc, const void *data, size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   size_t ptr;
@@ -1121,7 +922,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
      len -= clen;
      if ( ptr == buf_size )
      {
-         sha256_8way_transform_be( sc->val, sc->buf, sc->val );
+         sha256_8x32_transform_be( sc->val, sc->buf, sc->val );
         ptr = 0;
      }
      clow = sc->count_low;
@@ -1132,7 +933,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
   }
 }

-void sha256_8way_close( sha256_8way_context *sc, void *dst )
+void sha256_8x32_close( sha256_8x32_context *sc, void *dst )
 {
    unsigned ptr;
    uint32_t low, high;
@@ -1146,7 +947,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
    if ( ptr > pad )
    {
         memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
-         sha256_8way_transform_be( sc->val, sc->buf, sc->val );
+         sha256_8x32_transform_be( sc->val, sc->buf, sc->val );
         memset_zero_256( sc->buf, pad >> 2 );
    }
    else
@@ -1159,17 +960,17 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
    sc->buf[   pad     >> 2 ] = v256_32( bswap_32( high ) );
    sc->buf[ ( pad+4 ) >> 2 ] = v256_32( bswap_32( low ) );

-    sha256_8way_transform_be( sc->val, sc->buf, sc->val );
+    sha256_8x32_transform_be( sc->val, sc->buf, sc->val );

    mm256_block_bswap_32( dst, sc->val );
 }

-void sha256_8way_full( void *dst, const void *data, size_t len )
+void sha256_8x32_full( void *dst, const void *data, size_t len )
 {
-   sha256_8way_context ctx;
-   sha256_8way_init( &ctx );
-   sha256_8way_update( &ctx, data, len );
-   sha256_8way_close( &ctx, dst );
+   sha256_8x32_context ctx;
+   sha256_8x32_init( &ctx );
+   sha256_8x32_update( &ctx, data, len );
+   sha256_8x32_close( &ctx, dst );
 }

 #if defined(SIMD512)
@@ -1218,40 +1019,26 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
   W[15] = SHA256_16WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );

 #define SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
-do { \
-  __m512i T0 = _mm512_add_epi32( v512_32( K256[(j)+(i)] ), W[i] ); \
-  __m512i T1 = BSG2_1x16( E ); \
+{ \
+  __m512i T1 = _mm512_add_epi32( v512_32( K256[(j)+(i)] ), W[i] ); \
+  H = _mm512_add_epi32( H, BSG2_1x16( E ) ); \
  __m512i T2 = BSG2_0x16( A ); \
-  T0 = _mm512_add_epi32( T0, CHx16( E, F, G ) ); \
-  T1 = _mm512_add_epi32( T1, H ); \
+  T1 = _mm512_add_epi32( T1, CHx16( E, F, G ) ); \
  T2 = _mm512_add_epi32( T2, MAJx16( A, B, C ) ); \
-  T1 = _mm512_add_epi32( T1, T0 ); \
-  D  = _mm512_add_epi32( D,  T1 ); \
-  H  = _mm512_add_epi32( T1, T2 ); \
-} while (0)
+  H = _mm512_add_epi32( H, T1 ); \
+  D  = _mm512_add_epi32( D,  H ); \
+  H  = _mm512_add_epi32( H, T2 ); \
+}
   
 #define SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
 { \
-   __m512i T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \
+   H = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \
                              v512_32( K256[(i)+(j)] ) ); \
-   __m512i T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
-   D  = _mm512_add_epi32( D,  T1 ); \
-   H  = _mm512_add_epi32( T1, T2 ); \
+   __m512i T = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
+   D  = _mm512_add_epi32( D, H ); \
+   H  = _mm512_add_epi32( H, T ); \
 }

-/*
-#define SHA256_16WAY_ROUND(A, B, C, D, E, F, G, H, i, j) \
-do { \
-  __m512i T1, T2; \
-  __m512i K = v512_32( K256[( (j)+(i) )] ); \
-  T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \
-                                           K, W[i] ) ); \
-  T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
-  D  = _mm512_add_epi32( D,  T1 ); \
-  H  = _mm512_add_epi32( T1, T2 ); \
-} while (0)
-*/
-
 #define SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
   SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H,  0, j ); \
   SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G,  1, j ); \
@@ -1302,7 +1089,7 @@ static inline void SHA256_16WAY_TRANSFORM( __m512i *out, __m512i *W,
 }

 // accepts LE input data
-void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
+void sha256_16x32_transform_le( __m512i *state_out, const __m512i *data,
                                const __m512i *state_in )
 {
   __m512i W[16];
@@ -1311,7 +1098,7 @@ void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
 }

 // Accepts BE input data, need to bswap
-void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
+void sha256_16x32_transform_be( __m512i *state_out, const __m512i *data,
                                const __m512i *state_in )
 {
   __m512i W[16];
@@ -1321,7 +1108,7 @@ void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
 }
 
 // Aggressive prehashing, LE byte order
-void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X, 
+void sha256_16x32_prehash_3rounds( __m512i *state_mid, __m512i *X, 
                                   const __m512i *W, const __m512i *state_in )
 {
   __m512i A, B, C, D, E, F, G, H, T1;
@@ -1369,7 +1156,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
   _mm512_store_si512( state_mid + 7, H );
 }   

-void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
+void sha256_16x32_final_rounds( __m512i *state_out, const __m512i *data,
          const __m512i *state_in, const __m512i *state_mid, const __m512i *X )
 {
   __m512i A, B, C, D, E, F, G, H;
@@ -1470,15 +1257,12 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,

 // returns 0 if hash aborted early and invalid,
 // returns 1 for completed hash with at least one valid candidate.
-int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
+int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
                              const __m512i *state_in, const uint32_t *target )
 {
-   __m512i A, B, C, D, E, F, G, H, hash, targ;
-   __m512i T0, T1, T2;
+   __m512i A, B, C, D, E, F, G, H, hash, targ, G57, H56;
   __m512i W[16];      memcpy_512( W, data, 16 );
-   __mmask16 t6_mask;
-   const __m512i bswap_shuf = mm512_bcast_m128( _mm_set_epi64x(
-                              0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); 
+   __mmask16 mask;
   
   A = _mm512_load_si512( state_in   );
   B = _mm512_load_si512( state_in+1 );
@@ -1489,8 +1273,8 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
   G = _mm512_load_si512( state_in+6 );
   H = _mm512_load_si512( state_in+7 );

-   const __m512i IV6 = G;
-   const __m512i IV7 = H; 
+   const __m512i istate6 = G;
+   const __m512i istate7 = H;
   
   // rounds 0 to 8
   SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H,  0, 0 );
@@ -1562,7 +1346,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
   W[11] = SHA256_16WAY_MEXP( W[ 9], W[ 4], W[12], W[11] );
   W[12] = SHA256_16WAY_MEXP( W[10], W[ 5], W[13], W[12] );
   
-   // Rounds 48 to 57
+   // Rounds 48 to 55
   SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H,  0, 48 );
   SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G,  1, 48 );
   SHA256_16WAY_ROUND( G, H, A, B, C, D, E, F,  2, 48 );
@@ -1571,62 +1355,67 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
   SHA256_16WAY_ROUND( D, E, F, G, H, A, B, C,  5, 48 );
   SHA256_16WAY_ROUND( C, D, E, F, G, H, A, B,  6, 48 );
   SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A,  7, 48 );
-   SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H,  8, 48 );
-   SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G,  9, 48 );

-   // rounds 58 to 60 part 1
-   T0 = _mm512_add_epi32( v512_32( K256[58] ),
+   // Round 56
+   H = _mm512_add_epi32( v512_32( K256[56] ),
+                 mm512_add4_32( BSG2_1x16( E ), CHx16( E, F, G ), W[ 8], H ) );
+   D = _mm512_add_epi32( D, H );
+   H56 = _mm512_add_epi32( H, _mm512_add_epi32( BSG2_0x16( A ),
+                                                   MAJx16( A, B, C ) ) );
+   
+   // Rounds 57 to 60 part 1
+   G = _mm512_add_epi32( v512_32( K256[57] ),
+                 mm512_add4_32( BSG2_1x16( D ), CHx16( D, E, F ), W[ 9], G ) );
+   C = _mm512_add_epi32( C, G );
+   G57 = _mm512_add_epi32( G, MAJx16( H56, A, B ) );
+   
+   F = _mm512_add_epi32( v512_32( K256[58] ),
                 mm512_add4_32( BSG2_1x16( C ), CHx16( C, D, E ), W[10], F ) );
-   B = _mm512_add_epi32( B, T0 );
+   B = _mm512_add_epi32( B, F );
   
-   T1 = _mm512_add_epi32( v512_32( K256[59] ),
+   E = _mm512_add_epi32( v512_32( K256[59] ),
                 mm512_add4_32( BSG2_1x16( B ), CHx16( B, C, D ), W[11], E ) );
-   A = _mm512_add_epi32( A, T1 );
+   A = _mm512_add_epi32( A, E );

-   T2 = _mm512_add_epi32( v512_32( K256[60] ),
+   D = _mm512_add_epi32( v512_32( K256[60] ),
                 mm512_add4_32( BSG2_1x16( A ), CHx16( A, B, C ), W[12], D ) );
-   H = _mm512_add_epi32( H, T2 );
+   H = _mm512_add_epi32( H56, D );

-   // got H, test it against target[7]
-   hash = _mm512_shuffle_epi8( _mm512_add_epi32( H , IV7 ), bswap_shuf );
+   // got final H, test it against target[7]
+   hash = mm512_bswap_32( _mm512_add_epi32( H , istate7 ) );
   targ = v512_32( target[7] );
-   if ( target[7] )
-   if ( likely( 0 == _mm512_cmple_epu32_mask( hash, targ ) ))
+   if ( likely( 0 == ( mask = _mm512_cmple_epu32_mask( hash, targ ) ) ))
      return 0;
-   t6_mask = _mm512_cmpeq_epi32_mask( hash, targ );

-   // round 58 part 2
-   F = _mm512_add_epi32( T0, _mm512_add_epi32( BSG2_0x16( G ),
-                                               MAJx16( G, H, A ) ) );
+   // Round 57 part 2
+   G57 = _mm512_add_epi32( G57, BSG2_0x16( H56 ) );
   
-   // round 61 part 1
+   // Round 61 part 1
   W[13] = SHA256_16WAY_MEXP( W[11], W[ 6], W[14], W[13] );
-   T0 = _mm512_add_epi32( v512_32( K256[61] ),
+   C = _mm512_add_epi32( v512_32( K256[61] ),
                 mm512_add4_32( BSG2_1x16( H ), CHx16( H, A, B ), W[13], C ) );
-   G = _mm512_add_epi32( G, T0 );
+   G = _mm512_add_epi32( G57, C );

-   // got G, test it against target[6] if indicated
-   if ( (uint16_t)t6_mask )
+   // got final G, test it against target[6] if indicated.
+   if ( mask == _mm512_cmpeq_epi32_mask( hash, targ ) )
   {
-      hash = _mm512_shuffle_epi8( _mm512_add_epi32( G, IV6 ), bswap_shuf );
+      hash = mm512_bswap_32( _mm512_add_epi32( G, istate6 ) );
      targ = v512_32( target[6] );
-      if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) ))
+      if ( likely( 0 == _mm512_mask_cmple_epu32_mask( mask, hash, targ ) ))
          return 0;
   }

-   // round 59 part 2
-   E = _mm512_add_epi32( T1, _mm512_add_epi32( BSG2_0x16( F ),
-                                                  MAJx16( F, G, H ) ) );
-   
-   // round 60 part 2
-   D = _mm512_add_epi32( T2, _mm512_add_epi32( BSG2_0x16( E ),
-                                                  MAJx16( E, F, G ) ) );
-   
-   // round 61 part 2
-   C = _mm512_add_epi32( T0, _mm512_add_epi32( BSG2_0x16( D ),
+   // Round 58 to 61 part 2
+   F = _mm512_add_epi32( F, _mm512_add_epi32( BSG2_0x16( G57 ),
+                                                 MAJx16( G57, H, A ) ) );
+   E = _mm512_add_epi32( E, _mm512_add_epi32( BSG2_0x16( F ),
+                                                 MAJx16( F, G57, H ) ) );
+   D = _mm512_add_epi32( D, _mm512_add_epi32( BSG2_0x16( E ),
+                                                 MAJx16( E, F, G57 ) ) );
+   C = _mm512_add_epi32( C, _mm512_add_epi32( BSG2_0x16( D ),
                                                 MAJx16( D, E, F ) ) );

-   // rounds 62, 63
+   // Rounds 62, 63
   W[14] = SHA256_16WAY_MEXP( W[12], W[ 7], W[15], W[14] );
   W[15] = SHA256_16WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );
   
@@ -1644,7 +1433,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
   return 1;
 }

-void sha256_16way_init( sha256_16way_context *sc )
+void sha256_16x32_init( sha256_16x32_context *sc )
 {
   sc->count_high = sc->count_low = 0;
   sc->val[0] = v512_32( sha256_iv[0] );
@@ -1657,7 +1446,7 @@ void sha256_16way_init( sha256_16way_context *sc )
   sc->val[7] = v512_32( sha256_iv[7] );
 }

-void sha256_16way_update( sha256_16way_context *sc, const void *data,
+void sha256_16x32_update( sha256_16x32_context *sc, const void *data,
                           size_t len )
 {
   __m512i *vdata = (__m512i*)data;
@@ -1679,7 +1468,7 @@ void sha256_16way_update( sha256_16way_context *sc, const void *data,
      len -= clen;
      if ( ptr == buf_size )
      {
-         sha256_16way_transform_be( sc->val, sc->buf, sc->val );
+         sha256_16x32_transform_be( sc->val, sc->buf, sc->val );
         ptr = 0;
      }
      clow = sc->count_low;
@@ -1690,7 +1479,7 @@ void sha256_16way_update( sha256_16way_context *sc, const void *data,
   }
 }

-void sha256_16way_close( sha256_16way_context *sc, void *dst )
+void sha256_16x32_close( sha256_16x32_context *sc, void *dst )
 {
    unsigned ptr;
    uint32_t low, high;
@@ -1704,7 +1493,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
    if ( ptr > pad )
    {
         memset_zero_512( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
-         sha256_16way_transform_be( sc->val, sc->buf, sc->val );
+         sha256_16x32_transform_be( sc->val, sc->buf, sc->val );
         memset_zero_512( sc->buf, pad >> 2 );
    }
    else
@@ -1717,17 +1506,17 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
    sc->buf[   pad     >> 2 ] = v512_32( bswap_32( high ) );
    sc->buf[ ( pad+4 ) >> 2 ] = v512_32( bswap_32( low ) );

-    sha256_16way_transform_be( sc->val, sc->buf, sc->val );
+    sha256_16x32_transform_be( sc->val, sc->buf, sc->val );

    mm512_block_bswap_32( dst, sc->val );
 }

-void sha256_16way_full( void *dst, const void *data, size_t len )
+void sha256_16x32_full( void *dst, const void *data, size_t len )
 {
-   sha256_16way_context ctx;
-   sha256_16way_init( &ctx );
-   sha256_16way_update( &ctx, data, len );
-   sha256_16way_close( &ctx, dst );
+   sha256_16x32_context ctx;
+   sha256_16x32_init( &ctx );
+   sha256_16x32_update( &ctx, data, len );
+   sha256_16x32_close( &ctx, dst );
 }

 #undef CH
--- a/algo/sha/sha256-hash.h
+++ b/algo/sha/sha256-hash.h
@@ -180,20 +180,9 @@ void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data,
 int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
                             const __m256i *state_in, const uint32_t *target );

-// Temporary API during naming transition
-#define sha256_8way_context               sha256_8x32_context
-#define sha256_8way_init                  sha256_8x32_init
-#define sha256_8way_update                sha256_8x32_update
-#define sha256_8way_close                 sha256_8x32_close
-#define sha256_8way_full                  sha256_8x32_full
-#define sha256_8way_transform_le          sha256_8x32_transform_le
-#define sha256_8way_transform_be          sha256_8x32_transform_be
-#define sha256_8way_prehash_3rounds       sha256_8x32_prehash_3rounds
-#define sha256_8way_final_rounds          sha256_8x32_final_rounds
-#define sha256_8way_transform_le_short    sha256_8x32_transform_le_short
-
 #endif  // AVX2

+#if defined(__SSE2__) || defined(__ARM_NEON)
 // SHA-256 4 way x86_64 with SSE2 or AArch64 with NEON

 typedef struct
@@ -219,16 +208,5 @@ void sha256_4x32_final_rounds( v128_t *state_out, const v128_t *data,
 int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
                            const v128_t *state_in, const uint32_t *target );

-// Temporary API during naming transition
-#define sha256_4way_context              sha256_4x32_context
-#define sha256_4way_init                 sha256_4x32_init
-#define sha256_4way_update               sha256_4x32_update
-#define sha256_4way_close                sha256_4x32_close
-#define sha256_4way_full                 sha256_4x32_full
-#define sha256_4way_transform_le         sha256_4x32_transform_le
-#define sha256_4way_transform_be         sha256_4x32_transform_be
-#define sha256_4way_prehash_3rounds      sha256_4x32_prehash_3rounds
-#define sha256_4way_final_rounds         sha256_4x32_final_rounds
-#define sha256_4way_transform_le_short   sha256_4x32_transform_le_short
-
-#endif
+#endif // SSE2 || NEON
+#endif // SHA256_HASH_H__
--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -32,8 +32,6 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const v128_t shuf_bswap32 =
-           v128_set64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );

   // hash first 64 byte block of data
   sha256_transform_le( mstatea, pdata, sha256_iv );
@@ -69,10 +67,8 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,

      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
      {
-          casti_v128( hasha, 0 ) =
-               _mm_shuffle_epi8( casti_v128( hasha, 0 ), shuf_bswap32 );
-          casti_v128( hasha, 1 ) =
-               _mm_shuffle_epi8( casti_v128( hasha, 1 ), shuf_bswap32 );
+          casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
+          casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
          {
             pdata[19] = n;
@@ -81,10 +77,8 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
      }
      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
      {
-         casti_v128( hashb, 0 ) =
-               _mm_shuffle_epi8( casti_v128( hashb, 0 ), shuf_bswap32 );
-         casti_v128( hashb, 1 ) =
-               _mm_shuffle_epi8( casti_v128( hashb, 1 ), shuf_bswap32 );
+         casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
+         casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
         {
            pdata[19] = n+1;
@@ -204,8 +198,6 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   const int thr_id = mythr->id;
   const __m512i sixteen = v512_32( 16 );
   const bool bench = opt_benchmark;
-   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   // prehash first block directly from pdata
   sha256_transform_le( phash, pdata, sha256_iv );
@@ -231,7 +223,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   buf[15] = v512_32( 80*8 );  // bit count

   // partially pre-expand & prehash second message block, avoiding the nonces
-   sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
+   sha256_16x32_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );

   // vectorize IV for second hash
   istate[0] = v512_32( sha256_iv[0] );
@@ -250,15 +242,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,

   do
   {
-      sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
-      if ( unlikely( sha256_16way_transform_le_short(
+      sha256_16x32_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
+      if ( unlikely( sha256_16x32_transform_le_short(
                                  hash32, block, istate, ptarget ) ) )
      {
         for ( int lane = 0; lane < 16; lane++ )
         {
            extr_lane_16x32( phash, hash32, lane, 256 );
-            casti_m256i( phash, 0 ) =
-                   _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf ); 
+            casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) ); 
            if ( likely( valid_hash( phash, ptarget ) && !bench ) )
            {
              pdata[19] = n + lane;
@@ -299,8 +290,6 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
   const bool bench = opt_benchmark;
   const __m256i last_byte = v256_32( 0x80000000 );
   const __m256i eight = v256_32( 8 );
-   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   for ( int i = 0; i < 19; i++ )
      vdata[i] = v256_32( pdata[i] );
@@ -325,22 +314,22 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
   istate[6] = v256_32( sha256_iv[6] );
   istate[7] = v256_32( sha256_iv[7] );

-   sha256_8way_transform_le( mstate1, vdata, istate );
+   sha256_8x32_transform_le( mstate1, vdata, istate );

   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
+   sha256_8x32_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
   
   do
   {
-      sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
-      if ( unlikely( sha256_8way_transform_le_short( hash32, block,
+      sha256_8x32_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
+      if ( unlikely( sha256_8x32_transform_le_short( hash32, block,
                                                     istate, ptarget ) ) )
      {
         for ( int lane = 0; lane < 8; lane++ )
         {
            extr_lane_8x32( lane_hash, hash32, lane, 256 );
            casti_m256i( lane_hash, 0 ) =
-               _mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
+                                mm256_bswap_32( casti_m256i( lane_hash, 0 ) );
            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
            {
               pdata[19] = n + lane;
--- a/algo/sha/sha256d.h
+++ b/algo/sha/sha256d.h
@@ -12,7 +12,7 @@
  #define SHA256D_NEON_SHA2 1
 #elif defined(__AVX2__)
  #define SHA256D_8WAY 1
-#else
+#elif defined(__SSE2__) || defined(__ARM_NEON) 
  #define SHA256D_4WAY 1
 #endif

--- a/algo/sha/sha256dt.c
+++ b/algo/sha/sha256dt.c
@@ -17,7 +17,6 @@
 #elif defined (__SSE2__) || defined(__ARM_NEON) 
  #define SHA256DT_4X32 1
 #endif
-// else ref, should never happen

 static const uint32_t sha256dt_iv[8]  __attribute__ ((aligned (32))) =
 {
@@ -205,8 +204,6 @@ int scanhash_sha256dt_16x32( struct work *work, const uint32_t max_nonce,
   const int thr_id = mythr->id;
   const __m512i sixteen = v512_32( 16 );
   const bool bench = opt_benchmark;
-   const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   // prehash first block directly from pdata
   sha256_transform_le( phash, pdata, sha256dt_iv );
@@ -258,8 +255,7 @@ int scanhash_sha256dt_16x32( struct work *work, const uint32_t max_nonce,
         for ( int lane = 0; lane < 16; lane++ )
         {
            extr_lane_16x32( phash, hash32, lane, 256 );
-            casti_m256i( phash, 0 ) =
-                   _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf ); 
+            casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) ); 
            if ( likely( valid_hash( phash, ptarget ) && !bench ) )
            {
              pdata[19] = n + lane;
@@ -298,8 +294,6 @@ int scanhash_sha256dt_8x32( struct work *work, const uint32_t max_nonce,
   const bool bench = opt_benchmark;
   const __m256i last_byte = v256_32( 0x80000000 );
   const __m256i eight = v256_32( 8 );
-   const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   for ( int i = 0; i < 19; i++ )
      vdata[i] = v256_32( pdata[i] );
@@ -339,7 +333,7 @@ int scanhash_sha256dt_8x32( struct work *work, const uint32_t max_nonce,
         {
            extr_lane_8x32( lane_hash, hash32, lane, 256 );
            casti_m256i( lane_hash, 0 ) =
-               _mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
+                                mm256_bswap_32( casti_m256i( lane_hash, 0 ) );
            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
            {
               pdata[19] = n + lane;
@@ -406,7 +400,6 @@ int scanhash_sha256dt_4x32( struct work *work, const uint32_t max_nonce,
   do
   {
      sha256_4x32_final_rounds( block, vdata+16, mhash1, mhash2, mexp_pre );
-//      sha256_4x32_transform_le( block, vdata+16, mhash1 );
      sha256_4x32_transform_le( hash32, block, iv );

      for ( int lane = 0; lane < 4; lane++ )
--- a/algo/sha/sha256q-4way.c
+++ b/algo/sha/sha256q-4way.c
@@ -7,28 +7,28 @@

 #if defined(SHA256T_16WAY)

-static __thread sha256_16way_context sha256_ctx16 __attribute__ ((aligned (64)));
+static __thread sha256_16x32_context sha256_ctx16 __attribute__ ((aligned (64)));

 void sha256q_16way_hash( void* output, const void* input )
 {
   uint32_t vhash[8*16] __attribute__ ((aligned (64)));
-   sha256_16way_context ctx;
+   sha256_16x32_context ctx;
   memcpy( &ctx, &sha256_ctx16, sizeof ctx );

-   sha256_16way_update( &ctx, input + (64<<4), 16 );
-   sha256_16way_close( &ctx, vhash );
+   sha256_16x32_update( &ctx, input + (64<<4), 16 );
+   sha256_16x32_close( &ctx, vhash );

-   sha256_16way_init( &ctx );
-   sha256_16way_update( &ctx, vhash, 32 );
-   sha256_16way_close( &ctx, vhash );
+   sha256_16x32_init( &ctx );
+   sha256_16x32_update( &ctx, vhash, 32 );
+   sha256_16x32_close( &ctx, vhash );

-   sha256_16way_init( &ctx );
-   sha256_16way_update( &ctx, vhash, 32 );
-   sha256_16way_close( &ctx, vhash );
+   sha256_16x32_init( &ctx );
+   sha256_16x32_update( &ctx, vhash, 32 );
+   sha256_16x32_close( &ctx, vhash );

-   sha256_16way_init( &ctx );
-   sha256_16way_update( &ctx, vhash, 32 );
-   sha256_16way_close( &ctx, output );
+   sha256_16x32_init( &ctx );
+   sha256_16x32_update( &ctx, vhash, 32 );
+   sha256_16x32_close( &ctx, output );
 }

 int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,
@@ -51,8 +51,8 @@ int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,
   mm512_bswap32_intrlv80_16x32( vdata, pdata );
   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
-   sha256_16way_init( &sha256_ctx16 );
-   sha256_16way_update( &sha256_ctx16, vdata, 64 );
+   sha256_16x32_init( &sha256_ctx16 );
+   sha256_16x32_update( &sha256_ctx16, vdata, 64 );

   do
   {
@@ -80,28 +80,28 @@ int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,

 #if defined(SHA256T_8WAY)

-static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
+static __thread sha256_8x32_context sha256_ctx8 __attribute__ ((aligned (64)));

 void sha256q_8way_hash( void* output, const void* input )
 {
   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
-   sha256_8way_context ctx;
+   sha256_8x32_context ctx;
   memcpy( &ctx, &sha256_ctx8, sizeof ctx );

-   sha256_8way_update( &ctx, input + (64<<3), 16 );
-   sha256_8way_close( &ctx, vhash );
+   sha256_8x32_update( &ctx, input + (64<<3), 16 );
+   sha256_8x32_close( &ctx, vhash );

-   sha256_8way_init( &ctx );
-   sha256_8way_update( &ctx, vhash, 32 );
-   sha256_8way_close( &ctx, vhash );
+   sha256_8x32_init( &ctx );
+   sha256_8x32_update( &ctx, vhash, 32 );
+   sha256_8x32_close( &ctx, vhash );

-   sha256_8way_init( &ctx );
-   sha256_8way_update( &ctx, vhash, 32 );
-   sha256_8way_close( &ctx, vhash );
+   sha256_8x32_init( &ctx );
+   sha256_8x32_update( &ctx, vhash, 32 );
+   sha256_8x32_close( &ctx, vhash );

-   sha256_8way_init( &ctx );
-   sha256_8way_update( &ctx, vhash, 32 );
-   sha256_8way_close( &ctx, output );
+   sha256_8x32_init( &ctx );
+   sha256_8x32_update( &ctx, vhash, 32 );
+   sha256_8x32_close( &ctx, output );
 }

 int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,
@@ -123,8 +123,8 @@ int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
-   sha256_8way_init( &sha256_ctx8 );
-   sha256_8way_update( &sha256_ctx8, vdata, 64 );
+   sha256_8x32_init( &sha256_ctx8 );
+   sha256_8x32_update( &sha256_ctx8, vdata, 64 );

   do
   {
@@ -152,28 +152,28 @@ int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,

 #if defined(SHA256T_4WAY)

-static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
+static __thread sha256_4x32_context sha256_ctx4 __attribute__ ((aligned (64)));

 void sha256q_4way_hash( void* output, const void* input )
 {
   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-   sha256_4way_context ctx;
+   sha256_4x32_context ctx;
   memcpy( &ctx, &sha256_ctx4, sizeof ctx );

-   sha256_4way_update( &ctx, input + (64<<2), 16 );
-   sha256_4way_close( &ctx, vhash );
+   sha256_4x32_update( &ctx, input + (64<<2), 16 );
+   sha256_4x32_close( &ctx, vhash );

-   sha256_4way_init( &ctx );
-   sha256_4way_update( &ctx, vhash, 32 );
-   sha256_4way_close( &ctx, vhash );
+   sha256_4x32_init( &ctx );
+   sha256_4x32_update( &ctx, vhash, 32 );
+   sha256_4x32_close( &ctx, vhash );

-   sha256_4way_init( &ctx );
-   sha256_4way_update( &ctx, vhash, 32 );
-   sha256_4way_close( &ctx, vhash );
+   sha256_4x32_init( &ctx );
+   sha256_4x32_update( &ctx, vhash, 32 );
+   sha256_4x32_close( &ctx, vhash );

-   sha256_4way_init( &ctx );
-   sha256_4way_update( &ctx, vhash, 32 );
-   sha256_4way_close( &ctx, output );
+   sha256_4x32_init( &ctx );
+   sha256_4x32_update( &ctx, vhash, 32 );
+   sha256_4x32_close( &ctx, output );
 }

 int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
@@ -205,8 +205,8 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
                                        0 };

   v128_bswap32_intrlv80_4x32( vdata, pdata );
-   sha256_4way_init( &sha256_ctx4 );
-   sha256_4way_update( &sha256_ctx4, vdata, 64 );
+   sha256_4x32_init( &sha256_ctx4 );
+   sha256_4x32_update( &sha256_ctx4, vdata, 64 );

   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
   {
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -35,8 +35,6 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   const int thr_id = mythr->id;
   const __m512i sixteen = v512_32( 16 );
   const bool bench = opt_benchmark;
-   const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   // prehash first block directly from pdata
   sha256_transform_le( phash, pdata, sha256_iv );
@@ -62,7 +60,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   buf[15] = v512_32( 80*8 ); // bit count

   // partially pre-expand & prehash second message block, avoiding the nonces
-   sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
+   sha256_16x32_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );

   // vectorize IV for 2nd & 3rd sha256
   istate[0] = v512_32( sha256_iv[0] );
@@ -81,18 +79,17 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,

   do
   {
-      sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
+      sha256_16x32_final_rounds( block, buf, mstate1, mstate2, mexp_pre );

-      sha256_16way_transform_le( block, block, istate );
+      sha256_16x32_transform_le( block, block, istate );

-      if ( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) )
+      if ( sha256_16x32_transform_le_short( hash32, block, istate, ptarget ) )
      {
         for ( int lane = 0; lane < 16; lane++ )
         if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
         {
            extr_lane_16x32( phash, hash32, lane, 256 );
-            casti_m256i( phash, 0 ) =
-                _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
+            casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) );
            if ( likely( valid_hash( phash, ptarget ) && !bench ) )
            {
               pdata[19] = n + lane;
@@ -301,8 +298,6 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
   const bool bench = opt_benchmark;
   const __m256i last_byte = v256_32( 0x80000000 );
   const __m256i eight = v256_32( 8 );
-   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   for ( int i = 0; i < 19; i++ )
      vdata[i] = v256_32( pdata[i] );
@@ -327,29 +322,29 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
   istate[6] = v256_32( sha256_iv[6] );
   istate[7] = v256_32( sha256_iv[7] );

-   sha256_8way_transform_le( mstate1, vdata, istate );
+   sha256_8x32_transform_le( mstate1, vdata, istate );

   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
+   sha256_8x32_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );

   do
   {
      // 1. final 16 bytes of data, with padding
-      sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2,
+      sha256_8x32_final_rounds( block, vdata+16, mstate1, mstate2,
                                mexp_pre );

      // 2. 32 byte hash from 1.
-      sha256_8way_transform_le( block, block, istate );
+      sha256_8x32_transform_le( block, block, istate );

      // 3. 32 byte hash from 2.
-      if ( unlikely( sha256_8way_transform_le_short(
+      if ( unlikely( sha256_8x32_transform_le_short(
                                    hash32, block, istate, ptarget ) ) )
      {
         for ( int lane = 0; lane < 8; lane++ )
         {
            extr_lane_8x32( lane_hash, hash32, lane, 256 );
            casti_m256i( lane_hash, 0 ) =
-             _mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
+                             mm256_bswap_32( casti_m256i( lane_hash, 0 ) );
            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
            {
               pdata[19] = n + lane;
@@ -419,8 +414,8 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
   do
   {
      sha256_4x32_final_rounds( block, vdata+16, mhash1, mhash2, mexp_pre );
-      sha256_4way_transform_le( block,  block, iv );
-      sha256_4way_transform_le( hash32, block, iv );
+      sha256_4x32_transform_le( block,  block, iv );
+      sha256_4x32_transform_le( hash32, block, iv );

      for ( int lane = 0; lane < 4; lane++ )
      {
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -67,7 +67,7 @@ static const uint64_t K512[80] =
 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
 };

-#if defined(__AVX2__) && defined(__SHA512__)
+#if defined(__AVX__) && defined(__SHA512__)

 // SHA-512 implemented using SHA512 CPU extension.

@@ -83,15 +83,13 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
                              const uint64_t *state_in )
 {
    __m256i STATE0, STATE1;
-    __m256i MSG, TMP, BSWAP64;
+    __m256i MSG, TMP;
    __m256i TMSG0, TMSG1, TMSG2, TMSG3;
    __m256i ABEF_SAVE, CDGH_SAVE;

    // Load initial values
    TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
    STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
-    BSWAP64 = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f,
-                                                0x0001020304050607 ) );
    TMP = _mm256_permute4x64_epi64( TMP, 0xB1 );             // CDAB
    STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B );       // EFGH
    STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
@@ -103,7 +101,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,

    // Rounds 0-3
    TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
-    TMSG0 = _mm256_shuffle_epi8( TMSG0, BSWAP64 );
+    TMSG0 = mm256_bswap_64( TMSG0 );
    MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
                                       _mm256_castsi256_si128 (MSG ) );
@@ -113,7 +111,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,

    // Rounds 4-7
    TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
-    TMSG1 = _mm256_shuffle_epi8( TMSG1, BSWAP64 );
+    TMSG1 = mm256_bswap_64( TMSG1 );
    MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
                                        _mm256_castsi256_si128( MSG ) );
@@ -124,7 +122,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,

    // Rounds 8-11
    TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
-    TMSG2 = _mm256_shuffle_epi8( TMSG2, BSWAP64 );
+    TMSG2 = mm256_bswap_64( TMSG2 );
    MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
                                       _mm256_castsi256_si128( MSG ) );
@@ -135,7 +133,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,

    // Rounds 12-15
    TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
-    TMSG3 = _mm256_shuffle_epi8( TMSG3, BSWAP64 );
+    TMSG3 = mm256_bswap_64( TMSG3 );
    MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
                                       _mm256_castsi256_si128( MSG ) );
@@ -735,8 +733,6 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst )
    unsigned ptr;
    const int buf_size = 128;
    const int pad = buf_size - 16;
-    const __m512i shuff_bswap64 = mm512_bcast_m128( _mm_set_epi64x(
-                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ) );

    ptr = (unsigned)sc->count & (buf_size - 1U);
    sc->buf[ ptr>>3 ] = v512_64( 0x80 );
@@ -750,10 +746,8 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst )
    else
         memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );

-    sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
-                       v512_64( sc->count >> 61 ), shuff_bswap64 );
-    sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
-                       v512_64( sc->count <<  3 ), shuff_bswap64 );
+    sc->buf[ pad >> 3 ] = v512_64( bswap_64( sc->count >> 61 ) );
+    sc->buf[ ( pad+8 ) >> 3 ] = v512_64( bswap_64( sc->count <<  3 ) );
    sha512_8x64_round( sc, sc->buf, sc->val );

    mm512_block_bswap_64( dst, sc->val );
@@ -789,29 +783,6 @@ void sha512_8x64_ctx( sha512_8x64_context *sc, void *dst, const void *data,
                                    mm256_ror_64( x, 61 ), \
                                    _mm256_srli_epi64( x, 6 ) )

-#if defined(VL256)
-// 4 way is not used whith AVX512 but will be whith AVX10_256 when it
-// becomes available.
-
-#define CH( X, Y, Z )    _mm256_ternarylogic_epi64( X, Y, Z, 0xca )
-
-#define MAJ( X, Y, Z )   _mm256_ternarylogic_epi64( X, Y, Z, 0xe8 )
-   
-#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
-do { \
-  __m256i T0 = _mm256_add_epi64( v256_64( K512[i] ), W[i] ); \
-  __m256i T1 = BSG5_1( E ); \
-  __m256i T2 = BSG5_0( A ); \
-  T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
-  T1 = _mm256_add_epi64( T1, H ); \
-  T2 = _mm256_add_epi64( T2, MAJ( A, B, C ) ); \
-  T1 = _mm256_add_epi64( T1, T0 ); \
-  D  = _mm256_add_epi64( D,  T1 ); \
-  H  = _mm256_add_epi64( T1, T2 ); \
-} while (0)
-
-#else   // AVX2 only
-
 #define CH(X, Y, Z) \
   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 

@@ -833,19 +804,12 @@ do { \
  H  = _mm256_add_epi64( T1, T2 ); \
 } while (0)

-#endif  // AVX512VL AVX10_256
-
 static void
 sha512_4x64_round( sha512_4x64_context *ctx,  __m256i *in, __m256i r[8] )
 {
   int i;
   register __m256i A, B, C, D, E, F, G, H;
-
-#if !defined(VL256)
-// Disable for AVX10_256
   __m256i X_xor_Y, Y_xor_Z;
-#endif
-
   __m256i W[80];

   mm256_block_bswap_64( W  , in );
@@ -878,10 +842,7 @@ sha512_4x64_round( sha512_4x64_context *ctx,  __m256i *in, __m256i r[8] )
      H = v256_64( 0x5BE0CD19137E2179 );
   }

-#if !defined(VL256)
-// Disable for AVX10_256
   Y_xor_Z = _mm256_xor_si256( B, C );
-#endif

   for ( i = 0; i < 80; i += 8 )
   {
@@ -957,8 +918,6 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst )
    unsigned ptr;
    const int buf_size = 128;
    const int pad = buf_size - 16;
-    const __m256i shuff_bswap64 = mm256_bcast_m128( _mm_set_epi64x(
-                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ) );

    ptr = (unsigned)sc->count & (buf_size - 1U);
    sc->buf[ ptr>>3 ] = v256_64( 0x80 );
@@ -972,10 +931,8 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst )
    else
         memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );

-    sc->buf[ pad >> 3 ] = _mm256_shuffle_epi8(
-                       v256_64( sc->count >> 61 ), shuff_bswap64 );
-    sc->buf[ ( pad+8 ) >> 3 ] = _mm256_shuffle_epi8( 
-                       v256_64( sc->count <<  3 ), shuff_bswap64 );
+    sc->buf[ pad >> 3 ] = v256_64( bswap_64( sc->count >> 61 ) );
+    sc->buf[ ( pad+8 ) >> 3 ] = v256_64( bswap_64( sc->count <<  3 ) );
    sha512_4x64_round( sc, sc->buf, sc->val );

    mm256_block_bswap_64( dst, sc->val );
@@ -1138,8 +1095,8 @@ void sha512_2x64_close( sha512_2x64_context *sc, void *dst )
    else
         v128_memset_zero( sc->buf + (ptr>>3), (pad - ptr) >> 3 );

-    sc->buf[ pad >> 3 ] = v128_bswap64( v128_64( sc->count >> 61 ) );
-    sc->buf[ ( pad+8 ) >> 3 ] = v128_bswap64( v128_64( sc->count << 3 ) );
+    sc->buf[ pad >> 3 ] = v128_64( bswap_64( sc->count >> 61 ) );
+    sc->buf[ ( pad+8 ) >> 3 ] = v128_64( bswap_64( sc->count << 3 ) );
    sha512_2x64_round( sc, sc->buf, sc->val );

    v128_block_bswap64( castp_v128u64( dst ), sc->val );
--- a/algo/sha/sha512-hash.h
+++ b/algo/sha/sha512-hash.h
@@ -5,7 +5,7 @@
 #include "simd-utils.h"
 #include "sph_sha2.h"

-#if defined(__SHA512__) && defined(__AVX2__)
+#if defined(__SHA512__) && defined(__AVX__)

 // Experimental, untested
 // Need to substitute for sph_sha512
@@ -36,7 +36,6 @@ typedef struct
   uint64_t count;
   bool initialized;
 } sha512_8x64_context __attribute__ ((aligned (128)));
-#define sha512_8way_context sha512_8x64_context

 void sha512_8x64_init( sha512_8x64_context *sc);
 void sha512_8x64_update( sha512_8x64_context *sc, const void *data, 
@@ -45,10 +44,6 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst );
 void sha512_8x64_ctx( sha512_8x64_context *sc, void *dst, const void *data,
                      size_t len );

-#define sha512_8way_init     sha512_8x64_init
-#define sha512_8way_update   sha512_8x64_update
-#define sha512_8way_close    sha512_8x64_close
-
 #endif  // AVX512

 #if defined (__AVX2__)
@@ -62,7 +57,6 @@ typedef struct
   uint64_t count;
   bool initialized;
 } sha512_4x64_context __attribute__ ((aligned (64)));
-#define sha512_4way_context sha512_4x64_context

 void sha512_4x64_init( sha512_4x64_context *sc);
 void sha512_4x64_update( sha512_4x64_context *sc, const void *data,
@@ -71,10 +65,6 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst );
 void sha512_4x64_ctx( sha512_4x64_context *sc, void *dst, const void *data,
                       size_t len );

-#define sha512_4way_init     sha512_4x64_init
-#define sha512_4way_update   sha512_4x64_update
-#define sha512_4way_close    sha512_4x64_close
-
 #endif  // AVX2

 typedef struct
--- a/algo/sha/sha512256d-4way.c
+++ b/algo/sha/sha512256d-4way.c
@@ -14,7 +14,7 @@

 #if defined(SHA512256D_8WAY)

-static void sha512256d_8way_init( sha512_8way_context *ctx )
+static void sha512256d_8x64_init( sha512_8x64_context *ctx )
 {
  ctx->count = 0;
  ctx->initialized = true;
@@ -33,7 +33,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
 {
    uint64_t hash[8*8] __attribute__ ((aligned (128)));
    uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-    sha512_8way_context ctx; 
+    sha512_8x64_context ctx; 
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
    uint64_t *hash_q3 = &(hash[3*8]);
    uint32_t *pdata = work->data;
@@ -53,13 +53,13 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
    do
    {
-       sha512256d_8way_init( &ctx );
-       sha512_8way_update( &ctx, vdata, 80 );
-       sha512_8way_close( &ctx, hash );        
+       sha512256d_8x64_init( &ctx );
+       sha512_8x64_update( &ctx, vdata, 80 );
+       sha512_8x64_close( &ctx, hash );        

-       sha512256d_8way_init( &ctx );
-       sha512_8way_update( &ctx, hash, 32 );
-       sha512_8way_close( &ctx, hash );
+       sha512256d_8x64_init( &ctx );
+       sha512_8x64_update( &ctx, hash, 32 );
+       sha512_8x64_close( &ctx, hash );

       for ( int lane = 0; lane < 8; lane++ )
       if ( unlikely( hash_q3[ lane ] <= targ_q3 && !bench ) )
@@ -82,7 +82,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,

 #elif defined(SHA512256D_4WAY)

-static void sha512256d_4way_init( sha512_4way_context *ctx )
+static void sha512256d_4x64_init( sha512_4x64_context *ctx )
 {
  ctx->count = 0;
  ctx->initialized = true;
@@ -101,7 +101,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
 {
    uint64_t hash[8*4] __attribute__ ((aligned (64)));
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-    sha512_4way_context ctx;
+    sha512_4x64_context ctx;
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
    uint64_t *hash_q3 = &(hash[3*4]);
    uint32_t *pdata = work->data;
@@ -119,13 +119,13 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
                     n+3, 0, n+2, 0, n+1, 0, n, 0 ), casti_m256i( vdata,9 ) );
    do
    {
-       sha512256d_4way_init( &ctx );
-       sha512_4way_update( &ctx, vdata, 80 );
-       sha512_4way_close( &ctx, hash );
+       sha512256d_4x64_init( &ctx );
+       sha512_4x64_update( &ctx, vdata, 80 );
+       sha512_4x64_close( &ctx, hash );

-       sha512256d_4way_init( &ctx );
-       sha512_4way_update( &ctx, hash, 32 );
-       sha512_4way_close( &ctx, hash );
+       sha512256d_4x64_init( &ctx );
+       sha512_4x64_update( &ctx, hash, 32 );
+       sha512_4x64_close( &ctx, hash );

       for ( int lane = 0; lane < 4; lane++ )
       if ( hash_q3[ lane ] <= targ_q3 )
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -305,7 +305,7 @@ do { \
   xb0 = mm512_rol_32( xb0, 1 ); \
   xa0 = mm512_xor3( xm, xb1, \
                     mm512_xorandnot( v512_mult_x3( xa0 ), xb3, xb2 ) ); \
-   xb0 = mm512_xnor( xa0, xb0 ); \
+   xb0 = mm512_nxor( xa0, xb0 ); \
 } while (0)

 #define PERM_STEP_0_16  do { \
@@ -430,9 +430,9 @@ do { \
   } while (0)

 static void
-shabal_16way_init( void *cc, unsigned size )
+shabal_16x32_init( void *cc, unsigned size )
 {
-   shabal_16way_context *sc = (shabal_16way_context*)cc;
+   shabal_16x32_context *sc = (shabal_16x32_context*)cc;

   if ( size == 512 )
   { // copy immediate constants directly to working registers later.
@@ -494,9 +494,9 @@ shabal_16way_init( void *cc, unsigned size )
 }

 static void
-shabal_16way_core( void *cc, const unsigned char *data, size_t len )
+shabal_16x32_core( void *cc, const unsigned char *data, size_t len )
 {
-   shabal_16way_context *sc = (shabal_16way_context*)cc;
+   shabal_16x32_context *sc = (shabal_16x32_context*)cc;
    __m512i *buf;
    __m512i *vdata = (__m512i*)data;
   const int buf_size = 64;
@@ -544,10 +544,10 @@ shabal_16way_core( void *cc, const unsigned char *data, size_t len )
 }

 static void
-shabal_16way_close( void *cc, unsigned ub, unsigned n, void *dst,
+shabal_16x32_close( void *cc, unsigned ub, unsigned n, void *dst,
                   unsigned size_words )
 {
-   shabal_16way_context *sc = (shabal_16way_context*)cc;
+   shabal_16x32_context *sc = (shabal_16x32_context*)cc;
    __m512i *buf;
   const int buf_size = 64;
   size_t ptr;
@@ -590,52 +590,39 @@ shabal_16way_close( void *cc, unsigned ub, unsigned n, void *dst,
 }

 void
-shabal256_16way_init( void *cc )
+shabal256_16x32_init( void *cc )
 {
-   shabal_16way_init(cc, 256);
+   shabal_16x32_init(cc, 256);
 }

 void
-shabal256_16way_update( void *cc, const void *data, size_t len )
+shabal256_16x32_update( void *cc, const void *data, size_t len )
 {
-   shabal_16way_core( cc, data, len );
+   shabal_16x32_core( cc, data, len );
 }

 void
-shabal256_16way_close( void *cc, void *dst )
+shabal256_16x32_close( void *cc, void *dst )
 {
-   shabal_16way_close(cc, 0, 0, dst, 8);
+   shabal_16x32_close(cc, 0, 0, dst, 8);
 }

 void
-shabal256_16way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                  void *dst )
+shabal512_16x32_init(void *cc)
 {
-   shabal_16way_close(cc, ub, n, dst, 8);
+   shabal_16x32_init(cc, 512);
 }

 void
-shabal512_16way_init(void *cc)
+shabal512_16x32_update(void *cc, const void *data, size_t len)
 {
-   shabal_16way_init(cc, 512);
+   shabal_16x32_core(cc, data, len);
 }

 void
-shabal512_16way_update(void *cc, const void *data, size_t len)
+shabal512_16x32_close(void *cc, void *dst)
 {
-   shabal_16way_core(cc, data, len);
-}
-
-void
-shabal512_16way_close(void *cc, void *dst)
-{
-   shabal_16way_close(cc, 0, 0, dst, 16);
-}
-
-void
-shabal512_16way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
-{
-   shabal_16way_close(cc, ub, n, dst, 16);
+   shabal_16x32_close(cc, 0, 0, dst, 16);
 }

 #endif
@@ -911,7 +898,7 @@ do { \
   xb0 = mm256_rol_32( xb0, 1 ); \
   xa0 = mm256_xor3( xm, xb1, \
                     mm256_xorandnot( v256_mult_x3( xa0 ), xb3, xb2 ) ); \
-   xb0 = mm256_xnor( xa0, xb0 ); \
+   xb0 = mm256_nxor( xa0, xb0 ); \
 } while (0)

 #define PERM_STEP_0_8   do { \
@@ -1031,9 +1018,9 @@ do { \
 } while (0)

 static void
-shabal_8way_init( void *cc, unsigned size )
+shabal_8x32_init( void *cc, unsigned size )
 {
-   shabal_8way_context *sc = (shabal_8way_context*)cc;
+   shabal_8x32_context *sc = (shabal_8x32_context*)cc;

   if ( size == 512 )
   { // copy immediate constants directly to working registers later.
@@ -1095,9 +1082,9 @@ shabal_8way_init( void *cc, unsigned size )
 }

 static void
-shabal_8way_core( void *cc, const unsigned char *data, size_t len )
+shabal_8x32_core( void *cc, const unsigned char *data, size_t len )
 {
-   shabal_8way_context *sc = (shabal_8way_context*)cc;
+   shabal_8x32_context *sc = (shabal_8x32_context*)cc;
    __m256i *buf;
    __m256i *vdata = (__m256i*)data;
   const int buf_size = 64;
@@ -1146,10 +1133,10 @@ shabal_8way_core( void *cc, const unsigned char *data, size_t len )
 }

 static void
-shabal_8way_close( void *cc, unsigned ub, unsigned n, void *dst,
+shabal_8x32_close( void *cc, unsigned ub, unsigned n, void *dst,
                   unsigned size_words )
 {
-   shabal_8way_context *sc = (shabal_8way_context*)cc;
+   shabal_8x32_context *sc = (shabal_8x32_context*)cc;
    __m256i *buf;
   const int buf_size = 64;
   size_t ptr;
@@ -1192,52 +1179,39 @@ shabal_8way_close( void *cc, unsigned ub, unsigned n, void *dst,
 }

 void
-shabal256_8way_init( void *cc )
+shabal256_8x32_init( void *cc )
 {
-   shabal_8way_init(cc, 256);
+   shabal_8x32_init(cc, 256);
 }

 void
-shabal256_8way_update( void *cc, const void *data, size_t len )
+shabal256_8x32_update( void *cc, const void *data, size_t len )
 {
-   shabal_8way_core( cc, data, len );
+   shabal_8x32_core( cc, data, len );
 }

 void
-shabal256_8way_close( void *cc, void *dst )
+shabal256_8x32_close( void *cc, void *dst )
 {
-   shabal_8way_close(cc, 0, 0, dst, 8);
+   shabal_8x32_close(cc, 0, 0, dst, 8);
 }

 void
-shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                  void *dst )
+shabal512_8x32_init(void *cc)
 {
-   shabal_8way_close(cc, ub, n, dst, 8);
+   shabal_8x32_init(cc, 512);
 }

 void
-shabal512_8way_init(void *cc)
+shabal512_8x32_update(void *cc, const void *data, size_t len)
 {
-   shabal_8way_init(cc, 512);
+   shabal_8x32_core(cc, data, len);
 }

 void
-shabal512_8way_update(void *cc, const void *data, size_t len)
+shabal512_8x32_close(void *cc, void *dst)
 {
-   shabal_8way_core(cc, data, len);
-}
-
-void
-shabal512_8way_close(void *cc, void *dst)
-{
-   shabal_8way_close(cc, 0, 0, dst, 16);
-}
-
-void
-shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
-{
-   shabal_8way_close(cc, ub, n, dst, 16);
+   shabal_8x32_close(cc, 0, 0, dst, 16);
 }

 #endif  // AVX2
@@ -1674,9 +1648,9 @@ static const sph_u32 C_init_512[] = {
 */

 static void
-shabal_4way_init( void *cc, unsigned size )
+shabal_4x32_init( void *cc, unsigned size )
 {
-   shabal_4way_context *sc = (shabal_4way_context*)cc;
+   shabal_4x32_context *sc = (shabal_4x32_context*)cc;

   if ( size == 512 )
   { // copy immediate constants directly to working registers later.
@@ -1786,9 +1760,9 @@ shabal_4way_init( void *cc, unsigned size )
 }

 static void
-shabal_4way_core( void *cc, const unsigned char *data, size_t len )
+shabal_4x32_core( void *cc, const unsigned char *data, size_t len )
 {
-   shabal_4way_context *sc = (shabal_4way_context*)cc;
+   shabal_4x32_context *sc = (shabal_4x32_context*)cc;
    v128_t *buf;
    v128_t *vdata = (v128_t*)data;
   const int buf_size = 64;  
@@ -1838,10 +1812,10 @@ shabal_4way_core( void *cc, const unsigned char *data, size_t len )
 }

 static void
-shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
+shabal_4x32_close( void *cc, unsigned ub, unsigned n, void *dst,
                   unsigned size_words )
 {
-   shabal_4way_context *sc = (shabal_4way_context*)cc;
+   shabal_4x32_context *sc = (shabal_4x32_context*)cc;
    v128_t *buf;
   const int buf_size = 64;
   size_t ptr;
@@ -1884,52 +1858,39 @@ shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
 }

 void
-shabal256_4way_init( void *cc )
+shabal256_4x32_init( void *cc )
 {
-	shabal_4way_init(cc, 256);
+	shabal_4x32_init(cc, 256);
 }

 void
-shabal256_4way_update( void *cc, const void *data, size_t len )
+shabal256_4x32_update( void *cc, const void *data, size_t len )
 {
-	shabal_4way_core( cc, data, len );
+	shabal_4x32_core( cc, data, len );
 }

 void
-shabal256_4way_close( void *cc, void *dst )
+shabal256_4x32_close( void *cc, void *dst )
 {
-	shabal_4way_close(cc, 0, 0, dst, 8);
+	shabal_4x32_close(cc, 0, 0, dst, 8);
 }

 void
-shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                  void *dst )
+shabal512_4x32_init(void *cc)
 {
-	shabal_4way_close(cc, ub, n, dst, 8);
+	shabal_4x32_init(cc, 512);
 }

 void
-shabal512_4way_init(void *cc)
+shabal512_4x32_update(void *cc, const void *data, size_t len)
 {
-	shabal_4way_init(cc, 512);
+	shabal_4x32_core(cc, data, len);
 }

 void
-shabal512_4way_update(void *cc, const void *data, size_t len)
+shabal512_4x32_close(void *cc, void *dst)
 {
-	shabal_4way_core(cc, data, len);
-}
-
-void
-shabal512_4way_close(void *cc, void *dst)
-{
-	shabal_4way_close(cc, 0, 0, dst, 16);
-}
-
-void
-shabal512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
-{
-	shabal_4way_close(cc, ub, n, dst, 16);
+	shabal_4x32_close(cc, 0, 0, dst, 16);
 }

 #endif
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -4,10 +4,6 @@
 #include <stddef.h>
 #include "simd-utils.h"

-#define SPH_SIZE_shabal256   256
-
-#define SPH_SIZE_shabal512   512
-
 #if defined(SIMD512)

 typedef struct {
@@ -16,22 +12,27 @@ typedef struct {
   uint32_t Whigh, Wlow;
   size_t ptr;
   bool state_loaded;
-} shabal_16way_context __attribute__ ((aligned (64)));
+} shabal_16x32_context __attribute__ ((aligned (64)));

-typedef shabal_16way_context shabal256_16way_context;
-typedef shabal_16way_context shabal512_16way_context;
+typedef shabal_16x32_context shabal256_16x32_context;
+typedef shabal_16x32_context shabal512_16x32_context;

-void shabal256_16way_init( void *cc );
-void shabal256_16way_update( void *cc, const void *data, size_t len );
-void shabal256_16way_close( void *cc, void *dst );
-void shabal256_16way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                       void *dst );
+void shabal256_16x32_init( void *cc );
+void shabal256_16x32_update( void *cc, const void *data, size_t len );
+void shabal256_16x32_close( void *cc, void *dst );

-void shabal512_16way_init( void *cc );
-void shabal512_16way_update( void *cc, const void *data, size_t len );
-void shabal512_16way_close( void *cc, void *dst );
-void shabal512_16way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                       void *dst );
+void shabal512_16x32_init( void *cc );
+void shabal512_16x32_update( void *cc, const void *data, size_t len );
+void shabal512_16x32_close( void *cc, void *dst );
+
+#define shabal256_16way_context    shabal256_16x32_context
+#define shabal256_16way_init       shabal256_16x32_init
+#define shabal256_16way_update     shabal256_16x32_update
+#define shabal256_16way_close      shabal256_16x32_close
+#define shabal512_16way_context    shabal512_16x32_context
+#define shabal512_16way_init       shabal512_16x32_init
+#define shabal512_16way_update     shabal512_16x32_update
+#define shabal512_16way_close      shabal512_16x32_close

 #endif

@@ -43,22 +44,27 @@ typedef struct {
   uint32_t Whigh, Wlow;
   size_t ptr;
   bool state_loaded;
-} shabal_8way_context __attribute__ ((aligned (64)));
+} shabal_8x32_context __attribute__ ((aligned (64)));

-typedef shabal_8way_context shabal256_8way_context;
-typedef shabal_8way_context shabal512_8way_context;
+typedef shabal_8x32_context shabal256_8x32_context;
+typedef shabal_8x32_context shabal512_8x32_context;

-void shabal256_8way_init( void *cc );
-void shabal256_8way_update( void *cc, const void *data, size_t len );
-void shabal256_8way_close( void *cc, void *dst );
-void shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                       void *dst );
+void shabal256_8x32_init( void *cc );
+void shabal256_8x32_update( void *cc, const void *data, size_t len );
+void shabal256_8x32_close( void *cc, void *dst );

-void shabal512_8way_init( void *cc );
-void shabal512_8way_update( void *cc, const void *data, size_t len );
-void shabal512_8way_close( void *cc, void *dst );
-void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                       void *dst );
+void shabal512_8x32_init( void *cc );
+void shabal512_8x32_update( void *cc, const void *data, size_t len );
+void shabal512_8x32_close( void *cc, void *dst );
+
+#define shabal256_8way_context     shabal256_8x32_context
+#define shabal256_8way_init        shabal256_8x32_init
+#define shabal256_8way_update      shabal256_8x32_update
+#define shabal256_8way_close       shabal256_8x32_close
+#define shabal512_8way_context     shabal512_8x32_context
+#define shabal512_8way_init        shabal512_8x32_init
+#define shabal512_8way_update      shabal512_8x32_update
+#define shabal512_8way_close       shabal512_8x32_close

 #endif

@@ -70,59 +76,29 @@ typedef struct {
 	uint32_t Whigh, Wlow;
   size_t ptr;
   bool state_loaded;
-} shabal_4way_context;
+} shabal_4x32_context;

-typedef shabal_4way_context shabal256_4way_context;
-typedef shabal_4way_context shabal512_4way_context;
+typedef shabal_4x32_context shabal256_4x32_context;
+typedef shabal_4x32_context shabal512_4x32_context;

-void shabal256_4way_init( void *cc );
-void shabal256_4way_update( void *cc, const void *data, size_t len );
-void shabal256_4way_close( void *cc, void *dst );
-void shabal256_4way_addbits_and_close(	void *cc, unsigned ub, unsigned n,
-                                       void *dst );
+void shabal256_4x32_init( void *cc );
+void shabal256_4x32_update( void *cc, const void *data, size_t len );
+void shabal256_4x32_close( void *cc, void *dst );

-void shabal512_4way_init( void *cc );
-void shabal512_4way_update( void *cc, const void *data, size_t len );
-void shabal512_4way_close( void *cc, void *dst );
-void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                       void *dst );
+void shabal512_4x32_init( void *cc );
+void shabal512_4x32_update( void *cc, const void *data, size_t len );
+void shabal512_4x32_close( void *cc, void *dst );
+
+#define shabal256_4way_context     shabal256_4x32_context
+#define shabal256_4way_init        shabal256_4x32_init
+#define shabal256_4way_update      shabal256_4x32_update
+#define shabal256_4way_close       shabal256_4x32_close
+#define shabal512_4way_context     shabal512_4x32_context
+#define shabal512_4way_init        shabal512_4x32_init
+#define shabal512_4way_update      shabal512_4x32_update
+#define shabal512_4way_close       shabal512_4x32_close

 #endif

-// SSE or NEON
-
-/* No __mullo_pi32
-
-typedef struct
-{
-   v64_t buf[16] __attribute__ ((aligned (64)));
-   v64_t A[12], B[16], C[16];
-   uint32_t Whigh, Wlow;
-   size_t ptr;
-   bool state_loaded;
-} shabal_2x32_context;
-
-typedef shabal_2x32_context shabal256_2x32_context;
-typedef shabal_2x32_context shabal512_2x32_context;
-
-void shabal256_2x32_init( void *cc );
-void shabal256_2x32_update( void *cc, const void *data, size_t len );
-void shabal256_2x32_close( void *cc, void *dst );
-void shabal256_2x32_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                       void *dst );
-
-void shabal512_2x32_init( shabal512_2x32_context *cc );
-void shabal512_2x32_update( shabal512_2x32_context *cc, const void *data,
-                            size_t len );
-void shabal512_2x32_close( shabal512_2x32_context *cc, void *dst );
-void shabal512_2x32_addbits_and_close( shabal512_2x32_context *cc,
-                                       unsigned ub, unsigned n, void *dst );
-void shabal512_2x32_ctx( shabal512_2x32_context *cc, void *dst,
-                         const void *data, size_t len );
-void shabal512_2x32( shabal512_2x32_context *dst, const void *data,
-                     size_t len );
-
-*/
-
 #endif

--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -21,7 +21,7 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
   __m512i *H = (__m512i*)ctx->h;
   const __m512i count = _mm512_set4_epi32( ctx->count3, ctx->count2,
                                            ctx->count1, ctx->count0 );
-   int r;
+   const __m512i zero = _mm512_setzero_si512();

   P0 = H[0];
   P1 = H[1];
@@ -37,182 +37,160 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
   K6 = M[6];
   K7 = M[7];

-   X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K1 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K2 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K3 ), m512_zero );
+   // round 0

-   P0 = _mm512_xor_si512( P0, X );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K1 ), zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K2 ), zero );
+   P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P0 );

-   X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K5 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K6 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K7 ), m512_zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K5 ), zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K6 ), zero );
+   P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P2 );

-   P2 = _mm512_xor_si512( P2, X );
-
-   // round
-   for ( r = 0; r < 3; r ++ )
+   for ( int r = 0; r < 3; r ++ )
   {
     // round 1, 5, 9

     K0 = _mm512_xor_si512( K7, mm512_shuflr128_32(
-                                  _mm512_aesenc_epi128( K0, m512_zero ) ) );
+                                  _mm512_aesenc_epi128( K0, zero ) ) );

     if ( r == 0 )
        K0 = _mm512_xor_si512( K0,
-                    _mm512_mask_xor_epi32( count, 0x8888, count, m512_neg1 ) );
+             _mm512_mask_ternarylogic_epi32( count, 0x8888, count, count, 1 ) );

-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), zero );
     K1 = _mm512_xor_si512( K0,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K1, zero ) ) );

     if ( r == 1 )
        K1 = _mm512_xor_si512( K1, mm512_shuflr128_32(
-                 _mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) );
+           _mm512_mask_ternarylogic_epi32( count, 0x1111, count, count, 1 ) ) );

-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
     K2 = _mm512_xor_si512( K1,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K2, zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
     K3 = _mm512_xor_si512( K2,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
-
-     P3 = _mm512_xor_si512( P3, X );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K3, zero ) ) );
+     P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P3 );

     K4 = _mm512_xor_si512( K3,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K4, zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), zero );
     K5 = _mm512_xor_si512( K4,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K5, zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
     K6 = _mm512_xor_si512( K5,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K6, zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
     K7 = _mm512_xor_si512( K6,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K7, zero ) ) );

     if ( r == 2 )
        K7 = _mm512_xor_si512( K7, mm512_swap128_64(
-                 _mm512_mask_xor_epi32( count, 0x2222, count, m512_neg1 ) ) );
+           _mm512_mask_ternarylogic_epi32( count, 0x2222, count, count, 1 ) ) );
 
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
-     P1 = _mm512_xor_si512( P1, X );
+     P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P1 );
     
     // round 2, 6, 10

     K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), zero );
     K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
     K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
     K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
-
-     P2 = _mm512_xor_si512( P2, X );
+     P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P2 );

     K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), zero );
     K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
     K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
     K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
-
-     P0 = _mm512_xor_si512( P0, X );
+     P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P0 );

     // round 3, 7, 11

     K0 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero );
+                               _mm512_aesenc_epi128( K0, zero ) ), K7 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), zero );
     K1 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+                               _mm512_aesenc_epi128( K1, zero ) ), K0 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
     K2 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+                               _mm512_aesenc_epi128( K2, zero ) ), K1 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
     K3 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
-
-     P1 = _mm512_xor_si512( P1, X );
+                               _mm512_aesenc_epi128( K3, zero ) ), K2 );
+     P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P1 );

     K4 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero );
+                               _mm512_aesenc_epi128( K4, zero ) ), K3 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), zero );
     K5 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+                               _mm512_aesenc_epi128( K5, zero ) ), K4 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
     K6 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K6, m512_zero ) ), K5 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+                               _mm512_aesenc_epi128( K6, zero ) ), K5 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
     K7 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
-
-     P3 = _mm512_xor_si512( P3, X );
+                               _mm512_aesenc_epi128( K7, zero ) ), K6 );
+     P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P3 );

     // round 4, 8, 12

     K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), zero );
     K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
     K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
     K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
-
-     P0 = _mm512_xor_si512( P0, X );
+     P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P0 );

     K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), zero );
     K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
     K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
     K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
-
-     P2 = _mm512_xor_si512( P2, X );
+     P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P2 );
   }

   // round 13

   K0 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K0, m512_zero ) ), K7  );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
+			             _mm512_aesenc_epi128( K0, zero ) ), K7  );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), zero );
   K1 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+			             _mm512_aesenc_epi128( K1, zero ) ), K0 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
   K2 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+			             _mm512_aesenc_epi128( K2, zero ) ), K1 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
   K3 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
-
-   P3 = _mm512_xor_si512( P3, X );
+			             _mm512_aesenc_epi128( K3, zero ) ), K2 );
+   P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P3 );

   K4 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
+			             _mm512_aesenc_epi128( K4, zero ) ), K3 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), zero );
   K5 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
-   K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
-   K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5,  mm512_swap64_32( 
-              _mm512_mask_xor_epi32( count, 0x4444, count, m512_neg1 ) ) ) );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+			             _mm512_aesenc_epi128( K5, zero ) ), K4 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
+   K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, zero ) );
+   K6 = mm512_xor3( K6, K5, mm512_swap64_32(
+        _mm512_mask_ternarylogic_epi32( count, 0x4444, count, count, 1 ) ) );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
   K7= _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
-
-   P1 = _mm512_xor_si512( P1, X );
+			             _mm512_aesenc_epi128( K7, zero ) ), K6 );
+   P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P1 );

   H[0] = _mm512_xor_si512( H[0], P2 );
   H[1] = _mm512_xor_si512( H[1], P3 );
--- a/algo/shavite/shavite.c
+++ b/algo/shavite/shavite.c
@@ -1,159 +0,0 @@
-#include "miner.h"
-#include "algo-gate-api.h"
-#include <string.h>
-#include <stdint.h>
-
-#include "sph_shavite.h"
-
-extern void inkhash(void *state, const void *input)
-{
-    sph_shavite512_context	 ctx_shavite;
-    uint32_t hash[16];
-	
-    sph_shavite512_init(&ctx_shavite);
-    sph_shavite512 (&ctx_shavite, (const void*) input, 80);
-    sph_shavite512_close(&ctx_shavite, (void*) hash);
-    
-    sph_shavite512_init(&ctx_shavite);
-    sph_shavite512(&ctx_shavite, (const void*) hash, 64);
-    sph_shavite512_close(&ctx_shavite, (void*) hash);
-
-    memcpy(state, hash, 32);
-
-/*	
-	int ii;
-	printf("result: ");
-	for (ii=0; ii < 32; ii++)
-	{
-		printf ("%.2x",((uint8_t*)state)[ii]);
-	};
-	printf ("\n");	
-*/	
-}
-
-int scanhash_ink( struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
-{
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;
-
-	uint32_t n = pdata[19] - 1;
-	const uint32_t first_nonce = pdata[19];
-	//const uint32_t Htarg = ptarget[7];
-
-	uint32_t _ALIGN(32) hash64[8];
-	uint32_t endiandata[32];
-	
-	//char testdata[] = {"\x70\x00\x00\x00\x5d\x38\x5b\xa1\x14\xd0\x79\x97\x0b\x29\xa9\x41\x8f\xd0\x54\x9e\x7d\x68\xa9\x5c\x7f\x16\x86\x21\xa3\x14\x20\x10\x00\x00\x00\x00\x57\x85\x86\xd1\x49\xfd\x07\xb2\x2f\x3a\x8a\x34\x7c\x51\x6d\xe7\x05\x2f\x03\x4d\x2b\x76\xff\x68\xe0\xd6\xec\xff\x9b\x77\xa4\x54\x89\xe3\xfd\x51\x17\x32\x01\x1d\xf0\x73\x10\x00"};
-	
-	//we need bigendian data...
-	//lessons learned: do NOT endianchange directly in pdata, this will all proof-of-works be considered as stale from minerd.... 
-	int kk=0;
-	for (; kk < 32; kk++)
-	{
-		be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
-	};
-
-//	if (opt_debug) 
-//	{
-//		applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce);
-//	}
-	
-	/* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */
-	/* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */
-	if (ptarget[7]==0) {
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (((hash64[7]&0xFFFFFFFF)==0) && 
-					fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-	} 
-	else if (ptarget[7]<=0xF) 
-	{
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (((hash64[7]&0xFFFFFFF0)==0) && 
-					fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-	} 
-	else if (ptarget[7]<=0xFF) 
-	{
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (((hash64[7]&0xFFFFFF00)==0) && 
-					fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-	} 
-	else if (ptarget[7]<=0xFFF) 
-	{
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (((hash64[7]&0xFFFFF000)==0) && 
-					fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-
-	} 
-	else if (ptarget[7]<=0xFFFF) 
-	{
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (((hash64[7]&0xFFFF0000)==0) && 
-					fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-
-	} 
-	else 
-	{
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-	}
-	
-	
-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-	return 0;
-}
-
-bool register_shavite_algo( algo_gate_t* gate )
-{
-    algo_not_implemented();
-    return false;
-
-//    gate->scanhash = (void*)&scanhash_ink;
-//    gate->hash     = (void*)&inkhash;
-//    return true;
-};
-
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -50,7 +50,8 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

-static const sph_u32 IV512[] = {
+static const sph_u32 IV512[] =
+{
 	0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
 	0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC,
 	0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47,
@@ -71,38 +72,26 @@ c512( sph_shavite_big_context *sc, const void *msg )
   p2 = h[2];
   p3 = h[3];   

-   // round
-
   k00 = m[0];
-   x = v128_xor( p1, k00 );
-   x = v128_aesenc_nokey( x );
-
   k01 = m[1];
-   x = v128_xor( x, k01 );
-   x = v128_aesenc_nokey( x );
   k02 = m[2];
-   x = v128_xor( x, k02 );
-   x = v128_aesenc_nokey( x );
   k03 = m[3];
-   x = v128_xor( x, k03 );
-   x = v128_aesenc_nokey( x );
-
-   p0 = v128_xor( p0, x );
-
   k10 = m[4];
-   x = v128_xor( p3, k10 );
-   x = v128_aesenc_nokey( x );
   k11 = m[5];
-   x = v128_xor( x, k11 );
-   x = v128_aesenc_nokey( x );
   k12 = m[6];
-   x = v128_xor( x, k12 );
-   x = v128_aesenc_nokey( x );
   k13 = m[7];
-   x = v128_xor( x, k13 );
-   x = v128_aesenc_nokey( x );

-   p2 = v128_xor( p2, x );
+   // round 0
+   
+   x = v128_xoraesenc( p1, k00 );
+   x = v128_xoraesenc( x, k01 );
+   x = v128_xoraesenc( x, k02 );
+   p0 = v128_xoraesencxor( x, k03, p0 );
+
+   x = v128_xoraesenc( p3, k10 );
+   x = v128_xoraesenc( x, k11 );
+   x = v128_xoraesenc( x, k12 );
+   p2 = v128_xoraesencxor( x, k13, p2 );

   for ( r = 0; r < 3; r ++ )
   {
@@ -113,198 +102,165 @@ c512( sph_shavite_big_context *sc, const void *msg )
      if ( r == 0 )
         k00 = v128_xor( k00, v128_set32(
                  ~sc->count3, sc->count2, sc->count1, sc->count0 ) ); 
+      x = v128_xoraesenc( p0, k00 );

-      x = v128_xor( p0, k00 );
-      x = v128_aesenc_nokey( x );
      k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
      k01 = v128_xor( k01, k00 );

      if ( r == 1 )
         k01 = v128_xor( k01, v128_set32(
                  ~sc->count0, sc->count1, sc->count2, sc->count3 ) );
+      x = v128_xoraesenc( x, k01 );

-      x = v128_xor( x, k01 );
-      x = v128_aesenc_nokey( x );
      k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
      k02 = v128_xor( k02, k01 );
-      x = v128_xor( x, k02 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k02 );
+
      k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
      k03 = v128_xor( k03, k02 );
-      x = v128_xor( x, k03 );
-      x = v128_aesenc_nokey( x );
-
-      p3 = v128_xor( p3, x );
+      p3 = v128_xoraesencxor( x, k03, p3 );

      k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
      k10 = v128_xor( k10, k03 );
+      x = v128_xoraesenc( p2, k10 );

-      x = v128_xor( p2, k10 );
-      x = v128_aesenc_nokey( x );
      k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
      k11 = v128_xor( k11, k10 );
-      x = v128_xor( x, k11 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k11 );
+
      k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
      k12 = v128_xor( k12, k11 );
-      x = v128_xor( x, k12 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k12 );
+
      k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
      k13 = v128_xor( k13, k12 );

      if ( r == 2 )
         k13 = v128_xor( k13, v128_set32(
                  ~sc->count1, sc->count0, sc->count3, sc->count2 ) );
-
-      x = v128_xor( x, k13 );
-      x = v128_aesenc_nokey( x );
-      p1 = v128_xor( p1, x );
+      p1 = v128_xoraesencxor( x, k13, p1 );

      // round 2, 6, 10

      k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
-      x = v128_xor( p3, k00 );
-      x = v128_aesenc_nokey( x );
-      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
-      x = v128_xor( x, k01 );
-      x = v128_aesenc_nokey( x );
-      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
-      x = v128_xor( x, k02 );
-      x = v128_aesenc_nokey( x );
-      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
-      x = v128_xor( x, k03 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p3, k00 );

-      p2 = v128_xor( p2, x );
+      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
+      x = v128_xoraesenc( x, k01 );
+
+      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
+      x = v128_xoraesenc( x, k02 );
+
+      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
+      p2 = v128_xoraesencxor( x, k03, p2 );

      k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
-      x = v128_xor( p1, k10 );
-      x = v128_aesenc_nokey( x );
-      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
-      x = v128_xor( x, k11 );
-      x = v128_aesenc_nokey( x );
-      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
-      x = v128_xor( x, k12 );
-      x = v128_aesenc_nokey( x );
-      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
-      x = v128_xor( x, k13 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p1, k10 );

-      p0 = v128_xor( p0, x );
+      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
+      x = v128_xoraesenc( x, k11 );
+
+      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
+      x = v128_xoraesenc( x, k12 );
+
+      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
+      p0 = v128_xoraesencxor( x, k13, p0 );

      // round 3, 7, 11

      k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
      k00 = v128_xor( k00, k13 );
-      x = v128_xor( p2, k00 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p2, k00 );
+
      k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
      k01 = v128_xor( k01, k00 );
-      x = v128_xor( x, k01 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k01 );
+
      k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
      k02 = v128_xor( k02, k01 );
-      x = v128_xor( x, k02 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k02 );
+
      k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
      k03 = v128_xor( k03, k02 );
-      x = v128_xor( x, k03 );
-      x = v128_aesenc_nokey( x );
-
-      p1 = v128_xor( p1, x );
+      p1 = v128_xoraesencxor( x, k03, p1 );

      k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
      k10 = v128_xor( k10, k03 );
-      x = v128_xor( p0, k10 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p0, k10 );
+
      k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
      k11 = v128_xor( k11, k10 );
-      x = v128_xor( x, k11 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k11 );
+
      k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
      k12 = v128_xor( k12, k11 );
-      x = v128_xor( x, k12 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k12 );
+
      k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
      k13 = v128_xor( k13, k12 );
-      x = v128_xor( x, k13 );
-      x = v128_aesenc_nokey( x );
-
-      p3 = v128_xor( p3, x );
+      p3 = v128_xoraesencxor( x, k13, p3 );

      // round 4, 8, 12

      k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
-      x = v128_xor( p1, k00 );
-      x = v128_aesenc_nokey( x );
-      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
-      x = v128_xor( x, k01 );
-      x = v128_aesenc_nokey( x );
-      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
-      x = v128_xor( x, k02 );
-      x = v128_aesenc_nokey( x );
-      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
-      x = v128_xor( x, k03 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p1, k00 );

-      p0 = v128_xor( p0, x );
+      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
+      x = v128_xoraesenc( x, k01 );
+
+      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
+      x = v128_xoraesenc( x, k02 );
+
+      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
+      p0 = v128_xoraesencxor( x, k03, p0 );

      k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
-      x = v128_xor( p3, k10 );
-      x = v128_aesenc_nokey( x );
-      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
-      x = v128_xor( x, k11 );
-      x = v128_aesenc_nokey( x );
-      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
-      x = v128_xor( x, k12 );
-      x = v128_aesenc_nokey( x );
-      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
-      x = v128_xor( x, k13 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p3, k10 );

-      p2 = v128_xor( p2, x );
+      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
+      x = v128_xoraesenc( x, k11 );
+
+      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
+      x = v128_xoraesenc( x, k12 );
+
+      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
+      p2 = v128_xoraesencxor( x, k13, p2 );
   }

   // round 13

   k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
   k00 = v128_xor( k00, k13 );
-   x = v128_xor( p0, k00 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( p0, k00 );
+
   k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) ); 
   k01 = v128_xor( k01, k00 );
-   x = v128_xor( x, k01 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( x, k01 );
+
   k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
   k02 = v128_xor( k02, k01 );
-   x = v128_xor( x, k02 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( x, k02 );
+
   k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
   k03 = v128_xor( k03, k02 );
-   x = v128_xor( x, k03 );
-   x = v128_aesenc_nokey( x );
-
-   p3 = v128_xor( p3, x );
+   p3 = v128_xoraesencxor( x, k03, p3 );

   k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
   k10 = v128_xor( k10, k03 );
-   x = v128_xor( p2, k10 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( p2, k10 );
+
   k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
   k11 = v128_xor( k11, k10 );
-   x = v128_xor( x, k11 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( x, k11 );
+
   k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
   k12 = v128_xor( k12, v128_xor( k11, v128_set32(
               ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
-   x = v128_xor( x, k12 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( x, k12 );
+
   k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
   k13 = v128_xor( k13, k12 );
-   x = v128_xor( x, k13 );
-   x = v128_aesenc_nokey( x );
-
-   p1 = v128_xor( p1, x );
+   p1 = v128_xoraesencxor( x, k13, p1 );

   h[0] = v128_xor( h[0], p2 );
   h[1] = v128_xor( h[1], p3 );
--- a/algo/simd/nist.c
+++ b/algo/simd/nist.c
@@ -1,472 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "nist.h"
-#include "simd_iv.h"
-
-
-/* #define NO_PRECOMPUTED_IV */
-#if defined(__SSE2__)  // || defined(__ARM_NEON)
-
-/* 
- * Increase the counter.
- */
-void IncreaseCounter(hashState_sd *state, DataLength databitlen) {
-#ifdef HAS_64
-      state->count += databitlen;
-#else
-      uint32_t old_count = state->count_low;
-      state->count_low += databitlen;
-      if (state->count_low < old_count)
-        state->count_high++;
-#endif
-}
-
-
-/* 
- * Initialize the hashState_sd with a given IV.
- * If the IV is NULL, initialize with zeros.
- */
-int InitIV(hashState_sd *state, int hashbitlen, const u32 *IV) {
-
-  int n = 8;
-
-  state->hashbitlen = hashbitlen;
-  state->n_feistels = n;
-  state->blocksize = 128*8;
-  
-#ifdef HAS_64
-  state->count = 0;
-#else
-  state->count_low  = 0;
-  state->count_high = 0;
-#endif  
-
-//  state->buffer = malloc(16*n + 16);
-  /*
-   * Align the buffer to a 128 bit boundary.
-   */
-//  state->buffer += ((unsigned char*)NULL - state->buffer)&15;
-
-//  state->A = malloc((4*n+4)*sizeof(u32));
-  /*
-   * Align the buffer to a 128 bit boundary.
-   */
-//  state->A += ((u32*)NULL - state->A)&3;
-
-  state->B = state->A+n;
-  state->C = state->B+n;
-  state->D = state->C+n;
-
-  if (IV)
-    memcpy(state->A, IV, 4*n*sizeof(u32));
-  else
-    memset(state->A, 0, 4*n*sizeof(u32));
-
-   // free(state->buffer);
-  //  free(state->A);	
-  return 0;
-  
-}
-
-/* 
- * Initialize the hashState_sd.
- */
-int init_sd(hashState_sd *state, int hashbitlen) {
-  int r;
-  char *init;
-
-#ifndef NO_PRECOMPUTED_IV
-//  if (hashbitlen == 224)
-//    r=InitIV(state, hashbitlen, IV_224);
-//  else if (hashbitlen == 256)
-//    r=InitIV(state, hashbitlen, IV_256);
-//  else if (hashbitlen == 384)
-//    r=InitIV(state, hashbitlen, IV_384);
-//  else
-  if (hashbitlen == 512)
-    r = InitIV(state, hashbitlen, IV_512);
-  else
-#endif
-    {
-      /* 
-       * Nonstandart length: IV is not precomputed.
-       */
-      r=InitIV(state, hashbitlen, NULL);
-      if (r != 0)
-        return r;
-      
-      init = malloc(state->blocksize);
-      memset(init, 0, state->blocksize);
-#if defined __STDC__ && __STDC_VERSION__ >= 199901L
-      snprintf(init, state->blocksize, "SIMD-%i v1.1", hashbitlen);
-#else
-      sprintf(init, "SIMD-%i v1.1", hashbitlen);
-#endif
-      SIMD_Compress(state, (unsigned char*) init, 0);
-      free(init);
-    }
-  return r;
-}
-
-int update_sd( hashState_sd *state, const BitSequence *data,
-                      DataLength databitlen )
-{
-  unsigned current;
-  unsigned int bs = state->blocksize;
-  static int align = -1;
-
-  if (align == -1)
-    align = RequiredAlignment();
-
-#ifdef HAS_64
-  current = state->count & (bs - 1);
-#else
-  current = state->count_low & (bs - 1);
-#endif
-  
-  if ( current & 7 )
-  {
-    // The number of hashed bits is not a multiple of 8.
-    // Very painfull to implement and not required by the NIST API.
-    return 1;
-  }
-
-  while ( databitlen > 0 )
-  {
-    if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
-    {
-       // We can hash the data directly from the input buffer.
-      SIMD_Compress(state, data, 0);
-      databitlen -= bs;
-      data += bs/8;
-      IncreaseCounter(state, bs);
-    }
-    else
-    {
-       // Copy a chunk of data to the buffer
-      unsigned int len = bs - current;
-      if ( databitlen < len )
-      {
-        memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
-        IncreaseCounter( state, databitlen );        
-        return 0;
-      }
-      else
-      {
-        memcpy( state->buffer+current/8, data, len/8 );
-        IncreaseCounter( state,len );
-        databitlen -= len;
-        data += len/8;
-        current = 0;
-        SIMD_Compress( state, state->buffer, 0 );
-      }
-    }
-  }
-  return 0;
-}
-
-int final_sd( hashState_sd *state, BitSequence *hashval )
-{
-#ifdef HAS_64
-  uint64_t l;
-  int current = state->count & (state->blocksize - 1);
-#else
-  uint32_t l;
-  int current = state->count_low & (state->blocksize - 1);
-#endif
-  unsigned int i;
-  BitSequence bs[64];
-  int isshort = 1;
-
-  // If there is still some data in the buffer, hash it
-  if ( current )
-  {
-    // We first need to zero out the end of the buffer.
-    if ( current & 7 )
-    {
-      BitSequence mask = 0xff >> ( current & 7 );
-      state->buffer[current/8] &= ~mask;
-    }
-    current = ( current+7 ) / 8;
-    memset( state->buffer+current, 0, state->blocksize/8 - current );
-    SIMD_Compress( state, state->buffer, 0 );
-  }
-
-  //* Input the message length as the last block
-  memset( state->buffer, 0, state->blocksize / 8 );
-#ifdef HAS_64
-  l = state->count;
-  for ( i=0; i<8; i++ )
-  {
-    state->buffer[i] = l & 0xff;
-    l >>= 8;
-  }
-  if ( state->count < 16384 )
-    isshort = 2;
-#else
-  l = state->count_low;
-  for ( i=0; i<4; i++ )
-  {
-    state->buffer[i] = l & 0xff;
-    l >>= 8;
-  }
-  l = state->count_high;
-  for ( i=0; i<4; i++ )
-  {
-    state->buffer[4+i] = l & 0xff;
-    l >>= 8;
-  }
-  if ( state->count_high == 0 && state->count_low < 16384 )
-    isshort = 2;
-#endif
-
-  SIMD_Compress( state, state->buffer, isshort );
-  
-  // Decode the 32-bit words into a BitSequence
-  for ( i=0; i < 2*state->n_feistels; i++ )
-  {
-    u32 x = state->A[i];
-    bs[4*i  ] = x&0xff;
-    x >>= 8;
-    bs[4*i+1] = x&0xff;
-    x >>= 8;
-    bs[4*i+2] = x&0xff;
-    x >>= 8;
-    bs[4*i+3] = x&0xff;
-  }
-
-  memcpy( hashval, bs, state->hashbitlen / 8 );
-  if ( state->hashbitlen % 8 )
-  {
-    BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
-    hashval[state->hashbitlen/8 + 1] = bs[state->hashbitlen/8 + 1] & mask;
-  }
-  return 0;
-}
-
-int update_final_sd( hashState_sd *state, BitSequence *hashval,
-                            const BitSequence *data, DataLength databitlen )
-{
-  int current, i;
-  unsigned int bs = state->blocksize;
-  static int align = -1;
-  BitSequence out[64];
-  int isshort = 1;
-  uint64_t l;
-
-  if (align == -1)
-    align = RequiredAlignment();
-
-#ifdef HAS_64
-  current = state->count & (bs - 1);
-#else
-  current = state->count_low & (bs - 1);
-#endif
-
-  if ( current & 7 )
-  {
-    // The number of hashed bits is not a multiple of 8.
-    // Very painfull to implement and not required by the NIST API.
-    return 1;
-  }
-
-  while ( databitlen > 0 )
-  {
-    if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
-    {
-       // We can hash the data directly from the input buffer.
-      SIMD_Compress(state, data, 0);
-      databitlen -= bs;
-      data += bs/8;
-      IncreaseCounter(state, bs);
-    }
-    else
-    {
-       // Copy a chunk of data to the buffer
-      unsigned int len = bs - current;
-      if ( databitlen < len )
-      {
-        memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
-        IncreaseCounter( state, databitlen );
-        break;
-      }
-      else
-      {
-        memcpy( state->buffer+current/8, data, len/8 );
-        IncreaseCounter( state,len );
-        databitlen -= len;
-        data += len/8;
-        current = 0;
-        SIMD_Compress( state, state->buffer, 0 );
-      }
-    }
-  }
-
-  current = state->count & (state->blocksize - 1);
-
-  // If there is still some data in the buffer, hash it
-  if ( current )
-  {
-    // We first need to zero out the end of the buffer.
-    if ( current & 7 )
-    {
-      BitSequence mask = 0xff >> ( current & 7 );
-      state->buffer[current/8] &= ~mask;
-    }
-    current = ( current+7 ) / 8;
-    memset( state->buffer+current, 0, state->blocksize/8 - current );
-    SIMD_Compress( state, state->buffer, 0 );
-  }
-
-  //* Input the message length as the last block
-  memset( state->buffer, 0, state->blocksize / 8 );
-  l = state->count;
-  for ( i=0; i<8; i++ )
-  {
-    state->buffer[i] = l & 0xff;
-    l >>= 8;
-  }
-  if ( state->count < 16384 )
-    isshort = 2;
-
-  SIMD_Compress( state, state->buffer, isshort );
-
-  // Decode the 32-bit words into a BitSequence
-  for ( i=0; i < 2*state->n_feistels; i++ )
-  {
-    u32 x = state->A[i];
-    out[4*i  ] = x & 0xff;
-    x >>= 8;
-    out[4*i+1] = x & 0xff;
-    x >>= 8;
-    out[4*i+2] = x & 0xff;
-    x >>= 8;
-    out[4*i+3] = x & 0xff;
-  }
-
-  memcpy( hashval, out, state->hashbitlen / 8 );
-  if ( state->hashbitlen % 8 )
-  {
-    BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
-    hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask;
-  }
-  return 0;
-}
-
-int simd_full( hashState_sd *state, BitSequence *hashval,
-                            const BitSequence *data, DataLength databitlen )
-{
- 
-
-  InitIV( state, 512, IV_512 );
- 
-  int current, i;
-  unsigned int bs = state->blocksize;
-  static int align = -1;
-  BitSequence out[64];
-  int isshort = 1;
-  uint64_t l;
-
-  if (align == -1)
-    align = RequiredAlignment();
-
-#ifdef HAS_64
-  current = state->count & (bs - 1);
-#else
-  current = state->count_low & (bs - 1);
-#endif
-
-  if ( current & 7 )
-  {
-    // The number of hashed bits is not a multiple of 8.
-    // Very painfull to implement and not required by the NIST API.
-    return 1;
-  }
-
-  while ( databitlen > 0 )
-  {
-    if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
-    {
-       // We can hash the data directly from the input buffer.
-      SIMD_Compress(state, data, 0);
-      databitlen -= bs;
-      data += bs/8;
-      IncreaseCounter(state, bs);
-    }
-    else
-    {
-       // Copy a chunk of data to the buffer
-      unsigned int len = bs - current;
-      if ( databitlen < len )
-      {
-        memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
-        IncreaseCounter( state, databitlen );
-        break;
-      }
-      else
-      {
-        memcpy( state->buffer+current/8, data, len/8 );
-        IncreaseCounter( state,len );
-        databitlen -= len;
-        data += len/8;
-        current = 0;
-        SIMD_Compress( state, state->buffer, 0 );
-      }
-    }
-  }
-
-  current = state->count & (state->blocksize - 1);
-
-  // If there is still some data in the buffer, hash it
-  if ( current )
-  {
-    // We first need to zero out the end of the buffer.
-    if ( current & 7 )
-    {
-      BitSequence mask = 0xff >> ( current & 7 );
-      state->buffer[current/8] &= ~mask;
-    }
-    current = ( current+7 ) / 8;
-    memset( state->buffer+current, 0, state->blocksize/8 - current );
-    SIMD_Compress( state, state->buffer, 0 );
-  }
-
-  //* Input the message length as the last block
-  memset( state->buffer, 0, state->blocksize / 8 );
-  l = state->count;
-  for ( i=0; i<8; i++ )
-  {
-    state->buffer[i] = l & 0xff;
-    l >>= 8;
-  }
-  if ( state->count < 16384 )
-    isshort = 2;
-
-  SIMD_Compress( state, state->buffer, isshort );
-
-  // Decode the 32-bit words into a BitSequence
-  for ( i=0; i < 2*state->n_feistels; i++ )
-  {
-    u32 x = state->A[i];
-    out[4*i  ] = x & 0xff;
-    x >>= 8;
-    out[4*i+1] = x & 0xff;
-    x >>= 8;
-    out[4*i+2] = x & 0xff;
-    x >>= 8;
-    out[4*i+3] = x & 0xff;
-  }
-
-  memcpy( hashval, out, state->hashbitlen / 8 );
-  if ( state->hashbitlen % 8 )
-  {
-    BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
-    hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask;
-  }
-  return 0;
-}
-
-#endif
-
--- a/algo/simd/nist.h
+++ b/algo/simd/nist.h
@@ -1,64 +0,0 @@
-#ifndef __NIST_H__
-#define __NIST_H__
-
-/*define data alignment for different C compilers*/
-#if defined(__GNUC__)
-#define DATA_ALIGN(x) x __attribute__((aligned(16)))
-#else
-#define DATA_ALIGN(x) __declspec(align(16)) x
-#endif
-
-#include "simd-compat.h"
-#include "compat/sha3-defs.h"
-/*
- * NIST API Specific types.
- */
-
-typedef struct {
-  unsigned int hashbitlen;
-  unsigned int blocksize;
-  unsigned int n_feistels;
-
-#ifdef HAS_64
-  uint64_t count;
-#else
-  uint32_t count_low;
-  uint32_t count_high;
-#endif
-
-  DATA_ALIGN(uint32_t A[32]);
-  uint32_t *B;
-  uint32_t *C;
-  uint32_t *D;
-  DATA_ALIGN(unsigned char buffer[128]);
-  
-} hashState_sd;
-
-/* 
- * NIST API
- */
-
-int init_sd(hashState_sd *state, int hashbitlen);
-
-int update_sd(hashState_sd *state, const BitSequence *data, DataLength databitlen);
-
-int final_sd(hashState_sd *state, BitSequence *hashval);
-
-int update_final_sd( hashState_sd *state, BitSequence *hashval,
-                            const BitSequence *data, DataLength databitlen );
-
-int simd_full( hashState_sd *state, BitSequence *hashval,
-               const BitSequence *data, DataLength databitlen );
-
-/* 
- * Internal API
- */
-
-//int SupportedLength(int hashbitlen);
-int RequiredAlignment(void);
-void SIMD_Compress(hashState_sd * state, const unsigned char *M, int final);
-
-void fft128_natural(fft_t *a, unsigned char *x);
-void fft256_natural(fft_t *a, unsigned char *x);
-
-#endif
--- a/algo/simd/simd-compat.h
+++ b/algo/simd/simd-compat.h
@@ -1,198 +0,0 @@
-#ifndef __SIMD_COMPAT_H__
-#define __SIMD_COMPAT_H__
-
-#include <limits.h>
-
-
-/* 
- * This file desfines some helper function for cross-platform compatibility.
- */
-
-#if defined __GNUC_PREREQ && (! defined __STRICT_ANSI__)
-#define GNU_EXT
-#endif
-
-/*
- * First define some integer types.
- */
-
-#if defined __STDC__ && __STDC_VERSION__ >= 199901L
-
-/*
- * On C99 implementations, we can use <stdint.h> to get an exact 32-bit
- * type, if any, or otherwise use a wider type.
- */
-
-#include <stdint.h>
-#include "compat/brg_types.h"
-
-#define C32(x)    ((u32)(x))
-
-#define HAS_64  1
-
-#else
-
-/*
- * On non-C99 systems, we use "unsigned int" if it is wide enough,
- * "unsigned long" otherwise. This supports all "reasonable" architectures.
- * We have to be cautious: pre-C99 preprocessors handle constants
- * differently in '#if' expressions. Hence the shifts to test UINT_MAX.
- */
-
-#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
-
-typedef unsigned int u32;
-
-#define C32(x)    ((u32)(x ## U))
-
-#else
-
-typedef unsigned long u32;
-
-#define C32(x)    ((u32)(x ## UL))
-
-#endif
-
-/*
- * We want a 64-bit type. We use "unsigned long" if it is wide enough (as
- * is common on 64-bit architectures such as AMD64, Alpha or Sparcv9),
- * "unsigned long long" otherwise, if available. We use ULLONG_MAX to
- * test whether "unsigned long long" is available; we also know that
- * gcc features this type, even if the libc header do not know it.
- */
-
-#if ((ULONG_MAX >> 31) >> 31) >= 3
-
-typedef unsigned long u64;
-
-#define HAS_64  1
-
-#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__
-
-typedef unsigned long long u64;
-
-#define HAS_64  1
-
-#else
-
-/*
- * No 64-bit type...
- */
-
-#endif
-
-#endif
-
-
-/*
- * fft_t should be at least 16 bits wide.
- * using short int will require less memory, but int is faster...
- */
-
-typedef int fft_t;
-
-
-/*
- * Implementation note: some processors have specific opcodes to perform
- * a rotation. Recent versions of gcc recognize the expression above and
- * use the relevant opcodes, when appropriate.
- */
-
-#define T32(x)    ((x) & C32(0xFFFFFFFF))
-#define ROTL32(x, n)   T32(((x) << (n)) | ((x) >> (32 - (n))))
-#define ROTR32(x, n)   ROTL32(x, (32 - (n)))
-
-
-
-/*
- * The macro MAYBE_INLINE expands to an inline qualifier, is available.
- */
-
-#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined GNU_EXT
-#define MAYBE_INLINE static inline
-#elif defined _MSC_VER
-#define MAYBE_INLINE __inline
-#else
-#define MAYBE_INLINE
-#endif
-
-
-/*  */
-
-#if defined __GNUC__ && ( defined __i386__ || defined __x86_64__ )
-
-#define rdtsc()                                                         \
-  ({                                                                    \
-    u32 lo, hi;                                                         \
-    __asm__ __volatile__ (      /* serialize */                         \
-                          "xorl %%eax,%%eax \n        cpuid"            \
-                          ::: "%rax", "%rbx", "%rcx", "%rdx");          \
-    /* We cannot use "=A", since this would use %rax on x86_64 */       \
-    __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));              \
-    (u64)hi << 32 | lo;                                                 \
-  })                                                                    \
-
-#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
-
-#define rdtsc __rdtsc
-
-#endif
-
-/* 
- * The IS_ALIGNED macro tests if a char* pointer is aligned to an
- * n-bit boundary.
- * It is defined as false on unknown architectures.
- */
-
-
-#define CHECK_ALIGNED(p,n) ((((unsigned char *) (p) - (unsigned char *) NULL) & ((n)-1)) == 0)
-
-#if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64
-/*
- * Unaligned 32-bit access are not expensive on x86 so we don't care
- */
-#define IS_ALIGNED(p,n)    (n<=4 || CHECK_ALIGNED(p,n))
-
-#elif defined __sparcv9 || defined __sparc || defined __arm || \
-      defined __ia64 || defined __ia64__ || \
-      defined __itanium__ || defined __M_IA64 || \
-      defined __powerpc__ || defined __powerpc
-#define IS_ALIGNED(p,n)    CHECK_ALIGNED(p,n)
-
-#else
-/* 
- * Unkonwn architecture: play safe
- */
-#define IS_ALIGNED(p,n)    0
-#endif
-
-
-
-/* checks for endianness */
-
-#if defined (__linux__) || defined (__GLIBC__)
-#  include <endian.h>
-#elif defined (__FreeBSD__)
-#  include <machine/endian.h> 
-#elif defined (__OpenBSD__)
-#  include <sys/endian.h>
-#endif
-
-#ifdef __BYTE_ORDER
-
-#  if __BYTE_ORDER == __LITTLE_ENDIAN
-#    define SIMD_LITTLE_ENDIAN
-#  elif __BYTE_ORDER == __BIG_ENDIAN
-#    define SIMD_BIG_ENDIAN
-#  endif
-
-#else
-
-#  if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64
-#    define SIMD_LITTLE_ENDIAN
-#  endif
-
-#endif
-
-
-#endif
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -171,6 +171,53 @@ static const m128_v16 FFT256_twiddle[] __attribute__((aligned(64))) =
   {{ -30,   55,  -58,  -65,  -95,  -40,  -98,   94 }},
 };

+#if defined(__AVX2__)
+
+static const __m256i V256_00FF = { 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff,
+                                   0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff };
+#define V128_00FF _mm256_castsi256_si128( V256_00FF )
+
+#elif defined(__SSE2__) || defined(__ARM_NEON )
+
+static const v128u64_t V128_00FF = { 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff };
+
+#endif
+
+#if defined(SIMD512)
+
+static const __m512i V512_0101 = { 0x0101010101010101, 0x0101010101010101,
+                                   0x0101010101010101, 0x0101010101010101,
+                                   0x0101010101010101, 0x0101010101010101,
+                                   0x0101010101010101, 0x0101010101010101 };
+#define V256_0101 _mm512_castsi512_si256( V512_0101 )
+#define V128_0101 _mm512_castsi512_si128( V512_0101 )
+
+
+static const __m512i V512_0080 = { 0x0080008000800080, 0x0080008000800080,
+                                   0x0080008000800080, 0x0080008000800080,
+                                   0x0080008000800080, 0x0080008000800080,
+                                   0x0080008000800080, 0x0080008000800080 };
+#define V256_0080 _mm512_castsi512_si256( V512_0080 )
+#define V128_0080 _mm512_castsi512_si128( V512_0080 )
+
+#elif defined(__AVX2__)
+
+static const __m256i V256_0101 = { 0x0101010101010101, 0x0101010101010101,
+                                   0x0101010101010101, 0x0101010101010101 };
+#define V128_0101 _mm256_castsi256_si128( V256_0101 )
+
+static const __m256i V256_0080 = { 0x0080008000800080, 0x0080008000800080,
+                                   0x0080008000800080, 0x0080008000800080 };
+#define V128_0080 _mm256_castsi256_si128( V256_0080 )
+
+#elif defined(__SSE2__) || defined(__ARM_NEON )
+
+static const v128u64_t V128_0101 = { 0x0101010101010101, 0x0101010101010101 };
+
+static const v128u64_t V128_0080 = { 0x0080008000800080, 0x0080008000800080 };
+
+#endif
+
 #if defined(__x86_64__)

 #define SHUFXOR_1(x)        _mm_shuffle_epi32(x,0xb1)
@@ -190,13 +237,10 @@ static const m128_v16 FFT256_twiddle[] __attribute__((aligned(64))) =
 #define shufxor(x,s)   XCAT(SHUFXOR_,s)(x) 

 #define REDUCE(x) \
-  v128_sub16( v128_and( x, v128_64( \
-                         0x00ff00ff00ff00ff ) ), v128_sra16( x, 8 ) )
+  v128_sub16( v128_and( x, V128_00FF ), v128_sra16( x, 8 ) )

 #define EXTRA_REDUCE_S(x)\
-  v128_sub16( x, v128_and( \
-          v128_64( 0x0101010101010101 ), \
-          v128_cmpgt16( x, v128_64( 0x0080008000800080 ) ) ) )
+  v128_sub16( x, v128_and( V128_0101, v128_cmpgt16( x, V128_0080 ) ) )

 #define REDUCE_FULL_S( x )  EXTRA_REDUCE_S( REDUCE (x ) )

@@ -293,10 +337,9 @@ do { \
  // This will make the full FFT_64 in order.
 #define INTERLEAVE(i,j) \
  do { \
-    v128u16_t t1= X(i); \
-    v128u16_t t2= X(j); \
-    X(i) = v128_unpacklo16( t1, t2 ); \
-    X(j) = v128_unpackhi16( t1, t2 ); \
+    v128u16_t t = X(i); \
+    X(i) = v128_unpacklo16( t, X(j) ); \
+    X(j) = v128_unpackhi16( t, X(j) ); \
  } while(0)

  INTERLEAVE( 1, 0 );
@@ -803,23 +846,12 @@ static const m256_v16 FFT256_Twiddle[] =

 #define shufxor2w(x,s)      XCAT(SHUFXOR_,s)(x)

-#if defined(VL256)
-
 #define REDUCE(x) \
-  _mm256_sub_epi16( _mm256_maskz_mov_epi8( 0x55555555, x ), \
-                    _mm256_srai_epi16( x, 8 ) )
-#else
-
-#define REDUCE(x) \
-  _mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi64x( \
-                         0x00ff00ff00ff00ff ) ), _mm256_srai_epi16( x, 8 ) )
-
-#endif
+  _mm256_sub_epi16( _mm256_and_si256( x, V256_00FF ), _mm256_srai_epi16( x, 8 ) )

 #define EXTRA_REDUCE_S(x)\
-  _mm256_sub_epi16( x, _mm256_and_si256( \
-          _mm256_set1_epi64x( 0x0101010101010101 ), \
-          _mm256_cmpgt_epi16( x, _mm256_set1_epi64x( 0x0080008000800080 ) ) ) )
+  _mm256_sub_epi16( x, _mm256_and_si256( V256_0101, \
+                                         _mm256_cmpgt_epi16( x, V256_0080 ) ) )

 #define REDUCE_FULL_S( x )  EXTRA_REDUCE_S( REDUCE (x ) )

@@ -917,10 +949,9 @@ do { \
  // This will make the full FFT_64 in order.
 #define INTERLEAVE(i,j) \
  do { \
-    __m256i t1= X(i); \
-    __m256i t2= X(j); \
-    X(i) = _mm256_unpacklo_epi16( t1, t2 ); \
-    X(j) = _mm256_unpackhi_epi16( t1, t2 ); \
+    __m256i t = X(i); \
+    X(i) = _mm256_unpacklo_epi16( t, X(j) ); \
+    X(j) = _mm256_unpackhi_epi16( t, X(j) ); \
  } while(0)

  INTERLEAVE( 1, 0 );
@@ -1658,10 +1689,8 @@ static const m512_v16 FFT256_Twiddle4w[] =
                    _mm512_srai_epi16( x, 8 ) )

 #define EXTRA_REDUCE_S4w(x) \
-  _mm512_sub_epi16( x, _mm512_and_si512( \
-             _mm512_set1_epi64( 0x0101010101010101 ), \
-             _mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \
-                             x, _mm512_set1_epi64( 0x0080008000800080 ) ) ) ) )
+  _mm512_sub_epi16( x, _mm512_and_si512( V512_0101, \
+             _mm512_movm_epi16( _mm512_cmpgt_epi16_mask( x, V512_0080 ) ) ) )

 // generic, except it calls targetted macros
 #define REDUCE_FULL_S4w( x )  EXTRA_REDUCE_S4w( REDUCE4w (x ) )
--- a/algo/simd/simd-hash-2way.h
+++ b/algo/simd/simd-hash-2way.h
@@ -1,7 +1,6 @@
 #ifndef SIMD_HASH_2WAY_H__
 #define SIMD_HASH_2WAY_H__ 1

-#include "simd-compat.h"
 #include "simd-utils.h"

 #if defined(__SSE2__) || defined (__ARM_NEON)
@@ -34,7 +33,7 @@ typedef struct
  unsigned int hashbitlen;
  unsigned int blocksize;
  unsigned int n_feistels;
-} simd512_2way_context __attribute__((aligned(128)));
+} simd512_2way_context __attribute__((aligned(64)));
 #define simd_2way_context simd512_2way_context

 // databitlen is bits
--- a/algo/simd/vector.c
+++ b/algo/simd/vector.c
@@ -1,948 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "nist.h"
-#include "vector.h"
-
-
-//#if defined(__SSE2__) || defined(__ARM_NEON)
-#if defined(__SSE2__)
-
-#define PRINT_SOME 0
-
-/*
-int SupportedLength(int hashbitlen) {
-  if (hashbitlen <= 0 || hashbitlen > 512)
-    return 0;
-  else
-    return 1;
-}
-*/
-
-int RequiredAlignment(void) {
-  return 16;
-}
-
-static const union cv V128 = CV(128);
-static const union cv V255 = CV(255);
-static const union cv V257 = CV(257);
-static const union cv8  V0 = CV(0);
-
-
-/*
- * Reduce modulo 257; result is in [-127; 383]
- * REDUCE(x) := (x&255) - (x>>8)
- */
-#define REDUCE(x)                               \
-  v16_sub(v16_and(x, V255.v16), v16_shift_r (x, 8))
-
-/*
- * Reduce from [-127; 383] to [-128; 128]
- * EXTRA_REDUCE_S(x) := x<=128 ? x : x-257
- */
-#define EXTRA_REDUCE_S(x)                       \
-  v16_sub(x, v16_and(V257.v16, v16_cmp(x, V128.v16)))
- 
-/*
- * Reduce modulo 257; result is in [-128; 128]
- */
-#define REDUCE_FULL_S(x)                        \
-  EXTRA_REDUCE_S(REDUCE(x))
-
-#define DO_REDUCE(i)                            \
-  X(i) = REDUCE(X(i))
-
-#define DO_REDUCE_FULL_S(i)                     \
-  do {                                          \
-    X(i) = REDUCE(X(i));                        \
-    X(i) = EXTRA_REDUCE_S(X(i));                \
-  } while(0)
-
-#define MAYBE_VOLATILE
-
-MAYBE_INLINE void fft64(void *a) {
-
-  v16* const A = a;
-
-  register v16 X0, X1, X2, X3, X4, X5, X6, X7;
-/*
-#if V16_SIZE == 8
-#define X(i) A[i]
-#elif V16_SIZE == 4
-#define X(i) A[2*i]
-#endif
-*/
-#define X(i) X##i
-
-  X0 = A[0];
-  X1 = A[1];
-  X2 = A[2];
-  X3 = A[3];
-  X4 = A[4];
-  X5 = A[5];
-  X6 = A[6];
-  X7 = A[7];
-
-#define DO_REDUCE(i)                            \
-  X(i) = REDUCE(X(i))
-
-  /*
-   * Begin with 8 parallels DIF FFT_8
-   *
-   * FFT_8 using w=4 as 8th root of unity
-   *  Unrolled decimation in frequency (DIF) radix-2 NTT.
-   *  Output data is in revbin_permuted order.
-   */
-
-  static const int w[] = {0, 2, 4, 6};
-  //  v16 *Twiddle = (v16*)FFT64_Twiddle;
-
-#define BUTTERFLY(i,j,n)                                \
-  do {                                                  \
-    MAYBE_VOLATILE v16 v = X(j);                              \
-    X(j) =  v16_add(X(i), X(j));                        \
-    if (n)                                              \
-      X(i) = v16_shift_l(v16_sub(X(i), v), w[n]);       \
-    else                                                \
-      X(i) = v16_sub(X(i), v);                          \
-  } while(0)
-
-  BUTTERFLY(0, 4, 0);
-  BUTTERFLY(1, 5, 1);
-  BUTTERFLY(2, 6, 2);
-  BUTTERFLY(3, 7, 3);
-  
-  DO_REDUCE(2);
-  DO_REDUCE(3);
-  
-  BUTTERFLY(0, 2, 0);
-  BUTTERFLY(4, 6, 0);
-  BUTTERFLY(1, 3, 2);
-  BUTTERFLY(5, 7, 2);
-  
-  DO_REDUCE(1);
-  
-  BUTTERFLY(0, 1, 0);
-  BUTTERFLY(2, 3, 0);
-  BUTTERFLY(4, 5, 0);
-  BUTTERFLY(6, 7, 0);
-  
-  /* We don't need to reduce X(7) */
-  DO_REDUCE_FULL_S(0);
-  DO_REDUCE_FULL_S(1);
-  DO_REDUCE_FULL_S(2);
-  DO_REDUCE_FULL_S(3);
-  DO_REDUCE_FULL_S(4);
-  DO_REDUCE_FULL_S(5);
-  DO_REDUCE_FULL_S(6);
-    
-#undef BUTTERFLY
-
-  /*
-   * Multiply by twiddle factors
-   */
-
-  X(6) = v16_mul(X(6), FFT64_Twiddle[0].v16);
-  X(5) = v16_mul(X(5), FFT64_Twiddle[1].v16);
-  X(4) = v16_mul(X(4), FFT64_Twiddle[2].v16);
-  X(3) = v16_mul(X(3), FFT64_Twiddle[3].v16);
-  X(2) = v16_mul(X(2), FFT64_Twiddle[4].v16);
-  X(1) = v16_mul(X(1), FFT64_Twiddle[5].v16);
-  X(0) = v16_mul(X(0), FFT64_Twiddle[6].v16);
-
-  /*
-   * Transpose the FFT state with a revbin order permutation
-   * on the rows and the column.
-   * This will make the full FFT_64 in order.
-   */
-
-#define INTERLEAVE(i,j)                          \
-  do {                                           \
-    v16 t1= X(i);                                \
-    v16 t2= X(j);                                \
-    X(i) = v16_interleavel(t1, t2);              \
-    X(j) = v16_interleaveh(t1, t2);              \
-  } while(0)
-
-  INTERLEAVE(1, 0);
-  INTERLEAVE(3, 2);
-  INTERLEAVE(5, 4);
-  INTERLEAVE(7, 6);
-
-  INTERLEAVE(2, 0);
-  INTERLEAVE(3, 1);
-  INTERLEAVE(6, 4);
-  INTERLEAVE(7, 5);
-
-  INTERLEAVE(4, 0);
-  INTERLEAVE(5, 1);
-  INTERLEAVE(6, 2);
-  INTERLEAVE(7, 3);
-
-#undef INTERLEAVE
-
-  /*
-   * Finish with 8 parallels DIT FFT_8
-   *
-   * FFT_8 using w=4 as 8th root of unity
-   *  Unrolled decimation in time (DIT) radix-2 NTT.
-   *  Intput data is in revbin_permuted order.
-   */
-  
-#define BUTTERFLY(i,j,n)                                \
-  do {                                                  \
-    MAYBE_VOLATILE v16 u = X(j);                              \
-    if (n)                                              \
-      X(i) = v16_shift_l(X(i), w[n]);                   \
-    X(j) = v16_sub(X(j), X(i));                         \
-    X(i) = v16_add(u, X(i));                            \
-  } while(0)
-
-  DO_REDUCE(0);
-  DO_REDUCE(1);
-  DO_REDUCE(2);
-  DO_REDUCE(3);
-  DO_REDUCE(4);
-  DO_REDUCE(5);
-  DO_REDUCE(6);
-  DO_REDUCE(7);
-  
-  BUTTERFLY(0, 1, 0);
-  BUTTERFLY(2, 3, 0);
-  BUTTERFLY(4, 5, 0);
-  BUTTERFLY(6, 7, 0);
-  
-  BUTTERFLY(0, 2, 0);
-  BUTTERFLY(4, 6, 0);
-  BUTTERFLY(1, 3, 2);
-  BUTTERFLY(5, 7, 2);
-  
-  DO_REDUCE(3);
-  
-  BUTTERFLY(0, 4, 0);
-  BUTTERFLY(1, 5, 1);
-  BUTTERFLY(2, 6, 2);
-  BUTTERFLY(3, 7, 3);
-  
-  DO_REDUCE_FULL_S(0);
-  DO_REDUCE_FULL_S(1);
-  DO_REDUCE_FULL_S(2);
-  DO_REDUCE_FULL_S(3);
-  DO_REDUCE_FULL_S(4);
-  DO_REDUCE_FULL_S(5);
-  DO_REDUCE_FULL_S(6);
-  DO_REDUCE_FULL_S(7);
-  
-#undef BUTTERFLY
-
-  A[0] = X0;
-  A[1] = X1;
-  A[2] = X2;
-  A[3] = X3;
-  A[4] = X4;
-  A[5] = X5;
-  A[6] = X6;
-  A[7] = X7;
-
-#undef X
-
-}
-
-
-MAYBE_INLINE void fft128(void *a) {
-
-  int i;
-
-  // Temp space to help for interleaving in the end
-  v16 B[8];
-
-  v16 *A = (v16*) a;
-  //  v16 *Twiddle = (v16*)FFT128_Twiddle;
-
-  /* Size-2 butterflies */
-
-  for (i = 0; i<8; i++) {
-    B[i]   = v16_add(A[i], A[i+8]);
-    B[i]   = REDUCE_FULL_S(B[i]);
-    A[i+8] = v16_sub(A[i], A[i+8]);
-    A[i+8] = REDUCE_FULL_S(A[i+8]);
-    A[i+8] = v16_mul(A[i+8], FFT128_Twiddle[i].v16);
-    A[i+8] = REDUCE_FULL_S(A[i+8]);
-  }
-
-  fft64(B);
-  fft64(A+8);
-
-  /* Transpose (i.e. interleave) */
-
-  for (i=0; i<8; i++) {
-    A[2*i]   = v16_interleavel (B[i], A[i+8]);
-    A[2*i+1] = v16_interleaveh (B[i], A[i+8]);
-  }
-}
-
-#ifdef v16_broadcast
-/* Compute the FFT using a table
- * The function works if the value of the message is smaller 
- * than 2^14.
- */
-void fft128_msg_final(short *a, const unsigned char *x) {
-
-  static const union cv FFT128_Final_Table[] = {
-    {{   1, -211,   60,  -67,    2,   92, -137,  123}},
-    {{   2,  118,   45,  111,   97,  -46,   49, -106}},
-    {{   4,  -73,  -17,  -11,    8,  111,  -34,  -22}},
-    {{ -68,   -4,   76,  -25,   96,  -96,  -68,   -9}},
-    {{  16,  -35,  -68,  -44,   32,  -70, -136,  -88}},
-    {{   0, -124,   17,   12,   -6,   57,   47,   -8}},
-    {{  64,  117,  -15,   81,  128,  -23,  -30,  -95}},
-    {{ -68,  -53,  -52,  -70,  -10, -117,   77,   21}},
-    {{  -1,  -46,  -60,   67,   -2,  -92, -120, -123}},
-    {{  -2, -118,  -45, -111,  -97,   46,  -49,  106}},
-    {{  -4,   73,   17,   11,   -8, -111,   34,   22}},
-    {{  68,    4,  -76,   25,  -96,   96,   68,    9}},
-    {{ -16, -222,   68,   44,  -32,   70, -121,   88}},
-    {{   0,  124,  -17,  -12,    6,  -57,  -47,    8}},
-    {{ -64, -117,   15,  -81, -128, -234,   30,   95}},
-    {{  68,   53,   52,   70,   10,  117,  -77,  -21}},
-    {{-118,  -31,  116,  -61,   21,  -62,  -25, -122}},
-    {{-101,  107,  -45,  -95,   -8,    3,  101,  -34}},
-    {{  42, -124,  -50,   13,   84,    9, -100, -231}},
-    {{ -79,  -53,   82,   65,  -81,   47,   61,  107}},
-    {{ -89, -239,   57, -205, -178,   36, -143,  104}},
-    {{-126,  113,   33,  111,  103, -109,   65, -114}},
-    {{ -99,   72,  -29,  -49, -198, -113,  -58,  -98}},
-    {{   8,  -27, -106,  -30,  111,    6,   10, -108}},
-    {{-139,   31, -116, -196,  -21,   62,   25, -135}},
-    {{ 101, -107,   45,   95,    8,   -3, -101,   34}},
-    {{ -42, -133,   50,  -13,  -84,   -9,  100,  -26}},
-    {{  79,   53,  -82,  -65,   81,  -47,  -61, -107}},
-    {{-168,  -18,  -57,  -52,  -79,  -36, -114, -104}},
-    {{ 126, -113,  -33, -111, -103,  109,  -65,  114}},
-    {{  99,  -72, -228,   49,  -59,  113,   58, -159}},
-    {{  -8,   27,  106,   30, -111,   -6,  -10,  108}}
-  };
-
-  //  v16 *Table = (v16*)FFT128_Final_Table;
-  v16 *A = (v16*) a;
-  v16 msg1 = v16_broadcast(x[0]>128?x[0]-257:x[0]);
-  v16 msg2 = v16_broadcast(x[1]>128?x[1]-257:x[1]);
-  // v16 msg2 = v16_broadcast(x[1]);
-
-#if 0
-  int i;
-  for (i=0; i<16; i++) {
-    v16 tmp = v16_mul(FFT128_Final_Table[2*i].v16  , msg2);
-    v16 sum = v16_add(FFT128_Final_Table[2*i+1].v16, msg1);
-    sum = v16_add(sum, tmp);
-    A[i] = REDUCE_FULL_S(sum);
-  }
-
-#else
-
-#define FFT_FINAL(i)                                           \
-  v16 tmp##i = v16_mul(FFT128_Final_Table[2*i].v16, msg2);     \
-  v16 sum##i = v16_add(FFT128_Final_Table[2*i+1].v16, msg1);   \
-  sum##i = v16_add(sum##i, tmp##i);                            \
-  A[i] = REDUCE_FULL_S(sum##i);
-
-  FFT_FINAL(0)
-  FFT_FINAL(1)
-  FFT_FINAL(2)
-  FFT_FINAL(3)
-  FFT_FINAL(4)
-  FFT_FINAL(5)
-  FFT_FINAL(6)
-  FFT_FINAL(7)
-  FFT_FINAL(8)
-  FFT_FINAL(9)
-  FFT_FINAL(10)
-  FFT_FINAL(11)
-  FFT_FINAL(12)
-  FFT_FINAL(13)
-  FFT_FINAL(14)
-  FFT_FINAL(15)
-
-#endif
-
-}
-#endif
-
-void fft128_msg(short *a, const unsigned char *x, int final) {
-
-  static const union cv Tweak =
-    {{0,0,0,0,0,0,0,1}};
-  static const union cv FinalTweak =
-    {{0,0,0,0,0,1,0,1}};
-
-
-  v8  *X = (v8*)  x;
-  v16 *A = (v16*) a;
-  //  v16 *Twiddle = (v16*)FFT128_Twiddle;
-
-#define UNPACK(i)                                      \
-  do {                                                 \
-    v8 t = X[i];                                       \
-    A[2*i]   = v8_mergel(t, V0.v8);                    \
-    A[2*i+8] = v16_mul(A[2*i], FFT128_Twiddle[2*i].v16);          \
-    A[2*i+8] = REDUCE(A[2*i+8]);                       \
-    A[2*i+1] = v8_mergeh(t, V0.v8);                    \
-    A[2*i+9] = v16_mul(A[2*i+1], FFT128_Twiddle[2*i+1].v16);      \
-    A[2*i+9] = REDUCE(A[2*i+9]);                       \
-  } while(0)
-
-
-  /* 
-   * This allows to tweak the last butterflies to introduce X^127
-   */
-#define UNPACK_TWEAK(i,tw)                             \
-  do {                                                 \
-    v8 t = X[i];                                       \
-    v16 tmp;                                           \
-    A[2*i]   = v8_mergel(t, V0.v8);                    \
-    A[2*i+8] = v16_mul(A[2*i], FFT128_Twiddle[2*i].v16);          \
-    A[2*i+8] = REDUCE(A[2*i+8]);                       \
-    tmp      = v8_mergeh(t, V0.v8);                    \
-    A[2*i+1] = v16_add(tmp, tw);                               \
-    A[2*i+9] = v16_mul(v16_sub(tmp, tw), FFT128_Twiddle[2*i+1].v16);      \
-    A[2*i+9] = REDUCE(A[2*i+9]);                       \
-  } while(0)
-
-  UNPACK(0);
-  UNPACK(1);
-  UNPACK(2);
-  if (final)
-    UNPACK_TWEAK(3, FinalTweak.v16);
-  else
-    UNPACK_TWEAK(3, Tweak.v16);
-
-#undef UNPACK
-#undef UNPACK_TWEAK
-
-  fft64(a);
-  fft64(a+64);
-}
-
-#if 0
-void fft128_msg(short *a, const unsigned char *x, int final) {
-
-  for (int i=0; i<64; i++)
-    a[i] = x[i];
-
-  for (int i=64; i<128; i++)
-    a[i] = 0;
-
-  a[127] = 1;
-  a[125] = final? 1: 0;
-
-  fft128(a);
-}
-#endif
-
-void fft256_msg(short *a, const unsigned char *x, int final) {
-
-  static const union cv Tweak =
-    {{0,0,0,0,0,0,0,1}};
-  static const union cv FinalTweak =
-    {{0,0,0,0,0,1,0,1}};
-
-
-  v8  *X = (v8*)  x;
-  v16 *A = (v16*) a;
-  //  v16 *Twiddle = (v16*)FFT256_Twiddle;
-
-#define UNPACK(i)                                       \
-  do {                                                  \
-    v8 t      = X[i];                                   \
-    A[2*i]    = v8_mergel(t, V0.v8);                    \
-    A[2*i+16] = v16_mul(A[2*i], FFT256_Twiddle[2*i].v16);          \
-    A[2*i+16] = REDUCE(A[2*i+16]);                      \
-    A[2*i+1]  = v8_mergeh(t, V0.v8);                    \
-    A[2*i+17] = v16_mul(A[2*i+1], FFT256_Twiddle[2*i+1].v16);      \
-    A[2*i+17] = REDUCE(A[2*i+17]);                       \
-  } while(0)
-
-
-  /* 
-   * This allows to tweak the last butterflies to introduce X^127
-   */
-#define UNPACK_TWEAK(i,tw)                              \
-  do {                                                  \
-    v8 t = X[i];                                        \
-    v16 tmp;                                            \
-    A[2*i]    = v8_mergel(t, V0.v8);                    \
-    A[2*i+16] = v16_mul(A[2*i], FFT256_Twiddle[2*i].v16);          \
-    A[2*i+16] = REDUCE(A[2*i+16]);                       \
-    tmp       = v8_mergeh(t, V0.v8);                    \
-    A[2*i+1]  = v16_add(tmp, tw);                               \
-    A[2*i+17] = v16_mul(v16_sub(tmp, tw), FFT256_Twiddle[2*i+1].v16);      \
-    A[2*i+17] = REDUCE(A[2*i+17]);                      \
-  } while(0)
-
-  UNPACK(0);
-  UNPACK(1);
-  UNPACK(2);
-  UNPACK(3);
-  UNPACK(4);
-  UNPACK(5);
-  UNPACK(6);
-  if (final)
-    UNPACK_TWEAK(7, FinalTweak.v16);
-  else
-    UNPACK_TWEAK(7, Tweak.v16);
-
-#undef UNPACK
-#undef UNPACK_TWEAK
-
-  fft128(a);
-  fft128(a+128);
-}
-
-
-void rounds(u32* state, const unsigned char* msg, short* fft) {
-  
-  v32* S = (v32*) state;
-  const v32* M = (v32*)msg;
-  volatile v16* W = (v16*)fft;
-
-  register v32 S0, S1, S2, S3;
-  static const union cv code[] = { CV(185), CV(233) };
-
-  S0 = v32_xor(S[0], v32_bswap(M[0]));
-  S1 = v32_xor(S[1], v32_bswap(M[1]));
-  S2 = v32_xor(S[2], v32_bswap(M[2]));
-  S3 = v32_xor(S[3], v32_bswap(M[3]));
-
-#define S(i) S##i
-
-
-/* #define F_0(B, C, D)     ((((C) ^ (D)) & (B)) ^ (D)) */
-/* #define F_1(B, C, D)     (((D) & (C)) | (((D) | (C)) & (B))) */
-
-#define F_0(B, C, D)     v32_xor(v32_and(v32_xor(C,D), B), D)
-#define F_1(B, C, D)     v32_or(v32_and(D, C), v32_and( v32_or(D,C), B))
-
-#define F(a,b,c,fun) F_##fun (a,b,c)
-
-  /*
-   * We split the round function in two halfes
-   * so as to insert some independent computations in between
-   */
-
-#define SUM3_00 1
-#define SUM3_01 2
-#define SUM3_02 3
-#define SUM3_10 2
-#define SUM3_11 3
-#define SUM3_12 1
-#define SUM3_20 3
-#define SUM3_21 1
-#define SUM3_22 2
-
-#define STEP_1(a,b,c,d,w,fun,r,s,z)                             \
-  do {                                                          \
-    if (PRINT_SOME) {                                           \
-      int j;                                                    \
-      v32 ww=w, aa=a, bb=b, cc=c, dd=d;                         \
-      u32 *WW = (void*)&ww;                                     \
-      u32 *AA = (void*)&aa;                                     \
-      u32 *BB = (void*)&bb;                                     \
-      u32 *CC = (void*)&cc;                                     \
-      u32 *DD = (void*)&dd;                                     \
-      for (j=0; j<4; j++) {                                     \
-        printf ("%08x/%2i/%2i[%i]: %08x %08x %08x %08x\n",      \
-                WW[j], r, s, SUM3_##z,                          \
-                AA[j], BB[j], CC[j], DD[j]);                    \
-      }                                                         \
-    }                                                           \
-    TT = F(a,b,c,fun);                                          \
-    a = v32_rotate(a,r);                                        \
-    w = v32_add(w, d);                                          \
-    TT = v32_add(TT, w);                                        \
-    TT = v32_rotate(TT,s);                                      \
-    d = v32_shufxor(a,SUM3_##z);                                \
-  } while(0)
-
-#define STEP_2(a,b,c,d,w,fun,r,s)                               \
-  do {                                                          \
-    d = v32_add(d, TT);                                         \
-  } while(0)
-
-#define STEP(a,b,c,d,w,fun,r,s,z)               \
-  do {                                          \
-    register v32 TT;                            \
-    STEP_1(a,b,c,d,w,fun,r,s,z);                \
-    STEP_2(a,b,c,d,w,fun,r,s);                  \
-  } while(0);
-
-
-#define ROUND(h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,        \
-              fun,r,s,t,u,z,r0)                           \
-  do {                                                    \
-    register v32 W0, W1, W2, W3, TT;                      \
-    W0 = v16_merge##u0(W[h0], W[l0]);                     \
-    W0 = V1632(v16_mul(V3216(W0), code[z].v16));          \
-    STEP_1(S(0), S(1), S(2), S(3), W0, fun, r, s, r0##0); \
-    W1 = v16_merge##u1(W[h1], W[l1]);                     \
-    W1 = V1632(v16_mul(V3216(W1), code[z].v16));          \
-    STEP_2(S(0), S(1), S(2), S(3), W0, fun, r, s);        \
-    STEP_1(S(3), S(0), S(1), S(2), W1, fun, s, t, r0##1); \
-    W2 = v16_merge##u2(W[h2], W[l2]);                     \
-    W2 = V1632(v16_mul(V3216(W2), code[z].v16));          \
-    STEP_2(S(3), S(0), S(1), S(2), W1, fun, s, t);        \
-    STEP_1(S(2), S(3), S(0), S(1), W2, fun, t, u, r0##2); \
-    W3 = v16_merge##u3(W[h3], W[l3]);                     \
-    W3 = V1632(v16_mul(V3216(W3), code[z].v16));          \
-    STEP_2(S(2), S(3), S(0), S(1), W2, fun, t, u);        \
-    STEP_1(S(1), S(2), S(3), S(0), W3, fun, u, r, r0##0); \
-    STEP_2(S(1), S(2), S(3), S(0), W3, fun, u, r);        \
-  } while(0)
-
-
-  /*
-   * 4 rounds with code 185
-   */
-  ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0, 0);
-  ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0, 1);
-  ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22,  7, 0, 2);
-  ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22,  7, 0, 0);
-
-  /*
-   * 4 rounds with code 233
-   */
-  ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1, 1);
-  ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1, 2);
-  ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1, 0);
-  ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1, 1);
-
-
-  /*
-   * 1 round as feed-forward
-   */
-  STEP(S(0), S(1), S(2), S(3), S[0], 0,  4, 13, 20);
-  STEP(S(3), S(0), S(1), S(2), S[1], 0, 13, 10, 21);
-  STEP(S(2), S(3), S(0), S(1), S[2], 0, 10, 25, 22);
-  STEP(S(1), S(2), S(3), S(0), S[3], 0, 25,  4, 20);
-
-  S[0] = S(0);  S[1] = S(1);  S[2] = S(2);  S[3] = S(3);
-
-#undef ROUND
-#undef STEP
-#undef STEP_1
-#undef STEP_2
-}
-
-
-void rounds512(u32* state, const unsigned char* msg, short* fft) {
-  
-  v32* S = (v32*) state;
-  v32* M = (v32*) msg;
-  v16* W = (v16*) fft;
-
-  register v32 S0l, S1l, S2l, S3l;
-  register v32 S0h, S1h, S2h, S3h;
-  static const union cv code[] = { CV(185), CV(233) };
-
-  S0l = v32_xor(S[0], v32_bswap(M[0]));
-  S0h = v32_xor(S[1], v32_bswap(M[1]));
-  S1l = v32_xor(S[2], v32_bswap(M[2]));
-  S1h = v32_xor(S[3], v32_bswap(M[3]));
-  S2l = v32_xor(S[4], v32_bswap(M[4]));
-  S2h = v32_xor(S[5], v32_bswap(M[5]));
-  S3l = v32_xor(S[6], v32_bswap(M[6]));
-  S3h = v32_xor(S[7], v32_bswap(M[7]));
-
-#define S(i) S##i
-
-
-/* #define F_0(B, C, D)     ((((C) ^ (D)) & (B)) ^ (D)) */
-/* #define F_1(B, C, D)     (((D) & (C)) | (((D) | (C)) & (B))) */
-
-#define F_0(B, C, D)     v32_xor(v32_and(v32_xor(C,D), B), D)
-#define F_1(B, C, D)     v32_or(v32_and(D, C), v32_and( v32_or(D,C), B))
-
-#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
-#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
-
-  /*
-   * We split the round function in two halfes
-   * so as to insert some independent computations in between
-   */
-
-#define SUM7_00 0
-#define SUM7_01 1
-#define SUM7_02 2
-#define SUM7_03 3
-#define SUM7_04 4
-#define SUM7_05 5
-#define SUM7_06 6
-
-#define SUM7_10 1
-#define SUM7_11 2
-#define SUM7_12 3
-#define SUM7_13 4
-#define SUM7_14 5
-#define SUM7_15 6
-#define SUM7_16 0
-                
-#define SUM7_20 2
-#define SUM7_21 3
-#define SUM7_22 4
-#define SUM7_23 5
-#define SUM7_24 6
-#define SUM7_25 0
-#define SUM7_26 1
-                
-#define SUM7_30 3
-#define SUM7_31 4
-#define SUM7_32 5
-#define SUM7_33 6
-#define SUM7_34 0
-#define SUM7_35 1
-#define SUM7_36 2
-                
-#define SUM7_40 4
-#define SUM7_41 5
-#define SUM7_42 6
-#define SUM7_43 0
-#define SUM7_44 1
-#define SUM7_45 2
-#define SUM7_46 3
-                
-#define SUM7_50 5
-#define SUM7_51 6
-#define SUM7_52 0
-#define SUM7_53 1
-#define SUM7_54 2
-#define SUM7_55 3
-#define SUM7_56 4
-
-#define SUM7_60 6
-#define SUM7_61 0
-#define SUM7_62 1
-#define SUM7_63 2
-#define SUM7_64 3
-#define SUM7_65 4
-#define SUM7_66 5
-
-#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
-
-#define PERM_0(d,a) /* XOR 1 */           \
-  do {                                    \
-    d##l = v32_shufxor(a##l,1);           \
-    d##h = v32_shufxor(a##h,1);           \
-  } while(0)
-
-#define PERM_1(d,a) /* XOR 6 */           \
-  do {                                    \
-    d##l = v32_shufxor(a##h,2);           \
-    d##h = v32_shufxor(a##l,2);           \
-  } while(0)
-
-#define PERM_2(d,a) /* XOR 2 */           \
-  do {                                    \
-    d##l = v32_shufxor(a##l,2);           \
-    d##h = v32_shufxor(a##h,2);           \
-  } while(0)
-
-#define PERM_3(d,a) /* XOR 3 */           \
-  do {                                    \
-    d##l = v32_shufxor(a##l,3);           \
-    d##h = v32_shufxor(a##h,3);           \
-  } while(0)
-
-#define PERM_4(d,a) /* XOR 5 */           \
-  do {                                    \
-    d##l = v32_shufxor(a##h,1);           \
-    d##h = v32_shufxor(a##l,1);           \
-  } while(0)
-
-#define PERM_5(d,a) /* XOR 7 */           \
-  do {                                    \
-    d##l = v32_shufxor(a##h,3);           \
-    d##h = v32_shufxor(a##l,3);           \
-  } while(0)
-
-#define PERM_6(d,a) /* XOR 4 */           \
-  do {                                    \
-    d##l = a##h;                          \
-    d##h = a##l;                          \
-  } while(0)
-
-#define STEP_1_(a,b,c,d,w,fun,r,s,z)                            \
-  do {                                                          \
-    if (PRINT_SOME) {                                           \
-      int j;                                                    \
-      v32 ww=w##l, aa=a##l, bb=b##l, cc=c##l, dd=d##l;          \
-      u32 *WW = (void*)&ww;                                     \
-      u32 *AA = (void*)&aa;                                     \
-      u32 *BB = (void*)&bb;                                     \
-      u32 *CC = (void*)&cc;                                     \
-      u32 *DD = (void*)&dd;                                     \
-      for (j=0; j<4; j++) {                                     \
-        printf ("%08x/%2i/%2i: %08x %08x %08x %08x\n",          \
-                WW[j], r, s,                                    \
-                AA[j], BB[j], CC[j], DD[j]);                    \
-      }                                                         \
-    }                                                           \
-    TTl = Fl(a,b,c,fun);                                        \
-    TTh = Fh(a,b,c,fun);                                        \
-    a##l = v32_rotate(a##l,r);                                  \
-    a##h = v32_rotate(a##h,r);                                  \
-    w##l  = v32_add(w##l, d##l);                                \
-    w##h  = v32_add(w##h, d##h);                                \
-    TTl = v32_add(TTl, w##l);                                   \
-    TTh = v32_add(TTh, w##h);                                   \
-    TTl = v32_rotate(TTl,s);                                    \
-    TTh = v32_rotate(TTh,s);                                    \
-    PERM(z,d,a);                                                \
-  } while(0)
-
-#define STEP_1(a,b,c,d,w,fun,r,s,z)             \
-  STEP_1_(a,b,c,d,w,fun,r,s,z)
-
-#define STEP_2_(a,b,c,d,w,fun,r,s)                               \
-  do {                                                          \
-    d##l = v32_add(d##l, TTl);                                  \
-    d##h = v32_add(d##h, TTh);                                  \
-  } while(0)
-
-#define STEP_2(a,b,c,d,w,fun,r,s)              \
-  STEP_2_(a,b,c,d,w,fun,r,s)
-  
-#define STEP(a,b,c,d,w1,w2,fun,r,s,z)           \
-  do {                                          \
-    register v32 TTl, TTh, Wl=w1, Wh=w2;        \
-    STEP_1(a,b,c,d,W,fun,r,s,z);                \
-    STEP_2(a,b,c,d,W,fun,r,s);                  \
-  } while(0);
-
-
-#define MSG_l(x) (2*(x))
-#define MSG_h(x) (2*(x)+1)
-
-#define MSG(w,hh,ll,u,z)                                \
-  do {                                                  \
-    int a = MSG_##u(hh);                                \
-    int b = MSG_##u(ll);                                \
-    w##l = v16_mergel(W[a], W[b]);                      \
-    w##l = V1632(v16_mul(V3216(w##l), code[z].v16));    \
-    w##h = v16_mergeh(W[a], W[b]);                      \
-    w##h = V1632(v16_mul(V3216(w##h), code[z].v16));    \
-  } while(0)
-  
-#define ROUND(h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,        \
-              fun,r,s,t,u,z)                              \
-  do {                                                    \
-    register v32 W0l, W1l, W2l, W3l, TTl;                 \
-    register v32 W0h, W1h, W2h, W3h, TTh;                 \
-    MSG(W0,h0,l0,u0,z);                                   \
-    STEP_1(S(0), S(1), S(2), S(3), W0, fun, r, s, 0);     \
-    MSG(W1,h1,l1,u1,z);                                   \
-    STEP_2(S(0), S(1), S(2), S(3), W0, fun, r, s);        \
-    STEP_1(S(3), S(0), S(1), S(2), W1, fun, s, t, 1);     \
-    MSG(W2,h2,l2,u2,z);                                   \
-    STEP_2(S(3), S(0), S(1), S(2), W1, fun, s, t);        \
-    STEP_1(S(2), S(3), S(0), S(1), W2, fun, t, u, 2);     \
-    MSG(W3,h3,l3,u3,z);                                   \
-    STEP_2(S(2), S(3), S(0), S(1), W2, fun, t, u);        \
-    STEP_1(S(1), S(2), S(3), S(0), W3, fun, u, r, 3);     \
-    STEP_2(S(1), S(2), S(3), S(0), W3, fun, u, r);        \
-  } while(0)
-
-
-  /*
-   * 4 rounds with code 185
-   */
-#define PERM_START 0
-  ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0);
-#undef PERM_START
-#define PERM_START 4
-  ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0);
-#undef PERM_START
-#define PERM_START 1
-  ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22, 7,  0);
-#undef PERM_START
-#define PERM_START 5
-  ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22, 7,  0);
-#undef PERM_START
-
-  /*
-   * 4 rounds with code 233
-   */
-#define PERM_START 2
-  ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1);
-#undef PERM_START
-#define PERM_START 6
-  ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1);
-#undef PERM_START
-#define PERM_START 3
-  ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1);
-#undef PERM_START
-#define PERM_START 0
-  ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1);
-#undef PERM_START
-
-
-  /*
-   * 1 round as feed-forward
-   */
-#define PERM_START 4
-  STEP(S(0), S(1), S(2), S(3), S[0], S[1], 0,  4, 13, 0);
-  STEP(S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1);
-  STEP(S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2);
-  STEP(S(1), S(2), S(3), S(0), S[6], S[7], 0, 25,  4, 3);
-#undef PERM_START
-
-  S[0] = S0l;  S[1] = S0h;  S[2] = S1l;  S[3] = S1h;
-  S[4] = S2l;  S[5] = S2h;  S[6] = S3l;  S[7] = S3h;
-
-#undef ROUND
-#undef STEP
-#undef STEP_1
-#undef STEP_2
-}
-
-void SIMD_Compress(hashState_sd * state, const unsigned char *m, int final) {
-  if (state->hashbitlen <= 256) {
-    union cv Y[16];
-    short* y = (short*) Y[0].u16;
-
-#ifdef v16_broadcast
-    if (final == 2) {
-      fft128_msg_final(y, m);
-      rounds(state->A, m, y);
-    } else {
-      fft128_msg(y, m, final);
-      rounds(state->A, m, y);
-    }
-#else
-    fft128_msg(y, m, final);
-    rounds(state->A, m, y);
-#endif
-  } else {
-    union cv Y[32];
-    short* y = (short*) Y[0].u16;
-    
-    fft256_msg(y, m, final);
-    rounds512(state->A, m, y);
-  }
-}
-
-/* 
- * Give the FFT output in the regular order for consitancy checks
- */
-void fft128_natural(fft_t *x, unsigned char *a) {
-  union cv Y[16];
-  short* y = (short*) Y[0].u16;
-  int i;
-
-  fft128_msg(y, a, 0);
-
-  for(i=0; i<64; i++) {
-    x[2*i]   = y[i];
-    x[2*i+1] = y[i+64];
-  }
-}
-
-#endif // SSE2
--- a/algo/simd/vector.h
+++ b/algo/simd/vector.h
@@ -1,246 +0,0 @@
-#ifndef __VECTOR_H__
-#define __VECTOR_H__
-
-#include "compat.h"
-#include "simd-utils.h"
-
-/******************************* 
- * Using GCC vector extensions * 
- *******************************/
-
-//typedef unsigned char v16qi __attribute__ ((vector_size (16)));
-typedef char          v16qi __attribute__ ((vector_size (16)));
-typedef short          v8hi __attribute__ ((vector_size (16)));
-typedef int            v4si __attribute__ ((vector_size (16)));
-typedef float          v4sf __attribute__ ((vector_size (16)));
-typedef long long int  v2di __attribute__ ((vector_size (16)));
-
-typedef short          v4hi __attribute__ ((vector_size (8)));
-typedef unsigned char  v8qi __attribute__ ((vector_size (8)));
-
-typedef v16qi v8;
-typedef v8hi v16;
-typedef v4si v32;
-#define V16_SIZE 8
-
-union cv {
-  unsigned short u16[8];
-  v16 v16;
-};
-
-union cv8 {
-  unsigned char u8[16];
-  v8 v8;
-};
-
-union u32 {
-  u32 u[4];
-  v32 v;
-};
-
-#define V3216(x) ((v16) (x))
-#define V1632(x) ((v32) (x))
-#define  V168(x) ( (v8) (x))
-#define  V816(x) ((v16) (x))
-
-#if 0
-/* These instruction are shorter than the PAND/POR/... that GCC uses */
-
-#define vec_and(x,y)  ({v16 a = (v16) x; v16 b = (v16) y;  __builtin_ia32_andps ((v4sf) a, (v4sf) b);})
-#define vec_or(x,y)   ({v16 a = (v16) x; v16 b = (v16) y;  __builtin_ia32_orps ((v4sf) a, (v4sf) b);})
-#define vec_xor(x,y)  ({v16 a = (v16) x; v16 b = (v16) y;  __builtin_ia32_xorps ((v4sf) a, (v4sf) b);})
-#define vec_andn(x,y) ({v16 a = (v16) x; v16 b = (v16) y;  __builtin_ia32_andnps ((v4sf) a, (v4sf) b);})
-
-#define v16_and(x,y)  ((v16) vec_and ((x), (y)))
-#define v16_or(x,y)   ((v16) vec_or  ((x), (y)))
-#define v16_xor(x,y)  ((v16) vec_xor ((x), (y)))
-#define v16_andn(x,y) ((v16) vec_andn((x), (y)))
-
-#define v32_and(x,y)  ((v32) vec_and ((x), (y)))
-#define v32_or(x,y)   ((v32) vec_or  ((x), (y)))
-#define v32_xor(x,y)  ((v32) vec_xor ((x), (y)))
-#define v32_andn(x,y) ((v32) vec_andn((x), (y)))
-#endif
-
-#if defined(__SSE2__)
-
-#define vec_and(x,y) ((x)&(y))
-#define vec_or(x,y)  ((x)|(y))
-#define vec_xor(x,y) ((x)^(y))
-
-#define v16_and vec_and
-#define v16_or  vec_or
-#define v16_xor vec_xor
-
-#define v32_and vec_and
-#define v32_or  vec_or
-#define v32_xor vec_xor
-
-#define vec_andn(x,y) __builtin_ia32_pandn128 ((v2di) x, (v2di) y)
-#define v16_andn(x,y) ((v16) vec_andn(x,y))
-#define v32_andn(x,y) ((v32) vec_andn(x,y))
-
-#define v32_add(x,y) ((x)+(y))
-
-#define v16_add(x,y) ((x)+(y))
-#define v16_sub(x,y) ((x)-(y))
-#define v16_mul(x,y) ((x)*(y))
-#define v16_neg(x)   (-(x))
-#define v16_shift_l  __builtin_ia32_psllwi128
-#define v16_shift_r  __builtin_ia32_psrawi128
-#define v16_cmp      __builtin_ia32_pcmpgtw128
-
-#define v16_interleavel   __builtin_ia32_punpcklwd128
-#define v16_interleaveh   __builtin_ia32_punpckhwd128
-
-#define v16_mergel(a,b)   V1632(__builtin_ia32_punpcklwd128(a,b))
-#define v16_mergeh(a,b)   V1632(__builtin_ia32_punpckhwd128(a,b))
-
-#define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b))
-#define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b))
-
-#define v32_shift_l  __builtin_ia32_pslldi128
-#define v32_shift_r  __builtin_ia32_psrldi128
-
-#define v32_rotate(x,n)                                 \
-  v32_or(v32_shift_l(x,n), v32_shift_r(x,32-(n)))
-
-#define v32_shuf __builtin_ia32_pshufd
-
-#define SHUFXOR_1 0xb1          /* 0b10110001 */
-#define SHUFXOR_2 0x4e          /* 0b01001110 */
-#define SHUFXOR_3 0x1b          /* 0b00011011 */
-
-#define CAT(x, y) x##y
-#define XCAT(x,y) CAT(x,y)
-
-#define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s))
-
-#define v32_bswap(x) (x)
-
-#define v16_broadcast(x) ({                     \
-      union u32 u;                              \
-      u32 xx = x;                               \
-      u.u[0] = xx | (xx << 16);                 \
-      V3216(v32_shuf(u.v,0)); })
-
-#define CV(x) {{x, x, x, x, x, x, x, x}}
-
-#elif defined(__aarch64__) && defined(__ARM_NEON)
-
-#define vec_and( x, y )    v128_and( x, y )
-#define vec_or(x,y)        v128_or( x, y )
-#define vec_xor(x,y)       v128_xor( x, y )
-
-#define v16_and v128_and
-#define v16_or  v128_or
-#define v16_xor v128_xor
-
-#define v32_and v128_and
-#define v32_or  v128_or
-#define v32_xor v128_xor
-
-#define vec_andn( x,y )   v128_andnot( x, y )
-#define v16_andn          vec_andn 
-#define v32_andn          vec_andn
-
-#define v32_add( x, y )   v128_add32( x, y )
-
-#define v16_add( x, y )        v128_add16( x, y )
-#define v16_sub( x, y )        v128_sub16( x, y )
-#define v16_mul( x, y )        v128_mul16( x, y )
-#define v16_neg(x)             v128_negate16( x )
-#define v16_shift_l( x, c )    v128_sl16
-#define v16_shift_r            v128_sr16
-#define v16_cmp                v128_cmpgt16
-
-#define v16_interleavel        v128_unpacklo16
-#define v16_interleaveh        v128_unpackhi16 
-
-#define v16_mergel(a,b)   V1632(__builtin_ia32_punpcklwd128(a,b))
-#define v16_mergeh(a,b)   V1632(__builtin_ia32_punpckhwd128(a,b))
-
-#define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b))
-#define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b))
-
-#define v32_shift_l            v128_sl32
-#define v32_shift_r            v128_sr32
-
-#define v32_rotate(x,n)        v128_rol32
-
-#define v32_shuf __builtin_ia32_pshufd
-
-#define SHUFXOR_1 0xb1          /* 0b10110001 */
-#define SHUFXOR_2 0x4e          /* 0b01001110 */
-#define SHUFXOR_3 0x1b          /* 0b00011011 */
-
-#define CAT(x, y) x##y
-#define XCAT(x,y) CAT(x,y)
-
-#define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s))
-
-#define v32_bswap(x) (x)
-
-#define v16_broadcast(x) ({                     \
-      union u32 u;                              \
-      u32 xx = x;                               \
-      u.u[0] = xx | (xx << 16);                 \
-      V3216(v32_shuf(u.v,0)); })
-
-#define CV(x) {{x, x, x, x, x, x, x, x}}
-
-#else
-
-#error "I don't know how to vectorize on this architecture."
-
-#endif
-
-
-/* Twiddle tables */
-
-  static const union cv FFT64_Twiddle[] = {
-    {{1,    2,    4,    8,   16,   32,   64,  128}},
-    {{1,   60,    2,  120,    4,  -17,    8,  -34}},
-    {{1,  120,    8,  -68,   64,  -30,   -2,   17}},
-    {{1,   46,   60,  -67,    2,   92,  120,  123}},
-    {{1,   92,  -17,  -22,   32,  117,  -30,   67}},
-    {{1,  -67,  120,  -73,    8,  -22,  -68,  -70}},
-    {{1,  123,  -34,  -70,  128,   67,   17,   35}},
-  };
-
-
-  static const union cv FFT128_Twiddle[] =  {
-    {{  1, -118,   46,  -31,   60,  116,  -67,  -61}},
-    {{  2,   21,   92,  -62,  120,  -25,  123, -122}},
-    {{  4,   42,  -73, -124,  -17,  -50,  -11,   13}},
-    {{  8,   84,  111,    9,  -34, -100,  -22,   26}},
-    {{ 16,  -89,  -35,   18,  -68,   57,  -44,   52}},
-    {{ 32,   79,  -70,   36,  121,  114,  -88,  104}},
-    {{ 64,  -99,  117,   72,  -15,  -29,   81,  -49}},
-    {{128,   59,  -23, -113,  -30,  -58,  -95,  -98}},
-  };
-
-
-  static const union cv FFT256_Twiddle[] =  {
-    {{   1,   41, -118,   45,   46,   87,  -31,   14}}, 
-    {{  60, -110,  116, -127,  -67,   80,  -61,   69}}, 
-    {{   2,   82,   21,   90,   92,  -83,  -62,   28}}, 
-    {{ 120,   37,  -25,    3,  123,  -97, -122, -119}}, 
-    {{   4,  -93,   42,  -77,  -73,   91, -124,   56}}, 
-    {{ -17,   74,  -50,    6,  -11,   63,   13,   19}}, 
-    {{   8,   71,   84,  103,  111,  -75,    9,  112}}, 
-    {{ -34, -109, -100,   12,  -22,  126,   26,   38}}, 
-    {{  16, -115,  -89,  -51,  -35,  107,   18,  -33}}, 
-    {{ -68,   39,   57,   24,  -44,   -5,   52,   76}}, 
-    {{  32,   27,   79, -102,  -70,  -43,   36,  -66}}, 
-    {{ 121,   78,  114,   48,  -88,  -10,  104, -105}}, 
-    {{  64,   54,  -99,   53,  117,  -86,   72,  125}}, 
-    {{ -15, -101,  -29,   96,   81,  -20,  -49,   47}}, 
-    {{ 128,  108,   59,  106,  -23,   85, -113,   -7}}, 
-    {{ -30,   55,  -58,  -65,  -95,  -40,  -98,   94}}
-  };
-
-
-
-
-#endif
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -6,23 +6,23 @@

 #if defined (SKEIN_8WAY)

-static __thread skein512_8way_context skein512_8way_ctx
+static __thread skein512_8x64_context skein512_8x64_ctx
                                            __attribute__ ((aligned (64)));

 void skeinhash_8way( void *state, const void *input )
 {
     uint64_t vhash64[8*8] __attribute__ ((aligned (128)));
-     skein512_8way_context ctx_skein;
-     memcpy( &ctx_skein, &skein512_8way_ctx, sizeof( ctx_skein ) );
+     skein512_8x64_context ctx_skein;
+     memcpy( &ctx_skein, &skein512_8x64_ctx, sizeof( ctx_skein ) );
     uint32_t vhash32[16*8] __attribute__ ((aligned (128)));
-     sha256_8way_context ctx_sha256;
+     sha256_8x32_context ctx_sha256;

-     skein512_8way_final16( &ctx_skein, vhash64, input + (64*8) );
+     skein512_8x64_final16( &ctx_skein, vhash64, input + (64*8) );
     rintrlv_8x64_8x32( vhash32, vhash64, 512 );

-     sha256_8way_init( &ctx_sha256 );
-     sha256_8way_update( &ctx_sha256, vhash32, 64 );
-     sha256_8way_close( &ctx_sha256, state );
+     sha256_8x32_init( &ctx_sha256 );
+     sha256_8x32_update( &ctx_sha256, vhash32, 64 );
+     sha256_8x32_close( &ctx_sha256, state );
 }

 int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
@@ -46,7 +46,7 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
   *noncev = mm512_intrlv_blend_32(
                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
-   skein512_8way_prehash64( &skein512_8way_ctx, vdata );
+   skein512_8x64_prehash64( &skein512_8x64_ctx, vdata );
   do
   {
       skeinhash_8way( hash, vdata );
@@ -73,14 +73,14 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce,

 #elif defined (SKEIN_4WAY)

-static __thread skein512_4way_context skein512_4way_ctx
+static __thread skein512_4x64_context skein512_4x64_ctx
                                            __attribute__ ((aligned (64)));

 void skeinhash_4way( void *state, const void *input )
 {
     uint64_t vhash64[8*4] __attribute__ ((aligned (128)));
-     skein512_4way_context ctx_skein;
-     memcpy( &ctx_skein, &skein512_4way_ctx, sizeof( ctx_skein ) );
+     skein512_4x64_context ctx_skein;
+     memcpy( &ctx_skein, &skein512_4x64_ctx, sizeof( ctx_skein ) );
 #if defined(__SHA__)
     uint32_t hash0[16] __attribute__ ((aligned (64)));
     uint32_t hash1[16] __attribute__ ((aligned (64)));
@@ -88,10 +88,10 @@ void skeinhash_4way( void *state, const void *input )
     uint32_t hash3[16] __attribute__ ((aligned (64)));
 #else
     uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
-     sha256_4way_context ctx_sha256;
+     sha256_4x32_context ctx_sha256;
 #endif

-     skein512_4way_final16( &ctx_skein, vhash64, input + (64*4) );
+     skein512_4x64_final16( &ctx_skein, vhash64, input + (64*4) );

 #if defined(__SHA__)      

@@ -107,9 +107,9 @@ void skeinhash_4way( void *state, const void *input )
 #else

     rintrlv_4x64_4x32( vhash32, vhash64, 512 );
-     sha256_4way_init( &ctx_sha256 );
-     sha256_4way_update( &ctx_sha256, vhash32, 64 );
-     sha256_4way_close( &ctx_sha256, state );
+     sha256_4x32_init( &ctx_sha256 );
+     sha256_4x32_update( &ctx_sha256, vhash32, 64 );
+     sha256_4x32_close( &ctx_sha256, state );

 #endif
 }
@@ -132,7 +132,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
    const bool bench = opt_benchmark;

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
-   skein512_4way_prehash64( &skein512_4way_ctx, vdata );
+   skein512_4x64_prehash64( &skein512_4x64_ctx, vdata );

   *noncev = mm256_intrlv_blend_32(
                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -513,7 +513,7 @@ do { \

 #if defined(SIMD512)

-void skein256_8way_init( skein256_8way_context *sc )
+void skein256_8x64_init( skein256_8x64_context *sc )
 {
        sc->h0 = _mm512_set1_epi64( 0xCCD044A12FDB3E13 );
        sc->h1 = _mm512_set1_epi64( 0xE83590301A79A9EB );
@@ -527,7 +527,7 @@ void skein256_8way_init( skein256_8way_context *sc )
        sc->ptr = 0;
 }

-void skein512_8way_init( skein512_8way_context *sc )
+void skein512_8x64_init( skein512_8x64_context *sc )
 {
        sc->h0 = _mm512_set1_epi64( 0x4903ADFF749C51CE );
        sc->h1 = _mm512_set1_epi64( 0x0D95DE399746DF03 );
@@ -542,7 +542,7 @@ void skein512_8way_init( skein512_8way_context *sc )
 }

 static void
-skein_big_core_8way( skein512_8way_context *sc, const void *data,
+skein_big_core_8x64( skein512_8x64_context *sc, const void *data,
                     size_t len )
 {
   __m512i *vdata = (__m512i*)data;
@@ -587,7 +587,7 @@ skein_big_core_8way( skein512_8way_context *sc, const void *data,
 }

 static void
-skein_big_close_8way( skein512_8way_context *sc, unsigned ub, unsigned n,
+skein_big_close_8x64( skein512_8x64_context *sc, unsigned ub, unsigned n,
                      void *dst, size_t out_len )
 {
   __m512i *buf;
@@ -621,7 +621,7 @@ skein_big_close_8way( skein512_8way_context *sc, unsigned ub, unsigned n,
   memcpy_512( dst, buf, out_len >> 3 );
 }

-void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data,
+void skein512_8x64_full( skein512_8x64_context *sc, void *out, const void *data,
                     size_t len )
 {
   __m512i h0, h1, h2, h3, h4, h5, h6, h7;
@@ -698,7 +698,7 @@ void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data,
 }

 void
-skein512_8way_prehash64( skein512_8way_context *sc, const void *data )
+skein512_8x64_prehash64( skein512_8x64_context *sc, const void *data )
 {
   __m512i *vdata = (__m512i*)data;
   __m512i *buf = sc->buf;
@@ -732,7 +732,7 @@ skein512_8way_prehash64( skein512_8way_context *sc, const void *data )
 }

 void
-skein512_8way_final16( skein512_8way_context *sc,  void *output,
+skein512_8x64_final16( skein512_8x64_context *sc,  void *output,
                       const void *data )
 {
   __m512i *in = (__m512i*)data;
@@ -778,34 +778,34 @@ skein512_8way_final16( skein512_8way_context *sc,  void *output,


 void
-skein256_8way_update(void *cc, const void *data, size_t len)
+skein256_8x64_update(void *cc, const void *data, size_t len)
 {
-   skein_big_core_8way(cc, data, len);
+   skein_big_core_8x64(cc, data, len);
 }

 void
-skein256_8way_close(void *cc, void *dst)
+skein256_8x64_close(void *cc, void *dst)
 {
-        skein_big_close_8way(cc, 0, 0, dst, 32);
+        skein_big_close_8x64(cc, 0, 0, dst, 32);
 }

 void
-skein512_8way_update(void *cc, const void *data, size_t len)
+skein512_8x64_update(void *cc, const void *data, size_t len)
 {
-   skein_big_core_8way(cc, data, len);
+   skein_big_core_8x64(cc, data, len);
 }

 void
-skein512_8way_close(void *cc, void *dst)
+skein512_8x64_close(void *cc, void *dst)
 {
-        skein_big_close_8way(cc, 0, 0, dst, 64);
+        skein_big_close_8x64(cc, 0, 0, dst, 64);
 }

 #endif // AVX512

 #if defined(__AVX2__)

-void skein256_4way_init( skein256_4way_context *sc )
+void skein256_4x64_init( skein256_4x64_context *sc )
 {
        sc->h0 = _mm256_set1_epi64x( 0xCCD044A12FDB3E13 );
        sc->h1 = _mm256_set1_epi64x( 0xE83590301A79A9EB );
@@ -819,7 +819,7 @@ void skein256_4way_init( skein256_4way_context *sc )
        sc->ptr = 0;
 }

-void skein512_4way_init( skein512_4way_context *sc )
+void skein512_4x64_init( skein512_4x64_context *sc )
 {
        sc->h0 = _mm256_set1_epi64x( 0x4903ADFF749C51CE );
        sc->h1 = _mm256_set1_epi64x( 0x0D95DE399746DF03 );
@@ -835,7 +835,7 @@ void skein512_4way_init( skein512_4way_context *sc )

 // Do not use for 128 bt data length
 static void
-skein_big_core_4way( skein512_4way_context *sc, const void *data,
+skein_big_core_4x64( skein512_4x64_context *sc, const void *data,
                     size_t len )
 {
   __m256i *vdata = (__m256i*)data;
@@ -882,7 +882,7 @@ skein_big_core_4way( skein512_4way_context *sc, const void *data,
 }

 static void
-skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
+skein_big_close_4x64( skein512_4x64_context *sc, unsigned ub, unsigned n,
                      void *dst, size_t out_len )
 {
 	__m256i *buf;
@@ -920,7 +920,7 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
 }

 void
-skein512_4way_full( skein512_4way_context *sc, void *out, const void *data,
+skein512_4x64_full( skein512_4x64_context *sc, void *out, const void *data,
                     size_t len )
 {
   __m256i h0, h1, h2, h3, h4, h5, h6, h7;
@@ -995,7 +995,7 @@ skein512_4way_full( skein512_4way_context *sc, void *out, const void *data,
 }

 void
-skein512_4way_prehash64( skein512_4way_context *sc, const void *data )
+skein512_4x64_prehash64( skein512_4x64_context *sc, const void *data )
 {
   __m256i *vdata = (__m256i*)data;
   __m256i *buf = sc->buf;
@@ -1029,7 +1029,7 @@ skein512_4way_prehash64( skein512_4way_context *sc, const void *data )
 }

 void
-skein512_4way_final16( skein512_4way_context *sc,  void *out, const void *data )
+skein512_4x64_final16( skein512_4x64_context *sc,  void *out, const void *data )
 {
   __m256i *vdata = (__m256i*)data;
   __m256i *buf = sc->buf;
@@ -1073,29 +1073,29 @@ skein512_4way_final16( skein512_4way_context *sc,  void *out, const void *data )

 // Broken for 80 bytes, use prehash.
 void
-skein256_4way_update(void *cc, const void *data, size_t len)
+skein256_4x64_update(void *cc, const void *data, size_t len)
 {
-	skein_big_core_4way(cc, data, len);
+	skein_big_core_4x64(cc, data, len);
 }

 void
-skein256_4way_close(void *cc, void *dst)
+skein256_4x64_close(void *cc, void *dst)
 {
-        skein_big_close_4way(cc, 0, 0, dst, 32);
+        skein_big_close_4x64(cc, 0, 0, dst, 32);
 }


 // Broken for 80 & 128 bytes, use prehash or full
 void
-skein512_4way_update(void *cc, const void *data, size_t len)
+skein512_4x64_update(void *cc, const void *data, size_t len)
 {
-	skein_big_core_4way(cc, data, len);
+	skein_big_core_4x64(cc, data, len);
 }

 void
-skein512_4way_close(void *cc, void *dst)
+skein512_4x64_close(void *cc, void *dst)
 {
-        skein_big_close_4way(cc, 0, 0, dst, 64);
+        skein_big_close_4x64(cc, 0, 0, dst, 64);
 }

 #endif   // AVX2
@@ -1231,7 +1231,7 @@ void skein512_2x64_init( skein512_2x64_context *sc )
 }

 static void
-skein_big_core_2way( skein512_2x64_context *sc, const void *data,
+skein_big_core_2x64( skein512_2x64_context *sc, const void *data,
                     size_t len )
 {
   v128u64_t *vdata = (v128u64_t*)data;
@@ -1278,7 +1278,7 @@ skein_big_core_2way( skein512_2x64_context *sc, const void *data,
 }

 static void
-skein_big_close_2way( skein512_2x64_context *sc, unsigned ub, unsigned n,
+skein_big_close_2x64( skein512_2x64_context *sc, unsigned ub, unsigned n,
                      void *dst, size_t out_len )
 {
   v128u64_t *buf;
@@ -1471,13 +1471,13 @@ skein512_2x64_final16( skein512_2x64_context *sc,  void *out, const void *data )
 void
 skein256_2x64_update(void *cc, const void *data, size_t len)
 {
-   skein_big_core_2way(cc, data, len);
+   skein_big_core_2x64(cc, data, len);
 }

 void
 skein256_2x64_close(void *cc, void *dst)
 {
-   skein_big_close_2way(cc, 0, 0, dst, 32);
+   skein_big_close_2x64(cc, 0, 0, dst, 32);
 }


@@ -1485,13 +1485,12 @@ skein256_2x64_close(void *cc, void *dst)
 void
 skein512_2x64_update(void *cc, const void *data, size_t len)
 {
-   skein_big_core_2way(cc, data, len);
+   skein_big_core_2x64(cc, data, len);
 }

 void
 skein512_2x64_close(void *cc, void *dst)
 {
-    skein_big_close_2way(cc, 0, 0, dst, 64);
+    skein_big_close_2x64(cc, 0, 0, dst, 64);
 }

-
--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -52,24 +52,36 @@ typedef struct
   __m512i h0, h1, h2, h3, h4, h5, h6, h7;
   size_t ptr;
   uint64_t bcount;
-} skein_8way_big_context __attribute__ ((aligned (128)));
+} skein_8x64_big_context __attribute__ ((aligned (128)));

-typedef skein_8way_big_context skein512_8way_context;
-typedef skein_8way_big_context skein256_8way_context;
+typedef skein_8x64_big_context skein512_8x64_context;
+typedef skein_8x64_big_context skein256_8x64_context;

-void skein512_8way_full( skein512_8way_context *sc, void *out,
+void skein512_8x64_full( skein512_8x64_context *sc, void *out,
                         const void *data, size_t len );
-void skein512_8way_init( skein512_8way_context *sc );
-void skein512_8way_update( void *cc, const void *data, size_t len );
-void skein512_8way_close( void *cc, void *dst );
+void skein512_8x64_init( skein512_8x64_context *sc );
+void skein512_8x64_update( void *cc, const void *data, size_t len );
+void skein512_8x64_close( void *cc, void *dst );

-void skein512_8way_prehash64( skein512_8way_context *sc, const void *data );
-void skein512_8way_final16( skein512_8way_context *sc, void *out,
+void skein512_8x64_prehash64( skein512_8x64_context *sc, const void *data );
+void skein512_8x64_final16( skein512_8x64_context *sc, void *out,
     const void *data );

-void skein256_8way_init( skein256_8way_context *sc );
-void skein256_8way_update( void *cc, const void *data, size_t len );
-void skein256_8way_close( void *cc, void *dst );
+void skein256_8x64_init( skein256_8x64_context *sc );
+void skein256_8x64_update( void *cc, const void *data, size_t len );
+void skein256_8x64_close( void *cc, void *dst );
+
+#define skein512_8way_context       skein512_8x64_context
+#define skein512_8way_full          skein512_8x64_full
+#define skein512_8way_init          skein512_8x64_init
+#define skein512_8way_update        skein512_8x64_update
+#define skein512_8way_close         skein512_8x64_close
+#define skein512_8way_prehash64     skein512_8x64_prehash64
+#define skein512_8way_final16       skein512_8x64_final16
+#define skein256_8way_context       skein256_8x64_context
+#define skein256_8way_init          skein256_8x64_init
+#define skein256_8way_update        skein256_8x64_update
+#define skein256_8way_close         skein256_8x64_close

 #endif // AVX512

@@ -81,25 +93,35 @@ typedef struct
   __m256i h0, h1, h2, h3, h4, h5, h6, h7;
   size_t ptr;
 	uint64_t bcount;
-} skein_4way_big_context __attribute__ ((aligned (128)));
+} skein_4x64_big_context __attribute__ ((aligned (128)));

-typedef skein_4way_big_context skein512_4way_context;
-typedef skein_4way_big_context skein256_4way_context;
+typedef skein_4x64_big_context skein512_4x64_context;
+typedef skein_4x64_big_context skein256_4x64_context;

-void skein512_4way_init( skein512_4way_context *sc );
-void skein512_4way_full( skein512_4way_context *sc, void *out,
+void skein512_4x64_init( skein512_4x64_context *sc );
+void skein512_4x64_full( skein512_4x64_context *sc, void *out,
                         const void *data, size_t len );
-void skein512_4way_update( void *cc, const void *data, size_t len );
-void skein512_4way_close( void *cc, void *dst );
-
-void skein256_4way_init( skein256_4way_context *sc );
-void skein256_4way_update( void *cc, const void *data, size_t len );
-void skein256_4way_close( void *cc, void *dst );
-
-void skein512_4way_prehash64( skein512_4way_context *sc, const void *data );
-void skein512_4way_final16( skein512_4way_context *sc, void *out,
+void skein512_4x64_update( void *cc, const void *data, size_t len );
+void skein512_4x64_close( void *cc, void *dst );
+void skein512_4x64_prehash64( skein512_4x64_context *sc, const void *data );
+void skein512_4x64_final16( skein512_4x64_context *sc, void *out,
     const void *data );

+void skein256_4x64_init( skein256_4x64_context *sc );
+void skein256_4x64_update( void *cc, const void *data, size_t len );
+void skein256_4x64_close( void *cc, void *dst );
+
+#define skein512_4way_context       skein512_4x64_context
+#define skein512_4way_full          skein512_4x64_full
+#define skein512_4way_init          skein512_4x64_init
+#define skein512_4way_update        skein512_4x64_update
+#define skein512_4way_close         skein512_4x64_close
+#define skein512_4way_prehash64     skein512_4x64_prehash64
+#define skein512_4way_final16       skein512_4x64_final16
+#define skein256_4way_context       skein256_4x64_context
+#define skein256_4way_init          skein256_4x64_init
+#define skein256_4way_update        skein256_4x64_update
+#define skein256_4way_close         skein256_4x64_close

 #endif

@@ -109,10 +131,10 @@ typedef struct
   v128u64_t h0, h1, h2, h3, h4, h5, h6, h7;
   size_t ptr;
   uint64_t bcount;
-} skein_2way_big_context __attribute__ ((aligned (128)));
+} skein_2x64_big_context __attribute__ ((aligned (128)));

-typedef skein_2way_big_context skein512_2x64_context;
-typedef skein_2way_big_context skein256_2x64_context;
+typedef skein_2x64_big_context skein512_2x64_context;
+typedef skein_2x64_big_context skein256_2x64_context;

 void skein512_2x64_init( skein512_2x64_context *sc );
 void skein512_2x64_full( skein512_2x64_context *sc, void *out,
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -21,17 +21,17 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
    __m512i  *noncev = (__m512i*)vdata + 9; 
    const int thr_id = mythr->id; 
    const bool bench = opt_benchmark;
-    skein512_8way_context ctx;
+    skein512_8x64_context ctx;

    mm512_bswap32_intrlv80_8x64( vdata, pdata );
    *noncev = mm512_intrlv_blend_32(
                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
-    skein512_8way_prehash64( &ctx, vdata );
+    skein512_8x64_prehash64( &ctx, vdata );
    do
    {
-       skein512_8way_final16( &ctx, hash, vdata + (16*8) );
-       skein512_8way_full( &ctx, hash, hash, 64 );
+       skein512_8x64_final16( &ctx, hash, vdata + (16*8) );
+       skein512_8x64_full( &ctx, hash, hash, 64 );

       for ( int lane = 0; lane < 8; lane++ )
       if ( unlikely( hashq3[ lane ] <= targq3 && !bench ) )
@@ -71,16 +71,16 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
    __m256i  *noncev = (__m256i*)vdata + 9; 
    const int thr_id = mythr->id;  
    const bool bench = opt_benchmark;
-    skein512_4way_context ctx;
+    skein512_4x64_context ctx;

    mm256_bswap32_intrlv80_4x64( vdata, pdata );
-    skein512_4way_prehash64( &ctx, vdata );
+    skein512_4x64_prehash64( &ctx, vdata );
    *noncev = mm256_intrlv_blend_32(
                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
    do 
    {
-       skein512_4way_final16( &ctx, hash, vdata + (16*4) );
-       skein512_4way_full( &ctx, hash, hash, 64 );
+       skein512_4x64_final16( &ctx, hash, vdata + (16*4) );
+       skein512_4x64_full( &ctx, hash, hash, 64 );

       for ( int lane = 0; lane < 4; lane++ )
       if ( hash_q3[ lane ] <= targ_q3 )
--- a/algo/swifftx/swifftx.c
+++ b/algo/swifftx/swifftx.c
@@ -640,24 +640,25 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
 #if defined(__AVX2__)

   __m256i F0, F1, F2, F3, F4, F5, F6, F7;
-   __m256i tbl = *(__m256i*)&( fftTable[ input[0] << 3 ] );
+   __m256i *table = (__m256i*)fftTable;
+   __m256i tbl = table[ input[0] ];
   __m256i *mul = (__m256i*)multipliers;
   __m256i *out = (__m256i*)output;

   F0 = _mm256_mullo_epi32( mul[0], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[1] << 3 ] );
+   tbl = table[ input[1] ];
   F1 = _mm256_mullo_epi32( mul[1], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[2] << 3 ] );
+   tbl = table[ input[2] ];
   F2 = _mm256_mullo_epi32( mul[2], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[3] << 3 ] );
+   tbl = table[ input[3] ];
   F3 = _mm256_mullo_epi32( mul[3], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[4] << 3 ] );
+   tbl = table[ input[4] ];
   F4 = _mm256_mullo_epi32( mul[4], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[5] << 3 ] );
+   tbl = table[ input[5] ];
   F5 = _mm256_mullo_epi32( mul[5], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[6] << 3 ] );
+   tbl = table[ input[6] ];
   F6 = _mm256_mullo_epi32( mul[6], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[7] << 3 ] );
+   tbl = table[ input[7]  ];
   F7 = _mm256_mullo_epi32( mul[7], tbl );

   #define ADD_SUB( a, b ) \
@@ -677,9 +678,9 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
   ADD_SUB( F1, F3 );
   ADD_SUB( F4, F6 );
   ADD_SUB( F5, F7 );  
-   F5 = _mm256_slli_epi32( F5, 2 );
   F6 = _mm256_slli_epi32( F6, 4 );
   F7 = _mm256_slli_epi32( F7, 6 );
+   F5 = _mm256_slli_epi32( F5, 2 );
   ADD_SUB( F0, F4 );
   ADD_SUB( F1, F5 );
   ADD_SUB( F2, F6 );
--- a/algo/x11/c11.c
+++ b/algo/x11/c11.c
@@ -13,11 +13,7 @@
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
@@ -43,11 +39,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cube;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
+   simd512_context         simd;
 } c11_ctx_holder;

 c11_ctx_holder c11_ctx __attribute__ ((aligned (64)));
@@ -69,11 +61,6 @@ void init_c11_ctx()
   init_luffa( &c11_ctx.luffa, 512 );
   cubehashInit( &c11_ctx.cube, 512, 16, 32 );
   sph_shavite512_init( &c11_ctx.shavite );
-#if defined(__aarch64__)
-   sph_simd512_init( &c11_ctx.simd );
-#else
-   init_sd( &c11_ctx.simd, 512 );
-#endif
 }

 void c11_hash( void *output, const void *input )
@@ -112,13 +99,7 @@ void c11_hash( void *output, const void *input )
    sph_shavite512( &ctx.shavite, hash, 64);
    sph_shavite512_close( &ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                                   (const BitSequence *)hash, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
    update_final_echo ( &ctx.echo, (BitSequence *)hash,
--- a/algo/x11/timetravel10.c
+++ b/algo/x11/timetravel10.c
@@ -13,17 +13,13 @@
 #include "algo/skein/sph_skein.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #ifdef __AES__
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #else
  #include "algo/groestl/sph_groestl.h"
 #endif
-  #include "algo/luffa/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"

 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
@@ -37,11 +33,7 @@ typedef struct {
        hashState_luffa         luffa;
        cubehashParam           cube;
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
+        simd512_context         simd;
 #ifdef __AES__
        hashState_groestl       groestl;
 #else
@@ -62,11 +54,6 @@ void init_tt10_ctx()
        init_luffa( &tt10_ctx.luffa, 512 );
        cubehashInit( &tt10_ctx.cube, 512, 16, 32 );
        sph_shavite512_init( &tt10_ctx.shavite );
-#if defined(__aarch64__)
-   sph_simd512_init( &tt10_ctx.simd );
-#else
-   init_sd( &tt10_ctx.simd, 512 );
-#endif
 #ifdef __AES__
        init_groestl( &tt10_ctx.groestl, 64 );
 #else
@@ -222,27 +209,7 @@ void timetravel10_hash(void *output, const void *input)
        }
        break;
     case 9:
-        if ( i == 0 )
-        {
-           memcpy( &ctx.simd, &tt10_mid.simd, sizeof tt10_mid.simd );
-#if defined(__aarch64__)
-           sph_simd512(&ctx.simd, (const void*) input + midlen, tail );
-           sph_simd512_close(&ctx.simd, hash);
-#else
-           update_final_sd( &ctx.simd, (BitSequence *)hashB,
-                            (const BitSequence *)input + midlen, tail*8 );
-#endif
-        }
-        else
-        {
-#if defined(__aarch64__)
-           sph_simd512(&ctx.simd, (const void*) hash, 64);
-           sph_simd512_close(&ctx.simd, hash);
-#else
-           update_sd( &ctx.simd, (const BitSequence *)hashA, dataLen*8 );
-           final_sd( &ctx.simd, (BitSequence *)hashB );
-#endif
-        }
+        simd512_ctx( &ctx.simd, hashB, hashA, dataLen );
        break;
     default:
 	break;
@@ -325,15 +292,6 @@ int scanhash_timetravel10( struct work *work, uint32_t max_nonce,
           memcpy( &tt10_mid.shavite, &tt10_ctx.shavite, sizeof(tt10_mid.shavite ) );
           sph_shavite512( &tt10_mid.shavite, endiandata, 64 );
           break;
-        case 9:
-           memcpy( &tt10_mid.simd, &tt10_ctx.simd, sizeof(tt10_mid.simd ) );
-#if defined(__aarch64__)
-           sph_simd512( &tt10_mid.simd, (const void*) endiandata, 64 );
-           sph_simd512_close( &tt10_mid.simd, hash);
-#else
-           update_sd( &tt10_mid.simd, (const BitSequence *)endiandata, 512 );
-#endif
-           break;
        default:
           break;
      }
--- a/algo/x11/x11.c
+++ b/algo/x11/x11.c
@@ -22,12 +22,7 @@
  #include "algo/echo/sph_echo.h"
 #endif
 #include "algo/luffa/luffa_for_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
-
+#include "algo/simd/simd-hash-2way.h"

 typedef struct {
   sph_blake512_context blake;
@@ -45,11 +40,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cube;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
+   simd512_context         simd;
 } x11_ctx_holder;

 x11_ctx_holder x11_ctx;
@@ -71,11 +62,6 @@ void init_x11_ctx()
   init_luffa( &x11_ctx.luffa, 512 );
   cubehashInit( &x11_ctx.cube, 512, 16, 32 );
   sph_shavite512_init( &x11_ctx.shavite );
-#if defined(__aarch64__)
-   sph_simd512_init( &x11_ctx.simd );
-#else
-   init_sd( &x11_ctx.simd, 512 );
-#endif
 }

 void x11_hash( void *state, const void *input )
@@ -118,13 +104,7 @@ void x11_hash( void *state, const void *input )
    sph_shavite512( &ctx.shavite, hash, 64 );
    sph_shavite512_close( &ctx.shavite, hash );

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                       (const BitSequence *)hash, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
    update_final_echo ( &ctx.echo, (BitSequence *)hash,
--- a/algo/x11/x11evo.c
+++ b/algo/x11/x11evo.c
@@ -20,11 +20,7 @@
  #include "algo/echo/sph_echo.h"
 #endif
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/luffa/luffa_for_sse2.h"

 typedef struct {
@@ -37,11 +33,7 @@ typedef struct {
 #endif
    hashState_luffa         luffa;
    cubehashParam           cube;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
+   simd512_context         simd;
    sph_blake512_context    blake;
    sph_bmw512_context      bmw;
    sph_skein512_context    skein;
@@ -63,11 +55,6 @@ void init_x11evo_ctx()
 #endif
     init_luffa( &x11evo_ctx.luffa, 512 );
     cubehashInit( &x11evo_ctx.cube, 512, 16, 32 );
-#if defined(__aarch64__)
-     sph_simd512_init( &x11evo_ctx.simd );
-#else
-     init_sd( &x11evo_ctx.simd, 512 );
-#endif
     sph_blake512_init( &x11evo_ctx.blake );
     sph_bmw512_init( &x11evo_ctx.bmw );
     sph_skein512_init( &x11evo_ctx.skein );
@@ -146,12 +133,7 @@ void x11evo_hash( void *state, const void *input )
 	      sph_shavite512_close( &ctx.shavite, (char*)hash );
 	      break;
 	    case 9:
-#if defined(__aarch64__)
-         sph_simd512(&ctx.simd, (const void*) hash, 64);
-         sph_simd512_close(&ctx.simd, hash);
-#else
-         update_final_sd( &ctx.simd, (char*)hash, (const char*)hash, 512 );
-#endif
+         simd512_ctx( &ctx.simd, hash, hash, 64 );
    break;
 	    case 10:
 #ifdef __AES__
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -17,12 +17,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-#include "algo/simd/sph_simd.h"
-#else
-#include "algo/simd/nist.h"
-#endif
-
+#include "algo/simd/simd-hash-2way.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -47,11 +42,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cube;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-  sph_simd512_context     simd;
-#else
-  hashState_sd            simd;
-#endif
+   simd512_context         simd;
   sph_gost512_context     gost;
 } x11gost_ctx_holder;

@@ -75,11 +66,6 @@ void init_x11gost_ctx()
   sph_shavite512_init( &x11gost_ctx.shavite );
   init_luffa( &x11gost_ctx.luffa, 512 );
   cubehashInit( &x11gost_ctx.cube, 512, 16, 32 );
-#if defined(__aarch64__)
-    sph_simd512_init(&x11gost_ctx.simd);
-#else
-    init_sd( &x11gost_ctx.simd, 512 );
-#endif
 }

 void x11gost_hash(void *output, const void *input)
@@ -123,13 +109,7 @@ void x11gost_hash(void *output, const void *input)
    sph_shavite512( &ctx.shavite, hash, 64 );
    sph_shavite512_close( &ctx.shavite, hash );

-#if defined(__aarch64__)
-    sph_simd512 (&ctx.simd, hash, 64); 
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                                   (const BitSequence *)hash, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
     update_final_echo ( &ctx.echo, (BitSequence *)hash,
--- a/algo/x12/x12.c
+++ b/algo/x12/x12.c
@@ -17,11 +17,7 @@
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
@@ -44,11 +40,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam            cubehash;
   sph_shavite512_context   shavite;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
+   simd512_context         simd;
   sph_hamsi512_context     hamsi;
 } x12_ctx_holder;

@@ -71,11 +63,6 @@ void init_x12_ctx()
        init_luffa( &x12_ctx.luffa, 512 );
        cubehashInit( &x12_ctx.cubehash, 512, 16, 32 );
        sph_shavite512_init( &x12_ctx.shavite );
-#if defined(__aarch64__)
-   sph_simd512_init( &x12_ctx.simd );
-#else
-   init_sd( &x12_ctx.simd, 512 );
-#endif
        sph_hamsi512_init( &x12_ctx.hamsi );
 };

@@ -101,13 +88,7 @@ void x12hash(void *output, const void *input)
   sph_shavite512( &ctx.shavite, hash, 64);
   sph_shavite512_close( &ctx.shavite, hashB);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hashB, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_sd( &ctx.simd, (const BitSequence *)hash, 512 );
-    final_sd( &ctx.simd, (BitSequence *)hash );
-#endif
+   simd512_ctx( &ctx.simd, hash, hashB, 64 );

 #if defined(__AES__)
   update_final_echo ( &ctx.echo, (BitSequence *)hashB,
--- a/algo/x13/x13.c
+++ b/algo/x13/x13.c
@@ -15,11 +15,7 @@
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-#include "algo/simd/sph_simd.h"
-#else
-#include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -48,11 +44,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cubehash;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-  sph_simd512_context     simd;
-#else
-  hashState_sd            simd;
-#endif
+   simd512_context         simd;
   sph_hamsi512_context    hamsi;
 } x13_ctx_holder;

@@ -77,11 +69,6 @@ void init_x13_ctx()
   init_luffa( &x13_ctx.luffa, 512 );
   cubehashInit( &x13_ctx.cubehash, 512, 16, 32 );
   sph_shavite512_init( &x13_ctx.shavite );
-#if defined(__aarch64__)
-    sph_simd512_init(&x13_ctx.simd);
-#else
-    init_sd( &x13_ctx.simd, 512 );
-#endif
   sph_hamsi512_init( &x13_ctx.hamsi );
 };

@@ -121,13 +108,7 @@ void x13hash(void *output, const void *input)
    sph_shavite512( &ctx.shavite, hash, 64);
    sph_shavite512_close( &ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                                   (const BitSequence *)hash, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
    update_final_echo ( &ctx.echo, (BitSequence *)hash,
--- a/algo/x13/x13bcd.c
+++ b/algo/x13/x13bcd.c
@@ -15,11 +15,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -47,11 +43,7 @@ typedef struct {
   sph_skein512_context    skein;
   cubehashParam           cube;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
+   simd512_context         simd;
   sph_hamsi512_context    hamsi;
   sm3_ctx_t               sm3;
 } x13bcd_ctx_holder;
@@ -76,11 +68,6 @@ void init_x13bcd_ctx()
   sph_keccak512_init( &x13bcd_ctx.keccak );
   cubehashInit( &x13bcd_ctx.cube,512,16,32 );
   sph_shavite512_init( &x13bcd_ctx.shavite );
-#if defined(__aarch64__)
-   sph_simd512_init( &x13bcd_ctx.simd );
-#else
-   init_sd( &x13bcd_ctx.simd, 512 );
-#endif
   sm3_init( &x13bcd_ctx.sm3 );
   sph_hamsi512_init( &x13bcd_ctx.hamsi );
 };
@@ -127,13 +114,7 @@ void x13bcd_hash(void *output, const void *input)
    sph_shavite512( &ctx.shavite, hash, 64);
    sph_shavite512_close( &ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                                   (const BitSequence *)hash, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
    update_final_echo ( &ctx.echo, (BitSequence *)hash,
--- a/algo/x13/x13sm3.c
+++ b/algo/x13/x13sm3.c
@@ -17,11 +17,7 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -46,11 +42,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cube;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
+   simd512_context         simd;
   sm3_ctx_t               sm3;
   sph_hamsi512_context    hamsi;
   sph_fugue512_context    fugue;
@@ -75,11 +67,6 @@ void init_x13sm3_ctx()
   init_luffa( &hsr_ctx.luffa,512 );
   cubehashInit( &hsr_ctx.cube,512,16,32 );
   sph_shavite512_init( &hsr_ctx.shavite );
-#if defined(__aarch64__)
-   sph_simd512_init( &hsr_ctx.simd );
-#else
-   init_sd( &hsr_ctx.simd,512 );
-#endif
   sm3_init( &hsr_ctx.sm3 );
   sph_hamsi512_init( &hsr_ctx.hamsi );
   sph_fugue512_init( &hsr_ctx.fugue );
@@ -123,13 +110,7 @@ void x13sm3_hash(void *output, const void *input)
    sph_shavite512( &ctx.shavite, hash, 64);
        sph_shavite512_close( &ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                                   (const BitSequence *)hash, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hash, hash, 64 );

        //11---echo---
 #ifdef __AES__
--- a/algo/x14/x14.c
+++ b/algo/x14/x14.c
@@ -15,11 +15,7 @@
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -49,11 +45,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cube;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
+   simd512_context         simd;
   sph_hamsi512_context    hamsi;
   sph_shabal512_context   shabal;
 } x14_ctx_holder;
@@ -79,11 +71,6 @@ void init_x14_ctx()
   init_luffa( &x14_ctx.luffa,512 );
   cubehashInit( &x14_ctx.cube,512,16,32 );
   sph_shavite512_init( &x14_ctx.shavite );
-#if defined(__aarch64__)
-   sph_simd512_init( &x14_ctx.simd );
-#else
-   init_sd( &x14_ctx.simd, 512 );
-#endif
   sph_hamsi512_init( &x14_ctx.hamsi );
   sph_shabal512_init( &x14_ctx.shabal );
 };
@@ -124,13 +111,7 @@ void x14hash(void *output, const void *input)
    sph_shavite512( &ctx.shavite, hash, 64);
    sph_shavite512_close( &ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                                   (const BitSequence *)hash, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
    update_final_echo ( &ctx.echo, (BitSequence *)hash,
--- a/algo/x15/x15.c
+++ b/algo/x15/x15.c
@@ -17,12 +17,7 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
-
+#include "algo/simd/simd-hash-2way.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -52,11 +47,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cubehash;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
+   simd512_context         simd;
   sph_hamsi512_context    hamsi;
   sph_shabal512_context   shabal;
   sph_whirlpool_context   whirlpool;
@@ -83,11 +74,6 @@ void init_x15_ctx()
   init_luffa( &x15_ctx.luffa,512 );
   cubehashInit( &x15_ctx.cubehash, 512, 16, 32 );
   sph_shavite512_init( &x15_ctx.shavite );
-#if defined(__aarch64__)
-   sph_simd512_init( &x15_ctx.simd );
-#else
-   init_sd( &x15_ctx.simd, 512 );
-#endif
   sph_hamsi512_init( &x15_ctx.hamsi );
   sph_shabal512_init( &x15_ctx.shabal );
   sph_whirlpool_init( &x15_ctx.whirlpool );
@@ -131,13 +117,7 @@ void x15hash(void *output, const void *input)
    sph_shavite512( &ctx.shavite, hash, 64);
    sph_shavite512_close( &ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                                   (const BitSequence *)hash, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
    update_final_echo ( &ctx.echo, (BitSequence *)hash,
--- a/algo/x16/hex.c
+++ b/algo/x16/hex.c
@@ -189,7 +189,7 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
   v128_bswap32_80( edata, pdata );
   
   static __thread uint32_t s_ntime = UINT32_MAX;
-   uint32_t ntime = swab32(pdata[17]);
+   uint32_t ntime = bswap_32(pdata[17]);
   if ( s_ntime != ntime )
   {
      hex_getAlgoString( (const uint32_t*) (&edata[1]), x16r_hash_order );
@@ -236,7 +236,7 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
   do
   {
      edata[19] = nonce;
-      if ( hex_hash( hash32, edata, thr_id ) );
+      if ( hex_hash( hash32, edata, thr_id ) )
      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
      {
         be32enc( &pdata[19], nonce );
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -31,18 +31,18 @@ void x16r_8way_prehash( void *vdata, void *pdata, const char *hash_order )
   {
      case JH:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         jh512_8way_init( &x16r_ctx.jh );
-         jh512_8way_update( &x16r_ctx.jh, vdata, 64 );
+         jh512_8x64_init( &x16r_ctx.jh );
+         jh512_8x64_update( &x16r_ctx.jh, vdata, 64 );
      break;
      case KECCAK:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         keccak512_8way_init( &x16r_ctx.keccak );
-         keccak512_8way_update( &x16r_ctx.keccak, vdata, 72 );
+         keccak512_8x64_init( &x16r_ctx.keccak );
+         keccak512_8x64_update( &x16r_ctx.keccak, vdata, 72 );
      break;
      case SKEIN:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         skein512_8way_init( &x16r_ctx.skein );
-         skein512_8way_update( &x16r_ctx.skein, vdata, 64 );
+         skein512_8x64_init( &x16r_ctx.skein );
+         skein512_8x64_update( &x16r_ctx.skein, vdata, 64 );
      break;
      case LUFFA:
      {
@@ -78,8 +78,8 @@ void x16r_8way_prehash( void *vdata, void *pdata, const char *hash_order )
      break;
      case HAMSI:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         hamsi512_8way_init( &x16r_ctx.hamsi );
-         hamsi512_8way_update( &x16r_ctx.hamsi, vdata, 72 );
+         hamsi512_8x64_init( &x16r_ctx.hamsi );
+         hamsi512_8x64_update( &x16r_ctx.hamsi, vdata, 72 );
      break;
      case FUGUE:
         v128_bswap32_80( edata, pdata );
@@ -90,8 +90,8 @@ void x16r_8way_prehash( void *vdata, void *pdata, const char *hash_order )
      break;
      case SHABAL:
         mm256_bswap32_intrlv80_8x32( vdata2, pdata );
-         shabal512_8way_init( &x16r_ctx.shabal );
-         shabal512_8way_update( &x16r_ctx.shabal, vdata2, 64 );
+         shabal512_8x32_init( &x16r_ctx.shabal );
+         shabal512_8x32_update( &x16r_ctx.shabal, vdata2, 64 );
         rintrlv_8x32_8x64( vdata, vdata2, 640 );
      break;
      case WHIRLPOOL:
@@ -146,27 +146,27 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid,
      {
         case BLAKE:
            if ( i == 0 )
-               blake512_8way_full( &ctx.blake, vhash, input, size );
+               blake512_8x64_full( &ctx.blake, vhash, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
                            size<<3 );
-               blake512_8way_full( &ctx.blake, vhash, vhash, size );
+               blake512_8x64_full( &ctx.blake, vhash, vhash, size );
            }
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5,
                                 hash6, hash7, vhash );
         break;
         case BMW:
-            bmw512_8way_init( &ctx.bmw );
+            bmw512_8x64_init( &ctx.bmw );
            if ( i == 0 )
-               bmw512_8way_update( &ctx.bmw, input, size );
+               bmw512_8x64_update( &ctx.bmw, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
-               bmw512_8way_update( &ctx.bmw, vhash, size );
+               bmw512_8x64_update( &ctx.bmw, vhash, size );
            }
-            bmw512_8way_close( &ctx.bmw, vhash );
+            bmw512_8x64_close( &ctx.bmw, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
@@ -191,43 +191,43 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid,
         break;
         case JH:
            if ( i == 0 )
-               jh512_8way_update( &ctx.jh, input + (64<<3), 16 );
+               jh512_8x64_update( &ctx.jh, input + (64<<3), 16 );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
                            size<<3 );
-               jh512_8way_init( &ctx.jh );
-               jh512_8way_update( &ctx.jh, vhash, size );
+               jh512_8x64_init( &ctx.jh );
+               jh512_8x64_update( &ctx.jh, vhash, size );
            }
-            jh512_8way_close( &ctx.jh, vhash );
+            jh512_8x64_close( &ctx.jh, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case KECCAK:
           if ( i == 0 )
-               keccak512_8way_update( &ctx.keccak, input + (72<<3), 8 );
+               keccak512_8x64_update( &ctx.keccak, input + (72<<3), 8 );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
                            size<<3 );
-               keccak512_8way_init( &ctx.keccak );
-               keccak512_8way_update( &ctx.keccak, vhash, size );
+               keccak512_8x64_init( &ctx.keccak );
+               keccak512_8x64_update( &ctx.keccak, vhash, size );
            }
-            keccak512_8way_close( &ctx.keccak, vhash );
+            keccak512_8x64_close( &ctx.keccak, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case SKEIN:
            if ( i == 0 )
-               skein512_8way_update( &ctx.skein, input + (64<<3), 16 );
+               skein512_8x64_update( &ctx.skein, input + (64<<3), 16 );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
-               skein512_8way_init( &ctx.skein );
-               skein512_8way_update( &ctx.skein, vhash, size );
+               skein512_8x64_init( &ctx.skein );
+               skein512_8x64_update( &ctx.skein, vhash, size );
            }
-            skein512_8way_close( &ctx.skein, vhash );
+            skein512_8x64_close( &ctx.skein, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
@@ -333,15 +333,15 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid,
         break;
         case HAMSI:
            if ( i == 0 )
-               hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 );
+               hamsi512_8x64_update( &ctx.hamsi, input + (72<<3), 8 );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
-               hamsi512_8way_init( &ctx.hamsi );
-               hamsi512_8way_update( &ctx.hamsi, vhash, size );
+               hamsi512_8x64_init( &ctx.hamsi );
+               hamsi512_8x64_update( &ctx.hamsi, vhash, size );
            }
-            hamsi512_8way_close( &ctx.hamsi, vhash );
+            hamsi512_8x64_close( &ctx.hamsi, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
@@ -388,13 +388,13 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid,
             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                             size<<3 );
             if ( i == 0 )
-                shabal512_8way_update( &ctx.shabal, vhash + (16<<3), 16 );
+                shabal512_8x32_update( &ctx.shabal, vhash + (16<<3), 16 );
             else
             {
-                shabal512_8way_init( &ctx.shabal );
-                shabal512_8way_update( &ctx.shabal, vhash, size );
+                shabal512_8x32_init( &ctx.shabal );
+                shabal512_8x32_update( &ctx.shabal, vhash, size );
             }
-             shabal512_8way_close( &ctx.shabal, vhash );
+             shabal512_8x32_close( &ctx.shabal, vhash );
             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
@@ -438,16 +438,16 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid,
            }
         break;
         case SHA_512:
-             sha512_8way_init( &ctx.sha512 );
+             sha512_8x64_init( &ctx.sha512 );
             if ( i == 0 )
-                sha512_8way_update( &ctx.sha512, input, size );
+                sha512_8x64_update( &ctx.sha512, input, size );
             else
             {
                intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                             size<<3 );
-                sha512_8way_update( &ctx.sha512, vhash, size );
+                sha512_8x64_update( &ctx.sha512, vhash, size );
             }
-             sha512_8way_close( &ctx.sha512, vhash );
+             sha512_8x64_close( &ctx.sha512, vhash );
             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                               hash7, vhash );
          break;
@@ -526,7 +526,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
   do
   {
-      if( x16r_8way_hash( hash, vdata, thr_id ) );
+      if ( x16r_8way_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 8; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
@@ -556,17 +556,17 @@ void x16r_4way_prehash( void *vdata, void *pdata, const char *hash_order )
   {
      case JH:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         jh512_4way_init( &x16r_ctx.jh );
-         jh512_4way_update( &x16r_ctx.jh, vdata, 64 );
+         jh512_4x64_init( &x16r_ctx.jh );
+         jh512_4x64_update( &x16r_ctx.jh, vdata, 64 );
      break;
      case KECCAK:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         keccak512_4way_init( &x16r_ctx.keccak );
-         keccak512_4way_update( &x16r_ctx.keccak, vdata, 72 );
+         keccak512_4x64_init( &x16r_ctx.keccak );
+         keccak512_4x64_update( &x16r_ctx.keccak, vdata, 72 );
      break;
      case SKEIN:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         skein512_4way_prehash64( &x16r_ctx.skein, vdata );
+         skein512_4x64_prehash64( &x16r_ctx.skein, vdata );
      break;
      case LUFFA:
      {
@@ -599,8 +599,8 @@ void x16r_4way_prehash( void *vdata, void *pdata, const char *hash_order )
      break;
      case HAMSI:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         hamsi512_4way_init( &x16r_ctx.hamsi );
-         hamsi512_4way_update( &x16r_ctx.hamsi, vdata, 72 );
+         hamsi512_4x64_init( &x16r_ctx.hamsi );
+         hamsi512_4x64_update( &x16r_ctx.hamsi, vdata, 72 );
      break;
      case FUGUE:
         v128_bswap32_80( edata, pdata );
@@ -610,8 +610,8 @@ void x16r_4way_prehash( void *vdata, void *pdata, const char *hash_order )
      break;
      case SHABAL:
         v128_bswap32_intrlv80_4x32( vdata2, pdata );
-         shabal512_4way_init( &x16r_ctx.shabal );
-         shabal512_4way_update( &x16r_ctx.shabal, vdata2, 64 );
+         shabal512_4x32_init( &x16r_ctx.shabal );
+         shabal512_4x32_update( &x16r_ctx.shabal, vdata2, 64 );
         rintrlv_4x32_4x64( vdata, vdata2, 640 );
      break;
      case WHIRLPOOL:
@@ -652,24 +652,24 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid,
      {
         case BLAKE:
            if ( i == 0 )
-               blake512_4way_full( &ctx.blake, vhash, input, size );
+               blake512_4x64_full( &ctx.blake, vhash, input, size );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               blake512_4way_full( &ctx.blake, vhash, vhash, size );
+               blake512_4x64_full( &ctx.blake, vhash, vhash, size );
            }
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case BMW:
-            bmw512_4way_init( &ctx.bmw );
+            bmw512_4x64_init( &ctx.bmw );
            if ( i == 0 )
-               bmw512_4way_update( &ctx.bmw, input, size );
+               bmw512_4x64_update( &ctx.bmw, input, size );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               bmw512_4way_update( &ctx.bmw, vhash, size );
+               bmw512_4x64_update( &ctx.bmw, vhash, size );
            }
-            bmw512_4way_close( &ctx.bmw, vhash );
+            bmw512_4x64_close( &ctx.bmw, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case GROESTL:
@@ -689,35 +689,35 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid,
   	    break;
         case JH:
            if ( i == 0 )
-               jh512_4way_update( &ctx.jh, input + (64<<2), 16 );
+               jh512_4x64_update( &ctx.jh, input + (64<<2), 16 );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               jh512_4way_init( &ctx.jh );
-               jh512_4way_update( &ctx.jh, vhash, size );
+               jh512_4x64_init( &ctx.jh );
+               jh512_4x64_update( &ctx.jh, vhash, size );
            }
-            jh512_4way_close( &ctx.jh, vhash );
+            jh512_4x64_close( &ctx.jh, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case KECCAK:
           if ( i == 0 )
-               keccak512_4way_update( &ctx.keccak, input + (72<<2), 8 );
+               keccak512_4x64_update( &ctx.keccak, input + (72<<2), 8 );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               keccak512_4way_init( &ctx.keccak );
-               keccak512_4way_update( &ctx.keccak, vhash, size );
+               keccak512_4x64_init( &ctx.keccak );
+               keccak512_4x64_update( &ctx.keccak, vhash, size );
            }
-            keccak512_4way_close( &ctx.keccak, vhash );
+            keccak512_4x64_close( &ctx.keccak, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case SKEIN:
            if ( i == 0 )
-               skein512_4way_final16( &ctx.skein, vhash, input + (64*4) );
+               skein512_4x64_final16( &ctx.skein, vhash, input + (64*4) );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               skein512_4way_full( &ctx.skein, vhash, vhash, size );
+               skein512_4x64_full( &ctx.skein, vhash, vhash, size );
            }
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
@@ -809,14 +809,14 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid,
   	    break;
         case HAMSI:
            if ( i == 0 )
-               hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 );
+               hamsi512_4x64_update( &ctx.hamsi, input + (72<<2), 8 );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               hamsi512_4way_init( &ctx.hamsi );
-               hamsi512_4way_update( &ctx.hamsi, vhash, size );
+               hamsi512_4x64_init( &ctx.hamsi );
+               hamsi512_4x64_update( &ctx.hamsi, vhash, size );
            }
-            hamsi512_4way_close( &ctx.hamsi, vhash );
+            hamsi512_4x64_close( &ctx.hamsi, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case FUGUE:
@@ -845,13 +845,13 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid,
         case SHABAL:
             intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
             if ( i == 0 )
-                shabal512_4way_update( &ctx.shabal, vhash + (16<<2), 16 );
+                shabal512_4x32_update( &ctx.shabal, vhash + (16<<2), 16 );
             else
             {
-                shabal512_4way_init( &ctx.shabal );
-                shabal512_4way_update( &ctx.shabal, vhash, size );
+                shabal512_4x32_init( &ctx.shabal );
+                shabal512_4x32_update( &ctx.shabal, vhash, size );
             }
-             shabal512_4way_close( &ctx.shabal, vhash );
+             shabal512_4x32_close( &ctx.shabal, vhash );
             dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case WHIRLPOOL:
@@ -878,16 +878,16 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid,
            }
         break;
         case SHA_512:
-            sha512_4way_init( &ctx.sha512 );
+            sha512_4x64_init( &ctx.sha512 );
            if ( i == 0 )
-               sha512_4way_update( &ctx.sha512, input, size );
+               sha512_4x64_update( &ctx.sha512, input, size );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               sha512_4way_init( &ctx.sha512 );
-               sha512_4way_update( &ctx.sha512, vhash, size );
+               sha512_4x64_init( &ctx.sha512 );
+               sha512_4x64_update( &ctx.sha512, vhash, size );
            }
-            sha512_4way_close( &ctx.sha512, vhash );
+            sha512_4x64_close( &ctx.sha512, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
      }
@@ -952,7 +952,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
   {
-      if ( x16r_4way_hash( hash, vdata, thr_id ) );
+      if ( x16r_4way_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 4; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
@@ -1353,7 +1353,7 @@ int scanhash_x16r_2x64( struct work *work, uint32_t max_nonce,
   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
   do
   {
-      if ( x16r_2x64_hash( hash, vdata, thr_id ) );
+      if ( x16r_2x64_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 2; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -15,7 +15,6 @@
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/sph_simd.h"
-#include "algo/simd/nist.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
@@ -126,19 +125,19 @@ bool register_x21s__algo( algo_gate_t* gate );

 union _x16r_8way_context_overlay
 {
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
+    blake512_8x64_context   blake;
+    bmw512_8x64_context     bmw;
+    skein512_8x64_context   skein;
+    jh512_8x64_context      jh;
+    keccak512_8x64_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
    simd_4way_context       simd;
-    hamsi512_8way_context   hamsi;
+    hamsi512_8x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_8way_context  shabal;
+    shabal512_8x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
+    sha512_8x64_context     sha512;
 #if defined(__VAES__)
    groestl512_4way_context groestl;
    shavite512_4way_context shavite;
@@ -171,8 +170,8 @@ int scanhash_x16r_8way( struct work *, uint32_t ,

 union _x16r_4way_context_overlay
 {
-    blake512_4way_context   blake;
-    bmw512_4way_context     bmw;
+    blake512_4x64_context   blake;
+    bmw512_4x64_context     bmw;
 #if defined(__VAES__)
    groestl512_2way_context groestl;
    shavite512_2way_context shavite;
@@ -182,17 +181,17 @@ union _x16r_4way_context_overlay
    shavite512_context      shavite;
    hashState_echo          echo;
 #endif
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;
-    keccak512_4way_context  keccak;
+    skein512_4x64_context   skein;
+    jh512_4x64_context      jh;
+    keccak512_4x64_context  keccak;
    luffa_2way_context      luffa;
    cube_2way_context       cube;
    simd_2way_context       simd;
-    hamsi512_4way_context   hamsi;
+    hamsi512_4x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_4way_context  shabal;
+    shabal512_4x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_4way_context     sha512;
+    sha512_4x64_context     sha512;
 } __attribute__ ((aligned (64)));
 #define  _x16r_4x64_context_overlay _x16r_4way_context_overlay

--- a/algo/x16/x16rt.c
+++ b/algo/x16/x16rt.c
@@ -20,7 +20,7 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
   v128_bswap32_80( edata, pdata );

   static __thread uint32_t s_ntime = UINT32_MAX;
-   uint32_t masked_ntime = swab32( pdata[17] ) & 0xffffff80;
+   uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
   if ( s_ntime != masked_ntime )
   {
      x16rt_getTimeHash( masked_ntime, &timeHash );
@@ -28,7 +28,7 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
      s_ntime = masked_ntime;
      if ( !thr_id )
          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
-                        x16r_hash_order, swab32( pdata[17] ), timeHash );
+                        x16r_hash_order, bswap_32( pdata[17] ), timeHash );
   }
   
   x16r_prehash( edata, pdata, x16r_hash_order );
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -14,19 +14,19 @@

 union _x16rv2_8way_context_overlay
 {
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
+    blake512_8x64_context   blake;
+    bmw512_8x64_context     bmw;
+    skein512_8x64_context   skein;
+    jh512_8x64_context      jh;
+    keccak512_8x64_context  keccak;
    luffa_4way_context      luffa;
    cubehashParam           cube;
    simd_4way_context       simd;
-    hamsi512_8way_context   hamsi;
+    hamsi512_8x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_8way_context  shabal;
+    shabal512_8x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
+    sha512_8x64_context     sha512;
    sph_tiger_context       tiger;
 #if defined(__VAES__)
    groestl512_4way_context groestl;
@@ -76,29 +76,29 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
      switch ( algo )
      {
         case BLAKE:
-            blake512_8way_init( &ctx.blake );
+            blake512_8x64_init( &ctx.blake );
            if ( i == 0 )
-               blake512_8way_full( &ctx.blake, vhash, input, size );
+               blake512_8x64_full( &ctx.blake, vhash, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
-               blake512_8way_full( &ctx.blake, vhash, vhash, size );
+               blake512_8x64_full( &ctx.blake, vhash, vhash, size );
            }
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5,
                                 hash6, hash7, vhash );
         break;
         case BMW:
-            bmw512_8way_init( &ctx.bmw );
+            bmw512_8x64_init( &ctx.bmw );
            if ( i == 0 )
-               bmw512_8way_update( &ctx.bmw, input, size );
+               bmw512_8x64_update( &ctx.bmw, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
-            bmw512_8way_update( &ctx.bmw, vhash, size );
+            bmw512_8x64_update( &ctx.bmw, vhash, size );
            }
-            bmw512_8way_close( &ctx.bmw, vhash );
+            bmw512_8x64_close( &ctx.bmw, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
@@ -123,15 +123,15 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
         break;
         case JH:
            if ( i == 0 )
-               jh512_8way_update( &ctx.jh, input + (64<<3), 16 );
+               jh512_8x64_update( &ctx.jh, input + (64<<3), 16 );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
-               jh512_8way_init( &ctx.jh ); 
-               jh512_8way_update( &ctx.jh, vhash, size );
+               jh512_8x64_init( &ctx.jh ); 
+               jh512_8x64_update( &ctx.jh, vhash, size );
            }
-            jh512_8way_close( &ctx.jh, vhash );
+            jh512_8x64_close( &ctx.jh, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
@@ -197,23 +197,23 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )

             intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
                          hash6, hash7 );
-             keccak512_8way_init( &ctx.keccak );
-             keccak512_8way_update( &ctx.keccak, vhash, 64 );
-             keccak512_8way_close( &ctx.keccak, vhash );
+             keccak512_8x64_init( &ctx.keccak );
+             keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+             keccak512_8x64_close( &ctx.keccak, vhash );
             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case SKEIN:
            if ( i == 0 )
-               skein512_8way_update( &ctx.skein, input + (64<<3), 16 );
+               skein512_8x64_update( &ctx.skein, input + (64<<3), 16 );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
-               skein512_8way_init( &ctx.skein ); 
-               skein512_8way_update( &ctx.skein, vhash, size );
+               skein512_8x64_init( &ctx.skein ); 
+               skein512_8x64_update( &ctx.skein, vhash, size );
            }
-            skein512_8way_close( &ctx.skein, vhash );
+            skein512_8x64_close( &ctx.skein, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
@@ -395,16 +395,16 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
         break;
         case HAMSI:
            if ( i == 0 )
-               hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 );
+               hamsi512_8x64_update( &ctx.hamsi, input + (72<<3), 8 );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );

-               hamsi512_8way_init( &ctx.hamsi );
-               hamsi512_8way_update( &ctx.hamsi, vhash, size );
+               hamsi512_8x64_init( &ctx.hamsi );
+               hamsi512_8x64_update( &ctx.hamsi, vhash, size );
            }
-            hamsi512_8way_close( &ctx.hamsi, vhash );
+            hamsi512_8x64_close( &ctx.hamsi, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
@@ -451,13 +451,13 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
            intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                         size<<3 );
            if ( i == 0 )
-                shabal512_8way_update( &ctx.shabal, vhash + (16<<3), 16 );
+                shabal512_8x32_update( &ctx.shabal, vhash + (16<<3), 16 );
            else
            {
-                shabal512_8way_init( &ctx.shabal );
-                shabal512_8way_update( &ctx.shabal, vhash, size );
+                shabal512_8x32_init( &ctx.shabal );
+                shabal512_8x32_update( &ctx.shabal, vhash, size );
            }
-            shabal512_8way_close( &ctx.shabal, vhash );
+            shabal512_8x32_close( &ctx.shabal, vhash );
            dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                              hash7, vhash );
         break;
@@ -562,9 +562,9 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )

             intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
                          hash6, hash7 );
-             sha512_8way_init( &ctx.sha512 );
-             sha512_8way_update( &ctx.sha512, vhash, 64 );
-             sha512_8way_close( &ctx.sha512, vhash );
+             sha512_8x64_init( &ctx.sha512 );
+             sha512_8x64_update( &ctx.sha512, vhash, 64 );
+             sha512_8x64_close( &ctx.sha512, vhash );
             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
@@ -623,8 +623,8 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
   {
      case JH:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         jh512_8way_init( &x16rv2_ctx.jh );
-         jh512_8way_update( &x16rv2_ctx.jh, vdata, 64 );
+         jh512_8x64_init( &x16rv2_ctx.jh );
+         jh512_8x64_update( &x16rv2_ctx.jh, vdata, 64 );
      break;
      case KECCAK:
      case LUFFA:
@@ -637,8 +637,8 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
      break;
      case SKEIN:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         skein512_8way_init( &x16rv2_ctx.skein );
-         skein512_8way_update( &x16rv2_ctx.skein, vdata, 64 );
+         skein512_8x64_init( &x16rv2_ctx.skein );
+         skein512_8x64_update( &x16rv2_ctx.skein, vdata, 64 );
      break;
      case CUBEHASH:
         v128_bswap32_80( edata, pdata );
@@ -649,8 +649,8 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
      break;
      case HAMSI:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         hamsi512_8way_init( &x16rv2_ctx.hamsi );
-         hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 72 );
+         hamsi512_8x64_init( &x16rv2_ctx.hamsi );
+         hamsi512_8x64_update( &x16rv2_ctx.hamsi, vdata, 72 );
      break;
      case FUGUE:
         v128_bswap32_80( edata, pdata );
@@ -661,8 +661,8 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
      break;
      case SHABAL:
         mm256_bswap32_intrlv80_8x32( vdata2, pdata );
-         shabal512_8way_init( &x16rv2_ctx.shabal );
-         shabal512_8way_update( &x16rv2_ctx.shabal, vdata2, 64 );
+         shabal512_8x32_init( &x16rv2_ctx.shabal );
+         shabal512_8x32_update( &x16rv2_ctx.shabal, vdata2, 64 );
         rintrlv_8x32_8x64( vdata, vdata2, 640 );
      break;
      case WHIRLPOOL:
@@ -701,8 +701,8 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,

 union _x16rv2_4way_context_overlay
 {
-    blake512_4way_context   blake;
-    bmw512_4way_context     bmw;
+    blake512_4x64_context   blake;
+    bmw512_4x64_context     bmw;
 #if defined(__VAES__)
    groestl512_2way_context groestl;
    shavite512_2way_context shavite;
@@ -712,17 +712,17 @@ union _x16rv2_4way_context_overlay
    shavite512_context      shavite;
    hashState_echo          echo;
 #endif
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;
-    keccak512_4way_context  keccak;
+    skein512_4x64_context   skein;
+    jh512_4x64_context      jh;
+    keccak512_4x64_context  keccak;
    luffa_2way_context      luffa;
    cubehashParam           cube;
    simd_2way_context       simd;
-    hamsi512_4way_context   hamsi;
+    hamsi512_4x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_4way_context  shabal;
+    shabal512_4x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_4way_context     sha512;
+    sha512_4x64_context     sha512;
    sph_tiger_context       tiger;
 };
 typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay;
@@ -761,24 +761,24 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
      {
         case BLAKE:
            if ( i == 0 )
-               blake512_4way_full( &ctx.blake, vhash, input, size );
+               blake512_4x64_full( &ctx.blake, vhash, input, size );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               blake512_4way_full( &ctx.blake, vhash, vhash, size );
+               blake512_4x64_full( &ctx.blake, vhash, vhash, size );
            }
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case BMW:
-            bmw512_4way_init( &ctx.bmw );
+            bmw512_4x64_init( &ctx.bmw );
            if ( i == 0 )
-               bmw512_4way_update( &ctx.bmw, input, size );
+               bmw512_4x64_update( &ctx.bmw, input, size );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               bmw512_4way_update( &ctx.bmw, vhash, size );
+               bmw512_4x64_update( &ctx.bmw, vhash, size );
            }
-            bmw512_4way_close( &ctx.bmw, vhash );
+            bmw512_4x64_close( &ctx.bmw, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case GROESTL:
@@ -798,14 +798,14 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
         break;
         case JH:
            if ( i == 0 )
-               jh512_4way_update( &ctx.jh, input + (64<<2), 16 );
+               jh512_4x64_update( &ctx.jh, input + (64<<2), 16 );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               jh512_4way_init( &ctx.jh );
-               jh512_4way_update( &ctx.jh, vhash, size );
+               jh512_4x64_init( &ctx.jh );
+               jh512_4x64_update( &ctx.jh, vhash, size );
            }
-            jh512_4way_close( &ctx.jh, vhash );
+            jh512_4x64_close( &ctx.jh, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case KECCAK:
@@ -842,20 +842,20 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
                hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0;

            intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-            keccak512_4way_init( &ctx.keccak );
-            keccak512_4way_update( &ctx.keccak, vhash, 64 );
-            keccak512_4way_close( &ctx.keccak, vhash );
+            keccak512_4x64_init( &ctx.keccak );
+            keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+            keccak512_4x64_close( &ctx.keccak, vhash );
            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case SKEIN:
            if ( i == 0 )
-               skein512_4way_final16( &ctx.skein, vhash, input + (64*4) );
+               skein512_4x64_final16( &ctx.skein, vhash, input + (64*4) );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               skein512_4way_init( &ctx.skein );
-               skein512_4way_update( &ctx.skein, vhash, size );
-               skein512_4way_close( &ctx.skein, vhash );
+               skein512_4x64_init( &ctx.skein );
+               skein512_4x64_update( &ctx.skein, vhash, size );
+               skein512_4x64_close( &ctx.skein, vhash );
            }
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
@@ -976,14 +976,14 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
         break;
         case HAMSI:
            if ( i == 0 )
-               hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 );
+               hamsi512_4x64_update( &ctx.hamsi, input + (72<<2), 8 );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               hamsi512_4way_init( &ctx.hamsi );
-               hamsi512_4way_update( &ctx.hamsi, vhash, size );
+               hamsi512_4x64_init( &ctx.hamsi );
+               hamsi512_4x64_update( &ctx.hamsi, vhash, size );
            }
-            hamsi512_4way_close( &ctx.hamsi, vhash );
+            hamsi512_4x64_close( &ctx.hamsi, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case FUGUE:
@@ -1012,13 +1012,13 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
         case SHABAL:
             intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
             if ( i == 0 )
-                shabal512_4way_update( &ctx.shabal, vhash + (16<<2), 16 );
+                shabal512_4x32_update( &ctx.shabal, vhash + (16<<2), 16 );
             else
             {
-                shabal512_4way_init( &ctx.shabal );
-                shabal512_4way_update( &ctx.shabal, vhash, size );
+                shabal512_4x32_init( &ctx.shabal );
+                shabal512_4x32_update( &ctx.shabal, vhash, size );
             }
-             shabal512_4way_close( &ctx.shabal, vhash );
+             shabal512_4x32_close( &ctx.shabal, vhash );
             dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case WHIRLPOOL:
@@ -1078,9 +1078,9 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
                hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0;
 
             intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-             sha512_4way_init( &ctx.sha512 );
-             sha512_4way_update( &ctx.sha512, vhash, 64 );
-             sha512_4way_close( &ctx.sha512, vhash );
+             sha512_4x64_init( &ctx.sha512 );
+             sha512_4x64_update( &ctx.sha512, vhash, 64 );
+             sha512_4x64_close( &ctx.sha512, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
      }
@@ -1133,8 +1133,8 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
   {
      case JH:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         jh512_4way_init( &x16rv2_ctx.jh );
-         jh512_4way_update( &x16rv2_ctx.jh, vdata, 64 );
+         jh512_4x64_init( &x16rv2_ctx.jh );
+         jh512_4x64_update( &x16rv2_ctx.jh, vdata, 64 );
      break;
      case KECCAK:
      case LUFFA:
@@ -1146,7 +1146,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
      break;
      case SKEIN:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         skein512_4way_prehash64( &x16rv2_ctx.skein, vdata );
+         skein512_4x64_prehash64( &x16rv2_ctx.skein, vdata );
      break;
      case CUBEHASH:
         v128_bswap32_80( edata, pdata );
@@ -1156,8 +1156,8 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
      break;
      case HAMSI:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         hamsi512_4way_init( &x16rv2_ctx.hamsi );
-         hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 72 );
+         hamsi512_4x64_init( &x16rv2_ctx.hamsi );
+         hamsi512_4x64_update( &x16rv2_ctx.hamsi, vdata, 72 );
      break;
      case FUGUE:
         v128_bswap32_80( edata, pdata );
@@ -1167,8 +1167,8 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
      break;
      case SHABAL:
         v128_bswap32_intrlv80_4x32( vdata32, pdata );
-         shabal512_4way_init( &x16rv2_ctx.shabal );
-         shabal512_4way_update( &x16rv2_ctx.shabal, vdata32, 64 );
+         shabal512_4x32_init( &x16rv2_ctx.shabal );
+         shabal512_4x32_update( &x16rv2_ctx.shabal, vdata32, 64 );
         rintrlv_4x32_4x64( vdata, vdata32, 640 );
      break;
      case WHIRLPOOL:
--- a/algo/x16/x16rv2.c
+++ b/algo/x16/x16rv2.c
@@ -168,7 +168,7 @@ int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
   static __thread uint32_t s_ntime = UINT32_MAX;
   if ( s_ntime != pdata[17] )
   {
-      uint32_t ntime = swab32(pdata[17]);
+      uint32_t ntime = bswap_32(pdata[17]);
      x16_r_s_getAlgoString( (const uint8_t*) (&edata[1]), x16r_hash_order );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
--- a/algo/x16/x20r.c
+++ b/algo/x16/x20r.c
@@ -137,7 +137,7 @@ int scanhash_x20r_8x64( struct work *work, uint32_t max_nonce,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
   do
   {
-      if( x20r_8x64_hash( hash, vdata, thr_id ) );
+      if ( x20r_8x64_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 8; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
@@ -205,7 +205,7 @@ int scanhash_x20r_4x64( struct work *work, uint32_t max_nonce,
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
   {
-      if ( x20r_4x64_hash( hash, vdata, thr_id ) );
+      if ( x20r_4x64_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 4; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
@@ -269,7 +269,7 @@ int scanhash_x20r_2x64( struct work *work, uint32_t max_nonce,
   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
   do
   {
-      if ( x20r_2x64_hash( hash, vdata, thr_id ) );
+      if ( x20r_2x64_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 2; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -21,10 +21,10 @@ static __thread uint64_t* x21s_8way_matrix;

 union _x21s_8way_context_overlay
 {
-    haval256_5_8way_context haval;
+    haval256_8x32_context   haval;
    sph_tiger_context       tiger;
    sph_gost512_context     gost;
-    sha256_8way_context     sha256;
+    sha256_8x32_context     sha256;
 } __attribute__ ((aligned (64)));

 typedef union _x21s_8way_context_overlay x21s_8way_context_overlay;
@@ -50,9 +50,9 @@ int x21s_8way_hash( void* output, const void* input, int thrid )
   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                    hash7 );

-   haval256_5_8way_init( &ctx.haval );
-   haval256_5_8way_update( &ctx.haval, vhash, 64 );
-   haval256_5_8way_close( &ctx.haval, vhash );
+   haval256_8x32_init( &ctx.haval );
+   haval256_8x32_update( &ctx.haval, vhash, 64 );
+   haval256_8x32_close( &ctx.haval, vhash );

   dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                     hash7, vhash );
@@ -122,9 +122,9 @@ int x21s_8way_hash( void* output, const void* input, int thrid )

   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                    hash7 );
-   sha256_8way_init( &ctx.sha256 );
-   sha256_8way_update( &ctx.sha256, vhash, 64 );
-   sha256_8way_close( &ctx.sha256, output );
+   sha256_8x32_init( &ctx.sha256 );
+   sha256_8x32_update( &ctx.sha256, vhash, 64 );
+   sha256_8x32_close( &ctx.sha256, output );

   return 1;
 }
@@ -202,11 +202,11 @@ static __thread uint64_t* x21s_4way_matrix;

 union _x21s_4way_context_overlay
 {
-    haval256_5_4way_context haval;
+    haval256_4x32_context   haval;
    sph_tiger_context       tiger;
    sph_gost512_context     gost;
 #if !defined(__SHA__)
-    sha256_4way_context     sha256;
+    sha256_4x32_context     sha256;
 #endif
 } __attribute__ ((aligned (64)));

@@ -228,9 +228,9 @@ int x21s_4way_hash( void* output, const void* input, int thrid )

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3,  512 );

-   haval256_5_4way_init( &ctx.haval );
-   haval256_5_4way_update( &ctx.haval, vhash, 64 );
-   haval256_5_4way_close( &ctx.haval, vhash );
+   haval256_4x32_init( &ctx.haval );
+   haval256_4x32_update( &ctx.haval, vhash, 64 );
+   haval256_4x32_close( &ctx.haval, vhash );

   dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );

@@ -279,9 +279,9 @@ int x21s_4way_hash( void* output, const void* input, int thrid )
 #else

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
-   sha256_4way_init( &ctx.sha256 );
-   sha256_4way_update( &ctx.sha256, vhash, 64 );
-   sha256_4way_close( &ctx.sha256, vhash );
+   sha256_4x32_init( &ctx.sha256 );
+   sha256_4x32_update( &ctx.sha256, vhash, 64 );
+   sha256_4x32_close( &ctx.sha256, vhash );
   dintrlv_4x32( output, output+32, output+64,output+96, vhash, 256 );

 #endif
--- a/algo/x16/x21s.c
+++ b/algo/x16/x21s.c
@@ -78,7 +78,7 @@ int scanhash_x21s( struct work *work, uint32_t max_nonce,
   static __thread uint32_t s_ntime = UINT32_MAX;
   if ( s_ntime != pdata[17] )
   {
-      uint32_t ntime = swab32(pdata[17]);
+      uint32_t ntime = bswap_32(pdata[17]);
      x16_r_s_getAlgoString( (const uint8_t*)(&edata[1]), x16r_hash_order );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -31,20 +31,20 @@

 union _sonoa_8way_context_overlay
 {
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
+    blake512_8x64_context   blake;
+    bmw512_8x64_context     bmw;
+    skein512_8x64_context   skein;
+    jh512_8x64_context      jh;
+    keccak512_8x64_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
    simd_4way_context       simd;
-    hamsi512_8way_context   hamsi;
+    hamsi512_8x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_8way_context  shabal;
+    shabal512_8x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
-    haval256_5_8way_context haval;
+    sha512_8x64_context     sha512;
+    haval256_8x32_context haval;
 #if defined(__VAES__)
    groestl512_4way_context groestl;
    shavite512_4way_context shavite;
@@ -75,9 +75,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 // 1
     
-     blake512_8way_full( &ctx.blake, vhash, input, 80 );
+     blake512_8x64_full( &ctx.blake, vhash, input, 80 );

-     bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
+     bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );

 #if defined(__VAES__)

@@ -107,15 +107,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );
     
-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, 64 );
-     jh512_8way_close( &ctx.jh, vhash );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhash, 64 );
+     jh512_8x64_close( &ctx.jh, vhash );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, 64 );
-     keccak512_8way_close( &ctx.keccak, vhash );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_8x64_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

@@ -189,7 +189,7 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     if ( work_restart[thr_id].restart ) return 0;
 // 2

-     bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
+     bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );

 #if defined(__VAES__)

@@ -219,15 +219,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, 64 );
-     jh512_8way_close( &ctx.jh, vhash );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhash, 64 );
+     jh512_8x64_close( &ctx.jh, vhash );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, 64 );
-     keccak512_8way_close( &ctx.keccak, vhash );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_8x64_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

@@ -298,14 +298,14 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8x64_close( &ctx.hamsi, vhash );

     if ( work_restart[thr_id].restart ) return 0;
 // 3

-     bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
+     bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );

 #if defined(__VAES__)

@@ -335,17 +335,17 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     skein512_8way_init( &ctx.skein );
-     skein512_8way_update( &ctx.skein, vhash, 64 );
-     skein512_8way_close( &ctx.skein, vhash );
+     skein512_8x64_init( &ctx.skein );
+     skein512_8x64_update( &ctx.skein, vhash, 64 );
+     skein512_8x64_close( &ctx.skein, vhash );

-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, 64 );
-     jh512_8way_close( &ctx.jh, vhash );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhash, 64 );
+     jh512_8x64_close( &ctx.jh, vhash );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, 64 );
-     keccak512_8way_close( &ctx.keccak, vhash );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_8x64_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

@@ -416,9 +416,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8x64_close( &ctx.hamsi, vhash );

     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
@@ -438,7 +438,7 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                       hash7 );

-     bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
+     bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );

 #if defined(__VAES__)

@@ -468,15 +468,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, 64 );
-     jh512_8way_close( &ctx.jh, vhash );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhash, 64 );
+     jh512_8x64_close( &ctx.jh, vhash );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, 64 );
-     keccak512_8way_close( &ctx.keccak, vhash );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_8x64_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

@@ -547,9 +547,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8x64_close( &ctx.hamsi, vhash );

     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
@@ -566,15 +566,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );

-     shabal512_8way_init( &ctx.shabal );
-     shabal512_8way_update( &ctx.shabal, vhash, 64 );
-     shabal512_8way_close( &ctx.shabal, vhash );
+     shabal512_8x32_init( &ctx.shabal );
+     shabal512_8x32_update( &ctx.shabal, vhash, 64 );
+     shabal512_8x32_close( &ctx.shabal, vhash );

     rintrlv_8x32_8x64( vhashA, vhash, 512 );

-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhashA, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhashA, 64 );
+     hamsi512_8x64_close( &ctx.hamsi, vhash );

 #if defined(__VAES__)
     
@@ -633,13 +633,13 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     if ( work_restart[thr_id].restart ) return 0;
 // 5

-     bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
+     bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );

     rintrlv_8x64_8x32( vhashA, vhash, 512 );

-     shabal512_8way_init( &ctx.shabal );
-     shabal512_8way_update( &ctx.shabal, vhashA, 64 );
-     shabal512_8way_close( &ctx.shabal, vhash );
+     shabal512_8x32_init( &ctx.shabal );
+     shabal512_8x32_update( &ctx.shabal, vhashA, 64 );
+     shabal512_8x32_close( &ctx.shabal, vhash );

 #if defined(__VAES__)

@@ -669,15 +669,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, 64 );
-     jh512_8way_close( &ctx.jh, vhash );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhash, 64 );
+     jh512_8x64_close( &ctx.jh, vhash );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, 64 );
-     keccak512_8way_close( &ctx.keccak, vhash );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_8x64_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

@@ -748,9 +748,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8x64_close( &ctx.hamsi, vhash );

     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
@@ -767,9 +767,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );

-     shabal512_8way_init( &ctx.shabal );
-     shabal512_8way_update( &ctx.shabal, vhash, 64 );
-     shabal512_8way_close( &ctx.shabal, vhash );
+     shabal512_8x32_init( &ctx.shabal );
+     shabal512_8x32_update( &ctx.shabal, vhash, 64 );
+     shabal512_8x32_close( &ctx.shabal, vhash );

     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
@@ -789,7 +789,7 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                       hash7 );

-     bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
+     bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );

 #if defined(__VAES__)

@@ -819,15 +819,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, 64 );
-     jh512_8way_close( &ctx.jh, vhash );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhash, 64 );
+     jh512_8x64_close( &ctx.jh, vhash );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, 64 );
-     keccak512_8way_close( &ctx.keccak, vhash );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_8x64_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

@@ -898,9 +898,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8x64_close( &ctx.hamsi, vhash );

     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
@@ -917,9 +917,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );

-     shabal512_8way_init( &ctx.shabal );
-     shabal512_8way_update( &ctx.shabal, vhash, 64 );
-     shabal512_8way_close( &ctx.shabal, vhash );
+     shabal512_8x32_init( &ctx.shabal );
+     shabal512_8x32_update( &ctx.shabal, vhash, 64 );
+     shabal512_8x32_close( &ctx.shabal, vhash );

     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
@@ -936,9 +936,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );

-     sha512_8way_init( &ctx.sha512 );
-     sha512_8way_update( &ctx.sha512, vhash, 64 );
-     sha512_8way_close( &ctx.sha512, vhash );
+     sha512_8x64_init( &ctx.sha512 );
+     sha512_8x64_update( &ctx.sha512, vhash, 64 );
+     sha512_8x64_close( &ctx.sha512, vhash );

     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
@@ -958,7 +958,7 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                       hash7 );

-     bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
+     bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );

 #if defined(__VAES__)

@@ -988,15 +988,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, 64 );
-     jh512_8way_close( &ctx.jh, vhash );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhash, 64 );
+     jh512_8x64_close( &ctx.jh, vhash );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, 64 );
-     keccak512_8way_close( &ctx.keccak, vhash );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_8x64_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

@@ -1067,9 +1067,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8x64_close( &ctx.hamsi, vhash );

     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
@@ -1086,9 +1086,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );

-     shabal512_8way_init( &ctx.shabal );
-     shabal512_8way_update( &ctx.shabal, vhash, 64 );
-     shabal512_8way_close( &ctx.shabal, vhash );
+     shabal512_8x32_init( &ctx.shabal );
+     shabal512_8x32_update( &ctx.shabal, vhash, 64 );
+     shabal512_8x32_close( &ctx.shabal, vhash );

     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
@@ -1105,15 +1105,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );

-     sha512_8way_init( &ctx.sha512 );
-     sha512_8way_update( &ctx.sha512, vhash, 64 );
-     sha512_8way_close( &ctx.sha512, vhash );
+     sha512_8x64_init( &ctx.sha512 );
+     sha512_8x64_update( &ctx.sha512, vhash, 64 );
+     sha512_8x64_close( &ctx.sha512, vhash );

     rintrlv_8x64_8x32( vhashA, vhash, 512 );

-     haval256_5_8way_init( &ctx.haval );
-     haval256_5_8way_update( &ctx.haval, vhashA, 64 );
-     haval256_5_8way_close( &ctx.haval, state );
+     haval256_8x32_init( &ctx.haval );
+     haval256_8x32_update( &ctx.haval, vhashA, 64 );
+     haval256_8x32_close( &ctx.haval, state );

     return 1;
 }
@@ -1122,8 +1122,8 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )

 union _sonoa_4way_context_overlay
 {
-    blake512_4way_context   blake;
-    bmw512_4way_context     bmw;
+    blake512_4x64_context   blake;
+    bmw512_4x64_context     bmw;
 #if defined(__VAES__)
    groestl512_2way_context groestl;
    echo512_2way_context    echo;
@@ -1131,19 +1131,19 @@ union _sonoa_4way_context_overlay
    hashState_groestl       groestl;
    hashState_echo          echo;
 #endif
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;
-    keccak512_4way_context  keccak;
+    skein512_4x64_context   skein;
+    jh512_4x64_context      jh;
+    keccak512_4x64_context  keccak;
    luffa_2way_context      luffa;
    cube_2way_context       cube;
    shavite512_2way_context shavite;
    simd_2way_context       simd;
-    hamsi512_4way_context   hamsi;
+    hamsi512_4x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_4way_context  shabal;
+    shabal512_4x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_4way_context     sha512;
-    haval256_5_4way_context haval;
+    sha512_4x64_context     sha512;
+    haval256_4x32_context haval;
 };

 typedef union _sonoa_4way_context_overlay sonoa_4way_context_overlay;
@@ -1161,11 +1161,11 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 // 1

-     blake512_4way_full( &ctx.blake, vhash, input, 80 );
+     blake512_4x64_full( &ctx.blake, vhash, input, 80 );

-     bmw512_4way_init( &ctx.bmw );
-     bmw512_4way_update( &ctx.bmw, vhash, 64 );
-     bmw512_4way_close( &ctx.bmw, vhash );
+     bmw512_4x64_init( &ctx.bmw );
+     bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+     bmw512_4x64_close( &ctx.bmw, vhash );

 #if defined(__VAES__)

@@ -1189,15 +1189,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif

-     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
+     jh512_4x64_init( &ctx.jh );
+     jh512_4x64_update( &ctx.jh, vhash, 64 );
+     jh512_4x64_close( &ctx.jh, vhash );

-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
+     keccak512_4x64_init( &ctx.keccak );
+     keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_4x64_close( &ctx.keccak, vhash );

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

@@ -1241,9 +1241,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     if ( work_restart[thr_id].restart ) return 0;
 // 2

-     bmw512_4way_init( &ctx.bmw );
-     bmw512_4way_update( &ctx.bmw, vhash, 64 );
-     bmw512_4way_close( &ctx.bmw, vhash );
+     bmw512_4x64_init( &ctx.bmw );
+     bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+     bmw512_4x64_close( &ctx.bmw, vhash );

 #if defined(__VAES__)

@@ -1267,15 +1267,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif     

-     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
+     jh512_4x64_init( &ctx.jh );
+     jh512_4x64_update( &ctx.jh, vhash, 64 );
+     jh512_4x64_close( &ctx.jh, vhash );

-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
+     keccak512_4x64_init( &ctx.keccak );
+     keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_4x64_close( &ctx.keccak, vhash );

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

@@ -1316,16 +1316,16 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
+     hamsi512_4x64_init( &ctx.hamsi );
+     hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_4x64_close( &ctx.hamsi, vhash );

     if ( work_restart[thr_id].restart ) return 0;
 // 3

-     bmw512_4way_init( &ctx.bmw );
-     bmw512_4way_update( &ctx.bmw, vhash, 64 );
-     bmw512_4way_close( &ctx.bmw, vhash );
+     bmw512_4x64_init( &ctx.bmw );
+     bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+     bmw512_4x64_close( &ctx.bmw, vhash );

 #if defined(__VAES__)

@@ -1349,15 +1349,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif     

-     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
+     jh512_4x64_init( &ctx.jh );
+     jh512_4x64_update( &ctx.jh, vhash, 64 );
+     jh512_4x64_close( &ctx.jh, vhash );

-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
+     keccak512_4x64_init( &ctx.keccak );
+     keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_4x64_close( &ctx.keccak, vhash );

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

@@ -1398,9 +1398,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
+     hamsi512_4x64_init( &ctx.hamsi );
+     hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_4x64_close( &ctx.hamsi, vhash );

     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

@@ -1413,9 +1413,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
 // 4
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

-     bmw512_4way_init( &ctx.bmw );
-     bmw512_4way_update( &ctx.bmw, vhash, 64 );
-     bmw512_4way_close( &ctx.bmw, vhash );
+     bmw512_4x64_init( &ctx.bmw );
+     bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+     bmw512_4x64_close( &ctx.bmw, vhash );

 #if defined(__VAES__)

@@ -1439,15 +1439,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif     

-     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
+     jh512_4x64_init( &ctx.jh );
+     jh512_4x64_update( &ctx.jh, vhash, 64 );
+     jh512_4x64_close( &ctx.jh, vhash );

-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
+     keccak512_4x64_init( &ctx.keccak );
+     keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_4x64_close( &ctx.keccak, vhash );

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

@@ -1488,9 +1488,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
+     hamsi512_4x64_init( &ctx.hamsi );
+     hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_4x64_close( &ctx.hamsi, vhash );

     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

@@ -1501,15 +1501,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );

-     shabal512_4way_init( &ctx.shabal );
-     shabal512_4way_update( &ctx.shabal, vhash, 64 );
-     shabal512_4way_close( &ctx.shabal, vhash );
+     shabal512_4x32_init( &ctx.shabal );
+     shabal512_4x32_update( &ctx.shabal, vhash, 64 );
+     shabal512_4x32_close( &ctx.shabal, vhash );

     rintrlv_4x32_4x64( vhashB, vhash, 512 ); 

-     hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way_update( &ctx.hamsi, vhashB, 64 );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
+     hamsi512_4x64_init( &ctx.hamsi );
+     hamsi512_4x64_update( &ctx.hamsi, vhashB, 64 );
+     hamsi512_4x64_close( &ctx.hamsi, vhash );

 #if defined(__VAES__)

@@ -1545,15 +1545,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
 // 5
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );

-     bmw512_4way_init( &ctx.bmw );
-     bmw512_4way_update( &ctx.bmw, vhash, 64 );
-     bmw512_4way_close( &ctx.bmw, vhash );
+     bmw512_4x64_init( &ctx.bmw );
+     bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+     bmw512_4x64_close( &ctx.bmw, vhash );

     rintrlv_4x64_4x32( vhashB, vhash,  512 );

-     shabal512_4way_init( &ctx.shabal );
-     shabal512_4way_update( &ctx.shabal, vhashB, 64 );
-     shabal512_4way_close( &ctx.shabal, vhash );
+     shabal512_4x32_init( &ctx.shabal );
+     shabal512_4x32_update( &ctx.shabal, vhashB, 64 );
+     shabal512_4x32_close( &ctx.shabal, vhash );

 #if defined(__VAES__)

@@ -1580,15 +1580,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif     

-     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
+     jh512_4x64_init( &ctx.jh );
+     jh512_4x64_update( &ctx.jh, vhash, 64 );
+     jh512_4x64_close( &ctx.jh, vhash );

-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
+     keccak512_4x64_init( &ctx.keccak );
+     keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_4x64_close( &ctx.keccak, vhash );

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

@@ -1629,9 +1629,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
+     hamsi512_4x64_init( &ctx.hamsi );
+     hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_4x64_close( &ctx.hamsi, vhash );

     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

@@ -1642,9 +1642,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );

-     shabal512_4way_init( &ctx.shabal );
-     shabal512_4way_update( &ctx.shabal, vhash, 64 );
-     shabal512_4way_close( &ctx.shabal, vhash );
+     shabal512_4x32_init( &ctx.shabal );
+     shabal512_4x32_update( &ctx.shabal, vhash, 64 );
+     shabal512_4x32_close( &ctx.shabal, vhash );

     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );

@@ -1658,9 +1658,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     
-     bmw512_4way_init( &ctx.bmw );
-     bmw512_4way_update( &ctx.bmw, vhash, 64 );
-     bmw512_4way_close( &ctx.bmw, vhash );
+     bmw512_4x64_init( &ctx.bmw );
+     bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+     bmw512_4x64_close( &ctx.bmw, vhash );

 #if defined(__VAES__)

@@ -1684,15 +1684,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif     

-     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
+     jh512_4x64_init( &ctx.jh );
+     jh512_4x64_update( &ctx.jh, vhash, 64 );
+     jh512_4x64_close( &ctx.jh, vhash );

-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
+     keccak512_4x64_init( &ctx.keccak );
+     keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_4x64_close( &ctx.keccak, vhash );

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

@@ -1733,9 +1733,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
+     hamsi512_4x64_init( &ctx.hamsi );
+     hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_4x64_close( &ctx.hamsi, vhash );

     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

@@ -1746,9 +1746,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );

-     shabal512_4way_init( &ctx.shabal );
-     shabal512_4way_update( &ctx.shabal, vhash, 64 );
-     shabal512_4way_close( &ctx.shabal, vhash );
+     shabal512_4x32_init( &ctx.shabal );
+     shabal512_4x32_update( &ctx.shabal, vhash, 64 );
+     shabal512_4x32_close( &ctx.shabal, vhash );

     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );

@@ -1759,9 +1759,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

-     sha512_4way_init( &ctx.sha512 );
-     sha512_4way_update( &ctx.sha512, vhash, 64 );
-     sha512_4way_close( &ctx.sha512, vhash );
+     sha512_4x64_init( &ctx.sha512 );
+     sha512_4x64_update( &ctx.sha512, vhash, 64 );
+     sha512_4x64_close( &ctx.sha512, vhash );

     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

@@ -1775,9 +1775,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

-     bmw512_4way_init( &ctx.bmw );
-     bmw512_4way_update( &ctx.bmw, vhash, 64 );
-     bmw512_4way_close( &ctx.bmw, vhash );
+     bmw512_4x64_init( &ctx.bmw );
+     bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+     bmw512_4x64_close( &ctx.bmw, vhash );

 #if defined(__VAES__)

@@ -1801,15 +1801,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif     

-     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
+     jh512_4x64_init( &ctx.jh );
+     jh512_4x64_update( &ctx.jh, vhash, 64 );
+     jh512_4x64_close( &ctx.jh, vhash );

-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
+     keccak512_4x64_init( &ctx.keccak );
+     keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_4x64_close( &ctx.keccak, vhash );

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

@@ -1850,9 +1850,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
+     hamsi512_4x64_init( &ctx.hamsi );
+     hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_4x64_close( &ctx.hamsi, vhash );

     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

@@ -1863,9 +1863,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );

-     shabal512_4way_init( &ctx.shabal );
-     shabal512_4way_update( &ctx.shabal, vhash, 64 );
-     shabal512_4way_close( &ctx.shabal, vhash );
+     shabal512_4x32_init( &ctx.shabal );
+     shabal512_4x32_update( &ctx.shabal, vhash, 64 );
+     shabal512_4x32_close( &ctx.shabal, vhash );

     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );

@@ -1876,15 +1876,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )

     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

-     sha512_4way_init( &ctx.sha512 );
-     sha512_4way_update( &ctx.sha512, vhash, 64 );
-     sha512_4way_close( &ctx.sha512, vhash );
+     sha512_4x64_init( &ctx.sha512 );
+     sha512_4x64_update( &ctx.sha512, vhash, 64 );
+     sha512_4x64_close( &ctx.sha512, vhash );

     rintrlv_4x64_4x32( vhashB, vhash,  512 );

-     haval256_5_4way_init( &ctx.haval );
-     haval256_5_4way_update( &ctx.haval, vhashB, 64 );
-     haval256_5_4way_close( &ctx.haval, state );
+     haval256_4x32_init( &ctx.haval );
+     haval256_4x32_update( &ctx.haval, vhashB, 64 );
+     haval256_4x32_close( &ctx.haval, state );

     return 1;
 }
--- a/algo/x17/sonoa.c
+++ b/algo/x17/sonoa.c
@@ -18,11 +18,7 @@
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/sha/sph_sha2.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
@@ -53,11 +49,7 @@ typedef struct {
        hashState_luffa         luffa;
        cubehashParam           cubehash;
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-        sph_simd512_context     simd;
-#else
-        hashState_sd            simd;
-#endif
+        simd512_context         simd;
        sph_hamsi512_context    hamsi;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
@@ -86,11 +78,6 @@ void init_sonoa_ctx()
        init_luffa( &sonoa_ctx.luffa, 512 );
        cubehashInit( &sonoa_ctx.cubehash, 512, 16, 32 );
        sph_shavite512_init( &sonoa_ctx.shavite );
-#if defined(__aarch64__)
-        sph_simd512_init( &sonoa_ctx.simd );
-#else
-        init_sd( &sonoa_ctx.simd, 512 );
-#endif
        sph_hamsi512_init( &sonoa_ctx.hamsi );
        sph_shabal512_init( &sonoa_ctx.shabal );
        sph_whirlpool_init( &sonoa_ctx.whirlpool );
@@ -134,13 +121,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
 	sph_shavite512(&ctx.shavite, hash, 64);
 	sph_shavite512_close(&ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                       (const BitSequence *)hash, 512 );
-#endif
+   simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
   update_final_echo ( &ctx.echo, (BitSequence *)hash,
@@ -189,13 +170,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);

-#if defined(__aarch64__)
-   sph_simd512(&ctx.simd, (const void*) hash, 64);
-   sph_simd512_close(&ctx.simd, hash);
-#else
-   update_final_sd( &ctx.simd, (BitSequence *)hash,
-                       (const BitSequence *)hash, 512 );
-#endif
+   simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
   init_echo( &ctx.echo, 512 );
@@ -249,13 +224,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);

-#if defined(__aarch64__)
-   sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-   update_final_sd( &ctx.simd, (BitSequence *)hash,
-                       (const BitSequence *)hash, 512 );
-#endif
+   simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
   init_echo( &ctx.echo, 512 );
@@ -318,13 +287,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                       (const BitSequence *)hash, 512 );
-#endif
+   simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
   init_echo( &ctx.echo, 512 );
@@ -410,13 +373,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512_init( &ctx.simd );
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    simd_full( &ctx.simd, hash, hash, 512 );
-#endif
+   simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
   init_echo( &ctx.echo, 512 );
@@ -483,13 +440,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512_init( &ctx.simd );
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    simd_full( &ctx.simd, hash, hash, 512 );
-#endif
+   simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
   init_echo( &ctx.echo, 512 );
@@ -527,7 +478,6 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_whirlpool_close(&ctx.whirlpool, hash);

   if ( work_restart[thr_id].restart ) return 0;
-//

   sph_bmw512_init( &ctx.bmw);
   sph_bmw512(&ctx.bmw, hash, 64);
@@ -565,13 +515,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512_init( &ctx.simd );
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    simd_full( &ctx.simd, hash, hash, 512 );
-#endif
+   simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
   init_echo( &ctx.echo, 512 );
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -31,11 +31,11 @@

 union _x17_16way_context_overlay
 {
-    blake512_8way_context    blake;
+    blake512_8x64_context    blake;
    bmw512_8x64_context      bmw;
-    skein512_8way_context    skein;
-    jh512_8way_context       jh;
-    keccak512_8way_context   keccak;
+    skein512_8x64_context    skein;
+    jh512_8x64_context       jh;
+    keccak512_8x64_context   keccak;
    luffa_4way_context       luffa;
    cube_4way_2buf_context   cube;
 #if defined(__VAES__)
@@ -48,17 +48,17 @@ union _x17_16way_context_overlay
    hashState_echo           echo;
 #endif
    simd_4way_context        simd;
-    hamsi512_8way_context    hamsi;
+    hamsi512_8x64_context    hamsi;
    hashState_fugue          fugue;
-    shabal512_16way_context  shabal;
+    shabal512_16x32_context  shabal;
    sph_whirlpool_context    whirlpool;
-    sha512_8way_context      sha512;
-    haval256_5_16way_context haval;
+    sha512_8x64_context      sha512;
+    haval256_16x32_context   haval;
 } __attribute__ ((aligned (64)));
 typedef union _x17_16way_context_overlay x17_16way_context_overlay;

 static __thread __m512i x17_16way_midstate[16] __attribute__((aligned(64)));
-static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
+static __thread blake512_8x64_context blake512_8x64_ctx __attribute__((aligned(64)));

 int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,
                    int thr_id )
@@ -85,13 +85,10 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,
     uint64_t hash15[8] __attribute__ ((aligned (32)));
     x17_16way_context_overlay ctx;

-
-
-
-     memcpy( &ctx.blake, &blake512_8way_ctx, sizeof (blake512_8way_ctx) );
-     blake512_8way_final_le( &blake512_8way_ctx, vhashA, nonceA,
+     memcpy( &ctx.blake, &blake512_8x64_ctx, sizeof (blake512_8x64_ctx) );
+     blake512_8x64_final_le( &blake512_8x64_ctx, vhashA, nonceA,
                                                 x17_16way_midstate );
-     blake512_8way_final_le( &ctx.blake, vhashB, nonceB,
+     blake512_8x64_final_le( &ctx.blake, vhashB, nonceB,
                                                 x17_16way_midstate );

     bmw512_8x64_full( &ctx.bmw, vhashA, vhashA, 64 );
@@ -140,22 +137,22 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,

 #endif

-     skein512_8way_full( &ctx.skein, vhashA, vhashA, 64 );
-     skein512_8way_full( &ctx.skein, vhashB, vhashB, 64 );
+     skein512_8x64_full( &ctx.skein, vhashA, vhashA, 64 );
+     skein512_8x64_full( &ctx.skein, vhashB, vhashB, 64 );

-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhashA, 64 );
-     jh512_8way_close( &ctx.jh, vhashA );
-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhashB, 64 );
-     jh512_8way_close( &ctx.jh, vhashB );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhashA, 64 );
+     jh512_8x64_close( &ctx.jh, vhashA );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhashB, 64 );
+     jh512_8x64_close( &ctx.jh, vhashB );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhashA, 64 );
-     keccak512_8way_close( &ctx.keccak, vhashA );
-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhashB, 64 );
-     keccak512_8way_close( &ctx.keccak, vhashB );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhashA, 64 );
+     keccak512_8x64_close( &ctx.keccak, vhashA );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhashB, 64 );
+     keccak512_8x64_close( &ctx.keccak, vhashB );

 //
     rintrlv_8x64_4x128( vhashC, vhashD, vhashA, 512 );
@@ -310,18 +307,17 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,
 */

     
-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhashA, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhashA );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhashA, 64 );
+     hamsi512_8x64_close( &ctx.hamsi, vhashA );
     dintrlv_8x64_512( hash00, hash01, hash02, hash03,
                       hash04, hash05, hash06, hash07, vhashA );
-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhashB, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhashB );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhashB, 64 );
+     hamsi512_8x64_close( &ctx.hamsi, vhashB );
     dintrlv_8x64_512( hash08, hash09, hash10, hash11,
                       hash12, hash13, hash14, hash15, vhashB );

-
     fugue512_full( &ctx.fugue, hash00, hash00, 64 );
     fugue512_full( &ctx.fugue, hash01, hash01, 64 );
     fugue512_full( &ctx.fugue, hash02, hash02, 64 );
@@ -344,9 +340,9 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,
                               hash08, hash09, hash10, hash11,
                               hash12, hash13, hash14, hash15 );

-     shabal512_16way_init( &ctx.shabal );
-     shabal512_16way_update( &ctx.shabal, vhashA, 64 );
-     shabal512_16way_close( &ctx.shabal, vhashA );
+     shabal512_16x32_init( &ctx.shabal );
+     shabal512_16x32_update( &ctx.shabal, vhashA, 64 );
+     shabal512_16x32_close( &ctx.shabal, vhashA );

     dintrlv_16x32_512( hash00, hash01, hash02, hash03,
                        hash04, hash05, hash06, hash07,
@@ -375,12 +371,12 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,
     intrlv_8x64_512( vhashB, hash08, hash09, hash10, hash11,
                              hash12, hash13, hash14, hash15 );

-     sha512_8way_init( &ctx.sha512 );
-     sha512_8way_update( &ctx.sha512, vhashA, 64 );
-     sha512_8way_close( &ctx.sha512, vhashA );
-     sha512_8way_init( &ctx.sha512 );
-     sha512_8way_update( &ctx.sha512, vhashB, 64 );
-     sha512_8way_close( &ctx.sha512, vhashB );
+     sha512_8x64_init( &ctx.sha512 );
+     sha512_8x64_update( &ctx.sha512, vhashA, 64 );
+     sha512_8x64_close( &ctx.sha512, vhashA );
+     sha512_8x64_init( &ctx.sha512 );
+     sha512_8x64_update( &ctx.sha512, vhashB, 64 );
+     sha512_8x64_close( &ctx.sha512, vhashB );

     dintrlv_8x64_512( hash00, hash01, hash02, hash03,
                       hash04, hash05, hash06, hash07, vhashA );
@@ -391,9 +387,9 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,
                               hash08, hash09, hash10, hash11,
                               hash12, hash13, hash14, hash15 );

-     haval256_5_16way_init( &ctx.haval );
-     haval256_5_16way_update( &ctx.haval, vhashA, 64 );
-     haval256_5_16way_close( &ctx.haval, state );
+     haval256_16x32_init( &ctx.haval );
+     haval256_16x32_update( &ctx.haval, vhashA, 64 );
+     haval256_16x32_close( &ctx.haval, state );

     return 1;
 }
@@ -425,7 +421,7 @@ int scanhash_x17_16x32( struct work *work, uint32_t max_nonce,
   edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );

   mm512_intrlv80_8x64( vdata, edata );
-   blake512_8way_prehash_le( &blake512_8way_ctx, x17_16way_midstate, vdata );
+   blake512_8x64_prehash_le( &blake512_8x64_ctx, x17_16way_midstate, vdata );

   nonceA = _mm512_add_epi32( casti_m512i( vdata, 9 ),
                              _mm512_set_epi64( 7, 6, 5, 4, 3, 2, 1, 0 ) );
@@ -456,11 +452,11 @@ int scanhash_x17_16x32( struct work *work, uint32_t max_nonce,

 union _x17_8way_context_overlay
 {
-    blake512_8way_context   blake;
+    blake512_8x64_context   blake;
    bmw512_8x64_context     bmw;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
+    skein512_8x64_context   skein;
+    jh512_8x64_context      jh;
+    keccak512_8x64_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_2buf_context   cube;
 #if defined(__VAES__)
@@ -473,17 +469,17 @@ union _x17_8way_context_overlay
    hashState_echo          echo;
 #endif
    simd_4way_context       simd;
-    hamsi512_8way_context   hamsi;
+    hamsi512_8x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_8way_context  shabal;
+    shabal512_8x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
-    haval256_5_8way_context haval;
+    sha512_8x64_context     sha512;
+    haval256_8x32_context   haval;
 } __attribute__ ((aligned (64)));
 typedef union _x17_8way_context_overlay x17_8way_context_overlay;

 static __thread __m512i x17_8way_midstate[16] __attribute__((aligned(64)));
-static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
+static __thread blake512_8x64_context blake512_8x64_ctx __attribute__((aligned(64)));

 int x17_8x64_hash( void *state, const void *input, int thr_id )
 {
@@ -500,7 +496,7 @@ int x17_8x64_hash( void *state, const void *input, int thr_id )
     uint64_t hash7[8] __attribute__ ((aligned (32)));
     x17_8way_context_overlay ctx;

-     blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ),
+     blake512_8x64_final_le( &blake512_8x64_ctx, vhash, casti_m512i( input, 9 ),
                             x17_8way_midstate );
     
     bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );
@@ -533,15 +529,15 @@ int x17_8x64_hash( void *state, const void *input, int thr_id )

 #endif

-     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
+     skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, 64 );
-     jh512_8way_close( &ctx.jh, vhash );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhash, 64 );
+     jh512_8x64_close( &ctx.jh, vhash );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, 64 );
-     keccak512_8way_close( &ctx.keccak, vhash );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_8x64_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

@@ -611,9 +607,9 @@ int x17_8x64_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8x64_close( &ctx.hamsi, vhash );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );

@@ -629,9 +625,9 @@ int x17_8x64_hash( void *state, const void *input, int thr_id )
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );

-     shabal512_8way_init( &ctx.shabal );
-     shabal512_8way_update( &ctx.shabal, vhash, 64 );
-     shabal512_8way_close( &ctx.shabal, vhash );
+     shabal512_8x32_init( &ctx.shabal );
+     shabal512_8x32_update( &ctx.shabal, vhash, 64 );
+     shabal512_8x32_close( &ctx.shabal, vhash );

     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
@@ -648,15 +644,15 @@ int x17_8x64_hash( void *state, const void *input, int thr_id )
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );

-     sha512_8way_init( &ctx.sha512 );
-     sha512_8way_update( &ctx.sha512, vhash, 64 );
-     sha512_8way_close( &ctx.sha512, vhash );
+     sha512_8x64_init( &ctx.sha512 );
+     sha512_8x64_update( &ctx.sha512, vhash, 64 );
+     sha512_8x64_close( &ctx.sha512, vhash );

     rintrlv_8x64_8x32( vhashA, vhash,  512 );

-     haval256_5_8way_init( &ctx.haval );
-     haval256_5_8way_update( &ctx.haval, vhashA, 64 );
-     haval256_5_8way_close( &ctx.haval, state );
+     haval256_8x32_init( &ctx.haval );
+     haval256_8x32_update( &ctx.haval, vhashA, 64 );
+     haval256_8x32_close( &ctx.haval, state );

     return 1;
 }
@@ -690,7 +686,7 @@ int scanhash_x17_8x64( struct work *work, uint32_t max_nonce,
   mm512_intrlv80_8x64( vdata, edata );
   *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
                                    0,7, 0,6, 0,5, 0,4, 0,3, 0,2, 0,1, 0,0 ) );
-   blake512_8way_prehash_le( &blake512_8way_ctx, x17_8way_midstate, vdata );
+   blake512_8x64_prehash_le( &blake512_8x64_ctx, x17_8way_midstate, vdata );
   
   do
   {
@@ -717,7 +713,7 @@ int scanhash_x17_8x64( struct work *work, uint32_t max_nonce,

 union _x17_4way_context_overlay
 {
-    blake512_4way_context   blake;
+    blake512_4x64_context   blake;
    bmw512_4x64_context     bmw;
 #if defined(__VAES__)
    groestl512_2way_context groestl;
@@ -726,24 +722,24 @@ union _x17_4way_context_overlay
    hashState_groestl       groestl;
    hashState_echo          echo;
 #endif
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;
-    keccak512_4way_context  keccak;
+    skein512_4x64_context   skein;
+    jh512_4x64_context      jh;
+    keccak512_4x64_context  keccak;
    luffa_2way_context      luffa;
    cube_2way_context       cube;
    shavite512_2way_context shavite;
    simd_2way_context       simd;
-    hamsi512_4way_context   hamsi;
+    hamsi512_4x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_4way_context  shabal;
+    shabal512_4x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_4way_context     sha512;
-    haval256_5_4way_context haval;
+    sha512_4x64_context     sha512;
+    haval256_4x32_context   haval;
 };  
 typedef union _x17_4way_context_overlay x17_4way_context_overlay;

 static __thread __m256i x17_4way_midstate[16] __attribute__((aligned(64)));
-static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64)));
+static __thread blake512_4x64_context blake512_4x64_ctx __attribute__((aligned(64)));

 int x17_4x64_hash( void *state, const void *input, int thr_id )
 {
@@ -756,11 +752,9 @@ int x17_4x64_hash( void *state, const void *input, int thr_id )
     uint64_t hash3[8] __attribute__ ((aligned (32)));
     x17_4way_context_overlay ctx;

-     blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ),
+     blake512_4x64_final_le( &blake512_4x64_ctx, vhash, casti_m256i( input, 9 ),
                             x17_4way_midstate );
     
-//     blake512_4way_full( &ctx.blake, vhash, input, 80 );
-
     bmw512_4x64_init( &ctx.bmw );
     bmw512_4x64_update( &ctx.bmw, vhash, 64 );
     bmw512_4x64_close( &ctx.bmw, vhash );
@@ -789,13 +783,13 @@ int x17_4x64_hash( void *state, const void *input, int thr_id )

     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );

-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
+     jh512_4x64_init( &ctx.jh );
+     jh512_4x64_update( &ctx.jh, vhash, 64 );
+     jh512_4x64_close( &ctx.jh, vhash );

-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
+     keccak512_4x64_init( &ctx.keccak );
+     keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+     keccak512_4x64_close( &ctx.keccak, vhash );

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

@@ -836,9 +830,9 @@ int x17_4x64_hash( void *state, const void *input, int thr_id )

 #endif

-     hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
+     hamsi512_4x64_init( &ctx.hamsi );
+     hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_4x64_close( &ctx.hamsi, vhash );

     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

@@ -849,9 +843,9 @@ int x17_4x64_hash( void *state, const void *input, int thr_id )

     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );

-     shabal512_4way_init( &ctx.shabal );
-     shabal512_4way_update( &ctx.shabal, vhash, 64 );
-     shabal512_4way_close( &ctx.shabal, vhash );
+     shabal512_4x32_init( &ctx.shabal );
+     shabal512_4x32_update( &ctx.shabal, vhash, 64 );
+     shabal512_4x32_close( &ctx.shabal, vhash );

     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
       
@@ -862,15 +856,15 @@ int x17_4x64_hash( void *state, const void *input, int thr_id )

     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

-     sha512_4way_init( &ctx.sha512 );
-     sha512_4way_update( &ctx.sha512, vhash, 64 );
-     sha512_4way_close( &ctx.sha512, vhash );     
+     sha512_4x64_init( &ctx.sha512 );
+     sha512_4x64_update( &ctx.sha512, vhash, 64 );
+     sha512_4x64_close( &ctx.sha512, vhash );     

     rintrlv_4x64_4x32( vhashB, vhash,  512 );

-     haval256_5_4way_init( &ctx.haval );
-     haval256_5_4way_update( &ctx.haval, vhashB, 64 );
-     haval256_5_4way_close( &ctx.haval, state );
+     haval256_4x32_init( &ctx.haval );
+     haval256_4x32_update( &ctx.haval, vhashB, 64 );
+     haval256_4x32_close( &ctx.haval, state );

     return 1;
 }
@@ -903,7 +897,7 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,

   mm256_intrlv80_4x64( vdata, edata );
   *noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32( 0,3,0,2, 0,1,0,0 ) );
-   blake512_4way_prehash_le( &blake512_4way_ctx, x17_4way_midstate, vdata );
+   blake512_4x64_prehash_le( &blake512_4x64_ctx, x17_4way_midstate, vdata );

   do
   {
--- a/algo/x17/x17-gate.h
+++ b/algo/x17/x17-gate.h
@@ -6,10 +6,8 @@

 #if defined(SIMD512)
  #define X17_8WAY 1
-//  #define X17_16X32 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X17_4WAY 1
-  #define X17_8X32 1
 #elif defined(__SSE2__) || defined(__ARM_NEON)
  #define X17_2X64 1
 #endif
--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -18,11 +18,7 @@
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/sha/sph_sha2.h"
 #if defined(__AES__)
  #include "algo/fugue/fugue-aesni.h"
@@ -34,7 +30,7 @@
  #include "algo/fugue/sph_fugue.h"
 #endif
 #include "algo/blake/sph_blake.h"
-#include "algo/cubehash/sph_cubehash.h"
+//#include "algo/cubehash/sph_cubehash.h"
 #include "algo/luffa/sph_luffa.h"


@@ -63,17 +59,9 @@ union _x17_context_overlay
 #else
        hashState_luffa         luffa;
 #endif
-//#if defined(__aarch64__)
-//        sph_cubehash512_context    cube;
-//#else
        cubehashParam           cube;
-//#endif
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-        sph_simd512_context     simd;
-#else
-        hashState_sd            simd;
-#endif
+        simd512_context         simd;
        sph_hamsi512_context    hamsi;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
@@ -127,26 +115,13 @@ int x17_hash(void *output, const void *input, int thr_id )
    luffa_full( &ctx.luffa, hash, 512, hash, 64 );
 #endif

-//#if defined(__aarch64__)
-//    sph_cubehash512_init(&ctx.cube);
-//    sph_cubehash512(&ctx.cube, (const void*) hash, 64);
-//    sph_cubehash512_close(&ctx.cube, hash);
-//#else
    cubehash_full( &ctx.cube, hash, 512, hash, 64 );
-//#endif

    sph_shavite512_init( &ctx.shavite );
    sph_shavite512( &ctx.shavite, hash, 64);
    sph_shavite512_close( &ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512_init( &ctx.simd );
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    simd_full( &ctx.simd, (BitSequence *)hash,
-               (const BitSequence *)hash, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hash, hash, 64 );        

 #if defined(__AES__)
    echo_full( &ctx.echo, (BitSequence *)hash, 512,
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -31,20 +31,20 @@

 union _xevan_8way_context_overlay
 {
-   blake512_8way_context   blake;
-   bmw512_8way_context     bmw;
-   skein512_8way_context   skein;
-   jh512_8way_context      jh;
-   keccak512_8way_context  keccak;
+   blake512_8x64_context   blake;
+   bmw512_8x64_context     bmw;
+   skein512_8x64_context   skein;
+   jh512_8x64_context      jh;
+   keccak512_8x64_context  keccak;
   luffa_4way_context      luffa;
   cube_4way_context       cube;
   simd_4way_context       simd;
-   hamsi512_8way_context   hamsi;
+   hamsi512_8x64_context   hamsi;
   hashState_fugue         fugue;
-   shabal512_8way_context  shabal;
+   shabal512_8x32_context  shabal;
   sph_whirlpool_context   whirlpool;
-   sha512_8way_context     sha512;
-   haval256_5_8way_context haval;
+   sha512_8x64_context     sha512;
+   haval256_8x32_context haval;
 #if defined(__VAES__)
    groestl512_4way_context groestl;
    shavite512_4way_context shavite;
@@ -73,10 +73,10 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
     const int dataLen = 128;
     xevan_8way_context_overlay ctx __attribute__ ((aligned (64)));

-     blake512_8way_full( &ctx.blake, vhash, input, 80 );
+     blake512_8x64_full( &ctx.blake, vhash, input, 80 );
     memset( &vhash[8<<3], 0, 64<<3 );

-     bmw512_8way_full( &ctx.bmw, vhash, vhash, dataLen );
+     bmw512_8x64_full( &ctx.bmw, vhash, vhash, dataLen );

 #if defined(__VAES__)

@@ -106,15 +106,15 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )

 #endif

-     skein512_8way_full( &ctx.skein, vhash, vhash, dataLen );
+     skein512_8x64_full( &ctx.skein, vhash, vhash, dataLen );

-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, dataLen );
-     jh512_8way_close( &ctx.jh, vhash );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhash, dataLen );
+     jh512_8x64_close( &ctx.jh, vhash );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, dataLen );
-     keccak512_8way_close( &ctx.keccak, vhash );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhash, dataLen );
+     keccak512_8x64_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );

@@ -185,9 +185,9 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )

 #endif

-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, dataLen );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhash, dataLen );
+     hamsi512_8x64_close( &ctx.hamsi, vhash );

     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, dataLen<<3 );
@@ -204,9 +204,9 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
     intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, dataLen<<3 );

-     shabal512_8way_init( &ctx.shabal );
-     shabal512_8way_update( &ctx.shabal, vhash, dataLen );
-     shabal512_8way_close( &ctx.shabal, vhash );
+     shabal512_8x32_init( &ctx.shabal );
+     shabal512_8x32_update( &ctx.shabal, vhash, dataLen );
+     shabal512_8x32_close( &ctx.shabal, vhash );

     dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, dataLen<<3 );
@@ -223,23 +223,23 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, dataLen<<3 );

-     sha512_8way_init( &ctx.sha512 );
-     sha512_8way_update( &ctx.sha512, vhash, dataLen );
-     sha512_8way_close( &ctx.sha512, vhash );
+     sha512_8x64_init( &ctx.sha512 );
+     sha512_8x64_update( &ctx.sha512, vhash, dataLen );
+     sha512_8x64_close( &ctx.sha512, vhash );

     rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 );

-     haval256_5_8way_init( &ctx.haval );
-     haval256_5_8way_update( &ctx.haval, vhashA, dataLen );
-     haval256_5_8way_close( &ctx.haval, vhashA );
+     haval256_8x32_init( &ctx.haval );
+     haval256_8x32_update( &ctx.haval, vhashA, dataLen );
+     haval256_8x32_close( &ctx.haval, vhashA );

     rintrlv_8x32_8x64( vhash, vhashA, dataLen<<3 );

     memset( &vhash[ 4<<3 ], 0, (dataLen-32) << 3 );

-     blake512_8way_full( &ctx.blake, vhash, vhash, dataLen );
+     blake512_8x64_full( &ctx.blake, vhash, vhash, dataLen );

-     bmw512_8way_full( &ctx.bmw, vhash, vhash, dataLen );
+     bmw512_8x64_full( &ctx.bmw, vhash, vhash, dataLen );

 #if defined(__VAES__)

@@ -269,15 +269,15 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )

 #endif

-     skein512_8way_full( &ctx.skein, vhash, vhash, dataLen );
+     skein512_8x64_full( &ctx.skein, vhash, vhash, dataLen );

-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, dataLen );
-     jh512_8way_close( &ctx.jh, vhash );
+     jh512_8x64_init( &ctx.jh );
+     jh512_8x64_update( &ctx.jh, vhash, dataLen );
+     jh512_8x64_close( &ctx.jh, vhash );

-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, dataLen );
-     keccak512_8way_close( &ctx.keccak, vhash );
+     keccak512_8x64_init( &ctx.keccak );
+     keccak512_8x64_update( &ctx.keccak, vhash, dataLen );
+     keccak512_8x64_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );

@@ -348,9 +348,9 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )

 #endif

-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, dataLen );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
+     hamsi512_8x64_init( &ctx.hamsi );
+     hamsi512_8x64_update( &ctx.hamsi, vhash, dataLen );
+     hamsi512_8x64_close( &ctx.hamsi, vhash );

     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, dataLen<<3 );
@@ -367,9 +367,9 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
     intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, dataLen<<3 );

-     shabal512_8way_init( &ctx.shabal );
-     shabal512_8way_update( &ctx.shabal, vhash, dataLen );
-     shabal512_8way_close( &ctx.shabal, vhash );
+     shabal512_8x32_init( &ctx.shabal );
+     shabal512_8x32_update( &ctx.shabal, vhash, dataLen );
+     shabal512_8x32_close( &ctx.shabal, vhash );

     dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, dataLen<<3 );
@@ -386,15 +386,15 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, dataLen<<3 );

-     sha512_8way_init( &ctx.sha512 );
-     sha512_8way_update( &ctx.sha512, vhash, dataLen );
-     sha512_8way_close( &ctx.sha512, vhash );
+     sha512_8x64_init( &ctx.sha512 );
+     sha512_8x64_update( &ctx.sha512, vhash, dataLen );
+     sha512_8x64_close( &ctx.sha512, vhash );

     rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 );

-     haval256_5_8way_init( &ctx.haval );
-     haval256_5_8way_update( &ctx.haval, vhashA, dataLen );
-     haval256_5_8way_close( &ctx.haval, output );
+     haval256_8x32_init( &ctx.haval );
+     haval256_8x32_update( &ctx.haval, vhashA, dataLen );
+     haval256_8x32_close( &ctx.haval, output );

     return 1;
 }
@@ -403,8 +403,8 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )

 union _xevan_4way_context_overlay
 {
-	blake512_4way_context   blake;
-        bmw512_4way_context     bmw;
+        blake512_4x64_context   blake;
+        bmw512_4x64_context     bmw;
 #if defined(__VAES__)
        groestl512_2way_context groestl;
        echo_2way_context       echo;
@@ -412,19 +412,19 @@ union _xevan_4way_context_overlay
        hashState_groestl       groestl;
        hashState_echo          echo;
 #endif
-	skein512_4way_context   skein;
-        jh512_4way_context      jh;
-        keccak512_4way_context  keccak;
+        skein512_4x64_context   skein;
+        jh512_4x64_context      jh;
+        keccak512_4x64_context  keccak;
        luffa_2way_context      luffa;
        cube_2way_context       cube;
        shavite512_2way_context shavite;
        simd_2way_context       simd;
-        hamsi512_4way_context   hamsi;
+        hamsi512_4x64_context   hamsi;
        hashState_fugue         fugue;
-        shabal512_4way_context  shabal;
+        shabal512_4x32_context  shabal;
        sph_whirlpool_context   whirlpool;
-        sha512_4way_context     sha512;
-        haval256_5_4way_context haval;
+        sha512_4x64_context     sha512;
+        haval256_4x32_context haval;
 };
 typedef union _xevan_4way_context_overlay xevan_4way_context_overlay;

@@ -440,12 +440,12 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
     const int dataLen = 128;
     xevan_4way_context_overlay ctx __attribute__ ((aligned (64)));

-     blake512_4way_full( &ctx.blake, vhash, input, 80 );
+     blake512_4x64_full( &ctx.blake, vhash, input, 80 );
     memset( &vhash[8<<2], 0, 64<<2 );

-     bmw512_4way_init( &ctx.bmw );
-     bmw512_4way_update( &ctx.bmw, vhash, dataLen );
-     bmw512_4way_close( &ctx.bmw, vhash );
+     bmw512_4x64_init( &ctx.bmw );
+     bmw512_4x64_update( &ctx.bmw, vhash, dataLen );
+     bmw512_4x64_close( &ctx.bmw, vhash );

 #if defined(__VAES__)

@@ -469,15 +469,15 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )

 #endif

-     skein512_4way_full( &ctx.skein, vhash, vhash, dataLen );
+     skein512_4x64_full( &ctx.skein, vhash, vhash, dataLen );

-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, dataLen );
-     jh512_4way_close( &ctx.jh, vhash );
+     jh512_4x64_init( &ctx.jh );
+     jh512_4x64_update( &ctx.jh, vhash, dataLen );
+     jh512_4x64_close( &ctx.jh, vhash );

-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, dataLen );
-     keccak512_4way_close( &ctx.keccak, vhash );
+     keccak512_4x64_init( &ctx.keccak );
+     keccak512_4x64_update( &ctx.keccak, vhash, dataLen );
+     keccak512_4x64_close( &ctx.keccak, vhash );

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );

@@ -518,9 +518,9 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )

 #endif

-     hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way_update( &ctx.hamsi, vhash, dataLen );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
+     hamsi512_4x64_init( &ctx.hamsi );
+     hamsi512_4x64_update( &ctx.hamsi, vhash, dataLen );
+     hamsi512_4x64_close( &ctx.hamsi, vhash );

     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

@@ -532,9 +532,9 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
     // Parallel 4way 32 bit
     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

-     shabal512_4way_init( &ctx.shabal );
-     shabal512_4way_update( &ctx.shabal, vhash, dataLen );
-     shabal512_4way_close( &ctx.shabal, vhash );
+     shabal512_4x32_init( &ctx.shabal );
+     shabal512_4x32_update( &ctx.shabal, vhash, dataLen );
+     shabal512_4x32_close( &ctx.shabal, vhash );

     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

@@ -546,27 +546,27 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )

     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

-     sha512_4way_init( &ctx.sha512 );
-     sha512_4way_update( &ctx.sha512, vhash, dataLen );
-     sha512_4way_close( &ctx.sha512, vhash );
+     sha512_4x64_init( &ctx.sha512 );
+     sha512_4x64_update( &ctx.sha512, vhash, dataLen );
+     sha512_4x64_close( &ctx.sha512, vhash );

     rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 );

-     haval256_5_4way_init( &ctx.haval );
-     haval256_5_4way_update( &ctx.haval, vhashA, dataLen );
-     haval256_5_4way_close( &ctx.haval, vhashA );
+     haval256_4x32_init( &ctx.haval );
+     haval256_4x32_update( &ctx.haval, vhashA, dataLen );
+     haval256_4x32_close( &ctx.haval, vhashA );

     rintrlv_4x32_4x64( vhash, vhashA, dataLen<<3 );

     memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 );

-     blake512_4way_init( &ctx.blake );
-     blake512_4way_update( &ctx.blake, vhash, dataLen );
-     blake512_4way_close(&ctx.blake, vhash);
+     blake512_4x64_init( &ctx.blake );
+     blake512_4x64_update( &ctx.blake, vhash, dataLen );
+     blake512_4x64_close(&ctx.blake, vhash);

-     bmw512_4way_init( &ctx.bmw );
-     bmw512_4way_update( &ctx.bmw, vhash, dataLen );
-     bmw512_4way_close( &ctx.bmw, vhash );
+     bmw512_4x64_init( &ctx.bmw );
+     bmw512_4x64_update( &ctx.bmw, vhash, dataLen );
+     bmw512_4x64_close( &ctx.bmw, vhash );

 #if defined(__VAES__)

@@ -590,15 +590,15 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )

 #endif

-     skein512_4way_full( &ctx.skein, vhash, vhash, dataLen );
+     skein512_4x64_full( &ctx.skein, vhash, vhash, dataLen );

-     jh512_4way_init( &ctx.jh );
-     jh512_4way_update( &ctx.jh, vhash, dataLen );
-     jh512_4way_close( &ctx.jh, vhash );
+     jh512_4x64_init( &ctx.jh );
+     jh512_4x64_update( &ctx.jh, vhash, dataLen );
+     jh512_4x64_close( &ctx.jh, vhash );

-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way_update( &ctx.keccak, vhash, dataLen );
-     keccak512_4way_close( &ctx.keccak, vhash );
+     keccak512_4x64_init( &ctx.keccak );
+     keccak512_4x64_update( &ctx.keccak, vhash, dataLen );
+     keccak512_4x64_close( &ctx.keccak, vhash );

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );

@@ -639,9 +639,9 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )

 #endif

-     hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way_update( &ctx.hamsi, vhash, dataLen );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
+     hamsi512_4x64_init( &ctx.hamsi );
+     hamsi512_4x64_update( &ctx.hamsi, vhash, dataLen );
+     hamsi512_4x64_close( &ctx.hamsi, vhash );

     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

@@ -652,9 +652,9 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )

     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

-     shabal512_4way_init( &ctx.shabal );
-     shabal512_4way_update( &ctx.shabal, vhash, dataLen );
-     shabal512_4way_close( &ctx.shabal, vhash );
+     shabal512_4x32_init( &ctx.shabal );
+     shabal512_4x32_update( &ctx.shabal, vhash, dataLen );
+     shabal512_4x32_close( &ctx.shabal, vhash );

     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

@@ -665,15 +665,15 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )

     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

-     sha512_4way_init( &ctx.sha512 );
-     sha512_4way_update( &ctx.sha512, vhash, dataLen );
-     sha512_4way_close( &ctx.sha512, vhash );
+     sha512_4x64_init( &ctx.sha512 );
+     sha512_4x64_update( &ctx.sha512, vhash, dataLen );
+     sha512_4x64_close( &ctx.sha512, vhash );

     rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 );

-     haval256_5_4way_init( &ctx.haval );
-     haval256_5_4way_update( &ctx.haval, vhashA, dataLen );
-     haval256_5_4way_close( &ctx.haval, output );
+     haval256_4x32_init( &ctx.haval );
+     haval256_4x32_update( &ctx.haval, vhashA, dataLen );
+     haval256_4x32_close( &ctx.haval, output );

     return 1;
 }
--- a/algo/x17/xevan.c
+++ b/algo/x17/xevan.c
@@ -17,11 +17,7 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/sph-haval.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/sha/sph_sha2.h"
 #if defined(__AES__)
@@ -45,11 +41,7 @@ typedef struct {
        hashState_luffa         luffa;
        cubehashParam           cubehash;
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-        sph_simd512_context     simd;
-#else
-        hashState_sd            simd;
-#endif
+        simd512_context         simd;
        sph_hamsi512_context    hamsi;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
@@ -78,11 +70,6 @@ void init_xevan_ctx()
        init_luffa( &xevan_ctx.luffa, 512 );
        cubehashInit( &xevan_ctx.cubehash, 512, 16, 32 );
        sph_shavite512_init( &xevan_ctx.shavite );
-#if defined(__aarch64__)
-        sph_simd512_init( &xevan_ctx.simd );
-#else
-        init_sd( &xevan_ctx.simd, 512 );
-#endif
        sph_hamsi512_init( &xevan_ctx.hamsi );
        sph_shabal512_init( &xevan_ctx.shabal );
        sph_whirlpool_init( &xevan_ctx.whirlpool );
@@ -137,13 +124,7 @@ int xevan_hash(void *output, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, dataLen);
   sph_shavite512_close(&ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512( &ctx.simd, (const void*) hash, dataLen );
-    sph_simd512_close( &ctx.simd, hash );
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                         (const BitSequence *)hash, dataLen*8 );
-#endif
+   simd512_ctx( &ctx.simd, hash, hash, dataLen );

 #if defined(__AES__)
   update_final_echo( &ctx.echo, (BitSequence *) hash,
@@ -210,13 +191,14 @@ int xevan_hash(void *output, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, dataLen);
   sph_shavite512_close(&ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-   update_final_sd( &ctx.simd, (BitSequence *)hash,
-                         (const BitSequence *)hash, dataLen*8 );
-#endif
+    simd512_ctx( &ctx.simd, hash, hash, dataLen );
+//#if defined(__aarch64__)
+//    sph_simd512(&ctx.simd, (const void*) hash, 64);
+//    sph_simd512_close(&ctx.simd, hash);
+//#else
+//   update_final_sd( &ctx.simd, (BitSequence *)hash,
+//                         (const BitSequence *)hash, dataLen*8 );
+//#endif

 #if defined(__AES__)
   update_final_echo( &ctx.echo, (BitSequence *) hash,
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	b34565bfac	v26.1	2026-01-13 19:17:47 -05:00
Jay D Dee	8f2f9ec3e9	v25.7	2025-11-15 10:44:32 -05:00
Jay D Dee	12480a3ea5	v25.6	2025-07-20 19:43:10 -04:00
Jay D Dee	aa47e880d5	v25.5	2025-07-09 01:32:38 -04:00
Jay D Dee	66191db93c	v25.4	2025-06-20 20:31:41 -04:00
Jay D Dee	dd99580a4c	v25.3	2025-01-16 12:31:53 -05:00
Jay D Dee	1ed18bf22e	v25.2	2025-01-12 18:58:21 -05:00
Jay D Dee	1d9341ee92	v25.1	2024-12-30 21:33:04 -05:00
Jay D Dee	a45a333b40	v24.8	2024-12-25 23:12:29 -05:00
Jay D Dee	2b1037a7c7	v24.7	2024-12-16 19:17:19 -05:00