v25.6

v25.5
v25.4
2025-09-17 23:44:27 +00:00 · 2025-07-20 19:43:10 -04:00 · 2025-07-09 01:32:38 -04:00 · 2025-06-20 20:31:41 -04:00 · 2025-01-16 12:31:53 -05:00 · 2025-01-12 18:58:21 -05:00
253 changed files with 10654 additions and 12912 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,27 +1,49 @@

-if WANT_JANSSON
-JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson
+if HAVE_APPLE
+# MacOS uses Homebrew to install needed packages but they aren't linked for
+# the jansson test in configure. Ignore the failed test & link them now,
+# different path for different CPU arch.
+
+if ARCH_ARM64
+  EXTRA_INCLUDES = -I/opt/homebrew/include
+  EXTRA_LIBS     = -L/opt/homebrew/lib
 else
-JANSSON_INCLUDES=
+  EXTRA_INCLUDES = -I/usr/local/include
+  EXTRA_LIBS     = -L/usr/local/lib
 endif

-EXTRA_DIST	= example-cfg.json nomacro.pl
+else

-SUBDIRS		= compat
+if WANT_JANSSON
+# Can't find jansson libraries, compile the included source code.
+  EXTRA_INCLUDES = -I$(top_srcdir)/compat/jansson
+  EXTRA_LIBS     = -L$(top_srcdir)/compat/jansson
+else
+  EXTRA_INCLUDES =
+  EXTRA_LIBS     =
+endif

-ALL_INCLUDES	= @PTHREAD_FLAGS@ -fno-strict-aliasing $(JANSSON_INCLUDES) -I.
+endif

-bin_PROGRAMS	= cpuminer
+EXTRA_DIST = example-cfg.json nomacro.pl

-dist_man_MANS	= cpuminer.1
+SUBDIRS = compat
+
+ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(EXTRA_INCLUDES) -I.
+
+bin_PROGRAMS = cpuminer
+
+dist_man_MANS = cpuminer.1

 cpuminer_SOURCES = \
+  dummy.cpp \
  cpu-miner.c \
  util.c \
  api.c \
  sysinfos.c \
  algo-gate-api.c\
  malloc-huge.c \
+  simd-utils/simd-constants.c \
  algo/argon2d/argon2d-gate.c \
  algo/argon2d/blake2/blake2b.c \
  algo/argon2d/argon2d/argon2.c \
@@ -113,7 +135,6 @@ cpuminer_SOURCES = \
  algo/lyra2/phi2-4way.c \
  algo/lyra2/phi2.c \
  algo/m7m/m7m.c \
-  algo/m7m/magimath.cpp \
  algo/nist5/nist5-gate.c \
  algo/nist5/nist5-4way.c \
  algo/nist5/nist5.c \
@@ -166,9 +187,6 @@ cpuminer_SOURCES = \
  algo/shavite/sph-shavite-aesni.c \
  algo/shavite/shavite-hash-2way.c \
  algo/shavite/shavite-hash-4way.c \
-  algo/shavite/shavite.c \
-  algo/simd/nist.c \
-  algo/simd/vector.c \
  algo/simd/sph_simd.c \
  algo/simd/simd-hash-2way.c \
  algo/skein/sph_skein.c \
@@ -250,6 +268,7 @@ cpuminer_SOURCES = \
  algo/x16/x16rt.c \
  algo/x16/x16rt-4way.c \
  algo/x16/hex.c \
+  algo/x16/x20r.c \
  algo/x16/x21s-4way.c \
  algo/x16/x21s.c \
  algo/x16/minotaur.c \
@@ -274,29 +293,29 @@ cpuminer_SOURCES = \
  algo/yespower/yespower-opt.c \
  algo/yespower/yespower-ref.c \
  algo/yespower/yespower-blake2b-ref.c
-  
-disable_flags =
-
-if USE_ASM
-   cpuminer_SOURCES += asm/neoscrypt_asm.S
-else
-   disable_flags += -DNOASM
-endif

 if HAVE_WINDOWS
   cpuminer_SOURCES += compat/winansi.c
 endif

-cpuminer_LDFLAGS	= @LDFLAGS@
-cpuminer_LDADD	= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@  -lgmp
-cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
-cpuminer_CFLAGS   = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
+if USE_ASM
+   disable_flags =
+   cpuminer_SOURCES += asm/neoscrypt_asm.S
+else
+   disable_flags = -DNOASM
+endif

-if HAVE_WINDOWS
-cpuminer_CFLAGS += -Wl,--stack,10485760
+cpuminer_LDFLAGS = @LDFLAGS@
+cpuminer_LDADD	= $(EXTRA_LIBS) @LIBCURL@ -ljansson @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
+cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
+cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
+
+if ARCH_ARM64
+   cpuminer_CFLAGS += -flax-vector-conversions
 endif

 if HAVE_WINDOWS
+
 # use to profile an object
 # gprof_cflags = -pg -g3
 # cpuminer_LDFLAGS += -pg
@@ -310,5 +329,4 @@ cpuminer-neoscrypt.o: neoscrypt.c
 	@echo "CUSTOM ${@}: ${filter %.o,${^}} ${filter %.c,${^}}"
 	$(CC) $(common_ccflags) -g -O3 $(gprof_cflags) -MT $@ -MD -MP -c -o $@ $<

-
 endif
--- a/README.md
+++ b/README.md
@@ -36,44 +36,28 @@ for compile instructions.
 Requirements
 ------------

-1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
-Intel Core2 and newer and AMD equivalents. Further optimizations are available
-on some algoritms for CPUs with AES, AVX, AVX2, SHA, AVX512 and VAES.
-
-32 bit CPUs are not supported.
-Other CPU architectures such as ARM, Raspberry Pi, RISC-V, Xeon Phi, etc,
-are not supported.
+1. A 64 bit CPU supporting x86_64 (Intel or AMD) or aarch64 (ARM).
+x86_64 requires SSE2, aarch64 requires armv8 & NEON.

 Mobile CPUs like laptop computers are not recommended because they aren't
 designed for extreme heat of operating at full load for extended periods of
 time.

-Older CPUs and ARM architecture may be supported by cpuminer-multi by TPruvot.
-
-2. 64 bit Linux or Windows OS. Ubuntu and Fedora based distributions,
-including Mint and Centos, are known to work and have all dependencies
-in their repositories. Others may work but may require more effort. Older
-versions such as Centos 6 don't work due to missing features. 
-
-Windows 7 or newer is supported with mingw_w64 and msys or using the pre-built
-binaries. WindowsXP 64 bit is YMMV.
-
-FreeBSD is not actively tested but should work, YMMV.
-MacOS, OSx and Android are not supported.
+2. 64 bit operating system including Linux, Windows, MacOS, or BSD.
+Android, IOS and alt OSs like Haiku & ReactOS are not supported.

 3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
-RPC getwork using http:// or https://.
-GBT is YMMV.
+RPC getblocktemplate using http:// or https://.

 Supported Algorithms
 --------------------

                          allium        Garlicoin
                          anime         Animecoin
-                          argon2        Argon2 coin (AR2)
-                          argon2d250    argon2d-crds, Credits (CRDS)
-                          argon2d500    argon2d-dyn,  Dynamic (DYN)
-                          argon2d4096   argon2d-uis, Unitus, (UIS)
+                          argon2d250    
+                          argon2d500
+                          argon2d1000
+                          argon2d4096
                          blake         Blake-256
                          blake2b       Blake2-512
                          blake2s       Blake2-256
@@ -87,7 +71,6 @@ Supported Algorithms
                          groestl       Groestl coin
                          hex           x16r-hex
                          hmq1725       
-                          hodl          Hodlcoin
                          jha           Jackpotcoin
                          keccak        Maxcoin
                          keccakc       Creative coin
@@ -115,9 +98,11 @@ Supported Algorithms
                          scrypt:N      scrypt(N, 1, 1)
                          scryptn2      scrypt(1048576, 1, 1)
                          sha256d       Double SHA-256
+                          sha256dt
                          sha256q       Quad SHA-256
                          sha256t       Triple SHA-256
                          sha3d         Double keccak256 (BSHA3)
+                          sha512256d
                          skein         Skein+Sha (Skeincoin)
                          skein2        Double Skein (Woodcoin)
                          skunk         Signatum (SIGT)
@@ -145,6 +130,7 @@ Supported Algorithms
                          x16rt-veil    veil
                          x16s          
                          x17
+                          x20r
                          x21s
                          x22i
                          x25x
--- a/146
+++ b/146
@@ -27,17 +27,19 @@ See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions
 Requirements
 ------------

-Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
-supported.
+- A x86_64 architecture CPU with a minimum of SSE2 support. This includes Intel Core2 and newer and AMD equivalents.
+- Arm CPU supporting AArch64 and NEON.

-64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
-are not supported. FreeBSD YMMV.
+32 bit CPUs are not supported.

-ARM requirements (Beta):
+Older CPUs are supported by open source cpuminer-multi by TPruvot but at reduced performance.

-CPU: Armv8 and NEON, SHA2 & AES are optional
-OS: Linux distribution built for AArch64.
-Packages: source code only.
+Mining on mobile devices that meet the requirements is not recommended due to the risk of
+overheating and damaging the battery. Mining has unlimited demand, it will push any device
+to or beyond its limits. There is also a fire risk with overheated lithium batteries.
+
+Beware of apps claiming "mobile only mining". There is no such thing, they aren't miners.
+If a mobile CPU can mine it any CPU can.

 See wiki for details.

@@ -73,12 +75,136 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v25.6
+
+Added argon2d1000, argon2d16000 algos.
+Target specific AES optimizations improve shavite for ARM64 & x86_64.
+
+v25.5
+
+x86_64: Fixed an insidious bug in sha256 early rejection optimization for AVX2 & AVX512.
+x86_64: Faster sha256d, sha256dt for AVX2 & AVX512.
+Other small bug fixes.
+
+v25.4
+
+x86_64: improved handling of vector constants used for byte permutations.
+x86_64: removed hooks for cancelled AVX10-256.
+Minor bug fixes & improvements.
+More code cleanup.
+
+v25.3
+
+#442, #443: Fixed a regression in Makefile.am.
+Removed algo features log display.
+Some code cleanup.
+
+v25.2
+
+ARM: Fixed regression from v25.1 that could cause build fail.
+BSD: FreeBSD is now supported. Other BSDs may also work.
+MacOS: build with installed jansson library instead of compiling the included source code.
+Windows: remove "_WIN32_WINNT=0x0601" which was a downgrade on Win11.
+Changed build.sh shell from bash to sh.
+
+v25.1
+
+MacOS ARM64: m7m algo is now working.
+MacOS ARM64: can now be compiled with GCC.
+MacOS x86_64: is now working compiled with GCC.
+Fixed some minor bugs & removed some obsolete code.
+
+v24.8
+
+ARM: Apple MacOS on M series CPU is now supported compiled from source
+code, see Wiki for details.
+ARM: Fix incorrect compiler version display when using clang. 
+build.sh can now be used to compile all targets, arm_build.sh & build_msys2.sh
+have been removed.
+Windows: MSys2 build now enables CPU groups by default, prebuilt binaries
+continue to be compiled with CPU groups disabled.
+
+v24.7
+
+ARM: compile works for Windows using MSys2 & MingW, see wiki for details.
+
+v24.6
+
+ARM: Fixed scryptn2, x16*, broken in v24.2. 
+ARM: Small improvement to interleaving.
+Eliminated some potential compile errors in code that was dependent on 
+compiler optimisations.
+x86_64: improved support for AVX10 compilation, needs GCC-14 or higher.
+
+v24.5
+
+Fix MinGW compile error after MSys2 upgrade to GCC-14.2. 
+#427: GBT: Improved handling of new work.
+Removed shavite3 algo.
+
+v24.4
+
+x86_64: fixed a bug in ornot macro for AVX2 which broke some algos in v24.2.
+x86_64: fixed a bug in alignr macros for SSE2.
+ARM: CPU feature reporting enhancements.
+Some code cleanup.
+
+v24.3
+
+ARM: CPU feature detection and reporting is now working.
+ARM: Verthash is now working.
+ARM: Small speedup for yescrypt, yespower & argon2d.
+Code cleanup.
+
+v24.2
+
+x86_64: Fixed blakes2s for AVX2 & AVX512, x25x for AVX512, broken in v3.23.4.
+x86_64: Initial support for CPUs with AVX10, needs GCC-14.
+ARM NEON: Various code optimisations.
+
+v24.1
+
+#414: fix bug in merkle error handling.
+#416: change $nproc to $(nproc) in build scripts.
+#420: change some inline function definitions to static inline. 
+#413: Fix formatting error for share result log when using no-color.
+Faster 2 way interleaving.
+Cleanup sha256 architecture targetting.
+
+v23.15
+
+Fixed x11gost (sib) algo for all architectures, broken in v3.23.4.
+ARM: Fugue AES optimizations enabled.
+ARM: quark, qubit, x11gost algos optimized with NEON & AES.
+
+v23.14
+
+ARM: Groestl AES optimizations enabled.
+All: Small optimization to Shabal 4way.
+x86_64: Extend Shabal 4way support to SSE2 from SSE4.1.
+All: deleted some unused files.
+
+v23.13
+
+Added x20r algo.
+Eliminated redundant hash order calculations for x16r family.
+
+v23.12
+
+Several bugs fixes and speed improvements for x16r family for all CPU architectures.
+
+v23.11
+
+This is a release candidate for full AArch64 support, marking the end of the Beta phase.
+Fixed hmq1725 & x25x algos, SSE2 & NEON, broken in v3.23.4.
+Most CPU-mineable SHA3 algos (X*) upgraded to 2-way SSE2 & NEON.
+
 v23.10

 x86_64: Fixed scrypt, scryptn2 algos SSE2. 
-Fixed sha512d256d algo AVX2, SSE2, NEON.
+Fixed sha512256d algo AVX2, SSE2, NEON.
 Fixed a bug in Skein N-way that reduced performance.
-ARM: Skein algo optimized for NEON & SHA2.
+ARM: Skein optimized for NEON, SHA2 & SSE2.
 Skein2 algo 2-way optimized for NEON & SSE2.

 v23.9
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -184,7 +184,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,

 #endif

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 //int scanhash_8way_64_64( struct work *work, uint32_t max_nonce,
 //                      uint64_t *hashes_done, struct thr_info *mythr )
@@ -263,8 +263,8 @@ static void init_algo_gate( algo_gate_t* gate )
   gate->build_block_header      = (void*)&std_build_block_header;
   gate->build_extraheader       = (void*)&std_build_extraheader;
   gate->set_work_data_endian    = (void*)&do_nothing;
-   gate->resync_threads          = (void*)&do_nothing;
-   gate->do_this_thread          = (void*)&return_true;
+//   gate->resync_threads          = (void*)&do_nothing;
+//   gate->do_this_thread          = (void*)&return_true;
   gate->longpoll_rpc_call       = (void*)&std_longpoll_rpc_call;
   gate->get_work_data_size      = (void*)&std_get_work_data_size;
   gate->optimizations           = EMPTY_SET;
@@ -295,8 +295,10 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
  {
    case ALGO_ALLIUM:       rc = register_allium_algo        ( gate ); break;
    case ALGO_ANIME:        rc = register_anime_algo         ( gate ); break;
-    case ALGO_ARGON2D250:   rc = register_argon2d_crds_algo  ( gate ); break;
-    case ALGO_ARGON2D500:   rc = register_argon2d_dyn_algo   ( gate ); break;
+    case ALGO_ARGON2D250:   rc = register_argon2d250_algo    ( gate ); break;
+    case ALGO_ARGON2D500:   rc = register_argon2d500_algo    ( gate ); break;
+    case ALGO_ARGON2D1000:  rc = register_argon2d1000_algo   ( gate ); break;
+    case ALGO_ARGON2D16000: rc = register_argon2d16000_algo  ( gate ); break;
    case ALGO_ARGON2D4096:  rc = register_argon2d4096_algo   ( gate ); break;
    case ALGO_AXIOM:        rc = register_axiom_algo         ( gate ); break;
    case ALGO_BLAKE:        rc = register_blake_algo         ( gate ); break;
@@ -340,7 +342,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_SHA256T:      rc = register_sha256t_algo       ( gate ); break;
    case ALGO_SHA3D:        rc = register_sha3d_algo         ( gate ); break;
    case ALGO_SHA512256D:   rc = register_sha512256d_algo    ( gate ); break;
-    case ALGO_SHAVITE3:     rc = register_shavite_algo       ( gate ); break;
    case ALGO_SKEIN:        rc = register_skein_algo         ( gate ); break;
    case ALGO_SKEIN2:       rc = register_skein2_algo        ( gate ); break;
    case ALGO_SKUNK:        rc = register_skunk_algo         ( gate ); break;
@@ -368,6 +369,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_X16RT_VEIL:   rc = register_x16rt_veil_algo    ( gate ); break;
    case ALGO_X16S:         rc = register_x16s_algo          ( gate ); break;
    case ALGO_X17:          rc = register_x17_algo           ( gate ); break;
+    case ALGO_X20R:         rc = register_x20r_algo          ( gate ); break;
    case ALGO_X21S:         rc = register_x21s_algo          ( gate ); break;
    case ALGO_X22I:         rc = register_x22i_algo          ( gate ); break;
    case ALGO_X25X:         rc = register_x25x_algo          ( gate ); break;
@@ -416,8 +418,6 @@ void exec_hash_function( int algo, void *output, const void *pdata )
 const char* const algo_alias_map[][2] =
 {
 //   alias                proper
-  { "argon2d-dyn",       "argon2d500"     },
-  { "argon2d-uis",       "argon2d4096"    },
  { "bcd",               "x13bcd"         },
  { "bitcore",           "timetravel10"   },
  { "bitzeny",           "yescryptr8"     },
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -98,25 +98,27 @@ typedef  uint32_t set_t;
 #define AVX512_OPT       1 <<  6   // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
 #define AES_OPT          1 <<  7   // Intel Westmere, AArch64
 #define VAES_OPT         1 <<  8   // Icelake, Zen3
-#define SHA_OPT          1 <<  9   // Zen1, Icelake, AArch64 
-#define SHA512_OPT       1 << 10   // AArch64 
+#define SHA256_OPT       1 <<  9   // Zen1, Icelake, AArch64 
+#define SHA512_OPT       1 << 10   // Intel Arrow Lake, AArch64 
 #define NEON_OPT         1 << 11   // AArch64 
+#define AVX10_256        1 << 12
+#define AVX10_512        1 << 13

 // AVX10 does not have explicit algo features:
 //  AVX10_512 is compatible with AVX512 + VAES
 //  AVX10_256 is compatible with AVX2 + VAES

 // return set containing all elements from sets a & b
-inline set_t set_union ( set_t a, set_t b ) { return a | b; }
+static inline set_t set_union ( set_t a, set_t b ) { return a | b; }

 // return set contained common elements from sets a & b
-inline set_t set_intsec ( set_t a, set_t b) { return a & b; }
+static inline set_t set_intsec ( set_t a, set_t b) { return a & b; }

 // all elements in set a are included in set b
-inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; }
+static inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; }

 // no elements in set a are included in set b
-inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
+static inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }

 typedef struct
 {
@@ -163,15 +165,18 @@ char* ( *malloc_txs_request )   ( struct work* );
 void ( *set_work_data_endian )  ( struct work* );

 // Diverge mining threads
-bool ( *do_this_thread )        ( int );
+//bool ( *do_this_thread )        ( int );

 // After do_this_thread
-void ( *resync_threads )        ( int, struct work* );
+//void ( *resync_threads )        ( int, struct work* );

 json_t* ( *longpoll_rpc_call )  ( CURL*, int*, char* );

+// Deprecated
 set_t optimizations;
+
 int  ( *get_work_data_size )     ();
+
 int  ntime_index;
 int  nbits_index;
 int  nonce_index;            // use with caution, see warning below
@@ -246,7 +251,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,

 #endif

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 //int scanhash_8way_64in_64out( struct work *work, uint32_t max_nonce,
 //                      uint64_t *hashes_done, struct thr_info *mythr );
@@ -272,8 +277,6 @@ void std_get_new_work( struct work *work, struct work *g_work, int thr_id,

 void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
 void sha256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
-// OpenSSL sha256 deprecated
-//void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );

 bool std_le_work_decode( struct work *work );
 bool std_be_work_decode( struct work *work );
--- a/algo/argon2d/argon2d-gate.c
+++ b/algo/argon2d/argon2d-gate.c
@@ -6,9 +6,39 @@ static const size_t INPUT_BYTES = 80;  // Lenth of a block header in bytes. Inpu
 static const size_t OUTPUT_BYTES = 32; // Length of output needed for a 256-bit hash
 static const unsigned int DEFAULT_ARGON2_FLAG = 2; //Same as ARGON2_DEFAULT_FLAGS

-// Credits
+// generic, works with most variations of argon2d
+int scanhash_argon2d( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t _ALIGN(64) hash[8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const int thr_id = mythr->id;
+   const uint32_t first_nonce = (const uint32_t)pdata[19];
+   const uint32_t last_nonce = (const uint32_t)max_nonce;
+   uint32_t nonce = first_nonce;
+   const bool bench = opt_benchmark;

-void argon2d_crds_hash( void *output, const void *input )
+   v128_bswap32_80( edata, pdata );
+   do
+   {
+      edata[19] = nonce;
+      algo_gate.hash( hash, edata, thr_id );
+      if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
+      {
+          pdata[19] = bswap_32( nonce );
+          submit_solution( work, hash, mythr );
+      }
+      nonce++;
+  } while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
+
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce;
+   return 0;
+}
+
+void argon2d250_hash( void *output, const void *input )
 {
 	argon2_context context;
 	context.out = (uint8_t *)output;
@@ -34,48 +64,15 @@ void argon2d_crds_hash( void *output, const void *input )
 	argon2_ctx( &context, Argon2_d );
 }

-int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
+bool register_argon2d250_algo( algo_gate_t* gate )
 {
-   uint32_t _ALIGN(64) edata[20];
-   uint32_t _ALIGN(64) hash[8];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t Htarg = ptarget[7];
-   uint32_t nonce = first_nonce;
-
-   swab32_array( edata, pdata, 20 );
-
-   do {
-      be32enc(&edata[19], nonce);
-      argon2d_crds_hash( hash, edata );
-      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
-      {
-          pdata[19] = nonce;
-          submit_solution( work, hash, mythr );
-      }
-      nonce++;
-   } while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce + 1;
-   return 0;
-}
-
-bool register_argon2d_crds_algo( algo_gate_t* gate )
-{
-        gate->scanhash = (void*)&scanhash_argon2d_crds;
-        gate->hash = (void*)&argon2d_crds_hash;
-        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
+        gate->scanhash = (void*)&scanhash_argon2d;
+        gate->hash = (void*)&argon2d250_hash;
        opt_target_factor = 65536.0;
        return true;
 }

-// Dynamic
-
-void argon2d_dyn_hash( void *output, const void *input )
+void argon2d500_hash( void *output, const void *input )
 {
    argon2_context context;
    context.out = (uint8_t *)output;
@@ -101,48 +98,81 @@ void argon2d_dyn_hash( void *output, const void *input )
    argon2_ctx( &context, Argon2_d );
 }

-int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
+bool register_argon2d500_algo( algo_gate_t* gate )
 {
-   uint32_t _ALIGN(64) edata[20];
-   uint32_t _ALIGN(64) hash[8];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const int thr_id = mythr->id; 
-   const uint32_t first_nonce = (const uint32_t)pdata[19];
-   const uint32_t last_nonce = (const uint32_t)max_nonce;
-   uint32_t nonce = first_nonce;
-   const bool bench = opt_benchmark;
-
-   v128_bswap32_80( edata, pdata );
-   do
-   {
-      edata[19] = nonce;
-      argon2d_dyn_hash( hash, edata );
-      if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
-           && !bench ) )
-      {
-          pdata[19] = bswap_32( nonce );;
-          submit_solution( work, hash, mythr );
-      }
-      nonce++;
-  } while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
-
-   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce;
-   return 0;
-}
-
-bool register_argon2d_dyn_algo( algo_gate_t* gate )
-{
-        gate->scanhash = (void*)&scanhash_argon2d_dyn;
-        gate->hash = (void*)&argon2d_dyn_hash;
-        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
+        gate->scanhash = (void*)&scanhash_argon2d;
+        gate->hash = (void*)&argon2d500_hash;
        opt_target_factor = 65536.0;
        return true;
 }

-// Unitus
+void argon2d1000_hash( void *output, const void *input )
+{
+    argon2_context context;
+    context.out = (uint8_t *)output;
+    context.outlen = (uint32_t)OUTPUT_BYTES;
+    context.pwd = (uint8_t *)input;
+    context.pwdlen = (uint32_t)INPUT_BYTES;
+    context.salt = (uint8_t *)input; //salt = input
+    context.saltlen = (uint32_t)INPUT_BYTES;
+    context.secret = NULL;
+    context.secretlen = 0;
+    context.ad = NULL;
+    context.adlen = 0;
+    context.allocate_cbk = NULL;
+    context.free_cbk = NULL;
+    context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS
+    // main configurable Argon2 hash parameters
+    context.m_cost = 1000;  // Memory in KiB (1MB)
+    context.lanes = 8;     // Degree of Parallelism
+    context.threads = 1;   // Threads
+    context.t_cost = 2;    // Iterations
+    context.version = ARGON2_VERSION_10;
+
+    argon2_ctx( &context, Argon2_d );
+}
+
+bool register_argon2d1000_algo( algo_gate_t* gate )
+{
+        gate->scanhash = (void*)&scanhash_argon2d;
+        gate->hash = (void*)&argon2d1000_hash;
+        opt_target_factor = 65536.0;
+        return true;
+}
+
+void argon2d16000_hash( void *output, const void *input )
+{
+   argon2_context context;
+   context.out = (uint8_t *)output;
+   context.outlen = (uint32_t)OUTPUT_BYTES;
+   context.pwd = (uint8_t *)input;
+   context.pwdlen = (uint32_t)INPUT_BYTES;
+   context.salt = (uint8_t *)input; //salt = input
+   context.saltlen = (uint32_t)INPUT_BYTES;
+   context.secret = NULL;
+   context.secretlen = 0;
+   context.ad = NULL;
+   context.adlen = 0;
+   context.allocate_cbk = NULL;
+   context.free_cbk = NULL;
+   context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS
+   // main configurable Argon2 hash parameters
+   context.m_cost = 16000; // Memory in KiB (~16384KB)
+   context.lanes = 1;    // Degree of Parallelism
+   context.threads = 1;  // Threads
+   context.t_cost = 1;   // Iterations
+   context.version = ARGON2_VERSION_10;
+
+   argon2_ctx( &context, Argon2_d );
+}
+
+bool register_argon2d16000_algo( algo_gate_t* gate )
+{
+        gate->scanhash = (void*)&scanhash_argon2d;
+        gate->hash = (void*)&argon2d16000_hash;
+        opt_target_factor = 65536.0;
+        return true;
+}

 int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
@@ -154,7 +184,7 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = (const uint32_t)max_nonce;
   uint32_t n = first_nonce;
-   const int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;  
   uint32_t t_cost = 1; // 1 iteration
   uint32_t m_cost = 4096; // use 4MB
   uint32_t parallelism = 1; // 1 thread, 2 lanes
@@ -182,7 +212,6 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
 bool register_argon2d4096_algo( algo_gate_t* gate )
 {
        gate->scanhash = (void*)&scanhash_argon2d4096;
-        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT |NEON_OPT;
        opt_target_factor = 65536.0;
        return true;
 }
--- a/algo/argon2d/argon2d-gate.h
+++ b/algo/argon2d/argon2d-gate.h
@@ -4,22 +4,27 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-// Credits: version = 0x10, m_cost = 250.
-bool register_argon2d_crds_algo( algo_gate_t* gate );
-
-void argon2d_crds_hash( void *state, const void *input );
-
-int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
+int scanhash_argon2d( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );

+// Credits: version = 0x10, m_cost = 250.
+bool register_argon2d250_algo( algo_gate_t* gate );
+
+void argon2d250_hash( void *state, const void *input );
+
 // Dynamic: version = 0x10, m_cost = 500.
-bool register_argon2d_dyn_algo( algo_gate_t* gate );
+bool register_argon2d500_algo( algo_gate_t* gate );

-void argon2d_dyn_hash( void *state, const void *input );
+void argon2d500_hash( void *state, const void *input );

-int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done, struct thr_info *mythr );
+// Zero Dynamics Cash: version = 0x10, m_cost = 1000.
+bool register_argon2d1000_algo( algo_gate_t* gate );

+void argon2d1000_hash( void *state, const void *input );
+
+bool register_argon2d16000_algo( algo_gate_t* gate );
+
+void argon2d16000_hash( void *state, const void *input );

 // Unitus: version = 0x13, m_cost = 4096.
 bool register_argon2d4096_algo( algo_gate_t* gate );
--- a/algo/argon2d/argon2d/opt.c
+++ b/algo/argon2d/argon2d/opt.c
@@ -35,7 +35,7 @@
 * @pre all block pointers must be valid
 */

-#if defined(__AVX512F__)
+#if defined(SIMD512)

 static inline __m512i blamka( __m512i x, __m512i y )
 {
@@ -237,7 +237,7 @@ void fill_segment(const argon2_instance_t *instance,
    uint64_t pseudo_rand, ref_index, ref_lane;
    uint32_t prev_offset, curr_offset;
    uint32_t starting_index, i;
-#if defined(__AVX512F__)
+#if defined(SIMD512)
    __m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
 #elif defined(__AVX2__)
    __m256i state[ARGON2_HWORDS_IN_BLOCK];
--- a/algo/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2d/blake2/blamka-round-opt.h
@@ -21,7 +21,7 @@
 #include "blake2-impl.h"
 #include "simd-utils.h"

-#if !defined(__AVX512F__)
+#if !defined(SIMD512)

 #if !defined(__AVX2__)

--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -6,15 +6,15 @@

 #if defined (BLAKE_4WAY)

-blake256r14_4way_context blake_4w_ctx;
+blake256r14_4x32_context blake_4w_ctx;

 void blakehash_4way(void *state, const void *input)
 {
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-     blake256r14_4way_context ctx;
+     blake256r14_4x32_context ctx;
     memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
-     blake256r14_4way_update( &ctx, input + (64<<2), 16 );
-     blake256r14_4way_close( &ctx, vhash );
+     blake256r14_4x32_update( &ctx, input + (64<<2), 16 );
+     blake256r14_4x32_close( &ctx, vhash );
     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

@@ -35,11 +35,11 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
      HTarget = 0x7f;

   v128_bswap32_intrlv80_4x32( vdata, pdata );
-   blake256r14_4way_init( &blake_4w_ctx );
-   blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
+   blake256r14_4x32_init( &blake_4w_ctx );
+   blake256r14_4x32_update( &blake_4w_ctx, vdata, 64 );

   do {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );

      blakehash_4way( hash, vdata );

@@ -61,15 +61,15 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,

 #if defined(BLAKE_8WAY)

-blake256r14_8way_context blake_8w_ctx;
+blake256r14_8x32_context blake_8w_ctx;

 void blakehash_8way( void *state, const void *input )
 {
     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
-     blake256r14_8way_context ctx;
+     blake256r14_8x32_context ctx;
     memcpy( &ctx, &blake_8w_ctx, sizeof ctx );
-     blake256r14_8way( &ctx, input + (64<<3), 16 );
-     blake256r14_8way_close( &ctx, vhash );
+     blake256r14_8x32( &ctx, input + (64<<3), 16 );
+     blake256r14_8x32_close( &ctx, vhash );
     _dintrlv_8x32( state,     state+ 32, state+ 64, state+ 96,
                    state+128, state+160, state+192, state+224,
                    vhash, 256 );
@@ -93,8 +93,8 @@ int scanhash_blake_8way( struct work *work, uint32_t max_nonce,

   mm256_bswap32_intrlv80_8x32( vdata, pdata );

-   blake256r14_8way_init( &blake_8w_ctx );
-   blake256r14_8way( &blake_8w_ctx, vdata, 64 );
+   blake256r14_8x32_init( &blake_8w_ctx );
+   blake256r14_8x32( &blake_8w_ctx, vdata, 64 );

   do {
      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
--- a/algo/blake/blake256-hash.c
+++ b/algo/blake/blake256-hash.c
@@ -423,33 +423,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
 		(state)->T1 = T1; \
 	} while (0)

-
-#if defined(__SSSE3__)
-
-#define BLAKE256_4X32_BLOCK_BSWAP32 \
-{ \
-   v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
-                                     0x0405060700010203 ); \
-   M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
-   M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
-   M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
-   M3 = _mm_shuffle_epi8( buf[ 3], shuf_bswap32 ); \
-   M4 = _mm_shuffle_epi8( buf[ 4], shuf_bswap32 ); \
-   M5 = _mm_shuffle_epi8( buf[ 5], shuf_bswap32 ); \
-   M6 = _mm_shuffle_epi8( buf[ 6], shuf_bswap32 ); \
-   M7 = _mm_shuffle_epi8( buf[ 7], shuf_bswap32 ); \
-   M8 = _mm_shuffle_epi8( buf[ 8], shuf_bswap32 ); \
-   M9 = _mm_shuffle_epi8( buf[ 9], shuf_bswap32 ); \
-   MA = _mm_shuffle_epi8( buf[10], shuf_bswap32 ); \
-   MB = _mm_shuffle_epi8( buf[11], shuf_bswap32 ); \
-   MC = _mm_shuffle_epi8( buf[12], shuf_bswap32 ); \
-   MD = _mm_shuffle_epi8( buf[13], shuf_bswap32 ); \
-   ME = _mm_shuffle_epi8( buf[14], shuf_bswap32 ); \
-   MF = _mm_shuffle_epi8( buf[15], shuf_bswap32 ); \
-}
-
-#else  // SSE2
-
 #define BLAKE256_4X32_BLOCK_BSWAP32 \
 { \
   M0 = v128_bswap32( buf[0] ); \
@@ -470,8 +443,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
   MF = v128_bswap32( buf[15] ); \
 }

-#endif  // SSSE3 else SSE2
-
 #define COMPRESS32_4X32( rounds ) \
 { \
   v128_t M0, M1, M2, M3, M4, M5, M6, M7; \
@@ -926,22 +897,6 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
      ROUND_S_4X32_3;
   }

-#if defined(__SSSE3__)
-
-   const v128_t shuf_bswap32 =
-                      v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
-
-   H[0] = _mm_shuffle_epi8( v128_xor3( V8, V0, h[0] ), shuf_bswap32 );
-   H[1] = _mm_shuffle_epi8( v128_xor3( V9, V1, h[1] ), shuf_bswap32 );
-   H[2] = _mm_shuffle_epi8( v128_xor3( VA, V2, h[2] ), shuf_bswap32 );
-   H[3] = _mm_shuffle_epi8( v128_xor3( VB, V3, h[3] ), shuf_bswap32 );
-   H[4] = _mm_shuffle_epi8( v128_xor3( VC, V4, h[4] ), shuf_bswap32 );
-   H[5] = _mm_shuffle_epi8( v128_xor3( VD, V5, h[5] ), shuf_bswap32 );
-   H[6] = _mm_shuffle_epi8( v128_xor3( VE, V6, h[6] ), shuf_bswap32 );
-   H[7] = _mm_shuffle_epi8( v128_xor3( VF, V7, h[7] ), shuf_bswap32 );
-
-#else
-
   H[0] = v128_bswap32( v128_xor3( V8, V0, h[0] ) );
   H[1] = v128_bswap32( v128_xor3( V9, V1, h[1] ) );
   H[2] = v128_bswap32( v128_xor3( VA, V2, h[2] ) );
@@ -950,8 +905,6 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
   H[5] = v128_bswap32( v128_xor3( VD, V5, h[5] ) );
   H[6] = v128_bswap32( v128_xor3( VE, V6, h[6] ) );
   H[7] = v128_bswap32( v128_xor3( VF, V7, h[7] ) );
-
-#endif
 }

 #if defined (__AVX2__)
@@ -1291,24 +1244,22 @@ do { \
   VD = v256_32( T0 ^ 0x299F31D0 ); \
   VE = v256_32( T1 ^ 0x082EFA98 ); \
   VF = v256_32( T1 ^ 0xEC4E6C89 ); \
-   const __m256i shuf_bswap32 = mm256_set2_64( \
-                               0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
-   M0 = _mm256_shuffle_epi8( * buf    , shuf_bswap32 ); \
-   M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
-   M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
-   M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
-   M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
-   M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
-   M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
-   M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
-   M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
-   M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
-   MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
-   MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
-   MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
-   MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
-   ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
-   MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
+   M0 = mm256_bswap_32( * buf     ); \
+   M1 = mm256_bswap_32( *(buf+ 1) ); \
+   M2 = mm256_bswap_32( *(buf+ 2) ); \
+   M3 = mm256_bswap_32( *(buf+ 3) ); \
+   M4 = mm256_bswap_32( *(buf+ 4) ); \
+   M5 = mm256_bswap_32( *(buf+ 5) ); \
+   M6 = mm256_bswap_32( *(buf+ 6) ); \
+   M7 = mm256_bswap_32( *(buf+ 7) ); \
+   M8 = mm256_bswap_32( *(buf+ 8) ); \
+   M9 = mm256_bswap_32( *(buf+ 9) ); \
+   MA = mm256_bswap_32( *(buf+10) ); \
+   MB = mm256_bswap_32( *(buf+11) ); \
+   MC = mm256_bswap_32( *(buf+12) ); \
+   MD = mm256_bswap_32( *(buf+13) ); \
+   ME = mm256_bswap_32( *(buf+14) ); \
+   MF = mm256_bswap_32( *(buf+15) ); \
   ROUND_S_8WAY(0); \
   ROUND_S_8WAY(1); \
   ROUND_S_8WAY(2); \
@@ -1401,7 +1352,7 @@ do { \
   H7 = mm256_xor3( VF, V7, H7 ); \
 }

-void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
+void blake256_8x32_round0_prehash_le( void *midstate, const void *midhash,
                                      void *data )
 {
   __m256i *M = (__m256i*)data;
@@ -1491,7 +1442,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
                         _mm256_xor_si256( v256_32( CSE ), M[15] ) );
 }

-void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
+void blake256_8x32_final_rounds_le( void *final_hash, const void *midstate,
                     const void *midhash, const void *data, const int rounds )
 {
   __m256i *H = (__m256i*)final_hash;
@@ -1596,22 +1547,19 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
      ROUND256_8WAY_3;
   }

-   const __m256i shuf_bswap32 =
-                  mm256_set2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
-
-   H[0] = _mm256_shuffle_epi8( mm256_xor3( V8, V0, h[0] ), shuf_bswap32 );
-   H[1] = _mm256_shuffle_epi8( mm256_xor3( V9, V1, h[1] ), shuf_bswap32 );
-   H[2] = _mm256_shuffle_epi8( mm256_xor3( VA, V2, h[2] ), shuf_bswap32 );
-   H[3] = _mm256_shuffle_epi8( mm256_xor3( VB, V3, h[3] ), shuf_bswap32 );
-   H[4] = _mm256_shuffle_epi8( mm256_xor3( VC, V4, h[4] ), shuf_bswap32 );
-   H[5] = _mm256_shuffle_epi8( mm256_xor3( VD, V5, h[5] ), shuf_bswap32 );
-   H[6] = _mm256_shuffle_epi8( mm256_xor3( VE, V6, h[6] ), shuf_bswap32 );
-   H[7] = _mm256_shuffle_epi8( mm256_xor3( VF, V7, h[7] ), shuf_bswap32 );
+   H[0] = mm256_bswap_32( mm256_xor3( V8, V0, h[0] ) );
+   H[1] = mm256_bswap_32( mm256_xor3( V9, V1, h[1] ) );
+   H[2] = mm256_bswap_32( mm256_xor3( VA, V2, h[2] ) );
+   H[3] = mm256_bswap_32( mm256_xor3( VB, V3, h[3] ) );
+   H[4] = mm256_bswap_32( mm256_xor3( VC, V4, h[4] ) );
+   H[5] = mm256_bswap_32( mm256_xor3( VD, V5, h[5] ) );
+   H[6] = mm256_bswap_32( mm256_xor3( VE, V6, h[6] ) );
+   H[7] = mm256_bswap_32( mm256_xor3( VF, V7, h[7] ) );
 }

 #endif

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 ///////////////////////////////////////
 //
@@ -1933,8 +1881,6 @@ do { \
   __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
   __m512i V0, V1, V2, V3, V4, V5, V6, V7; \
   __m512i V8, V9, VA, VB, VC, VD, VE, VF; \
-   const __m512i shuf_bswap32 = mm512_bcast_m128( v128_set64( \
-                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
   V0 = H0; \
   V1 = H1; \
   V2 = H2; \
@@ -1951,22 +1897,22 @@ do { \
   VD = v512_32( T0 ^ 0x299F31D0 ); \
   VE = v512_32( T1 ^ 0x082EFA98 ); \
   VF = v512_32( T1 ^ 0xEC4E6C89 ); \
-   M0 = _mm512_shuffle_epi8( * buf    , shuf_bswap32 ); \
-   M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
-   M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
-   M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
-   M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
-   M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
-   M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
-   M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
-   M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
-   M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
-   MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
-   MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
-   MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
-   MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
-   ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
-   MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
+   M0 = mm512_bswap_32( * buf     ); \
+   M1 = mm512_bswap_32( *(buf+ 1) ); \
+   M2 = mm512_bswap_32( *(buf+ 2) ); \
+   M3 = mm512_bswap_32( *(buf+ 3) ); \
+   M4 = mm512_bswap_32( *(buf+ 4) ); \
+   M5 = mm512_bswap_32( *(buf+ 5) ); \
+   M6 = mm512_bswap_32( *(buf+ 6) ); \
+   M7 = mm512_bswap_32( *(buf+ 7) ); \
+   M8 = mm512_bswap_32( *(buf+ 8) ); \
+   M9 = mm512_bswap_32( *(buf+ 9) ); \
+   MA = mm512_bswap_32( *(buf+10) ); \
+   MB = mm512_bswap_32( *(buf+11) ); \
+   MC = mm512_bswap_32( *(buf+12) ); \
+   MD = mm512_bswap_32( *(buf+13) ); \
+   ME = mm512_bswap_32( *(buf+14) ); \
+   MF = mm512_bswap_32( *(buf+15) ); \
   ROUND_S_16WAY(0); \
   ROUND_S_16WAY(1); \
   ROUND_S_16WAY(2); \
@@ -2063,7 +2009,7 @@ do { \
 // is constant for every nonce and only needs to be run once per job. The
 // second part is run for each nonce using the precalculated midstate and the
 // hash from the first block.
-void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
+void blake256_16x32_round0_prehash_le( void *midstate, const void *midhash,
                                       void *data )
 {
   __m512i *M = (__m512i*)data;
@@ -2157,7 +2103,7 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
 }

 // Dfault is 14 rounds, blakecoin & vanilla are 8.
-void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
+void blake256_16x32_final_rounds_le( void *final_hash, const void *midstate,
                     const void *midhash, const void *data, const int rounds )
 {
   __m512i *H = (__m512i*)final_hash;
@@ -2274,27 +2220,23 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
   }

   // Byte swap final hash
-   const __m512i shuf_bswap32 =  mm512_bcast_m128( v128_set64( 
-                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
-   H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 );
-   H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 );
-   H[2] = _mm512_shuffle_epi8( mm512_xor3( VA, V2, h[2] ), shuf_bswap32 );
-   H[3] = _mm512_shuffle_epi8( mm512_xor3( VB, V3, h[3] ), shuf_bswap32 );
-   H[4] = _mm512_shuffle_epi8( mm512_xor3( VC, V4, h[4] ), shuf_bswap32 );
-   H[5] = _mm512_shuffle_epi8( mm512_xor3( VD, V5, h[5] ), shuf_bswap32 );
-   H[6] = _mm512_shuffle_epi8( mm512_xor3( VE, V6, h[6] ), shuf_bswap32 );
-   H[7] = _mm512_shuffle_epi8( mm512_xor3( VF, V7, h[7] ), shuf_bswap32 );
+   H[0] = mm512_bswap_32( mm512_xor3( V8, V0, h[0] ) );
+   H[1] = mm512_bswap_32( mm512_xor3( V9, V1, h[1] ) );
+   H[2] = mm512_bswap_32( mm512_xor3( VA, V2, h[2] ) );
+   H[3] = mm512_bswap_32( mm512_xor3( VB, V3, h[3] ) );
+   H[4] = mm512_bswap_32( mm512_xor3( VC, V4, h[4] ) );
+   H[5] = mm512_bswap_32( mm512_xor3( VD, V5, h[5] ) );
+   H[6] = mm512_bswap_32( mm512_xor3( VE, V6, h[6] ) );
+   H[7] = mm512_bswap_32( mm512_xor3( VF, V7, h[7] ) );
 }

 #endif

 // Blake-256 4 way

-static const uint32_t salt_zero_4x32_small[4] = { 0, 0, 0, 0 };
-
 static void
 blake32_4x32_init( blake_4x32_small_context *ctx, const uint32_t *iv,
-                   const uint32_t *salt, int rounds )
+                   int rounds )
 {
   casti_v128( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
   casti_v128( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 );
@@ -2404,11 +2346,10 @@ blake32_4x32_close( blake_4x32_small_context *ctx, unsigned ub, unsigned n,

 // Blake-256 8 way

-static const uint32_t salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };

 static void
-blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
-                   const uint32_t *salt, int rounds )
+blake32_8way_init( blake256_8x32_context *sc, const uint32_t *iv,
+                   int rounds )
 {
   casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E6676A09E667 );
   casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE85BB67AE85 );
@@ -2424,7 +2365,7 @@ blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
 }

 static void
-blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
+blake32_8way( blake256_8x32_context *sc, const void *data, size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   __m256i *buf;
@@ -2466,7 +2407,7 @@ blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
 }

 static void
-blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
+blake32_8way_close( blake256_8x32_context *sc, unsigned ub, unsigned n,
                    void *dst, size_t out_size_w32 )
 {
   __m256i buf[16];
@@ -2520,7 +2461,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
 }

 static void
-blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
+blake32_8way_le( blake256_8x32_context *sc, const void *data, size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   __m256i *buf;
@@ -2562,7 +2503,7 @@ blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
 }

 static void
-blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
+blake32_8way_close_le( blake256_8x32_context *sc, unsigned ub, unsigned n,
                       void *dst, size_t out_size_w32 )
 {
   __m256i buf[16];
@@ -2617,13 +2558,13 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
 #endif


-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 //Blake-256 16 way AVX512

 static void
-blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
-                   const uint32_t *salt, int rounds )
+blake32_16way_init( blake256_16x32_context *sc, const uint32_t *iv,
+                    int rounds )
 {
   casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E6676A09E667 );
   casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE85BB67AE85 );
@@ -2639,7 +2580,7 @@ blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
 }

 static void
-blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
+blake32_16way( blake256_16x32_context *sc, const void *data, size_t len )
 {
   __m512i *vdata = (__m512i*)data;
   __m512i *buf;
@@ -2679,7 +2620,7 @@ blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
   sc->ptr = ptr;
 }
 static void
-blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
+blake32_16way_close( blake256_16x32_context *sc, unsigned ub, unsigned n,
                    void *dst, size_t out_size_w32 )
 {
   __m512i buf[16];
@@ -2733,7 +2674,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
 }

 static void
-blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
+blake32_16way_le( blake256_16x32_context *sc, const void *data, size_t len )
 {
   __m512i *vdata = (__m512i*)data;
   __m512i *buf;
@@ -2776,7 +2717,7 @@ blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
 }

 static void
-blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
+blake32_16way_close_le( blake256_16x32_context *sc, unsigned ub, unsigned n,
                    void *dst, size_t out_size_w32 )
 {
   __m512i buf[16];
@@ -2827,65 +2768,65 @@ blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
 }

 void
-blake256_16way_init(void *cc)
+blake256_16x32_init(void *cc)
 {
-   blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
+   blake32_16way_init( cc, IV256, 14 );
 }

 void
-blake256_16way_update(void *cc, const void *data, size_t len)
+blake256_16x32_update(void *cc, const void *data, size_t len)
 {
        blake32_16way(cc, data, len);
 }

 void
-blake256_16way_close(void *cc, void *dst)
+blake256_16x32_close(void *cc, void *dst)
 {
        blake32_16way_close(cc, 0, 0, dst, 8);
 }

 void
-blake256_16way_update_le(void *cc, const void *data, size_t len)
+blake256_16x32_update_le(void *cc, const void *data, size_t len)
 {
   blake32_16way_le(cc, data, len);
 }

 void
-blake256_16way_close_le(void *cc, void *dst)
+blake256_16x32_close_le(void *cc, void *dst)
 {
    blake32_16way_close_le(cc, 0, 0, dst, 8);
 }

 void blake256r14_16way_init(void *cc)
 {
-   blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
+   blake32_16way_init( cc, IV256, 14 );
 }

 void
-blake256r14_16way_update(void *cc, const void *data, size_t len)
+blake256r14_16x32_update(void *cc, const void *data, size_t len)
 {
   blake32_16way(cc, data, len);
 }

 void
-blake256r14_16way_close(void *cc, void *dst)
+blake256r14_16x32_close(void *cc, void *dst)
 {
   blake32_16way_close(cc, 0, 0, dst, 8);
 }

 void blake256r8_16way_init(void *cc)
 {
-   blake32_16way_init( cc, IV256, salt_zero_8way_small, 8 );
+   blake32_16way_init( cc, IV256, 8 );
 }

 void
-blake256r8_16way_update(void *cc, const void *data, size_t len)
+blake256r8_16x32_update(void *cc, const void *data, size_t len)
 {
   blake32_16way(cc, data, len);
 }

 void
-blake256r8_16way_close(void *cc, void *dst)
+blake256r8_16x32_close(void *cc, void *dst)
 {
   blake32_16way_close(cc, 0, 0, dst, 8);
 }
@@ -2898,7 +2839,7 @@ blake256r8_16way_close(void *cc, void *dst)
 void
 blake256_4x32_init(void *ctx)
 {
-   blake32_4x32_init( ctx, IV256, salt_zero_4x32_small, 14 );
+   blake32_4x32_init( ctx, IV256, 14 );
 }

 void
@@ -2918,31 +2859,31 @@ blake256_4x32_close(void *ctx, void *dst)
 // Blake-256 8 way

 void
-blake256_8way_init(void *cc)
+blake256_8x32_init(void *cc)
 {
-   blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
+   blake32_8way_init( cc, IV256, 14 );
 }

 void
-blake256_8way_update(void *cc, const void *data, size_t len)
+blake256_8x32_update(void *cc, const void *data, size_t len)
 {
        blake32_8way(cc, data, len);
 }

 void
-blake256_8way_close(void *cc, void *dst)
+blake256_8x32_close(void *cc, void *dst)
 {
        blake32_8way_close(cc, 0, 0, dst, 8);
 }

 void
-blake256_8way_update_le(void *cc, const void *data, size_t len)
+blake256_8x32_update_le(void *cc, const void *data, size_t len)
 {
        blake32_8way_le(cc, data, len);
 }

 void
-blake256_8way_close_le(void *cc, void *dst)
+blake256_8x32_close_le(void *cc, void *dst)
 {
        blake32_8way_close_le(cc, 0, 0, dst, 8);
 }
@@ -2952,7 +2893,7 @@ blake256_8way_close_le(void *cc, void *dst)
 // 14 rounds Blake, Decred
 void blake256r14_4x32_init(void *cc)
 {
-   blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 14 );
+   blake32_4x32_init( cc, IV256, 14 );
 }

 void
@@ -2969,19 +2910,19 @@ blake256r14_4x32_close(void *cc, void *dst)

 #if defined(__AVX2__)

-void blake256r14_8way_init(void *cc)
+void blake256r14_8x32_init(void *cc)
 {
-   blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
+   blake32_8way_init( cc, IV256, 14 );
 }

 void
-blake256r14_8way_update(void *cc, const void *data, size_t len)
+blake256r14_8x32_update(void *cc, const void *data, size_t len)
 {
   blake32_8way(cc, data, len);
 }

 void
-blake256r14_8way_close(void *cc, void *dst)
+blake256r14_8x32_close(void *cc, void *dst)
 {
   blake32_8way_close(cc, 0, 0, dst, 8);
 }
@@ -2991,7 +2932,7 @@ blake256r14_8way_close(void *cc, void *dst)
 // 8 rounds Blakecoin, Vanilla
 void blake256r8_4x32_init(void *cc)
 {
-   blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 8 );
+   blake32_4x32_init( cc, IV256, 8 );
 }

 void
@@ -3008,19 +2949,19 @@ blake256r8_4x32_close(void *cc, void *dst)

 #if defined (__AVX2__)

-void blake256r8_8way_init(void *cc)
+void blake256r8_8x32_init(void *cc)
 {
-   blake32_8way_init( cc, IV256, salt_zero_8way_small, 8 );
+   blake32_8way_init( cc, IV256, 8 );
 }

 void
-blake256r8_8way_update(void *cc, const void *data, size_t len)
+blake256r8_8x32_update(void *cc, const void *data, size_t len)
 {
   blake32_8way(cc, data, len);
 }

 void
-blake256r8_8way_close(void *cc, void *dst)
+blake256r8_8x32_close(void *cc, void *dst)
 {
   blake32_8way_close(cc, 0, 0, dst, 8);
 }
--- a/algo/blake/blake256-hash.h
+++ b/algo/blake/blake256-hash.h
@@ -29,13 +29,6 @@ typedef struct

 void blake256_transform_le( uint32_t *H, const uint32_t *buf,
                            const uint32_t T0, const uint32_t T1, int rounds );
-/*
-void blake256_init( blake256_context *sc );
-void blake256_update( blake256_context *sc, const void *data, size_t len );
-void blake256_close( blake256_context *sc, void *dst );
-void blake256_full( blake256_context *sc, void *dst, const void *data,
-                    size_t len );
-*/

 //////////////////////////////////
 //
@@ -55,6 +48,10 @@ typedef blake_4x32_small_context blake256_4x32_context;
 void blake256_4x32_init(void *ctx);
 void blake256_4x32_update(void *ctx, const void *data, size_t len);
 void blake256_4x32_close(void *ctx, void *dst);
+void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
+                                      void *data );
+void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
+                     const void *midhash, const void *data, const int rounds );

 // 14 rounds
 typedef blake_4x32_small_context blake256r14_4x32_context;
@@ -68,29 +65,6 @@ void blake256r8_4x32_init(void *cc);
 void blake256r8_4x32_update(void *cc, const void *data, size_t len);
 void blake256r8_4x32_close(void *cc, void *dst);

-void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
-                                      void *data );
-void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
-                     const void *midhash, const void *data, const int rounds );
-
-#define blake_4way_small_context         blake256_4x32_context
-#define blake256_4way_context            blake256_4x32_context
-#define blake256_4way_init               blake256_4x32_init
-#define blake256_4way_update             blake256_4x32_update
-#define blake256_4way_close              blake256_4x32_close
-#define blake256_4way_update_le          blake256_4x32_update_le
-#define blake256_4way_close_le           blake256_4x32_close_le
-#define blake256_4way_round0_prehash_le  blake256_4x32_round0_prehash_le
-#define blake256_4way_final_rounds_le    blake256_4x32_final_rounds_le
-#define blake256r14_4way_context         blake256r14_4x32_context
-#define blake256r14_4way_init            blake256r14_4x32_init
-#define blake256r14_4way_update          blake256r14_4x32_update
-#define blake256r14_4way_close           blake256r14_4x32_close
-#define blake256r8_4way_context          blake256r14_4x32_context
-#define blake256r8_4way_init             blake256r14_4x32_init
-#define blake256r8_4way_update           blake256r14_4x32_update
-#define blake256r8_4way_close            blake256r14_4x32_close
-
 #ifdef __AVX2__

 //////////////////////////////
@@ -107,47 +81,30 @@ typedef struct
 } blake_8way_small_context;

 // Default 14 rounds
-typedef blake_8way_small_context blake256_8way_context;
-void blake256_8way_init(void *cc);
-void blake256_8way_update(void *cc, const void *data, size_t len);
-void blake256_8way_close(void *cc, void *dst);
-void blake256_8way_update_le(void *cc, const void *data, size_t len);
-void blake256_8way_close_le(void *cc, void *dst);
-void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
+typedef blake_8way_small_context blake256_8x32_context;
+void blake256_8x32_init(void *cc);
+void blake256_8x32_update(void *cc, const void *data, size_t len);
+void blake256_8x32_close(void *cc, void *dst);
+void blake256_8x32_update_le(void *cc, const void *data, size_t len);
+void blake256_8x32_close_le(void *cc, void *dst);
+void blake256_8x32_round0_prehash_le( void *midstate, const void *midhash,
                                      void *data );
-void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
+void blake256_8x32_final_rounds_le( void *final_hash, const void *midstate,
                    const void *midhash, const void *data, const int rounds );

 // 14 rounds, blake, decred
-typedef blake_8way_small_context blake256r14_8way_context;
-void blake256r14_8way_init(void *cc);
-void blake256r14_8way_update(void *cc, const void *data, size_t len);
-void blake256r14_8way_close(void *cc, void *dst);
+typedef blake_8way_small_context blake256r14_8x32_context;
+void blake256r14_8x32_init(void *cc);
+void blake256r14_8x32_update(void *cc, const void *data, size_t len);
+void blake256r14_8x32_close(void *cc, void *dst);

 // 8 rounds, blakecoin, vanilla
-typedef blake_8way_small_context blake256r8_8way_context;
-void blake256r8_8way_init(void *cc);
-void blake256r8_8way_update(void *cc, const void *data, size_t len);
-void blake256r8_8way_close(void *cc, void *dst);
+typedef blake_8way_small_context blake256r8_8x32_context;
+void blake256r8_8x32_init(void *cc);
+void blake256r8_8x32_update(void *cc, const void *data, size_t len);
+void blake256r8_8x32_close(void *cc, void *dst);

-#define blake_8x32_small_context      blake256_8way_context
-#define blake_8x32_init               blake256_8way_init
-#define blake_8x32_update             blake256_8way_update
-#define blake_8x32_close              blake256_8way_close
-#define blake_8x32_update_le          blake256_8way_update_le
-#define blake_8x32_close_le           blake256_8way_close_le
-#define blake_8x32_round0_prehash_le  blake256_8way_round0_prehash
-#define blake_8x32_final_rounds_le    blake256_8way_final_rounds_le
-#define blake256r14_8x32_context      blake256r14_8way_context
-#define blake256r14_8x32_init         blake256r14_8way_init
-#define blake256r14_8x32_update       blake256r14_8way_update
-#define blake256r14_8x32_close        blake256r14_8way_close
-#define blake256r8_8x32_context       blake256r14_8way_context
-#define blake256r8_8x32_init          blake256r14_8way_init
-#define blake256r8_8x32_update        blake256r14_8way_update
-#define blake256r8_8x32_close         blake256r14_8way_close
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 ///////////////////////////////////
 //
@@ -163,46 +120,29 @@ typedef struct
 } blake_16way_small_context __attribute__ ((aligned (128)));

 // Default 14 rounds
-typedef blake_16way_small_context blake256_16way_context;
-void blake256_16way_init(void *cc);
-void blake256_16way_update(void *cc, const void *data, size_t len);
-void blake256_16way_close(void *cc, void *dst);
+typedef blake_16way_small_context blake256_16x32_context;
+void blake256_16x32_init(void *cc);
+void blake256_16x32_update(void *cc, const void *data, size_t len);
+void blake256_16x32_close(void *cc, void *dst);
 // Expects data in little endian order, no byte swap needed
-void blake256_16way_update_le(void *cc, const void *data, size_t len);
-void blake256_16way_close_le(void *cc, void *dst);
-void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
+void blake256_16x32_update_le(void *cc, const void *data, size_t len);
+void blake256_16x32_close_le(void *cc, void *dst);
+void blake256_16x32_round0_prehash_le( void *midstate, const void *midhash,
                                       void *data );
-void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
+void blake256_16x32_final_rounds_le( void *final_hash, const void *midstate,
                     const void *midhash, const void *data, const int rounds );

 // 14 rounds, blake, decred
-typedef blake_16way_small_context blake256r14_16way_context;
-void blake256r14_16way_init(void *cc);
-void blake256r14_16way_update(void *cc, const void *data, size_t len);
-void blake256r14_16way_close(void *cc, void *dst);
+typedef blake_16way_small_context blake256r14_16x32_context;
+void blake256r14_16x32_init(void *cc);
+void blake256r14_16x32_update(void *cc, const void *data, size_t len);
+void blake256r14_16x32_close(void *cc, void *dst);

 // 8 rounds, blakecoin, vanilla
-typedef blake_16way_small_context blake256r8_16way_context;
-void blake256r8_16way_init(void *cc);
-void blake256r8_16way_update(void *cc, const void *data, size_t len);
-void blake256r8_16way_close(void *cc, void *dst);
-
-#define blake_16x32_small_context      blake256_16way_context
-#define blake_16x32_init               blake256_16way_init
-#define blake_16x32_update             blake256_16way_update
-#define blake_16x32_close              blake256_16way_close
-#define blake_16x32_update_le          blake256_16way_update_le
-#define blake_16x32_close_le           blake256_16way_close_le
-#define blake_16x32_round0_prehash_le  blake256_16way_round0_prehash
-#define blake_16x32_final_rounds_le    blake256_16way_final_rounds_le
-#define blake256r14_16x32_context      blake256r14_16way_context
-#define blake256r14_16x32_init         blake256r14_16way_init
-#define blake256r14_16x32_update       blake256r14_16way_update
-#define blake256r14_16x32_close        blake256r14_16way_close
-#define blake256r8_16x32_context       blake256r8_16way_context
-#define blake256r8_16x32_init          blake256r8_16way_init
-#define blake256r8_16x32_update        blake256r8_16way_update
-#define blake256r8_16x32_close         blake256r8_16way_close
+typedef blake_16way_small_context blake256r8_16x32_context;
+void blake256r8_16x32_init(void *cc);
+void blake256r8_16x32_update(void *cc, const void *data, size_t len);
+void blake256r8_16x32_close(void *cc, void *dst);

 #endif  // AVX512
 #endif  // AVX2
--- a/algo/blake/blake2b-hash.c
+++ b/algo/blake/blake2b-hash.c
@@ -226,7 +226,7 @@ static const uint8_t sigma[12][16] =
 #define Mx_(n)      Mx__(n)
 #define Mx__(n)     M ## n

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 #define B2B8W_G(a, b, c, d, x, y) \
 { \
@@ -240,7 +240,7 @@ static const uint8_t sigma[12][16] =
   v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
 }

-static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
+static void blake2b_8x64_compress( blake2b_8x64_ctx *ctx, int last )
 {  
   __m512i v[16], m[16];

@@ -306,7 +306,7 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
   ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
 }

-int blake2b_8way_init( blake2b_8way_ctx *ctx )
+int blake2b_8x64_init( blake2b_8x64_ctx *ctx )
 {
   size_t i;

@@ -333,7 +333,7 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
 }


-void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
+void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
                          size_t inlen )
 {
   __m512i* in =(__m512i*)input;
@@ -348,7 +348,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
         ctx->t[0] += ctx->c;
         if ( ctx->t[0] < ctx->c )
            ctx->t[1]++;
-         blake2b_8way_compress( ctx, 0 );
+         blake2b_8x64_compress( ctx, 0 );
         ctx->c = 0;
      }
      ctx->b[ c++ ] = in[i];
@@ -356,7 +356,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
   }
 }

-void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
+void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out )
 {
   size_t c;
   c = ctx->c >> 3;
@@ -371,7 +371,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
      ctx->c += 8;
   }

-   blake2b_8way_compress( ctx, 1 );           // final block flag = 1
+   blake2b_8x64_compress( ctx, 1 );           // final block flag = 1

   casti_m512i( out, 0 ) = ctx->h[0];
   casti_m512i( out, 1 ) = ctx->h[1];
@@ -407,7 +407,7 @@ static const uint64_t blake2b_iv[8] = {
 };
 */

-static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
+static void blake2b_4x64_compress( blake2b_4x64_ctx *ctx, int last )
 {
 	__m256i v[16], m[16];

@@ -473,7 +473,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
   ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
 }

-int blake2b_4way_init( blake2b_4way_ctx *ctx ) 
+int blake2b_4x64_init( blake2b_4x64_ctx *ctx ) 
 {
 	size_t i;

@@ -499,7 +499,7 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
 	return 0;
 }

-void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
+void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
                          size_t inlen ) 
 {
   __m256i* in =(__m256i*)input;
@@ -514,7 +514,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
 			ctx->t[0] += ctx->c;
 			if ( ctx->t[0] < ctx->c )
 				ctx->t[1]++;
-			blake2b_4way_compress( ctx, 0 );
+			blake2b_4x64_compress( ctx, 0 );
 			ctx->c = 0;
 		}
      ctx->b[ c++ ] = in[i];
@@ -522,7 +522,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
   }
 }

-void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
+void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out )
 {
 	size_t c;
   c = ctx->c >> 3;
@@ -537,7 +537,7 @@ void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
      ctx->c += 8;
   }

-   blake2b_4way_compress( ctx, 1 );           // final block flag = 1
+   blake2b_4x64_compress( ctx, 1 );           // final block flag = 1

   casti_m256i( out, 0 ) = ctx->h[0];
   casti_m256i( out, 1 ) = ctx->h[1];
--- a/algo/blake/blake2b-hash.h
+++ b/algo/blake/blake2b-hash.h
@@ -1,6 +1,6 @@
 #pragma once
-#ifndef __BLAKE2B_HASH_4WAY_H__
-#define __BLAKE2B_HASH_4WAY_H__
+#ifndef BLAKE2B_HASH_4WAY_H__
+#define BLAKE2B_HASH_4WAY_H__

 #include "simd-utils.h"
 #include <stddef.h>
@@ -14,8 +14,7 @@
 #define ALIGN(x) __attribute__((aligned(x)))
 #endif

-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 typedef struct ALIGN( 64 ) {
   __m512i b[16]; // input buffer
@@ -23,12 +22,12 @@ typedef struct ALIGN( 64 ) {
   uint64_t t[2];  // total number of bytes
   size_t c;       // pointer for b[]
   size_t outlen;  // digest size
-} blake2b_8way_ctx;
+} blake2b_8x64_ctx;

-int blake2b_8way_init( blake2b_8way_ctx *ctx );
-void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
+int blake2b_8x64_init( blake2b_8x64_ctx *ctx );
+void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
                          size_t inlen );
-void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
+void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out );

 #endif

@@ -41,12 +40,12 @@ typedef struct ALIGN( 64 ) {
 	uint64_t t[2];  // total number of bytes
 	size_t c;       // pointer for b[]
 	size_t outlen;  // digest size
-} blake2b_4way_ctx;
+} blake2b_4x64_ctx;

-int blake2b_4way_init( blake2b_4way_ctx *ctx );
-void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
+int blake2b_4x64_init( blake2b_4x64_ctx *ctx );
+void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
                          size_t inlen );
-void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
+void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out );

 #endif

--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -3,7 +3,7 @@
 #include <stdint.h>
 #include "blake2b-hash.h"

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define BLAKE2B_8WAY
 #elif defined(__AVX2__)
  #define BLAKE2B_4WAY
@@ -17,7 +17,7 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
   uint32_t hash[8*8] __attribute__ ((aligned (128)));;
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
+   blake2b_8x64_ctx ctx __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[49]);   // 3*16+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -35,9 +35,9 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );

-      blake2b_8way_init( &ctx );
-      blake2b_8way_update( &ctx, vdata, 80 );
-      blake2b_8way_final( &ctx, hash );
+      blake2b_8x64_init( &ctx );
+      blake2b_8x64_update( &ctx, vdata, 80 );
+      blake2b_8x64_final( &ctx, hash );

      for ( int lane = 0; lane < 8; lane++ )
      if ( hash7[ lane<<1 ] <= Htarg )
@@ -61,10 +61,10 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
 // Function not used, code inlined.
 void blake2b_4way_hash(void *output, const void *input)
 {
-    blake2b_4way_ctx ctx;
-    blake2b_4way_init( &ctx );
-    blake2b_4way_update( &ctx, input, 80 );
-    blake2b_4way_final( &ctx, output );
+    blake2b_4x64_ctx ctx;
+    blake2b_4x64_init( &ctx );
+    blake2b_4x64_update( &ctx, input, 80 );
+    blake2b_4x64_final( &ctx, output );
 }

 int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
@@ -73,7 +73,7 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
 	uint32_t hash[8*4] __attribute__ ((aligned (64)));;
   uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
+   blake2b_4x64_ctx ctx __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[25]);   // 3*8+1
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
@@ -90,9 +90,9 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

-      blake2b_4way_init( &ctx ); 
-      blake2b_4way_update( &ctx, vdata, 80 );
-      blake2b_4way_final( &ctx, hash );
+      blake2b_4x64_init( &ctx ); 
+      blake2b_4x64_update( &ctx, vdata, 80 );
+      blake2b_4x64_final( &ctx, hash );

      for ( int lane = 0; lane < 4; lane++ )
      if ( hash7[ lane<<1 ] <= Htarg )
--- a/algo/blake/blake2s-hash.c
+++ b/algo/blake/blake2s-hash.c
@@ -497,7 +497,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,

 #endif // __AVX2__

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 // Blake2s-256 16 way

--- a/algo/blake/blake2s-hash.h
+++ b/algo/blake/blake2s-hash.h
@@ -11,8 +11,8 @@
 * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
 */
 //#pragma once
-#ifndef __BLAKE2S_HASH_4WAY_H__
-#define __BLAKE2S_HASH_4WAY_H__ 1
+#ifndef BLAKE2S_HASH_4WAY_H__
+#define BLAKE2S_HASH_4WAY_H__ 1

 #if defined(__SSE2__) || defined(__ARM_NEON)

@@ -29,20 +29,20 @@
 #define ALIGN(x) __attribute__((aligned(x)))
 #endif

-   typedef struct __blake2s_nway_param
-   {
-      uint8_t  digest_length; // 1
-      uint8_t  key_length;    // 2
-      uint8_t  fanout;        // 3
-      uint8_t  depth;         // 4
-      uint32_t leaf_length;   // 8
-      uint8_t  node_offset[6];// 14
-      uint8_t  node_depth;    // 15
-      uint8_t  inner_length;  // 16
-      // uint8_t  reserved[0];
-      uint8_t  salt[8]; // 24
-      uint8_t  personal[8];  // 32
-   } blake2s_nway_param;
+typedef struct __blake2s_nway_param
+{
+   uint8_t  digest_length; // 1
+   uint8_t  key_length;    // 2
+   uint8_t  fanout;        // 3
+   uint8_t  depth;         // 4
+   uint32_t leaf_length;   // 8
+   uint8_t  node_offset[6];// 14
+   uint8_t  node_depth;    // 15
+   uint8_t  inner_length;  // 16
+   // uint8_t  reserved[0];
+   uint8_t  salt[8]; // 24
+   uint8_t  personal[8];  // 32
+} blake2s_nway_param;

 typedef struct ALIGN( 64 ) __blake2s_4way_state
 {
@@ -61,13 +61,18 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
 int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
                              const void *input, uint64_t inlen );

+#define blake2s_4x32_state       blake2s_4way_state
+#define blake2s_4x32_init        blake2s_4way_init
+#define blake2s_4x32_update      blake2s_4way_update
+#define blake2s_4x32_final       blake2s_4way_final
+#define blake2s_4x32_full_blocks blake2s_4way_full_blocks

 #if defined(__AVX2__)

 typedef struct ALIGN( 64 ) __blake2s_8way_state
 {
   __m256i h[8];
-   uint8_t  buf[ 32 * 8 ];
+   uint8_t  buf[ 64 * 8 ];
   uint32_t t[2];
   uint32_t f[2];
   size_t   buflen;
@@ -81,14 +86,20 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
 int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
                              const void *input, uint64_t inlen );

+#define blake2s_8x32_state       blake2s_8way_state
+#define blake2s_8x32_init        blake2s_8way_init
+#define blake2s_8x32_update      blake2s_8way_update
+#define blake2s_8x32_final       blake2s_8way_final
+#define blake2s_8x32_full_blocks blake2s_8way_full_blocks
+
 #endif

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 typedef struct ALIGN( 64 ) __blake2s_16way_state
 {
   __m512i h[8];
-   uint8_t  buf[ 32 * 16 ];
+   uint8_t  buf[ 64 * 16 ];
   uint32_t t[2];
   uint32_t f[2];
   size_t   buflen;
@@ -100,6 +111,11 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in,
                         uint64_t inlen );
 int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );

+#define blake2s_16x32_state       blake2s_16way_state
+#define blake2s_16x32_init        blake2s_16way_init
+#define blake2s_16x32_update      blake2s_16way_update
+#define blake2s_16x32_final       blake2s_16way_final
+
 #endif

 #if 0
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -3,7 +3,7 @@
 #include <string.h>
 #include <stdint.h>

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define BLAKE2S_16WAY
 #elif defined(__AVX2__)
  #define BLAKE2S_8WAY
--- a/algo/blake/blake512-hash.c
+++ b/algo/blake/blake512-hash.c
@@ -349,16 +349,16 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
   Va = v128_add64( Va, v128_add64( Vb, \
                            v128_set64( CBx( r, Sd ) ^ Mx( r, Sc ), \
                                        CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
-   Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
+   Vd = v128_ror64xor( Vd, Va, 32 ); \
   Vc = v128_add64( Vc, Vd ); \
-   Vb = v128_ror64( v128_xor( Vb, Vc ), 25 ); \
+   Vb = v128_ror64xor( Vb, Vc, 25 ); \
 \
   Va = v128_add64( Va, v128_add64( Vb, \
                            v128_set64( CBx( r, Sc ) ^ Mx( r, Sd ), \
                                        CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
-   Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
+   Vd = v128_ror64xor( Vd, Va, 16 ); \
   Vc = v128_add64( Vc, Vd ); \
-   Vb = v128_ror64( v128_xor( Vb, Vc ), 11 ); \
+   Vb = v128_ror64xor( Vb, Vc, 11 ); \
 }

 #define BLAKE512_ROUND( R ) \
@@ -559,7 +559,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,

 #if defined(__AVX2__)

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 ////////////////////////////////////
 //
@@ -617,24 +617,22 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
  VD = v512_64( CB5 ^ T0 ); \
  VE = v512_64( CB6 ^ T1 ); \
  VF = v512_64( CB7 ^ T1 ); \
-  const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set64( \
-                                   0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
-  M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
-  M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
-  M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
-  M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
-  M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
-  M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
-  M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
-  M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
-  M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
-  M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
-  MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
-  MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
-  MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
-  MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
-  ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
-  MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
+  M0 = mm512_bswap_64( *(buf+ 0) ); \
+  M1 = mm512_bswap_64( *(buf+ 1) ); \
+  M2 = mm512_bswap_64( *(buf+ 2) ); \
+  M3 = mm512_bswap_64( *(buf+ 3) ); \
+  M4 = mm512_bswap_64( *(buf+ 4) ); \
+  M5 = mm512_bswap_64( *(buf+ 5) ); \
+  M6 = mm512_bswap_64( *(buf+ 6) ); \
+  M7 = mm512_bswap_64( *(buf+ 7) ); \
+  M8 = mm512_bswap_64( *(buf+ 8) ); \
+  M9 = mm512_bswap_64( *(buf+ 9) ); \
+  MA = mm512_bswap_64( *(buf+10) ); \
+  MB = mm512_bswap_64( *(buf+11) ); \
+  MC = mm512_bswap_64( *(buf+12) ); \
+  MD = mm512_bswap_64( *(buf+13) ); \
+  ME = mm512_bswap_64( *(buf+14) ); \
+  MF = mm512_bswap_64( *(buf+15) ); \
  ROUND_B_8WAY(0); \
  ROUND_B_8WAY(1); \
  ROUND_B_8WAY(2); \
@@ -661,7 +659,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
  H7 = mm512_xor3( VF, V7, H7 ); \
 }

-void blake512_8way_compress( blake_8way_big_context *sc )
+void blake512_8x64_compress( blake_8x64_big_context *sc )
 { 
  __m512i M0, M1, M2, M3, M4, M5, M6, M7;
  __m512i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -685,25 +683,22 @@ void blake512_8way_compress( blake_8way_big_context *sc )
  VE = v512_64( CB6 ^ sc->T1 );
  VF = v512_64( CB7 ^ sc->T1 );

-  const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set64( 
-                                   0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
-
-  M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
-  M1 = _mm512_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
-  M2 = _mm512_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
-  M3 = _mm512_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
-  M4 = _mm512_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
-  M5 = _mm512_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
-  M6 = _mm512_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
-  M7 = _mm512_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
-  M8 = _mm512_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
-  M9 = _mm512_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
-  MA = _mm512_shuffle_epi8( sc->buf[10], shuf_bswap64 );
-  MB = _mm512_shuffle_epi8( sc->buf[11], shuf_bswap64 );
-  MC = _mm512_shuffle_epi8( sc->buf[12], shuf_bswap64 );
-  MD = _mm512_shuffle_epi8( sc->buf[13], shuf_bswap64 );
-  ME = _mm512_shuffle_epi8( sc->buf[14], shuf_bswap64 );
-  MF = _mm512_shuffle_epi8( sc->buf[15], shuf_bswap64 );
+  M0 = mm512_bswap_64( sc->buf[ 0] );
+  M1 = mm512_bswap_64( sc->buf[ 1] );
+  M2 = mm512_bswap_64( sc->buf[ 2] );
+  M3 = mm512_bswap_64( sc->buf[ 3] );
+  M4 = mm512_bswap_64( sc->buf[ 4] );
+  M5 = mm512_bswap_64( sc->buf[ 5] );
+  M6 = mm512_bswap_64( sc->buf[ 6] );
+  M7 = mm512_bswap_64( sc->buf[ 7] );
+  M8 = mm512_bswap_64( sc->buf[ 8] );
+  M9 = mm512_bswap_64( sc->buf[ 9] );
+  MA = mm512_bswap_64( sc->buf[10] );
+  MB = mm512_bswap_64( sc->buf[11] );
+  MC = mm512_bswap_64( sc->buf[12] );
+  MD = mm512_bswap_64( sc->buf[13] );
+  ME = mm512_bswap_64( sc->buf[14] );
+  MF = mm512_bswap_64( sc->buf[15] );

  ROUND_B_8WAY(0);
  ROUND_B_8WAY(1);
@@ -733,7 +728,7 @@ void blake512_8way_compress( blake_8way_big_context *sc )
 }

 // won't be used after prehash implemented
-void blake512_8way_compress_le( blake_8x64_big_context *sc )
+void blake512_8x64_compress_le( blake_8x64_big_context *sc )
 {
  __m512i M0, M1, M2, M3, M4, M5, M6, M7;
  __m512i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -1177,7 +1172,7 @@ void blake512_8x64_full( blake_8x64_big_context *sc, void * dst,
   {
      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
            sc->T1 = sc->T1 + 1;
-      blake512_8way_compress( sc );
+      blake512_8x64_compress( sc );
      sc->ptr = 0;
   }

@@ -1213,7 +1208,7 @@ void blake512_8x64_full( blake_8x64_big_context *sc, void * dst,
   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
       sc->T1 = sc->T1 + 1;

-   blake512_8way_compress( sc );
+   blake512_8x64_compress( sc );
   
   mm512_block_bswap_64( (__m512i*)dst, sc->H );
 }
@@ -1244,7 +1239,7 @@ void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst,
   {
      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
            sc->T1 = sc->T1 + 1;
-      blake512_8way_compress_le( sc );
+      blake512_8x64_compress_le( sc );
      sc->ptr = 0;
   }

@@ -1280,7 +1275,7 @@ void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst,
   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
       sc->T1 = sc->T1 + 1;

-   blake512_8way_compress_le( sc );
+   blake512_8x64_compress_le( sc );

   mm512_block_bswap_64( (__m512i*)dst, sc->H );
 }
@@ -1355,24 +1350,22 @@ blake512_8x64_close(void *cc, void *dst)
  VD = v256_64( CB5 ^ T0 ); \
  VE = v256_64( CB6 ^ T1 ); \
  VF = v256_64( CB7 ^ T1 ); \
-  const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set64( \
-                             0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
-  M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
-  M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
-  M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
-  M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
-  M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
-  M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
-  M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
-  M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
-  M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
-  M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
-  MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
-  MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
-  MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
-  MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
-  ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
-  MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
+  M0 = mm256_bswap_64( *(buf+ 0) ); \
+  M1 = mm256_bswap_64( *(buf+ 1) ); \
+  M2 = mm256_bswap_64( *(buf+ 2) ); \
+  M3 = mm256_bswap_64( *(buf+ 3) ); \
+  M4 = mm256_bswap_64( *(buf+ 4) ); \
+  M5 = mm256_bswap_64( *(buf+ 5) ); \
+  M6 = mm256_bswap_64( *(buf+ 6) ); \
+  M7 = mm256_bswap_64( *(buf+ 7) ); \
+  M8 = mm256_bswap_64( *(buf+ 8) ); \
+  M9 = mm256_bswap_64( *(buf+ 9) ); \
+  MA = mm256_bswap_64( *(buf+10) ); \
+  MB = mm256_bswap_64( *(buf+11) ); \
+  MC = mm256_bswap_64( *(buf+12) ); \
+  MD = mm256_bswap_64( *(buf+13) ); \
+  ME = mm256_bswap_64( *(buf+14) ); \
+  MF = mm256_bswap_64( *(buf+15) ); \
  ROUND_B_4WAY(0); \
  ROUND_B_4WAY(1); \
  ROUND_B_4WAY(2); \
@@ -1400,7 +1393,7 @@ blake512_8x64_close(void *cc, void *dst)
 }


-void blake512_4way_compress( blake_4x64_big_context *sc )
+void blake512_4x64_compress( blake_4x64_big_context *sc )
 {
  __m256i M0, M1, M2, M3, M4, M5, M6, M7;
  __m256i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -1423,25 +1416,23 @@ void blake512_4way_compress( blake_4x64_big_context *sc )
  VD = v256_64( CB5 ^ sc->T0 );
  VE = v256_64( CB6 ^ sc->T1 );
  VF = v256_64( CB7 ^ sc->T1 );
-  const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set64(
-                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ) );

-  M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
-  M1 = _mm256_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
-  M2 = _mm256_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
-  M3 = _mm256_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
-  M4 = _mm256_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
-  M5 = _mm256_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
-  M6 = _mm256_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
-  M7 = _mm256_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
-  M8 = _mm256_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
-  M9 = _mm256_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
-  MA = _mm256_shuffle_epi8( sc->buf[10], shuf_bswap64 );
-  MB = _mm256_shuffle_epi8( sc->buf[11], shuf_bswap64 );
-  MC = _mm256_shuffle_epi8( sc->buf[12], shuf_bswap64 );
-  MD = _mm256_shuffle_epi8( sc->buf[13], shuf_bswap64 );
-  ME = _mm256_shuffle_epi8( sc->buf[14], shuf_bswap64 );
-  MF = _mm256_shuffle_epi8( sc->buf[15], shuf_bswap64 );
+  M0 = mm256_bswap_64( sc->buf[ 0] );
+  M1 = mm256_bswap_64( sc->buf[ 1] );
+  M2 = mm256_bswap_64( sc->buf[ 2] );
+  M3 = mm256_bswap_64( sc->buf[ 3] );
+  M4 = mm256_bswap_64( sc->buf[ 4] );
+  M5 = mm256_bswap_64( sc->buf[ 5] );
+  M6 = mm256_bswap_64( sc->buf[ 6] );
+  M7 = mm256_bswap_64( sc->buf[ 7] );
+  M8 = mm256_bswap_64( sc->buf[ 8] );
+  M9 = mm256_bswap_64( sc->buf[ 9] );
+  MA = mm256_bswap_64( sc->buf[10] );
+  MB = mm256_bswap_64( sc->buf[11] );
+  MC = mm256_bswap_64( sc->buf[12] );
+  MD = mm256_bswap_64( sc->buf[13] );
+  ME = mm256_bswap_64( sc->buf[14] );
+  MF = mm256_bswap_64( sc->buf[15] );

  ROUND_B_4WAY(0);
  ROUND_B_4WAY(1);
@@ -1470,7 +1461,7 @@ void blake512_4way_compress( blake_4x64_big_context *sc )
  sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
 }

-void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate,
+void blake512_4x64_prehash_le( blake512_4x64_context *sc, __m256i *midstate,
                               const void *data )
 {
   __m256i V0, V1, V2, V3, V4, V5, V6, V7;
@@ -1562,7 +1553,7 @@ void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate,
   midstate[15] = VF;
 }

-void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
+void blake512_4x64_final_le( blake512_4x64_context *sc, void *hash,
                             const __m256i nonce, const __m256i *midstate )
 {
   __m256i M0, M1, M2, M3, M4, M5, M6, M7;
@@ -1685,7 +1676,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
 }


-void blake512_4x64_init( blake_4x64_big_context *sc )
+void blake512_4x64_init( blake512_4x64_context *sc )
 {
   casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
   casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
@@ -1798,7 +1789,7 @@ blake64_4way_close( blake_4x64_big_context *sc, void *dst )
 }

 // init, update & close
-void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
+void blake512_4x64_full( blake512_4x64_context *sc, void * dst,
                         const void *data, size_t len )
 {

@@ -1824,7 +1815,7 @@ void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
   {
      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
         sc->T1 =  sc->T1 + 1;
-      blake512_4way_compress( sc );
+      blake512_4x64_compress( sc );
      sc->ptr = 0;
   }

@@ -1859,7 +1850,7 @@ void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
       sc->T1 = sc->T1 + 1;

-   blake512_4way_compress( sc );
+   blake512_4x64_compress( sc );

   mm256_block_bswap_64( (__m256i*)dst, sc->H );
 }
@@ -1887,13 +1878,13 @@ blake512_4x64_close(void *cc, void *dst)
 #define GB_2X64( m0, m1, c0, c1, a, b, c, d ) \
 { \
   a = v128_add64( v128_add64( v128_xor( v128_64( c1 ), m0 ), b ), a ); \
-   d = v128_ror64( v128_xor( d, a ), 32 ); \
+   d = v128_ror64xor( d, a, 32 ); \
   c = v128_add64( c, d ); \
-   b = v128_ror64( v128_xor( b, c ), 25 ); \
+   b = v128_ror64xor( b, c, 25 ); \
   a = v128_add64( v128_add64( v128_xor( v128_64( c0 ), m1 ), b ), a ); \
-   d = v128_ror64( v128_xor( d, a ), 16 ); \
+   d = v128_ror64xor( d, a, 16 ); \
   c = v128_add64( c, d ); \
-   b = v128_ror64( v128_xor( b, c ), 11 ); \
+   b = v128_ror64xor( b, c, 11 ); \
 }

 #define ROUND_B_2X64(r) \
@@ -1934,29 +1925,6 @@ void blake512_2x64_compress( blake_2x64_big_context *sc )
  VE = v128_64( CB6 ^ sc->T1 );
  VF = v128_64( CB7 ^ sc->T1 );

-#if defined(__SSSE3__)
-
-  const v128u64_t shuf_bswap64 = v128_set64(
-                                 0x08090a0b0c0d0e0f, 0x0001020304050607 );
-  M0 = v128_shuffle8( sc->buf[ 0], shuf_bswap64 );
-  M1 = v128_shuffle8( sc->buf[ 1], shuf_bswap64 );
-  M2 = v128_shuffle8( sc->buf[ 2], shuf_bswap64 );
-  M3 = v128_shuffle8( sc->buf[ 3], shuf_bswap64 );
-  M4 = v128_shuffle8( sc->buf[ 4], shuf_bswap64 );
-  M5 = v128_shuffle8( sc->buf[ 5], shuf_bswap64 );
-  M6 = v128_shuffle8( sc->buf[ 6], shuf_bswap64 );
-  M7 = v128_shuffle8( sc->buf[ 7], shuf_bswap64 );
-  M8 = v128_shuffle8( sc->buf[ 8], shuf_bswap64 );
-  M9 = v128_shuffle8( sc->buf[ 9], shuf_bswap64 );
-  MA = v128_shuffle8( sc->buf[10], shuf_bswap64 );
-  MB = v128_shuffle8( sc->buf[11], shuf_bswap64 );
-  MC = v128_shuffle8( sc->buf[12], shuf_bswap64 );
-  MD = v128_shuffle8( sc->buf[13], shuf_bswap64 );
-  ME = v128_shuffle8( sc->buf[14], shuf_bswap64 );
-  MF = v128_shuffle8( sc->buf[15], shuf_bswap64 );
-
-#else  // SSE2 & NEON
-
  M0 = v128_bswap64( sc->buf[ 0] );
  M1 = v128_bswap64( sc->buf[ 1] );
  M2 = v128_bswap64( sc->buf[ 2] );
@@ -1974,8 +1942,6 @@ void blake512_2x64_compress( blake_2x64_big_context *sc )
  ME = v128_bswap64( sc->buf[14] );
  MF = v128_bswap64( sc->buf[15] );
  
-#endif
-
  ROUND_B_2X64(0);
  ROUND_B_2X64(1);
  ROUND_B_2X64(2);
@@ -2054,9 +2020,9 @@ void blake512_2x64_prehash_part1_le( blake_2x64_big_context *sc,
   // G4 skip nonce
   V0 = v128_add64( v128_add64( v128_xor( v128_64( CB9 ), sc->buf[ 8] ), V5 ),
                                          V0 );
-   VF = v128_ror64( v128_xor( VF, V0 ), 32 );
+   VF = v128_ror64xor( VF, V0, 32 );
   VA = v128_add64( VA, VF );
-   V5 = v128_ror64( v128_xor( V5, VA ), 25 );
+   V5 = v128_ror64xor( V5, VA, 25 );
   V0 = v128_add64( V0, V5 );

   GB_2X64( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
@@ -2137,9 +2103,9 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,

   // finish round 0, with the nonce now available 
   V0 = v128_add64( V0, v128_xor( v128_64( CB8 ), M9 ) );
-   VF = v128_ror64( v128_xor( VF, V0 ), 16 );
+   VF = v128_ror64xor( VF, V0, 16 );
   VA = v128_add64( VA, VF );
-   V5 = v128_ror64( v128_xor( V5, VA ), 11 );
+   V5 = v128_ror64xor( V5, VA, 11 );

   // Round 1
   // G0
@@ -2147,34 +2113,34 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,

   // G1
   V1 = v128_add64( V1, V5 );
-   VD = v128_ror64( v128_xor( VD, V1 ), 32 );
+   VD = v128_ror64xor( VD, V1, 32 );
   V9 = v128_add64( V9, VD );
-   V5 = v128_ror64( v128_xor( V5, V9 ), 25 );
+   V5 = v128_ror64xor( V5, V9, 25 );
   V1 = v128_add64( V1, v128_add64( v128_xor( v128_64( CBx(1,2) ), Mx(1,3) ),
                                              V5 ) );
-   VD = v128_ror64( v128_xor( VD, V1 ), 16 );
+   VD = v128_ror64xor( VD, V1, 16 );
   V9 = v128_add64( V9, VD );
-   V5 = v128_ror64( v128_xor( V5, V9 ), 11 );
+   V5 = v128_ror64xor( V5, V9, 11 );

   // G2
   V2 = v128_add64( V2, v128_xor( v128_64( CBF ), M9 ) );
-   VE = v128_ror64( v128_xor( VE, V2 ), 32 );
+   VE = v128_ror64xor( VE, V2, 32 );
   VA = v128_add64( VA, VE );
-   V6 = v128_ror64( v128_xor( V6, VA ), 25 );
+   V6 = v128_ror64xor( V6, VA, 25 );
   V2 = v128_add64( V2, v128_add64( v128_xor( v128_64( CB9 ), MF ), V6 ) );
-   VE = v128_ror64( v128_xor( VE, V2 ), 16 );
+   VE = v128_ror64xor( VE, V2, 16 );
   VA = v128_add64( VA, VE );
-   V6 = v128_ror64( v128_xor( V6, VA ), 11 );
+   V6 = v128_ror64xor( V6, VA, 11 );

   // G3
-   VF = v128_ror64( v128_xor( VF, V3 ), 32 );
+   VF = v128_ror64xor( VF, V3, 32 );
   VB = v128_add64( VB, VF );
-   V7 = v128_ror64( v128_xor( V7, VB ), 25 );
+   V7 = v128_ror64xor( V7, VB, 25 );
   V3 = v128_add64( V3, v128_add64( v128_xor( v128_64( CBx(1, 6) ), Mx(1, 7) ),
                                              V7 ) );
-   VF = v128_ror64( v128_xor( VF, V3 ), 16 );
+   VF = v128_ror64xor( VF, V3, 16 );
   VB = v128_add64( VB, VF );
-   V7 = v128_ror64( v128_xor( V7, VB ), 11 );
+   V7 = v128_ror64xor( V7, VB, 11 );

   // G4, G5, G6, G7
   GB_2X64(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
--- a/algo/blake/blake512-hash.h
+++ b/algo/blake/blake512-hash.h
@@ -92,7 +92,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
 #define blake512_4way_prehash_le  blake512_4x64_prehash_le
 #define blake512_4way_final_le    blake512_4x64_final_le

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 ////////////////////////////
 //
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -54,10 +54,10 @@ int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );

   // Partialy prehash second block without touching nonces in block_buf[3].
-   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-      blake256_16way_final_rounds_le( hash32, midstate_vars, block0_hash,
+      blake256_16x32_final_rounds_le( hash32, midstate_vars, block0_hash,
                                      block_buf, rounds );
      for ( int lane = 0; lane < 16; lane++ )
      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -123,10 +123,10 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
   block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );

   // Partialy prehash second block without touching nonces in block_buf[3].
-   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-      blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
+      blake256_8x32_final_rounds_le( hash32, midstate_vars, block0_hash,
                                     block_buf, rounds );
      for ( int lane = 0; lane < 8; lane++ )
      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -148,16 +148,16 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
   
 #elif defined (BLAKECOIN_4WAY)

-blake256r8_4way_context blakecoin_4w_ctx;
+blake256r8_4x32_context blakecoin_4w_ctx;

 void blakecoin_4way_hash(void *state, const void *input)
 {
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-     blake256r8_4way_context ctx;
+     blake256r8_4x32_context ctx;

     memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
-     blake256r8_4way_update( &ctx, input + (64<<2), 16 );
-     blake256r8_4way_close( &ctx, vhash );
+     blake256r8_4x32_update( &ctx, input + (64<<2), 16 );
+     blake256r8_4x32_close( &ctx, vhash );

     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }
@@ -178,11 +178,11 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
      HTarget = 0x7f;

   v128_bswap32_intrlv80_4x32( vdata, pdata );
-   blake256r8_4way_init( &blakecoin_4w_ctx );
-   blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
+   blake256r8_4x32_init( &blakecoin_4w_ctx );
+   blake256r8_4x32_update( &blakecoin_4w_ctx, vdata, 64 );

   do {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      pdata[19] = n;
      blakecoin_4way_hash( hash, vdata );

--- a/algo/blake/blakecoin-gate.h
+++ b/algo/blake/blakecoin-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define BLAKECOIN_16WAY
 #elif defined(__AVX2__)
  #define BLAKECOIN_8WAY
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -16,28 +16,27 @@ extern void pentablakehash_4way( void *output, const void *input )
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-     blake512_4way_context ctx;
+     blake512_4x64_context ctx;

+     blake512_4x64_init( &ctx );
+     blake512_4x64_update( &ctx, input, 80 );
+     blake512_4x64_close( &ctx, vhash );

-     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, input, 80 );
-     blake512_4way_close( &ctx, vhash );
+     blake512_4x64_init( &ctx );
+     blake512_4x64_update( &ctx, vhash, 64 );
+     blake512_4x64_close( &ctx, vhash );

-     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, vhash, 64 );
-     blake512_4way_close( &ctx, vhash );
+     blake512_4x64_init( &ctx );
+     blake512_4x64_update( &ctx, vhash, 64 );
+     blake512_4x64_close( &ctx, vhash );

-     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, vhash, 64 );
-     blake512_4way_close( &ctx, vhash );
+     blake512_4x64_init( &ctx );
+     blake512_4x64_update( &ctx, vhash, 64 );
+     blake512_4x64_close( &ctx, vhash );

-     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, vhash, 64 );
-     blake512_4way_close( &ctx, vhash );
-
-     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, vhash, 64 );
-     blake512_4way_close( &ctx, vhash );
+     blake512_4x64_init( &ctx );
+     blake512_4x64_update( &ctx, vhash, 64 );
+     blake512_4x64_close( &ctx, vhash );

     memcpy( output,    hash0, 32 );
     memcpy( output+32, hash1, 32 );
--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -227,7 +227,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
 	v[14] = S->f[0] ^ blake2s_IV[6];
 	v[15] = S->f[1] ^ blake2s_IV[7];

-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)

   v128_t *V = (v128_t*)v;

@@ -263,19 +263,6 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
   V[3] = v128_swap64( V[3] ); \
   V[2] = v128_shufll32( V[2] )

-   BLAKE2S_ROUND(0);
-   BLAKE2S_ROUND(1);
-   BLAKE2S_ROUND(2);
-   BLAKE2S_ROUND(3);
-   BLAKE2S_ROUND(4);
-   BLAKE2S_ROUND(5);
-   BLAKE2S_ROUND(6);
-   BLAKE2S_ROUND(7);
-   BLAKE2S_ROUND(8);
-   BLAKE2S_ROUND(9);
-   
-#undef BLAKE2S_ROUND
-
 #else

 #define G(r,i,a,b,c,d) \
@@ -290,7 +277,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
 		b = SPH_ROTR32(b ^ c, 7); \
 	} while(0)

-#define ROUND(r)  \
+#define BLAKE2S_ROUND(r)  \
 	do { \
 		G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
 		G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
@@ -302,24 +289,25 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
 		G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
 	} while(0)

-   ROUND( 0 );
-	ROUND( 1 );
-	ROUND( 2 );
-	ROUND( 3 );
-	ROUND( 4 );
-	ROUND( 5 );
-	ROUND( 6 );
-	ROUND( 7 );
-	ROUND( 8 );
-	ROUND( 9 );
-
 #endif

+   BLAKE2S_ROUND(0);
+   BLAKE2S_ROUND(1);
+   BLAKE2S_ROUND(2);
+   BLAKE2S_ROUND(3);
+   BLAKE2S_ROUND(4);
+   BLAKE2S_ROUND(5);
+   BLAKE2S_ROUND(6);
+   BLAKE2S_ROUND(7);
+   BLAKE2S_ROUND(8);
+   BLAKE2S_ROUND(9);
+   
+
 	for( size_t i = 0; i < 8; ++i )
 		S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];

 #undef G
-#undef ROUND
+#undef BLAKE2S_ROUND
 	return 0;
 }

--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -101,15 +101,15 @@
 { \
   Va = v128_add64( Va, v128_add64( Vb, \
                 v128_set64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
-   Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
+   Vd = v128_ror64xor( Vd, Va, 32 ); \
   Vc = v128_add64( Vc, Vd ); \
-   Vb = v128_ror64( v128_xor( Vb, Vc ), 24 ); \
+   Vb = v128_ror64xor( Vb, Vc, 24 ); \
 \
   Va = v128_add64( Va, v128_add64( Vb, \
                 v128_set64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
-   Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
+   Vd = v128_ror64xor( Vd, Va, 16 ); \
   Vc = v128_add64( Vc, Vd ); \
-   Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \
+   Vb = v128_ror64xor( Vb, Vc, 63 ); \
 }

 #define BLAKE2B_ROUND( R ) \
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -39,16 +39,14 @@
 #include <stddef.h>
 #include "simd-utils.h"

-#define SPH_SIZE_bmw256   256
-
-#define SPH_SIZE_bmw512   512
-
 // BMW-256 4 way 32

+#if defined(__SSE2__) || defined(__ARM_NEON)
+
 typedef struct
 {
-   v128_t buf[64];
-   v128_t H[16];
+   v128u32_t buf[64];
+   v128u32_t H[16];
   size_t ptr;
   uint32_t bit_count;  // assume bit_count fits in 32 bits
 } bmw_4way_small_context;
@@ -58,13 +56,19 @@ typedef bmw_4way_small_context bmw256_4way_context;
 void bmw256_4way_init( bmw256_4way_context *ctx );

 void bmw256_4way_update(void *cc, const void *data, size_t len);
-#define bmw256_4way bmw256_4way_update

 void bmw256_4way_close(void *cc, void *dst);

 void bmw256_4way_addbits_and_close(
        void *cc, unsigned ub, unsigned n, void *dst);

+#define bmw256_4x32_context bmw256_4way_context
+#define bmw256_4x32_init    bmw256_4way_init
+#define bmw256_4x32_update  bmw256_4way_update
+#define bmw256_4x32_close   bmw256_4way_close
+
+#endif
+
 #if defined(__AVX2__)

 // BMW-256 8 way 32
@@ -85,9 +89,14 @@ void bmw256_8way_update( bmw256_8way_context *ctx, const void *data,
 #define bmw256_8way bmw256_8way_update
 void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );

+#define bmw256_8x32_context bmw256_8way_context
+#define bmw256_8x32_init    bmw256_8way_init
+#define bmw256_8x32_update  bmw256_8way_update
+#define bmw256_8x32_close   bmw256_8way_close
+
 #endif

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 // BMW-256 16 way 32

@@ -106,6 +115,11 @@ void bmw256_16way_update( bmw256_16way_context *ctx, const void *data,
                          size_t len );
 void bmw256_16way_close( bmw256_16way_context *ctx, void *dst );

+#define bmw256_16x32_context bmw256_16way_context
+#define bmw256_16x32_init    bmw256_16way_init
+#define bmw256_16x32_update  bmw256_16way_update
+#define bmw256_16x32_close   bmw256_16way_close
+
 #endif

 // BMW-512 2 way 64
@@ -157,7 +171,7 @@ void bmw512_4way_addbits_and_close(

 #endif  // __AVX2__

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 // BMW-512 64 bit 8 way
 typedef struct
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -45,7 +45,7 @@ extern "C"{

 #define LPAR   (

-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)

 // BMW-256 4 way 32
 /*
@@ -284,9 +284,9 @@ static const uint32_t IV256[] = {
                     v128_xor( M[13], H[13] ) ) )


-void compress_small( const v128u64_t *M, const v128u64_t H[16], v128u64_t dH[16] )
+void compress_small( const v128u32_t *M, const v128u32_t H[16], v128u32_t dH[16] )
 {
-   v128u64_t qt[32], xl, xh; \
+   v128u32_t qt[32], xl, xh; \

   qt[ 0] = v128_add32( ss0( Ws0 ), H[ 1] );
   qt[ 1] = v128_add32( ss1( Ws1 ), H[ 2] );
@@ -428,49 +428,25 @@ static const uint32_t final_s[16][4] =
   { 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae },
   { 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
 };
-/*
-static const v128u64_t final_s[16] =
-{
-   { 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
-   { 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
-   { 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 },
-   { 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 },
-   { 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 },
-   { 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 },
-   { 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 },
-   { 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 },
-   { 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 },
-   { 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 },
-   { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
-   { 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab },
-   { 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac },
-   { 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad },
-   { 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
-   { 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
-};
-*/
+
 void bmw256_4way_init( bmw256_4way_context *ctx )
 {
-   ctx->H[ 0] = v128_64( 0x4041424340414243 );
-   ctx->H[ 1] = v128_64( 0x4445464744454647 );
-   ctx->H[ 2] = v128_64( 0x48494A4B48494A4B );
-   ctx->H[ 3] = v128_64( 0x4C4D4E4F4C4D4E4F );
-   ctx->H[ 4] = v128_64( 0x5051525350515253 );
-   ctx->H[ 5] = v128_64( 0x5455565754555657 );
-   ctx->H[ 6] = v128_64( 0x58595A5B58595A5B );
-   ctx->H[ 7] = v128_64( 0x5C5D5E5F5C5D5E5F );
-   ctx->H[ 8] = v128_64( 0x6061626360616263 );
-   ctx->H[ 9] = v128_64( 0x6465666764656667 );
-   ctx->H[10] = v128_64( 0x68696A6B68696A6B );
-   ctx->H[11] = v128_64( 0x6C6D6E6F6C6D6E6F );
-   ctx->H[12] = v128_64( 0x7071727370717273 );
-   ctx->H[13] = v128_64( 0x7475767774757677 );
-   ctx->H[14] = v128_64( 0x78797A7B78797A7B );
-   ctx->H[15] = v128_64( 0x7C7D7E7F7C7D7E7F );
-
-
-//   for ( int i = 0; i < 16; i++ )
-//      sc->H[i] = v128_32( iv[i] );
+   ctx->H[ 0] = v128_32( 0x40414243 );
+   ctx->H[ 1] = v128_32( 0x44454647 );
+   ctx->H[ 2] = v128_32( 0x48494A4B );
+   ctx->H[ 3] = v128_32( 0x4C4D4E4F );
+   ctx->H[ 4] = v128_32( 0x50515253 );
+   ctx->H[ 5] = v128_32( 0x54555657 );
+   ctx->H[ 6] = v128_32( 0x58595A5B );
+   ctx->H[ 7] = v128_32( 0x5C5D5E5F );
+   ctx->H[ 8] = v128_32( 0x60616263 );
+   ctx->H[ 9] = v128_32( 0x64656667 );
+   ctx->H[10] = v128_32( 0x68696A6B );
+   ctx->H[11] = v128_32( 0x6C6D6E6F );
+   ctx->H[12] = v128_32( 0x70717273 );
+   ctx->H[13] = v128_32( 0x74757677 );
+   ctx->H[14] = v128_32( 0x78797A7B );
+   ctx->H[15] = v128_32( 0x7C7D7E7F );
   ctx->ptr = 0;
   ctx->bit_count = 0;
 }
@@ -478,10 +454,10 @@ void bmw256_4way_init( bmw256_4way_context *ctx )
 static void
 bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
 {
-   v128u64_t *vdata = (v128u64_t*)data;
-   v128u64_t *buf;
-   v128u64_t htmp[16];
-   v128u64_t *h1, *h2;
+   v128u32_t *vdata = (v128u32_t*)data;
+   v128u32_t *buf;
+   v128u32_t htmp[16];
+   v128u32_t *h1, *h2;
   size_t ptr;
   const int buf_size = 64;  // bytes of one lane, compatible with len

@@ -503,7 +479,7 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
      ptr += clen;
      if ( ptr == buf_size )
      {
-         v128u64_t *ht;
+         v128u32_t *ht;
         compress_small( buf, h1, h2 );
         ht = h1;
         h1 = h2;
@@ -521,14 +497,14 @@ static void
 bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
 	void *dst, size_t out_size_w32)
 {
-   v128u64_t *buf;
-   v128u64_t h1[16], h2[16], *h;
+   v128u32_t *buf;
+   v128u32_t h1[16], h2[16], *h;
   size_t ptr, u, v;
   const int buf_size = 64;  // bytes of one lane, compatible with len

   buf = sc->buf;
   ptr = sc->ptr;
-   buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
+   buf[ ptr>>2 ] = v128_32( 0x00000080 );
   ptr += 4;
   h = sc->H;

@@ -548,7 +524,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
   for ( u = 0; u < 16; u ++ )
      buf[u] = h2[u];

-   compress_small( buf, (v128u64_t*)final_s, h1 );
+   compress_small( buf, (v128u32_t*)final_s, h1 );

   for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
      casti_v128( dst, u ) = h1[v];
@@ -1057,7 +1033,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )

 #endif // __AVX2__

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 // BMW-256 16 way 32

--- a/algo/bmw/bmw512-4way.c
+++ b/algo/bmw/bmw512-4way.c
@@ -2,12 +2,11 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
-//#include "sph_keccak.h"
 #include "bmw-hash-4way.h"

 #if defined(BMW512_8WAY)

-void bmw512hash_8way(void *state, const void *input)
+void bmw512hash_8way( void *state, const void *input )
 {
    bmw512_8way_context ctx;
    bmw512_8way_init( &ctx );
@@ -27,9 +26,9 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
-   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   __m512i  *noncev = (__m512i*)vdata + 9;
   const uint32_t Htarg = ptarget[7];
-   int thr_id = mythr->id;
+   const int thr_id = mythr->id;

   mm512_bswap32_intrlv80_8x64( vdata, pdata );
   do {
@@ -43,7 +42,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) )
+          if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
          {
              pdata[19] = n + lane;
              submit_solution( work, lane_hash, mythr );
@@ -59,9 +58,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
   
 #elif defined(BMW512_4WAY)

-//#ifdef BMW512_4WAY
-
-void bmw512hash_4way(void *state, const void *input)
+void bmw512hash_4way( void *state, const void *input )
 {
    bmw512_4way_context ctx;
    bmw512_4way_init( &ctx );
@@ -80,10 +77,10 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce -  4;
-   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   const uint32_t last_nonce = max_nonce - 4;
+   __m256i  *noncev = (__m256i*)vdata + 9; 
   const uint32_t Htarg = ptarget[7];
-    int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;  

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
   do {
@@ -96,7 +93,7 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) )
+          if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
          {
              pdata[19] = n + lane;
              submit_solution( work, lane_hash, mythr );
@@ -110,4 +107,55 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }

+#elif defined(BMW512_2WAY)
+
+void bmw512hash_2x64( void *state, const void *input )
+{
+    bmw512_2x64_context ctx;
+    bmw512_2x64_init( &ctx );
+    bmw512_2x64_update( &ctx, input, 80 );
+    bmw512_2x64_close( &ctx, state );
+}
+
+int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
+   uint32_t hash[16*2] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[13]);   // 3*4+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   v128_t *noncev = (v128_t*)vdata + 9;  
+   const uint32_t Htarg = ptarget[7];
+   const int thr_id = mythr->id; 
+
+   v128_bswap32_intrlv80_2x64( vdata, pdata );
+   do {
+      *noncev = v128_intrlv_blend_32( v128_bswap32(
+                                      v128_set32( n+1, 0, n, 0 ) ), *noncev );
+
+      bmw512hash_2x64( hash, vdata );
+
+      for ( int lane = 0; lane < 2; lane++ )
+      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
+      {
+          extr_lane_2x64( lane_hash, hash, lane, 256 );
+          if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
+          {
+              pdata[19] = n + lane;
+              submit_solution( work, lane_hash, mythr );
+          }
+      }
+      n += 2;
+
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
 #endif
--- a/algo/bmw/bmw512-gate.c
+++ b/algo/bmw/bmw512-gate.c
@@ -2,7 +2,7 @@

 bool register_bmw512_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
  opt_target_factor = 256.0;
 #if defined (BMW512_8WAY)
  gate->scanhash  = (void*)&scanhash_bmw512_8way;
@@ -10,6 +10,9 @@ bool register_bmw512_algo( algo_gate_t* gate )
 #elif defined (BMW512_4WAY)
  gate->scanhash  = (void*)&scanhash_bmw512_4way;
  gate->hash      = (void*)&bmw512hash_4way;
+#elif defined (BMW512_2WAY)
+  gate->scanhash  = (void*)&scanhash_bmw512_2x64;
+  gate->hash      = (void*)&bmw512hash_2x64;
 #else
  gate->scanhash        = (void*)&scanhash_bmw512;
  gate->hash            = (void*)&bmw512hash;
--- a/algo/bmw/bmw512-gate.h
+++ b/algo/bmw/bmw512-gate.h
@@ -4,23 +4,31 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define BMW512_8WAY 1
 #elif defined(__AVX2__)
  #define BMW512_4WAY 1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define BMW512_2WAY 1
 #endif

 #if defined(BMW512_8WAY)

 void bmw512hash_8way( void *state, const void *input );
 int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
+                          uint64_t *hashes_done, struct thr_info *mythr );

 #elif defined(BMW512_4WAY)

 void bmw512hash_4way( void *state, const void *input );
 int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
+                          uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(BMW512_2WAY)
+
+void bmw512hash_2x64( void *state, const void *input );
+int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );

 #else

--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -950,7 +950,7 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)

 #endif  // __AVX2__

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 // BMW-512 8 WAY

--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -26,7 +26,7 @@ static const uint64_t IV512[] =
 0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
 };

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 // 4 way 128 is handy to avoid reinterleaving in many algos.
 // If reinterleaving is necessary it may be more efficient to use
--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -6,7 +6,7 @@

 #if defined(__AVX2__)

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 struct _cube_4way_context
 {
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -13,7 +13,7 @@ static void transform( cubehashParam *sp )
    int r;
    const int rounds = sp->rounds;

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

    register __m512i x0, x1;

@@ -39,7 +39,7 @@ static void transform( cubehashParam *sp )

 #elif defined(__AVX2__)

-    register __m256i x0, x1, x2, x3, y0, y1;
+    register __m256i x0, x1, x2, x3, t0;

    x0 = _mm256_load_si256( (__m256i*)sp->x     );
    x1 = _mm256_load_si256( (__m256i*)sp->x + 1 );   
@@ -50,10 +50,10 @@ static void transform( cubehashParam *sp )
    { 
        x2 = _mm256_add_epi32( x0, x2 );
        x3 = _mm256_add_epi32( x1, x3 );
-        y0 = mm256_rol_32( x1, 7 );
-        y1 = mm256_rol_32( x0, 7 );
-        x0 = _mm256_xor_si256( y0, x2 );
-        x1 = _mm256_xor_si256( y1, x3 );
+        t0 = mm256_rol_32( x1, 7 );
+        x1 = mm256_rol_32( x0, 7 );
+        x0 = _mm256_xor_si256( t0, x2 );
+        x1 = _mm256_xor_si256( x1, x3 );
        x2 = mm256_swap128_64( x2 );
        x3 = mm256_swap128_64( x3 );
        x2 = _mm256_add_epi32( x0, x2 );
@@ -75,7 +75,7 @@ static void transform( cubehashParam *sp )

 #else   // AVX, SSE2, NEON

-    v128_t x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
+    v128_t x0, x1, x2, x3, x4, x5, x6, x7, t0, t1;

    x0 = casti_v128( sp->x, 0 );
    x1 = casti_v128( sp->x, 1 );
@@ -92,16 +92,12 @@ static void transform( cubehashParam *sp )
       x5 = v128_add32( x1, x5 );
       x6 = v128_add32( x2, x6 );
       x7 = v128_add32( x3, x7 );
-       y0 = x2;
-       y1 = x3;
-       y2 = x0;
-       y3 = x1;
-       x0 = v128_rol32( y0, 7 );
-       x1 = v128_rol32( y1, 7 );
-       x2 = v128_rol32( y2, 7 );
-       x3 = v128_rol32( y3, 7 );
-       x0 = v128_xor( x0, x4 );
-       x1 = v128_xor( x1, x5 );
+       t0 = v128_rol32( x2, 7 );
+       t1 = v128_rol32( x3, 7 );
+       x2 = v128_rol32( x0, 7 );
+       x3 = v128_rol32( x1, 7 );
+       x0 = v128_xor( t0, x4 );
+       x1 = v128_xor( t1, x5 );
       x2 = v128_xor( x2, x6 );
       x3 = v128_xor( x3, x7 );
       x4 = v128_swap64( x4 );
@@ -112,19 +108,15 @@ static void transform( cubehashParam *sp )
       x5 = v128_add32( x1, x5 );
       x6 = v128_add32( x2, x6 );
       x7 = v128_add32( x3, x7 );
-       y0 = x1;
-       y1 = x0;
-       y2 = x3;
-       y3 = x2;
-       x0 = v128_rol32( y0, 11 );
-       x1 = v128_rol32( y1, 11 );
-       x2 = v128_rol32( y2, 11 );
-       x3 = v128_rol32( y3, 11 );
-	    x0 = v128_xor( x0, x4 );
-	    x1 = v128_xor( x1, x5 );
-	    x2 = v128_xor( x2, x6 );
-	    x3 = v128_xor( x3, x7 );
-	    x4 = v128_swap64_32( x4 );
+       t0 = v128_rol32( x1, 11 );
+       x1 = v128_rol32( x0, 11 );
+       t1 = v128_rol32( x3, 11 );
+       x3 = v128_rol32( x2, 11 );
+       x0 = v128_xor( t0, x4 );
+       x1 = v128_xor( x1, x5 );
+       x2 = v128_xor( t1, x6 );
+       x3 = v128_xor( x3, x7 );
+       x4 = v128_swap64_32( x4 );
 	    x5 = v128_swap64_32( x5 );
 	    x6 = v128_swap64_32( x6 );
 	    x7 = v128_swap64_32( x7 );
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -236,9 +236,7 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc

 }

-
-
-HashReturn init_echo(hashState_echo *ctx, int nHashSize)
+HashReturn init_echo( hashState_echo *ctx, int nHashSize )
 {
 	int i, j;

@@ -280,7 +278,8 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
 	return SUCCESS;
 }

-HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
+HashReturn update_echo( hashState_echo *state, const void *data,
+                        uint32_t databitlen )
 {
 	unsigned int uByteLength, uBlockCount, uRemainingBytes;

@@ -330,7 +329,7 @@ HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLengt
 	return SUCCESS;
 }

-HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
+HashReturn final_echo( hashState_echo *state, void *hashval)
 {
 	v128_t remainingbits;

@@ -407,8 +406,8 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
 	return SUCCESS;
 }

-HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
-                              const BitSequence *data, DataLength databitlen )
+HashReturn update_final_echo( hashState_echo *state, void *hashval,
+                              const void *data, uint32_t databitlen )
 {
   unsigned int uByteLength, uBlockCount, uRemainingBytes;

@@ -530,8 +529,8 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
   return SUCCESS;
 }

-HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
-            int nHashSize, const BitSequence *data, DataLength datalen )
+HashReturn echo_full( hashState_echo *state, void *hashval,
+            int nHashSize, const void *data, uint32_t datalen )
 {
   int i, j;

@@ -578,7 +577,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
        {
           // Fill the buffer
           memcpy( state->buffer + state->uBufferBytes,
-                   (void*)data, state->uBlockLength - state->uBufferBytes );
+                   data, state->uBlockLength - state->uBufferBytes );

           // Process buffer
           Compress( state, state->buffer, 1 );
@@ -601,7 +600,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
        }

        if( uRemainingBytes > 0 )
-        memcpy(state->buffer, (void*)data, uRemainingBytes);
+        memcpy(state->buffer, data, uRemainingBytes);

        state->uBufferBytes = uRemainingBytes;
   }
@@ -689,7 +688,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
 }


-
+#if 0
 HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
 {
 	HashReturn hRet;
@@ -746,5 +745,6 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit

 	return SUCCESS;
 }
+#endif

 #endif
--- a/algo/echo/aes_ni/hash_api.h
+++ b/algo/echo/aes_ni/hash_api.h
@@ -47,16 +47,16 @@ HashReturn init_echo(hashState_echo *state, int hashbitlen);

 HashReturn reinit_echo(hashState_echo *state);

-HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen);
+HashReturn update_echo(hashState_echo *state, const void *data, uint32_t databitlen);

-HashReturn final_echo(hashState_echo *state, BitSequence *hashval);
+HashReturn final_echo(hashState_echo *state, void *hashval);

-HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
+HashReturn hash_echo(int hashbitlen, const void *data, uint32_t databitlen, void *hashval);

-HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
-                              const BitSequence *data, DataLength databitlen );
-HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
-            int nHashSize, const BitSequence *data, DataLength databitlen );
+HashReturn update_final_echo( hashState_echo *state, void *hashval,
+                              const void *data, uint32_t databitlen );
+HashReturn echo_full( hashState_echo *state, void *hashval,
+            int nHashSize, const void *data, uint32_t databitlen );

 #endif // HASH_API_H

--- a/algo/echo/echo-hash-4way.c
+++ b/algo/echo/echo-hash-4way.c
@@ -11,7 +11,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
 };
 */

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 #define ECHO_SUBBYTES4(state, j) \
   state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \
--- a/algo/echo/echo-hash-4way.h
+++ b/algo/echo/echo-hash-4way.h
@@ -5,7 +5,7 @@

 #include "simd-utils.h"

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 typedef struct
 {
--- a/algo/echo/sph_echo.c
+++ b/algo/echo/sph_echo.c
@@ -36,7 +36,6 @@

 #include "sph_echo.h"

-#if !defined(__AES__)

 #ifdef __cplusplus
 extern "C"{
@@ -1031,4 +1030,3 @@ sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #ifdef __cplusplus
 }
 #endif 
-#endif  // !AES
--- a/algo/echo/sph_echo.h
+++ b/algo/echo/sph_echo.h
@@ -36,8 +36,6 @@
 #ifndef SPH_ECHO_H__
 #define SPH_ECHO_H__

-#if !defined(__AES__)
-
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -318,5 +316,4 @@ void sph_echo512_addbits_and_close(
 #ifdef __cplusplus
 }
 #endif
-#endif // !AES
 #endif
--- a/algo/fugue/fugue-aesni.c
+++ b/algo/fugue/fugue-aesni.c
@@ -15,237 +15,176 @@
 *
 */

-#if defined(__AES__)
-
-#include <x86intrin.h>
+#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )

 #include <memory.h>
 #include "fugue-aesni.h"

+static const v128u64_t _supermix1a	__attribute__ ((aligned (16))) =
+   { 0x0202010807020100, 0x0a05000f06010c0b };

-MYALIGN const unsigned long long _supermix1a[]	= {0x0202010807020100, 0x0a05000f06010c0b};
-MYALIGN const unsigned long long _supermix1b[]	= {0x0b0d080703060504, 0x0e0a090c050e0f0a};
-MYALIGN const unsigned long long _supermix1c[]	= {0x0402060c070d0003, 0x090a060580808080};
-MYALIGN const unsigned long long _supermix1d[]	= {0x808080800f0e0d0c, 0x0f0e0d0c80808080};
-MYALIGN const unsigned long long _supermix2a[]	= {0x07020d0880808080, 0x0b06010c050e0f0a};
-MYALIGN const unsigned long long _supermix4a[]	= {0x000f0a050c0b0601, 0x0302020404030e09};
-MYALIGN const unsigned long long _supermix4b[]	= {0x07020d08080e0d0d, 0x07070908050e0f0a};
-MYALIGN const unsigned long long _supermix4c[]	= {0x0706050403020000, 0x0302000007060504};
-MYALIGN const unsigned long long _supermix7a[]	= {0x010c0b060d080702, 0x0904030e03000104};
-MYALIGN const unsigned long long _supermix7b[]	= {0x8080808080808080, 0x0504070605040f06};
-//MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
-//MYALIGN const unsigned char _shift_one_mask[]   = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
-//MYALIGN const unsigned char _shift_four_mask[]  = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
-//MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
-//MYALIGN const unsigned char _aes_shift_rows[]   = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
-MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
-MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
-MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
-MYALIGN const unsigned int _lsbmask2[] = {0x03030303, 0x03030303, 0x03030303, 0x03030303};
+static const v128u64_t _supermix1b	__attribute__ ((aligned (16))) =
+   { 0x0b0d080703060504, 0x0e0a090c050e0f0a };

+static const v128u64_t _supermix1c	__attribute__ ((aligned (16))) =
+   { 0x0402060c070d0003, 0x090a060580808080 };

-MYALIGN const unsigned int _IV512[] = {		
-	0x00000000, 0x00000000,	0x7ea50788, 0x00000000,
+static const v128u64_t _supermix1d	__attribute__ ((aligned (16))) =
+   { 0x808080800f0e0d0c, 0x0f0e0d0c80808080 };
+
+static const v128u64_t _supermix2a	__attribute__ ((aligned (16))) =
+   { 0x07020d0880808080, 0x0b06010c050e0f0a };
+
+static const v128u64_t _supermix4a	__attribute__ ((aligned (16))) =
+   { 0x000f0a050c0b0601, 0x0302020404030e09 };
+
+static const v128u64_t _supermix4b	__attribute__ ((aligned (16))) =
+   { 0x07020d08080e0d0d, 0x07070908050e0f0a };
+
+static const v128u64_t _supermix4c	__attribute__ ((aligned (16))) =
+   { 0x0706050403020000, 0x0302000007060504 };
+
+static const v128u64_t _supermix7a	__attribute__ ((aligned (16))) =
+   { 0x010c0b060d080702, 0x0904030e03000104 };
+
+static const v128u64_t _supermix7b	__attribute__ ((aligned (16))) =
+   { 0x8080808080808080, 0x0504070605040f06 };
+
+static const v128u64_t _inv_shift_rows __attribute__ ((aligned (16))) =
+   { 0x0b0e0104070a0d00, 0x0306090c0f020508 };
+
+static const v128u64_t _mul2mask __attribute__ ((aligned (16))) =
+   { 0x000000001b1b0000, 0x0000000000000000 };
+
+static const v128u64_t _mul4mask __attribute__ ((aligned (16))) =
+   { 0x000000002d361b00, 0x0000000000000000 };
+
+static const v128u64_t _lsbmask2 __attribute__ ((aligned (16))) =
+   { 0x0303030303030303, 0x0303030303030303 };
+
+static const uint32_t _IV512[] __attribute__ ((aligned (32))) =
+ {	0x00000000, 0x00000000,	0x7ea50788, 0x00000000,
 	0x75af16e6, 0xdbe4d3c5, 0x27b09aac, 0x00000000,
 	0x17f115d9, 0x54cceeb6, 0x0b02e806, 0x00000000,
 	0xd1ef924a, 0xc9e2c6aa, 0x9813b2dd, 0x00000000,
 	0x3858e6ca, 0x3f207f43, 0xe778ea25, 0x00000000,
-	0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000};
+	0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000
+ };

-#if defined(__SSE4_1__)
+#if defined(__ARM_NEON)

-#define PACK_S0(s0, s1, t1)\
-   s0 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s0), _mm_castsi128_ps(s1), 0x30))
+#define mask_1000(v)         v128_put32( v, 0, 3 )

-#define UNPACK_S0(s0, s1, t1)\
-   s1 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s1), _mm_castsi128_ps(s0), 0xc0));\
-   s0 = mm128_mask_32( s0, 8 )
+static const v128u32_t MASK_3321 __attribute__ ((aligned (16))) =
+   { 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x0f0e0d0c };

-#define CMIX(s1, s2, r1, r2, t1, t2)\
-   t1 = s1;\
-   t1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t1), _mm_castsi128_ps(s2), _MM_SHUFFLE(3, 0, 2, 1)));\
-   r1 = _mm_xor_si128(r1, t1);\
-   r2 = _mm_xor_si128(r2, t1);
+static const v128u32_t MASK_3033 __attribute__ ((aligned (16))) =
+   { 0x0f0e0d0c, 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c };

-#else   // SSE2
+static const v128u32_t MASK_3303 __attribute__ ((aligned (16))) =
+   { 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c, 0x0f0e0d0c };

-#define PACK_S0(s0, s1, t1)\
-   t1 = _mm_shuffle_epi32(s1, _MM_SHUFFLE(0, 3, 3, 3));\
-   s0 = _mm_xor_si128(s0, t1);
+static const v128u32_t MASK_0321 __attribute__ ((aligned (16))) =
+   { 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x03020100 };

-#define UNPACK_S0(s0, s1, t1)\
-   t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 3, 3));\
-   s1 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s1), _mm_castsi128_ps(t1)));\
-   s0 = mm128_mask_32( s0, 8 )
+#define shuffle_3303(v)      vqtbl1q_u8( v, MASK_3303 )
+#define shuffle_0321(v)      vqtbl1q_u8( v, MASK_0321 )

-#define CMIX(s1, s2, r1, r2, t1, t2)\
-   t1 = _mm_shuffle_epi32(s1, 0xf9);\
-   t2 = _mm_shuffle_epi32(s2, 0xcf);\
-   t1 = _mm_xor_si128(t1, t2);\
-   r1 = _mm_xor_si128(r1, t1);\
-   r2 = _mm_xor_si128(r2, t1)
+#define CMIX( s1, s2, r1, r2, t1, t2 ) \
+   t1 = vqtbl1q_u8( s1, MASK_3321 ); \
+   t2 = vqtbl1q_u8( s2, MASK_3033 ); \
+   t1 = v128_xor( t1, t2 ); \
+   r1 = v128_xor( r1, t1 ); \
+   r2 = v128_xor( r2, t1 );
+
+#elif defined(__SSE4_1__)
+
+#define mask_1000(v)         v128_mask32( v, 8 )
+
+#define shuffle_3303(v)      _mm_shuffle_epi32( v, 0xf3 )
+#define shuffle_0321(v)      _mm_shuffle_epi32( v, 0x39 )
+
+#define CMIX( s1, s2, r1, r2, t1, t2 ) \
+   t1 = s1; \
+   t1 = v128_shuffle2_32( t1, s2, _MM_SHUFFLE( 3, 0, 2, 1 ) ); \
+   r1 = v128_xor( r1, t1 ); \
+   r2 = v128_xor( r2, t1 );

 #endif

-#define TIX256(msg, s10, s8, s24, s0, t1, t2, t3)\
-	t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
-	s10 = _mm_xor_si128(s10, t1);\
-	t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
-	s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
-	t1 = _mm_slli_si128(t1, 8);\
-	s8 = _mm_xor_si128(s8, t1);\
-	t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
-	s0 = _mm_xor_si128(s0, t1)
+#define PACK_S0( s0, s1, t1 ) \
+ s0 = v128_movlane32( s0, 3, s1, 0 )

-
-#define TIX384(msg, s16, s8, s27, s30, s0, s4, t1, t2, t3)\
-	t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
-	s16 = _mm_xor_si128(s16, t1);\
-	t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
-	s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
-	t1 = _mm_slli_si128(t1, 8);\
-	s8 = _mm_xor_si128(s8, t1);\
-	t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
-	s0 = _mm_xor_si128(s0, t1);\
-	t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
-	s4 = _mm_xor_si128(s4, t1)
+#define UNPACK_S0( s0, s1, t1 ) \
+   s1 = v128_movlane32( s1, 0, s0, 3 ); \
+   s0 = mask_1000( s0 )

 #define TIX512(msg, s22, s8, s24, s27, s30, s0, s4, s7, t1, t2, t3)\
-	t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
-	s22 = _mm_xor_si128(s22, t1);\
-	t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
-	s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
-	t1 = _mm_slli_si128(t1, 8);\
-	s8 = _mm_xor_si128(s8, t1);\
-	t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
-	s0 = _mm_xor_si128(s0, t1);\
-	t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
-	s4 = _mm_xor_si128(s4, t1);\
-	t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
-	s7 = _mm_xor_si128(s7, t1)
+	t1 = shuffle_3303( s0 ); \
+	s22 = v128_xor(s22, t1);\
+	t1 = v128_put32( v128_zero, *(uint32_t*)msg, 0 ); \
+	s0 = v128_movlane32( s0, 0, t1, 0 ); \
+	t1 = v128_alignr64( t1, v128_zero, 1 ); \
+	s8 = v128_xor(s8, t1);\
+	t1 = shuffle_3303( s24 ); \
+	s0 = v128_xor(s0, t1);\
+	t1 = shuffle_3303( s27 ); \
+	s4 = v128_xor(s4, t1);\
+	t1 = shuffle_3303( s30 ); \
+	s7 = v128_xor(s7, t1)

-#define PRESUPERMIX(t0, t1, t2, t3, t4)\
-   t2 = t0;\
-   t3 = _mm_add_epi8(t0, t0);\
-   t4 = _mm_add_epi8(t3, t3);\
-   t1 = _mm_srli_epi16(t0, 6);\
-   t1 = _mm_and_si128(t1, M128(_lsbmask2));\
-   t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
-   t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
-
-/*
-#define PRESUPERMIX(x, t1, s1, s2, t2)\
-	s1 = x;\
-	s2 = _mm_add_epi8(x, x);\
-	t2 = _mm_add_epi8(s2, s2);\
-	t1 = _mm_srli_epi16(x, 6);\
-	t1 = _mm_and_si128(t1, M128(_lsbmask2));\
-	s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
-	x  = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
-*/
-
-#define SUBSTITUTE(r0, _t2 )\
-	_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
-	_t2 = _mm_aesenclast_si128( _t2, v128_zero )
+#define SUBSTITUTE( r0, _t2 ) \
+	_t2 = v128_shuffle8( r0, _inv_shift_rows ); \
+	_t2 = v128_aesenclast_nokey( _t2 )

 #define SUPERMIX(t0, t1, t2, t3, t4)\
   t2 = t0;\
-   t3 = _mm_add_epi8(t0, t0);\
-   t4 = _mm_add_epi8(t3, t3);\
-   t1 = _mm_srli_epi16(t0, 6);\
-   t1 = _mm_and_si128(t1, M128(_lsbmask2));\
-   t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
-   t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
-   t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
-   t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
-   t4 = _mm_xor_si128(t4, t1);\
-   t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
-   t4 = _mm_xor_si128(t4, t1);\
-   t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
-   t2 = v128_xor3(t2, t3, t0 );\
-   t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
+   t3 = v128_add8( t0, t0 ); \
+   t4 = v128_add8( t3, t3 ); \
+   t1 = v128_sr16( t0, 6 ); \
+   t1 = v128_and( t1, _lsbmask2 ); \
+   t0 = v128_xor( t4, v128_shuffle8( _mul4mask, t1 ) ); \
+   t4 = v128_shuffle8( t2, _supermix1b ); \
+   t3 = v128_xor( t3, v128_shuffle8( _mul2mask, t1 ) ); \
+   t1 = v128_shuffle8( t4, _supermix1c ); \
+   t4 = v128_xor( t4, t1 ); \
+   t1 = v128_shuffle8( t4, _supermix1d ); \
+   t4 = v128_xor( t4, t1 ); \
+   t1 = v128_shuffle8( t2, _supermix1a ); \
+   t2 = v128_xor3( t2, t3, t0 ); \
+   t2 = v128_shuffle8( t2, _supermix7a ); \
   t4 = v128_xor3( t4, t1, t2 ); \
-   t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
-   t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
-   t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
-   t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
+   t2 = v128_shuffle8( t2, _supermix7b ); \
+   t3 = v128_shuffle8( t3, _supermix2a ); \
+   t1 = v128_shuffle8( t0, _supermix4a ); \
+   t0 = v128_shuffle8( t0, _supermix4b ); \
   t4 = v128_xor3( t4, t2, t1 ); \
-   t0 = _mm_xor_si128(t0, t3);\
-   t4 = v128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
-
-/*
-#define SUPERMIX(t0, t1, t2, t3, t4)\
-	PRESUPERMIX(t0, t1, t2, t3, t4);\
-	POSTSUPERMIX(t0, t1, t2, t3, t4)
-*/
-
-#define POSTSUPERMIX(t0, t1, t2, t3, t4)\
-	t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
-	t4 = t1;\
-	t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
-	t4 = _mm_xor_si128(t4, t1);\
-	t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
-	t4 = _mm_xor_si128(t4, t1);\
-	t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
-	t4 = _mm_xor_si128(t4, t1);\
-	t2 = v128_xor3(t2, t3, t0 );\
-	t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
-	t4 = _mm_xor_si128(t4, t2);\
-	t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
-	t4 = _mm_xor_si128(t4, t2);\
-	t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
-	t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
-	t4 = _mm_xor_si128(t4, t1);\
-	t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
-	t0 = _mm_xor_si128(t0, t3);\
-	t4 = _mm_xor_si128(t4, t0);\
-	t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
-	t4 = _mm_xor_si128(t4, t0)
-
-#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
-	CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
-	PACK_S0(r1c, r1a, _t0);\
-	SUBSTITUTE(r1c, _t2 );\
-	SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
-	_t0 = _mm_shuffle_epi32(r1c, 0x39);\
-	r2c = _mm_xor_si128(r2c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
-	r2d = _mm_xor_si128(r2d, _t0);\
-	UNPACK_S0(r1c, r1a, _t3);\
-	SUBSTITUTE(r2c, _t2 );\
-	SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
-	_t0 = _mm_shuffle_epi32(r2c, 0x39);\
-	r3c = _mm_xor_si128(r3c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
-	r3d = _mm_xor_si128(r3d, _t0);\
-	UNPACK_S0(r2c, r2a, _t3);\
-	SUBSTITUTE(r3c, _t2 );\
-	SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
-	UNPACK_S0(r3c, r3a, _t3)
+   t0 = v128_xor( t0, t3 ); \
+   t4 = v128_xor3( t4, t0, v128_shuffle8( t0, _supermix4c ) );

 #define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
 	CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
 	PACK_S0(r1c, r1a, _t0);\
 	SUBSTITUTE( r1c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
-	_t0 = _mm_shuffle_epi32(r1c, 0x39);\
-	r2c = _mm_xor_si128(r2c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
-	r2d = _mm_xor_si128(r2d, _t0);\
+	_t0 = shuffle_0321( r1c ); \
+	r2c = v128_xor(r2c, _t0);\
+   _t0 = mask_1000( _t0 ); \
+	r2d = v128_xor(r2d, _t0);\
 	UNPACK_S0(r1c, r1a, _t3);\
 	SUBSTITUTE(r2c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
-	_t0 = _mm_shuffle_epi32(r2c, 0x39);\
-	r3c = _mm_xor_si128(r3c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
-	r3d = _mm_xor_si128(r3d, _t0);\
+	_t0 = shuffle_0321( r2c ); \
+	r3c = v128_xor(r3c, _t0);\
+   _t0 = mask_1000( _t0 ); \
+	r3d = v128_xor(r3d, _t0);\
 	UNPACK_S0(r2c, r2a, _t3);\
 	SUBSTITUTE( r3c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
-	_t0 = _mm_shuffle_epi32(r3c, 0x39);\
-	r4c = _mm_xor_si128(r4c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
-	r4d = _mm_xor_si128(r4d, _t0);\
+	_t0 = shuffle_0321( r3c ); \
+	r4c = v128_xor(r4c, _t0);\
+   _t0 = mask_1000( _t0 ); \
+	r4d = v128_xor(r4d, _t0);\
 	UNPACK_S0(r3c, r3a, _t3);\
 	SUBSTITUTE( r4c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
@@ -256,18 +195,19 @@ MYALIGN const unsigned int _IV512[] = {
 	block[1] = col[(base + a + 1) % s];\
 	block[2] = col[(base + a + 2) % s];\
 	block[3] = col[(base + a + 3) % s];\
-	x = _mm_load_si128((__m128i*)block)
+	x = v128_load( (v128_t*)block )

 #define STORECOLUMN(x, s)\
-	_mm_store_si128((__m128i*)block, x);\
+	v128_store((v128_t*)block, x );\
 	col[(base + 0) % s] = block[0];\
 	col[(base + 1) % s] = block[1];\
 	col[(base + 2) % s] = block[2];\
 	col[(base + 3) % s] = block[3]

-void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
+void Compress512( hashState_fugue *ctx, const unsigned char *pmsg,
+                  unsigned int uBlockCount )
 {
-   __m128i _t0, _t1, _t2, _t3;
+   v128_t _t0, _t1, _t2, _t3;

   switch(ctx->base)
   {
@@ -346,134 +286,133 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
      pmsg += 4;
      uBlockCount--;
   }
-
 }

-void Final512(hashState_fugue *ctx, BitSequence *hashval)
+void Final512( hashState_fugue *ctx, uint8_t *hashval )
 {
   unsigned int block[4] __attribute__ ((aligned (32)));
   unsigned int col[36] __attribute__ ((aligned (16)));
 	unsigned int i, base;
-	__m128i r0, _t0, _t1, _t2, _t3;
+	v128_t r0, _t0, _t1, _t2, _t3;

-	for(i = 0; i < 12; i++)
+	for( i = 0; i < 12; i++ )
 	{
-		_mm_store_si128((__m128i*)block, ctx->state[i]);
+		v128_store( (v128_t*)block, ctx->state[i] );

 		col[3 * i + 0] = block[0];
 		col[3 * i + 1] = block[1];
 		col[3 * i + 2] = block[2];
 	}

-	base = (36 - (12 * ctx->base)) % 36;
+	base = ( 36 - (12 * ctx->base) ) % 36;

-	for(i = 0; i < 32; i++)
+	for( i = 0; i < 32; i++ )
 	{
 		// ROR3
 		base = (base + 33) % 36;

 		// CMIX
-		col[(base +  0) % 36] ^= col[(base + 4) % 36];
-		col[(base +  1) % 36] ^= col[(base + 5) % 36];
-		col[(base +  2) % 36] ^= col[(base + 6) % 36];
-		col[(base +  18) % 36] ^= col[(base + 4) % 36];
-		col[(base +  19) % 36] ^= col[(base + 5) % 36];
-		col[(base +  20) % 36] ^= col[(base + 6) % 36];
+		col[ (base +  0) % 36 ] ^= col[ (base + 4) % 36 ];
+		col[ (base +  1) % 36 ] ^= col[ (base + 5) % 36 ];
+		col[ (base +  2) % 36 ] ^= col[ (base + 6) % 36 ];
+		col[ (base + 18) % 36 ] ^= col[ (base + 4) % 36 ];
+		col[ (base + 19) % 36 ] ^= col[ (base + 5) % 36 ];
+		col[ (base + 20) % 36 ] ^= col[ (base + 6) % 36 ];

 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t2);
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
-		STORECOLUMN(r0, 36);
+		LOADCOLUMN( r0, 36, 0 );
+		SUBSTITUTE( r0, _t2 );
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
+		STORECOLUMN( r0, 36 );
 	}

-	for(i = 0; i < 13; i++)
+	for( i = 0; i < 13; i++ )
 	{
 		// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
-		col[(base +  4) % 36] ^= col[(base + 0) % 36];
-		col[(base +  9) % 36] ^= col[(base + 0) % 36];
-		col[(base + 18) % 36] ^= col[(base + 0) % 36];
-		col[(base + 27) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base +  9) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];

 		// ROR9
 		base = (base + 27) % 36;

 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t2);
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
-		STORECOLUMN(r0, 36);
+		LOADCOLUMN( r0, 36, 0 );
+		SUBSTITUTE( r0, _t2 );
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
+		STORECOLUMN( r0, 36 );

 		// S4 += S0; S10 += S0; S18 += S0; S27 += S0;
-		col[(base +  4) % 36] ^= col[(base + 0) % 36];
-		col[(base + 10) % 36] ^= col[(base + 0) % 36];
-		col[(base + 18) % 36] ^= col[(base + 0) % 36];
-		col[(base + 27) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];

 		// ROR9
 		base = (base + 27) % 36;

 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t2);
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
-		STORECOLUMN(r0, 36);
+		LOADCOLUMN( r0, 36, 0 );
+		SUBSTITUTE( r0, _t2 );
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
+		STORECOLUMN( r0, 36 );

 		// S4 += S0; S10 += S0; S19 += S0; S27 += S0;
-		col[(base +  4) % 36] ^= col[(base + 0) % 36];
-		col[(base + 10) % 36] ^= col[(base + 0) % 36];
-		col[(base + 19) % 36] ^= col[(base + 0) % 36];
-		col[(base + 27) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];

 		// ROR9
 		base = (base + 27) % 36;

 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t2);
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
-		STORECOLUMN(r0, 36);
+		LOADCOLUMN( r0, 36, 0 );
+		SUBSTITUTE( r0, _t2 );
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
+		STORECOLUMN( r0, 36 );

 		// S4 += S0; S10 += S0; S19 += S0; S28 += S0;
-		col[(base +  4) % 36] ^= col[(base + 0) % 36];
-		col[(base + 10) % 36] ^= col[(base + 0) % 36];
-		col[(base + 19) % 36] ^= col[(base + 0) % 36];
-		col[(base + 28) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 28) % 36 ] ^= col[ (base + 0) % 36 ];

 		// ROR8
 		base = (base + 28) % 36;

 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t2);
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
-		STORECOLUMN(r0, 36);
+		LOADCOLUMN( r0, 36, 0 );
+		SUBSTITUTE( r0, _t2 );
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
+		STORECOLUMN( r0, 36 );
 	}

 	// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
-	col[(base +  4) % 36] ^= col[(base + 0) % 36];
-	col[(base +  9) % 36] ^= col[(base + 0) % 36];
-	col[(base + 18) % 36] ^= col[(base + 0) % 36];
-	col[(base + 27) % 36] ^= col[(base + 0) % 36];
+	col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
+	col[ (base +  9) % 36 ] ^= col[ (base + 0) % 36 ];
+	col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
+	col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];

 	// Transform to the standard basis and store output; S1 || S2 || S3 || S4
-	LOADCOLUMN(r0, 36, 1);
-	_mm_store_si128((__m128i*)hashval, r0);
+	LOADCOLUMN( r0, 36, 1 );
+	v128_store( (v128_t*)hashval, r0 );

 	// Transform to the standard basis and store output; S9 || S10 || S11 || S12
-	LOADCOLUMN(r0, 36, 9);
-	_mm_store_si128((__m128i*)hashval + 1, r0);
+	LOADCOLUMN( r0, 36, 9 );
+	v128_store( (v128_t*)hashval + 1, r0 );

 	// Transform to the standard basis and store output; S18 || S19 || S20 || S21
-	LOADCOLUMN(r0, 36, 18);
-	_mm_store_si128((__m128i*)hashval + 2, r0);
+	LOADCOLUMN( r0, 36, 18 );
+	v128_store( (v128_t*)hashval + 2, r0 );

 	// Transform to the standard basis and store output; S27 || S28 || S29 || S30
-	LOADCOLUMN(r0, 36, 27);
-	_mm_store_si128((__m128i*)hashval + 3, r0);
+	LOADCOLUMN( r0, 36, 27 );
+	v128_store( (v128_t*)hashval + 3, r0 );
 }

-HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
+int fugue512_Init( hashState_fugue *ctx, int nHashSize )
 {
 	int i;
 	ctx->processed_bits = 0;
@@ -487,18 +426,18 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
 	for(i = 0; i < 6; i++)
 		ctx->state[i] = v128_zero;

-	ctx->state[6]  = _mm_load_si128((__m128i*)_IV512 + 0);
-	ctx->state[7]  = _mm_load_si128((__m128i*)_IV512 + 1);
-	ctx->state[8]  = _mm_load_si128((__m128i*)_IV512 + 2);
-	ctx->state[9]  = _mm_load_si128((__m128i*)_IV512 + 3);
-	ctx->state[10] = _mm_load_si128((__m128i*)_IV512 + 4);
-	ctx->state[11] = _mm_load_si128((__m128i*)_IV512 + 5);
+	ctx->state[6]  = casti_v128( _IV512, 0 );
+	ctx->state[7]  = casti_v128( _IV512, 1 );
+	ctx->state[8]  = casti_v128( _IV512, 2 );
+	ctx->state[9]  = casti_v128( _IV512, 3 );
+	ctx->state[10] = casti_v128( _IV512, 4 );
+	ctx->state[11] = casti_v128( _IV512, 5 );

-	return SUCCESS;
+	return 0;
 }

-
-HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen)
+int fugue512_Update( hashState_fugue *state, const void *data,
+                            uint64_t databitlen )
 {
 	unsigned int uByteLength, uBlockCount, uRemainingBytes;

@@ -509,7 +448,8 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
 		if(state->uBufferBytes != 0)
 		{
 			// Fill the buffer
-			memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
+			memcpy( state->buffer + state->uBufferBytes, (void*)data,
+                 state->uBlockLength - state->uBufferBytes );

 			// Process the buffer
 			Compress512(state, state->buffer, 1);
@@ -545,13 +485,13 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
 		state->uBufferBytes += uByteLength;
 	}

-	return SUCCESS;
+	return 0;
 }

-HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
+int fugue512_Final( hashState_fugue *state, void *hashval )
 {
 	unsigned int i;
-	BitSequence lengthbuf[8] __attribute__((aligned(64)));
+	uint8_t lengthbuf[8] __attribute__((aligned(64)));

 	// Update message bit count
 	state->processed_bits += state->uBufferBytes * 8;
@@ -575,16 +515,17 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
 	// Finalization
 	Final512(state, hashval);

-	return SUCCESS;
+	return 0;
 }


-HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen)
+int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
+                   uint64_t databitlen )
 {
-	fugue512_Init(hs, 512);
-	fugue512_Update(hs, data, databitlen*8);
-	fugue512_Final(hs, hashval);
-	return SUCCESS;
+	fugue512_Init( hs, 512 );
+	fugue512_Update( hs, data, databitlen*8 );
+	fugue512_Final( hs, hashval );
+	return 0;
 }

 #endif  // AES
--- a/algo/fugue/fugue-aesni.h
+++ b/algo/fugue/fugue-aesni.h
@@ -14,37 +14,31 @@
 #ifndef FUGUE_HASH_API_H
 #define FUGUE_HASH_API_H

-#if defined(__AES__) 
+#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )

-#if !defined(__SSE4_1__)
-#error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
-#endif
-
-#include "compat/sha3_common.h"
 #include "simd-utils.h"

-
 typedef struct
 {
-	__m128i			state[12];
+	v128_t			state[12];
 	unsigned int	base;
-
 	unsigned int	uHashSize;
 	unsigned int	uBlockLength;
 	unsigned int	uBufferBytes;
-	DataLength		processed_bits;
-	BitSequence		buffer[4];
+	uint64_t 		processed_bits;
+	uint8_t  		buffer[4];

 } hashState_fugue __attribute__ ((aligned (64)));


 // These functions are deprecated, use the lower case macro aliases that use
 // the standard interface. This will be cleaned up at a later date.
-HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen);
+int fugue512_Init( hashState_fugue *state, int hashbitlen );

-HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen);
+int fugue512_Update( hashState_fugue *state, const void *data,
+                     uint64_t databitlen );

-HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
+int fugue512_Final( hashState_fugue *state, void *hashval );

 #define fugue512_init( state ) \
   fugue512_Init( state, 512 )
@@ -54,7 +48,8 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
   fugue512_Final


-HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen);
+int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
+                   uint64_t databitlen);

 #endif // AES
 #endif // HASH_API_H
--- a/algo/gost/sph_gost.c
+++ b/algo/gost/sph_gost.c
@@ -696,7 +696,7 @@ static void AddModulo512(const void *a,const void *b,void *c)

 static void AddXor512(const void *a,const void *b,void *c)
 {
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   casti_m512i( c, 0 ) = _mm512_xor_si512( casti_m512i( a, 0 ),
                                           casti_m512i( b, 0 ) );
 #elif defined(__AVX2__)
@@ -704,15 +704,15 @@ static void AddXor512(const void *a,const void *b,void *c)
                                           casti_m256i( b, 0 ) );
   casti_m256i( c, 1 ) = _mm256_xor_si256( casti_m256i( a, 1 ),
                                           casti_m256i( b, 1 ) );
-#elif defined(__SSE2__)
-   casti_m128i( c, 0 ) = _mm_xor_si128( casti_m128i( a, 0 ),
-                                        casti_m128i( b, 0 ) );
-   casti_m128i( c, 1 ) = _mm_xor_si128( casti_m128i( a, 1 ),
-                                        casti_m128i( b, 1 ) );
-   casti_m128i( c, 2 ) = _mm_xor_si128( casti_m128i( a, 2 ),
-                                        casti_m128i( b, 2 ) );
-   casti_m128i( c, 3 ) = _mm_xor_si128( casti_m128i( a, 3 ),
-                                        casti_m128i( b, 3 ) );
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+   casti_v128( c, 0 ) = v128_xor( casti_v128( a, 0 ),
+                                  casti_v128( b, 0 ) );
+   casti_v128( c, 1 ) = v128_xor( casti_v128( a, 1 ),
+                                  casti_v128( b, 1 ) );
+   casti_v128( c, 2 ) = v128_xor( casti_v128( a, 2 ),
+                                  casti_v128( b, 2 ) );
+   casti_v128( c, 3 ) = v128_xor( casti_v128( a, 3 ),
+                                  casti_v128( b, 3 ) );
 #else
   const unsigned long long *A=a, *B=b;
 	unsigned long long *C=c;
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -60,54 +60,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };

 #if defined(__ARM_NEON)

-// No fast shuffle on NEON
-//static const uint32x4_t vmask_d8 = {  3, 1, 2, 0 };  
-static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
+static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
+   { 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };

-#define gr_shuffle32( v )      v128_blendv( v128_qrev32( v ), v, BLEND_MASK )
-
-/*
-#define TRANSP_MASK \
-     0xd,0x5,0x9,0x1,0xc,0x4,0x8,0x0,0xf,0x7,0xb,0x3,0xe,0x6,0xa,0x2
-#define SUBSH_MASK0 \
-     0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8
-#define SUBSH_MASK1 \
-     0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9
-#define SUBSH_MASK2 \
-     0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa
-#define SUBSH_MASK3 \
-     0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb
-#define SUBSH_MASK4  \
-     0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc
-#define SUBSH_MASK5 \
-     0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd
-#define SUBSH_MASK6 \
-     0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe
-#define SUBSH_MASK7 \
-     0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3
-
-//#define gr_shuffle8( v, c )    v128_shullfev8( v, c )
-
-
-#define gr_shuffle8( v, c15, c14, c13, c12, c11, c10, c09, c08, \
-                        c07, c06, c05, c04, c03, c02, c01, c00 ) \
-  v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
-  v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
-  v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
-  v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
-    v, 15, v, c15 ), 14, v, c14 ), 13, v, c13 ), 12, v, c12 ), \
-       11, v, c11 ), 10, v, c10 ),  9, v, c09 ),  8, v, c08 ), \
-        7, v, c07 ),  6, v, c06 ),  5, v, c05 ),  4, v, c04 ), \
-        3, v, c03 ),  2, v, c02 ),  1, v, c01 ),  0, v, c00 )
-*/
+#define gr_shuffle32(v)       vqtbl1q_u8( v, gr_mask ) 

 #else

-#define gr_shuffle32( v )       _mm_shuffle_epi32( v, 0xd8 )
+#define gr_shuffle32(v)       _mm_shuffle_epi32( v, 0xd8 )

 #endif

-
 #define tos(a)    #a
 #define tostr(a)  tos(a)

@@ -140,7 +103,7 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
   K. Matusiewicz, 2011/05/29 */

-#if defined(__AVX512VL__)
+#if defined(VL256)

 #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* t_i = a_i + a_{i+1} */\
@@ -334,17 +297,16 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
 */
 #define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* SubBytes */\
-  b0 = v128_xor(b0, b0);\
-  a0 = v128_aesenclast(a0, b0);\
-  a1 = v128_aesenclast(a1, b0);\
-  a2 = v128_aesenclast(a2, b0);\
-  a3 = v128_aesenclast(a3, b0);\
-  a4 = v128_aesenclast(a4, b0);\
-  a5 = v128_aesenclast(a5, b0);\
-  a6 = v128_aesenclast(a6, b0);\
-  a7 = v128_aesenclast(a7, b0);\
+  a0 = v128_aesenclast_nokey( a0 ); \
+  a1 = v128_aesenclast_nokey( a1 ); \
+  a2 = v128_aesenclast_nokey( a2 ); \
+  a3 = v128_aesenclast_nokey( a3 ); \
+  a4 = v128_aesenclast_nokey( a4 ); \
+  a5 = v128_aesenclast_nokey( a5 ); \
+  a6 = v128_aesenclast_nokey( a6 ); \
+  a7 = v128_aesenclast_nokey( a7 ); \
  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+  MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7 ); \
 }

 #define ROUNDS_P(){\
@@ -362,10 +324,9 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
    xmm13 = v128_shuffle8( xmm13, SUBSH_MASK5 ); \
    xmm14 = v128_shuffle8( xmm14, SUBSH_MASK6 ); \
    xmm15 = v128_shuffle8( xmm15, SUBSH_MASK7 ); \
-    /* SubBytes + MixBytes */\
+     /* SubBytes + MixBytes */\
    SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
            xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7 ); \
-    \
    /* AddRoundConstant P1024 */\
    xmm0 = v128_xor( xmm0, \
             casti_v128( round_const_p, round_counter+1 ) ); \
@@ -467,7 +428,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
  t1 = v128_unpackhi16(t1, i3);\
  i2 = v128_unpacklo16(i2, i3);\
  i0 = v128_unpacklo16(i0, i1);\
-\
  /* shuffle with immediate */\
  t0 = gr_shuffle32( t0 ); \
  t1 = gr_shuffle32( t1 ); \
@@ -477,7 +437,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
  i2 = gr_shuffle32( i2 ); \
  i4 = gr_shuffle32( i4 ); \
  i6 = gr_shuffle32( i6 ); \
-\
  /* continue with unpack */\
  t4 = i0;\
  i0 = v128_unpacklo32(i0, i2);\
@@ -584,7 +543,8 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
  /* transpose done */\
 }/**/

-
+#if 0
+// not used
 void INIT( v128_t* chaining )
 {
  static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
@@ -613,6 +573,7 @@ void INIT( v128_t* chaining )
  chaining[6] = xmm14;
  chaining[7] = xmm15;
 }
+#endif

 void TF1024( v128_t* chaining, const v128_t* message )
 {
--- a/algo/groestl/aes_ni/groestl256-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl256-intr-aes.h
@@ -1,3 +1,6 @@
+#if !defined GROESTL256_INTR_AES_H__
+#define GROESTL256_INTR_AES_H__
+
 /* groestl-intr-aes.h     Aug 2011
 *
 * Groestl implementation with intrinsics using ssse3, sse4.1, and aes
@@ -50,18 +53,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };

 #if defined(__ARM_NEON)

-// No fast shuffle on NEON
-static const uint32x4_t vmask_d8 = {  3, 1, 2, 0 };
+static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
+   { 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };

-#define gr_shuffle32( v )       v128_shufflev32( v, vmask_d8 )
+#define gr_shuffle32(v)       vqtbl1q_u8( v, gr_mask ) 

 #else

-#define gr_shuffle32( v )       _mm_shuffle_epi32( v, 0xd8 )
+#define gr_shuffle32(v)       _mm_shuffle_epi32( v, 0xd8 )

 #endif

-
 #define tos(a)    #a
 #define tostr(a)  tos(a)

@@ -93,7 +95,7 @@ static const uint32x4_t vmask_d8 = {  3, 1, 2, 0 };
   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
   K. Matusiewicz, 2011/05/29 */

-#if defined(__AVX512VL__)
+#if defined(VL256)

 #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* t_i = a_i + a_{i+1} */\
@@ -598,4 +600,4 @@ void OF512( v128_t* chaining )
  chaining[3] = xmm11;
 }

-
+#endif
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -146,7 +146,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
   const int hash_offset = SIZE512 - hashlen_m128i;
   uint64_t blocks = len / SIZE512;
   v128_t* in = (v128_t*)input;
-
+   
   // digest any full blocks, process directly from input 
   for ( i = 0; i < blocks; i++ )
      TF1024( ctx->chaining, &in[ i * SIZE512 ] );
@@ -181,6 +181,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,

   // digest final padding block and do output transform
   TF1024( ctx->chaining, ctx->buffer );
+
   OF1024( ctx->chaining );

   // store hash result in output 
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -87,6 +87,7 @@ int final_groestl( hashState_groestl*, void* );
 int update_and_final_groestl( hashState_groestl*,  void*, const void*, int );
 int groestl512( hashState_groestl*,  void*, const void*, uint64_t );
 #define groestl512_full   groestl512
+#define groestl512_ctx    groestl512


 #endif /* __hash_h */
--- a/algo/groestl/groestl-gate.h
+++ b/algo/groestl/groestl-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__VAES__) && defined(SIMD512)
  #define GROESTL_4WAY_VAES 1
 #endif

--- a/algo/groestl/groestl256-hash-4way.c
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -17,7 +17,7 @@

 #if defined(__AVX2__) && defined(__VAES__)

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)


 int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
--- a/algo/groestl/groestl256-hash-4way.h
+++ b/algo/groestl/groestl256-hash-4way.h
@@ -43,7 +43,7 @@

 #define SIZE256 (SIZE_512/16)

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 typedef struct {
  __attribute__ ((aligned (128))) __m512i chaining[SIZE256];
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -42,7 +42,7 @@ static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
   { 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
 };

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
                                     0x1d1519111c141810, 0x1f171b131e161a12,
@@ -626,7 +626,7 @@ static const __m256i SUBSH_MASK7_2WAY =

 #define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
-  b1 = mm256_bcast_m128( mm128_mask_32( v128_neg1, 0x3 ) ); \
+  b1 = mm256_bcast_m128( v128_mask32( v128_neg1, 0x3 ) ); \
  a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
  a1 = _mm256_xor_si256( a1, b1 );\
  a2 = _mm256_xor_si256( a2, b1 );\
--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -17,7 +17,7 @@

 #if defined(__AVX2__) && defined(__VAES__)

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
 {
--- a/algo/groestl/groestl512-hash-4way.h
+++ b/algo/groestl/groestl512-hash-4way.h
@@ -33,7 +33,7 @@

 #define SIZE512 (SIZE_1024/16)

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 typedef struct {
  __attribute__ ((aligned (128))) __m512i chaining[SIZE512];
--- a/algo/groestl/groestl512-intr-4way.h
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -50,7 +50,7 @@ static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
   { 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
 };

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
                                     0x1d1519111c141810, 0x1f171b131e161a12,
@@ -239,7 +239,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
  { \
    /* AddRoundConstant P1024 */\
    xmm8 = _mm512_xor_si512( xmm8, mm512_bcast_m128( \
-             casti_m128i( round_const_p, round_counter ) ) ); \
+             casti_v128u32( round_const_p, round_counter ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm8  = _mm512_shuffle_epi8( xmm8,  SUBSH_MASK0 ); \
    xmm9  = _mm512_shuffle_epi8( xmm9,  SUBSH_MASK1 );\
@@ -254,7 +254,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    \
     /* AddRoundConstant P1024 */\
    xmm0 = _mm512_xor_si512( xmm0, mm512_bcast_m128( \
-             casti_m128i( round_const_p, round_counter+1 ) ) ); \
+             casti_v128u32( round_const_p, round_counter+1 ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
    xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK1 );\
@@ -283,7 +283,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
    xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
    xmm15 = _mm512_xor_si512( xmm15, mm512_bcast_m128( \
-                 casti_m128i( round_const_q, round_counter ) ) ); \
+                 casti_v128u32( round_const_q, round_counter ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm8  = _mm512_shuffle_epi8( xmm8,  SUBSH_MASK1 );\
    xmm9  = _mm512_shuffle_epi8( xmm9,  SUBSH_MASK3 );\
@@ -306,7 +306,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
    xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
    xmm7 = _mm512_xor_si512( xmm7, mm512_bcast_m128( \
-             casti_m128i( round_const_q, round_counter+1 ) ) ); \
+             casti_v128u32( round_const_q, round_counter+1 ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
    xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK3 );\
@@ -812,7 +812,7 @@ static const __m256i SUBSH_MASK7_2WAY =
  { \
    /* AddRoundConstant P1024 */\
    xmm8 = _mm256_xor_si256( xmm8, mm256_bcast_m128( \
-             casti_m128i( round_const_p, round_counter ) ) ); \
+             casti_v128u32( round_const_p, round_counter ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm8  = _mm256_shuffle_epi8( xmm8,  SUBSH_MASK0_2WAY ); \
    xmm9  = _mm256_shuffle_epi8( xmm9,  SUBSH_MASK1_2WAY );\
@@ -827,7 +827,7 @@ static const __m256i SUBSH_MASK7_2WAY =
    \
     /* AddRoundConstant P1024 */\
    xmm0 = _mm256_xor_si256( xmm0, mm256_bcast_m128( \
-             casti_m128i( round_const_p, round_counter+1 ) ) ); \
+             casti_v128u32( round_const_p, round_counter+1 ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
    xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK1_2WAY );\
@@ -856,7 +856,7 @@ static const __m256i SUBSH_MASK7_2WAY =
    xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
    xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
    xmm15 = _mm256_xor_si256( xmm15, mm256_bcast_m128( \
-                 casti_m128i( round_const_q, round_counter ) ) ); \
+                 casti_v128u32( round_const_q, round_counter ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm8  = _mm256_shuffle_epi8( xmm8,  SUBSH_MASK1_2WAY );\
    xmm9  = _mm256_shuffle_epi8( xmm9,  SUBSH_MASK3_2WAY );\
@@ -879,7 +879,7 @@ static const __m256i SUBSH_MASK7_2WAY =
    xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
    xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
    xmm7 = _mm256_xor_si256( xmm7, mm256_bcast_m128( \
-             casti_m128i( round_const_q, round_counter+1 ) ) ); \
+             casti_v128u32( round_const_q, round_counter+1 ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
    xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK3_2WAY );\
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -17,7 +17,7 @@ typedef struct {
 #else
   hashState_groestl       groestl;
 #endif
-   sha256_8way_context     sha;
+   sha256_8x32_context     sha;
 } myrgr_8way_ctx_holder;

 myrgr_8way_ctx_holder myrgr_8way_ctx;
@@ -29,7 +29,7 @@ void init_myrgr_8way_ctx()
 #else
     init_groestl( &myrgr_8way_ctx.groestl, 64 );
 #endif
-     sha256_8way_init( &myrgr_8way_ctx.sha );
+     sha256_8x32_init( &myrgr_8way_ctx.sha );
 }

 void myriad_8way_hash( void *output, const void *input )
@@ -96,8 +96,8 @@ void myriad_8way_hash( void *output, const void *input )
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
                       hash6, hash7 );
     
-     sha256_8way_update( &ctx.sha, vhash, 64 );
-     sha256_8way_close( &ctx.sha, output );
+     sha256_8x32_update( &ctx.sha, vhash, 64 );
+     sha256_8x32_close( &ctx.sha, output );
 }

 int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
@@ -156,7 +156,7 @@ int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,

 typedef struct {
    hashState_groestl       groestl;
-    sha256_4way_context     sha;
+    sha256_4x32_context     sha;
 } myrgr_4way_ctx_holder;

 myrgr_4way_ctx_holder myrgr_4way_ctx;
@@ -164,7 +164,7 @@ myrgr_4way_ctx_holder myrgr_4way_ctx;
 void init_myrgr_4way_ctx()
 {
     init_groestl (&myrgr_4way_ctx.groestl, 64 );
-     sha256_4way_init( &myrgr_4way_ctx.sha );
+     sha256_4x32_init( &myrgr_4way_ctx.sha );
 }

 void myriad_4way_hash( void *output, const void *input )
@@ -189,8 +189,8 @@ void myriad_4way_hash( void *output, const void *input )

     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

-     sha256_4way_update( &ctx.sha, vhash, 64 );
-     sha256_4way_close( &ctx.sha, output );
+     sha256_4x32_update( &ctx.sha, vhash, 64 );
+     sha256_4x32_close( &ctx.sha, output );
 }

 int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
@@ -213,7 +213,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,

   v128_bswap32_intrlv80_4x32( vdata, pdata );
   do {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
+      *noncev = v128_bswap32( _mm_set_epi32( n+3,n+2,n+1,n ) );

      myriad_4way_hash( hash, vdata );
      pdata[19] = n;
--- a/algo/groestl/myrgr-gate.c
+++ b/algo/groestl/myrgr-gate.c
@@ -16,7 +16,7 @@ bool register_myriad_algo( algo_gate_t* gate )
  init_myrgr_ctx();
  gate->scanhash  = (void*)&scanhash_myriad;
  gate->hash      = (void*)&myriad_hash;
-  gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT;
+  gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA256_OPT | VAES_OPT;
 #endif
  return true;
 };
--- a/algo/groestl/myrgr-gate.h
+++ b/algo/groestl/myrgr-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__VAES__) && defined(SIMD512)
  #define MYRGR_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
  #define MYRGR_4WAY 1
--- a/algo/groestl/sph_groestl.c
+++ b/algo/groestl/sph_groestl.c
@@ -35,8 +35,6 @@

 #include "sph_groestl.h"

-#if !defined(__AES__)
-
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -3119,5 +3117,4 @@ sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #ifdef __cplusplus
 }

-#endif  // !AES
 #endif
--- a/algo/groestl/sph_groestl.h
+++ b/algo/groestl/sph_groestl.h
@@ -42,7 +42,6 @@ extern "C"{
 #include <stddef.h>
 #include "compat/sph_types.h"

-#if !defined(__AES__)   
 /**
 * Output size (in bits) for Groestl-224.
 */
@@ -327,5 +326,4 @@ void sph_groestl512_addbits_and_close(
 }
 #endif

-#endif  // !AES
 #endif
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -382,12 +382,12 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
 #define S1F   MF


-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 // Hamsi 8 way AVX512 

 // Intel docs say _mm512_movepi64_mask & _mm512_cmplt_epi64_mask have same
-// timig. However, when tested hashing X13 on i9-9940x using cmplt with zero
+// timing. However, testing hashing X13 on i9-9940x using cmplt with zero
 // had a 3% faster overall hashrate than than using movepi. 

 #define INPUT_BIG8 \
@@ -418,13 +418,11 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
  tb = mm512_xoror( b, d, a ); \
  a = _mm512_xor_si512( a, c ); \
  b = mm512_xoror( td, tb, a ); \
-  td = mm512_xorand( a, td, tb ); \
+  d = _mm512_ternarylogic_epi64( a, td, tb, 0x87 );/* not( xorand( a, td, tb ) ); */ \
  a = c; \
-  c = mm512_xor3( tb, b, td ); \
-  d = mm512_not( td ); \
+  c = _mm512_ternarylogic_epi64( tb, b, d, 0x69 ); /* not( xor3( tb, b, d ) ); */ \
 }

-
 /*
 #define SBOX8( a, b, c, d ) \
 do { \
@@ -1061,7 +1059,7 @@ void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
   WRITE_STATE_BIG8( sc );
 }

-void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
+void hamsi_8way_big_final( hamsi512_8x64_context *sc, __m512i *buf )
 {
   __m512i m0, m1, m2, m3, m4, m5, m6, m7;

@@ -1073,7 +1071,7 @@ void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
   WRITE_STATE_BIG8( sc );
 }

-void hamsi512_8way_init( hamsi_8way_big_context *sc )
+void hamsi512_8x64_init( hamsi512_8x64_context *sc )
 {
   sc->partial_len = 0;
   sc->count_high = sc->count_low = 0;
@@ -1089,7 +1087,7 @@ void hamsi512_8way_init( hamsi_8way_big_context *sc )
   sc->h[7] = v512_64( iv[7] );
   }

-void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
+void hamsi512_8x64_update( hamsi512_8x64_context *sc, const void *data,
                           size_t len )
 {
   __m512i *vdata = (__m512i*)data;
@@ -1101,7 +1099,7 @@ void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
   sc->partial_len = len;
 }

-void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
+void hamsi512_8x64_close( hamsi512_8x64_context *sc, void *dst )
 {
   __m512i pad[1];
   uint32_t ch, cl;
@@ -1122,7 +1120,7 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )

 // Hamsi 4 way AVX2

-#if defined(__AVX512VL__)
+#if defined(VL256)

 #define INPUT_BIG \
 do { \
@@ -1155,11 +1153,99 @@ do { \
  b = mm256_xoror( td, tb, a ); \
  d = _mm256_ternarylogic_epi64( a, td, tb, 0x87 );/* mm256_not( mm256_xorand( a, td, tb ) ); */ \
  a = c; \
-  c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /*mm256_not( mm256_xor3( tb, b, d ) );*/ \
+  c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /* mm256_not( mm256_xor3( tb, b, d ) ); */ \
 }

 #else

+#define INPUT_BIG_sub( db_i ) \
+{ \
+     const __m256i dm = _mm256_cmpgt_epi64( zero, db_i ); \
+     m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, v256_64( tp[0] ) ) ); \
+     m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, v256_64( tp[1] ) ) ); \
+     m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, v256_64( tp[2] ) ) ); \
+     m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, v256_64( tp[3] ) ) ); \
+     m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, v256_64( tp[4] ) ) ); \
+     m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, v256_64( tp[5] ) ) ); \
+     m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, v256_64( tp[6] ) ) ); \
+     m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, v256_64( tp[7] ) ) ); \
+     tp += 8; \
+}
+
+#define INPUT_BIG \
+{ \
+  const __m256i db = *buf; \
+  const __m256i zero = m256_zero; \
+  const uint64_t *tp = (const uint64_t*)T512;  \
+  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,63 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,62 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,61 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,60 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,59 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,58 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,57 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,56 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,55 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,54 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,53 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,52 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,51 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,50 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,49 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,48 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,47 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,46 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,45 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,44 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,43 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,42 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,41 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,40 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,39 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,38 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,37 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,36 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,35 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,34 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,33 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,32 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,31 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,30 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,29 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,28 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,27 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,26 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,25 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,24 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,23 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,22 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,21 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,20 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,19 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,18 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,17 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,16 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,15 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,14 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,13 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,12 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,11 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,10 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 9 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 8 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 7 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 6 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 5 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 4 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 3 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 2 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 1 ) ); \
+  INPUT_BIG_sub( db ); \
+}
+
+#if 0
+// dependent on the compiler unrolling the loop
 #define INPUT_BIG \
 do { \
  __m256i db = *buf; \
@@ -1180,6 +1266,7 @@ do { \
     tp += 8; \
  } \
 } while (0)
+#endif

 // v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
 #define SBOX( a, b, c, d ) \
@@ -1219,7 +1306,7 @@ do { \
 do { \
   a = mm256_rol_32( a, 13 ); \
   c = mm256_rol_32( c,  3 ); \
-   b = mm256_xor3( a, b, c ); \
+   b = mm256_xor3( b, a, c ); \
   d = mm256_xor3( d, c, _mm256_slli_epi32( a, 3 ) ); \
   b = mm256_rol_32( b, 1 ); \
   d = mm256_rol_32( d, 7 ); \
@@ -1501,7 +1588,7 @@ do { /* order is important */ \
   sc->h[14] = CE; \
   sc->h[15] = CF;

-#if defined(__AVX512VL__)
+#if defined(VL256)

 #define INPUT_8X32 \
 { \
@@ -1857,7 +1944,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void * dst,

 ////////////

-void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
+void hamsi_big( hamsi512_4x64_context *sc, __m256i *buf, size_t num )
 {
   DECL_STATE_BIG
   uint32_t tmp;
@@ -1881,7 +1968,7 @@ void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
   WRITE_STATE_BIG( sc );
 }

-void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
+void hamsi_big_final( hamsi512_4x64_context *sc, __m256i *buf )
 {
   __m256i m0, m1, m2, m3, m4, m5, m6, m7;
   DECL_STATE_BIG
@@ -1892,7 +1979,7 @@ void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
   WRITE_STATE_BIG( sc );
 }

-void hamsi512_4way_init( hamsi_4way_big_context *sc )
+void hamsi512_4x64_init( hamsi512_4x64_context *sc )
 {
   sc->partial_len = 0;
   sc->count_high = sc->count_low = 0;
@@ -1907,7 +1994,7 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc )
   sc->h[7] = v256_64( iv[7] );
 }

-void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
+void hamsi512_4x64_update( hamsi512_4x64_context *sc, const void *data,
      size_t len )
 {
   __m256i *vdata = (__m256i*)data;
@@ -1919,7 +2006,7 @@ void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
   sc->partial_len = len;
 }

-void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
+void hamsi512_4x64_close( hamsi512_4x64_context *sc, void *dst )
 {
   __m256i pad[1];
   uint32_t ch, cl;
@@ -1961,6 +2048,94 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
   sc->h[6] = c6; \
   sc->h[7] = c7;

+#define INPUT_2x64_sub( db_i ) \
+{ \
+     const v128u64_t dm = v128_cmpgt64( zero, db_i ); \
+     m0 = v128_xor( m0, v128_and( dm, v128_64( tp[0] ) ) ); \
+     m1 = v128_xor( m1, v128_and( dm, v128_64( tp[1] ) ) ); \
+     m2 = v128_xor( m2, v128_and( dm, v128_64( tp[2] ) ) ); \
+     m3 = v128_xor( m3, v128_and( dm, v128_64( tp[3] ) ) ); \
+     m4 = v128_xor( m4, v128_and( dm, v128_64( tp[4] ) ) ); \
+     m5 = v128_xor( m5, v128_and( dm, v128_64( tp[5] ) ) ); \
+     m6 = v128_xor( m6, v128_and( dm, v128_64( tp[6] ) ) ); \
+     m7 = v128_xor( m7, v128_and( dm, v128_64( tp[7] ) ) ); \
+     tp += 8; \
+}
+
+#define INPUT_2x64 \
+{ \
+  const v128u64_t db = *buf; \
+  const v128u64_t zero = v128_zero; \
+  const uint64_t *tp = (const uint64_t*)T512;  \
+  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
+  INPUT_2x64_sub( v128_sl64( db,63 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,62 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,61 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,60 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,59 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,58 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,57 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,56 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,55 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,54 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,53 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,52 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,51 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,50 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,49 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,48 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,47 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,46 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,45 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,44 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,43 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,42 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,41 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,40 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,39 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,38 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,37 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,36 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,35 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,34 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,33 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,32 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,31 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,30 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,29 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,28 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,27 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,26 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,25 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,24 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,23 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,22 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,21 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,20 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,19 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,18 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,17 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,16 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,15 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,14 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,13 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,12 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,11 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,10 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 9 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 8 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 7 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 6 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 5 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 4 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 3 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 2 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 1 ) ); \
+  INPUT_2x64_sub( db ); \
+}
+
+#if 0
+// Dependent on the compiler unrolling the loop.
 #define INPUT_2x64 \
 { \
  v128u64_t db = *buf; \
@@ -1981,6 +2156,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
     tp += 8; \
  } \
 }
+#endif

 // v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
 #define SBOX_2x64( a, b, c, d ) \
@@ -2001,7 +2177,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
 { \
   a = v128_rol32( a, 13 ); \
   c = v128_rol32( c,  3 ); \
-   b = v128_xor3( a, b, c ); \
+   b = v128_xor3( c, a, b ); \
   d = v128_xor3( d, c, v128_sl32( a, 3 ) ); \
   b = v128_rol32( b, 1 ); \
   d = v128_rol32( d, 7 ); \
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -72,17 +72,17 @@ typedef struct
   size_t partial_len;
   uint32_t count_high, count_low;
 } hamsi_4way_big_context;
-typedef hamsi_4way_big_context hamsi512_4way_context;
+typedef hamsi_4way_big_context hamsi512_4x64_context;

-void hamsi512_4way_init( hamsi512_4way_context *sc );
-void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
+void hamsi512_4x64_init( hamsi512_4x64_context *sc );
+void hamsi512_4x64_update( hamsi512_4x64_context *sc, const void *data,
      size_t len );
-void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
+void hamsi512_4x64_close( hamsi512_4x64_context *sc, void *dst );

-#define hamsi512_4x64_context   hamsi512_4way_context
-#define hamsi512_4x64_init      hamsi512_4way_init
-#define hamsi512_4x64_update    hamsi512_4way_update
-#define hamsi512_4x64_close     hamsi512_4way_close
+#define hamsi512_4way_context   hamsi512_4x64_context
+#define hamsi512_4way_init      hamsi512_4x64_init
+#define hamsi512_4way_update    hamsi512_4x64_update
+#define hamsi512_4way_close     hamsi512_4x64_close

 // Hamsi-512 8x32

@@ -104,7 +104,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data,

 #endif

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 // Hamsi-512 8x64

@@ -115,17 +115,17 @@ typedef struct
   size_t partial_len;
   uint32_t count_high, count_low;
 } hamsi_8way_big_context;
-typedef hamsi_8way_big_context hamsi512_8way_context;
+typedef hamsi_8way_big_context hamsi512_8x64_context;

-void hamsi512_8way_init( hamsi512_8way_context *sc );
-void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
+void hamsi512_8x64_init( hamsi512_8x64_context *sc );
+void hamsi512_8x64_update( hamsi512_8x64_context *sc, const void *data,
                           size_t len );
-void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );
+void hamsi512_8x64_close( hamsi512_8x64_context *sc, void *dst );

-#define hamsi512_8x64_context   hamsi512_8way_context
-#define hamsi512_8x64_init      hamsi512_8way_init
-#define hamsi512_8x64_update    hamsi512_8way_update
-#define hamsi512_8x64_close     hamsi512_8way_close
+#define hamsi512_8way_context   hamsi512_8x64_context
+#define hamsi512_8way_init      hamsi512_8x64_init
+#define hamsi512_8way_update    hamsi512_8x64_update
+#define hamsi512_8way_close     hamsi512_8x64_close

 // Hamsi-512 16x32

--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -53,7 +53,7 @@ extern "C"{
 #define SPH_SMALL_FOOTPRINT_HAVAL   1
 //#endif

-#if defined(__AVX512VL__)
+#if defined(VL256)

 // ( ~( a ^ b ) ) & c
 #define v128_andnotxor( a, b, c ) \
@@ -583,7 +583,7 @@ do { \

 // Haval-256 8 way 32 bit avx2

-#if defined (__AVX512VL__)
+#if defined (VL256)

 // ( ~( a ^ b ) ) & c
 #define mm256_andnotxor( a, b, c ) \
@@ -882,7 +882,7 @@ do { \

 #endif // AVX2

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512) 

 // ( ~( a ^ b ) ) & c
 #define mm512_andnotxor( a, b, c ) \
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -82,12 +82,15 @@ typedef struct {
 typedef haval_4way_context haval256_5_4way_context;

 void haval256_5_4way_init( void *cc );
-
 void haval256_5_4way_update( void *cc, const void *data, size_t len );
 //#define haval256_5_4way haval256_5_4way_update
-
 void haval256_5_4way_close( void *cc, void *dst );

+#define haval256_4x32_context    haval256_5_4way_context
+#define haval256_4x32_init       haval256_5_4way_init
+#define haval256_4x32_update     haval256_5_4way_update
+#define haval256_4x32_close      haval256_5_4way_close
+
 #if defined(__AVX2__)

 typedef struct {
@@ -100,14 +103,17 @@ typedef struct {
 typedef haval_8way_context haval256_5_8way_context;

 void haval256_5_8way_init( void *cc );
-
 void haval256_5_8way_update( void *cc, const void *data, size_t len );
-
 void haval256_5_8way_close( void *cc, void *dst );

+#define haval256_8x32_context    haval256_5_8way_context
+#define haval256_8x32_init       haval256_5_8way_init
+#define haval256_8x32_update     haval256_5_8way_update
+#define haval256_8x32_close      haval256_5_8way_close
+
 #endif // AVX2

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 typedef struct {
   __m512i buf[32];
@@ -119,11 +125,14 @@ typedef struct {
 typedef haval_16way_context haval256_5_16way_context;

 void haval256_5_16way_init( void *cc );
-
 void haval256_5_16way_update( void *cc, const void *data, size_t len );
-
 void haval256_5_16way_close( void *cc, void *dst );

+#define haval256_16x32_context    haval256_5_16way_context
+#define haval256_16x32_init       haval256_5_16way_init
+#define haval256_16x32_update     haval256_5_16way_update
+#define haval256_16x32_close      haval256_5_16way_close
+
 #endif // AVX512

 #ifdef __cplusplus
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -204,7 +204,7 @@ static const uint64_t IV512[] =
      (state)->H[15] = h7l; \
   } while (0)

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 #define Sb_8W(x0, x1, x2, x3, c) \
 { \
@@ -364,8 +364,7 @@ static const uint64_t IV512[] =

 #if defined(__AVX2__)

-#if defined(__AVX512VL__)
-//TODO enable for AVX10_256, not used with AVX512VL
+#if defined(VL256)

 #define notxorandnot( a, b, c ) \
   _mm256_ternarylogic_epi64( a, b, c, 0x2d )
@@ -522,7 +521,7 @@ static const uint64_t IV512[] =

 #endif   // AVX2

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 void jh256_8x64_init( jh_8x64_context *sc )
 {
--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -55,7 +55,7 @@
 * <code>memcpy()</code>).
 */

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 typedef struct
 {
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -78,7 +78,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
-   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   __m256i *noncev = (__m256i*)vdata + 9;   // aligned
   const uint32_t Htarg = ptarget[7];
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
@@ -108,4 +108,53 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }

+#elif defined(KECCAK_2WAY)
+
+void keccakhash_2x64(void *state, const void *input)
+{
+    keccak256_2x64_context ctx;
+    keccak256_2x64_init( &ctx );
+    keccak256_2x64_update( &ctx, input, 80 );
+    keccak256_2x64_close( &ctx, state );
+}
+
+int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
+   uint32_t hash[16*2] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[13]);   // 3*4+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   v128_t *noncev = (v128_t*)vdata + 9;
+   const uint32_t Htarg = ptarget[7];
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+
+   v128_bswap32_intrlv80_2x64( vdata, pdata );
+   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
+   do {
+      keccakhash_2x64( hash, vdata );
+
+      for ( int lane = 0; lane < 2; lane++ )
+      if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
+      {
+          extr_lane_2x64( lane_hash, hash, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ))
+          {
+              pdata[19] = bswap_32( n + lane );
+              submit_solution( work, lane_hash, mythr );
+          }
+      }
+      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
+      n += 2;
+   } while ( (n < max_nonce-2) && !work_restart[thr_id].restart);
+   pdata[19] = n;
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
 #endif
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -17,6 +17,9 @@ bool register_keccak_algo( algo_gate_t* gate )
 #elif defined (KECCAK_4WAY)
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
+#elif defined (KECCAK_2WAY)
+  gate->scanhash  = (void*)&scanhash_keccak_2x64;
+  gate->hash      = (void*)&keccakhash_2x64;
 #else
  gate->scanhash  = (void*)&scanhash_keccak;
  gate->hash      = (void*)&keccakhash;
@@ -37,6 +40,9 @@ bool register_keccakc_algo( algo_gate_t* gate )
 #elif defined (KECCAK_4WAY)
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
+#elif defined (KECCAK_2WAY)
+  gate->scanhash  = (void*)&scanhash_keccak_2x64;
+  gate->hash      = (void*)&keccakhash_2x64;
 #else
  gate->scanhash  = (void*)&scanhash_keccak;
  gate->hash      = (void*)&keccakhash;
@@ -75,15 +81,17 @@ void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
 bool register_sha3d_algo( algo_gate_t* gate )
 {
  hard_coded_eb = 6;
-//  opt_extranonce = false;
-  gate->optimizations = AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
  gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root;
-#if defined (KECCAK_8WAY)
+#if defined (SHA3D_8WAY)
  gate->scanhash  = (void*)&scanhash_sha3d_8way;
  gate->hash      = (void*)&sha3d_hash_8way;
-#elif defined (KECCAK_4WAY)
+#elif defined (SHA3D_4WAY)
  gate->scanhash  = (void*)&scanhash_sha3d_4way;
  gate->hash      = (void*)&sha3d_hash_4way;
+#elif defined (SHA3D_2WAY)
+  gate->scanhash  = (void*)&scanhash_sha3d_2x64;
+  gate->hash      = (void*)&sha3d_hash_2x64;
 #else
  gate->scanhash  = (void*)&scanhash_sha3d;
  gate->hash      = (void*)&sha3d_hash;
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -4,10 +4,20 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define KECCAK_8WAY 1
 #elif defined(__AVX2__)
  #define KECCAK_4WAY 1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define KECCAK_2WAY 1
+#endif
+
+#if defined(SIMD512)
+  #define SHA3D_8WAY 1
+#elif defined(__AVX2__)
+  #define SHA3D_4WAY 1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define SHA3D_2WAY 1
 #endif

 extern int hard_coded_eb;
@@ -16,27 +26,47 @@ extern int hard_coded_eb;

 void keccakhash_8way( void *state, const void *input );
 int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
-
-void sha3d_hash_8way( void *state, const void *input );
-int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
+                          uint64_t *hashes_done, struct thr_info *mythr );

 #elif defined(KECCAK_4WAY)

 void keccakhash_4way( void *state, const void *input );
 int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
+                          uint64_t *hashes_done, struct thr_info *mythr );

-void sha3d_hash_4way( void *state, const void *input );
-int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
+#elif defined(KECCAK_2WAY)
+
+void keccakhash_2x64( void *state, const void *input );
+int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );

 #else

 void keccakhash( void *state, const void *input );
 int scanhash_keccak( struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done, struct thr_info *mythr );
+                     uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
+#if defined(SHA3D_8WAY)
+
+void sha3d_hash_8way( void *state, const void *input );
+int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(SHA3D_4WAY)
+
+void sha3d_hash_4way( void *state, const void *input );
+int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(SHA3D_2WAY)
+
+void sha3d_hash_2x64( void *state, const void *input );
+int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#else

 void sha3d_hash( void *state, const void *input );
 int scanhash_sha3d( struct work *work, uint32_t max_nonce,
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -57,7 +57,7 @@ static const uint64_t RC[] = {

 #define DO(x)   x

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 #define INPUT_BUF(size)   do { \
    size_t j; \
@@ -161,29 +161,25 @@ keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
 static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
                                 size_t byte_len, size_t lim )
 {
-    unsigned eb;
-    union {
-       __m512i tmp[lim + 1];
-       uint64_t dummy;   /* for alignment */
-    } u;
+    __m512i tmp[lim + 1] __attribute__ ((aligned (64)));
    size_t j;
    size_t m512_len = byte_len >> 3;
+    const unsigned eb = hard_coded_eb;

-    eb = hard_coded_eb;
    if ( kc->ptr == (lim - 8) )
    {
        const uint64_t t = eb | 0x8000000000000000;
-        u.tmp[0] = _mm512_set1_epi64( t );
+        tmp[0] = _mm512_set1_epi64( t );
        j = 8;
    }
    else
    {
        j = lim - kc->ptr;
-        u.tmp[0] = _mm512_set1_epi64( eb );
-        memset_zero_512( u.tmp + 1, (j>>3) - 2 );
-        u.tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
+        tmp[0] = _mm512_set1_epi64( eb );
+        memset_zero_512( tmp + 1, (j>>3) - 2 );
+        tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
    }
-    keccak64_8way_core( kc, u.tmp, j, lim );
+    keccak64_8way_core( kc, tmp, j, lim );
    /* Finalize the "lane complement" */
    NOT64( kc->w[ 1], kc->w[ 1] );
    NOT64( kc->w[ 2], kc->w[ 2] );
@@ -194,7 +190,7 @@ static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
    memcpy_512( dst, kc->w, m512_len );
 }

-void keccak256_8way_init( void *kc )
+void keccak256_8x64_init( void *kc )
 {
   keccak64_8way_init( kc, 256 );
 }
@@ -361,29 +357,25 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
 static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
            size_t lim )
 {
-    unsigned eb;
-    union {
-       __m256i tmp[lim + 1];
-       uint64_t dummy;   /* for alignment */
-    } u;
+    __m256i tmp[lim + 1] __attribute__ ((aligned (32)));
    size_t j;
    size_t m256_len = byte_len >> 3;
+    const unsigned eb = hard_coded_eb;

-    eb = hard_coded_eb;
    if ( kc->ptr == (lim - 8) )
    {
        const uint64_t t = eb | 0x8000000000000000;
-        u.tmp[0] = _mm256_set1_epi64x( t );
+        tmp[0] = _mm256_set1_epi64x( t );
        j = 8;
    }
    else
    {
        j = lim - kc->ptr;
-        u.tmp[0] = _mm256_set1_epi64x( eb );
-        memset_zero_256( u.tmp + 1, (j>>3) - 2 );
-        u.tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
+        tmp[0] = _mm256_set1_epi64x( eb );
+        memset_zero_256( tmp + 1, (j>>3) - 2 );
+        tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
    }
-    keccak64_core( kc, u.tmp, j, lim );
+    keccak64_core( kc, tmp, j, lim );
    /* Finalize the "lane complement" */
    NOT64( kc->w[ 1], kc->w[ 1] );
    NOT64( kc->w[ 2], kc->w[ 2] );
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -4,7 +4,7 @@
 #include <stddef.h>
 #include "simd-utils.h"

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 typedef struct
 {
--- a/algo/keccak/sha3d-4way.c
+++ b/algo/keccak/sha3d-4way.c
@@ -4,12 +4,12 @@
 #include <stdint.h>
 #include "keccak-hash-4way.h"

-#if defined(KECCAK_8WAY)
+#if defined(SHA3D_8WAY)

 void sha3d_hash_8way(void *state, const void *input)
 {
    uint32_t buffer[16*8] __attribute__ ((aligned (128)));
-    keccak256_8way_context ctx;
+    keccak256_8x64_context ctx;

    keccak256_8x64_init( &ctx );
    keccak256_8x64_update( &ctx, input, 80 );
@@ -64,12 +64,12 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
   return 0;
 }

-#elif defined(KECCAK_4WAY)
+#elif defined(SHA3D_4WAY)

 void sha3d_hash_4way(void *state, const void *input)
 {
    uint32_t buffer[16*4] __attribute__ ((aligned (64)));
-    keccak256_4way_context ctx;
+    keccak256_4x64_context ctx;

    keccak256_4x64_init( &ctx );
    keccak256_4x64_update( &ctx, input, 80 );
@@ -122,4 +122,60 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }

+#elif defined(SHA3D_2WAY)
+
+void sha3d_hash_2x64(void *state, const void *input)
+{
+    uint32_t buffer[16*4] __attribute__ ((aligned (64)));
+    keccak256_2x64_context ctx;
+
+    keccak256_2x64_init( &ctx );
+    keccak256_2x64_update( &ctx, input, 80 );
+    keccak256_2x64_close( &ctx, buffer );
+
+    keccak256_2x64_init( &ctx );
+    keccak256_2x64_update( &ctx, buffer, 32 );
+    keccak256_2x64_close( &ctx, state );
+}
+
+int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
+   uint32_t hash[16*2] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[13]);   // 3*4+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   v128_t *noncev = (v128_t*)vdata + 9;
+   const uint32_t Htarg = ptarget[7];
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+
+   v128_bswap32_intrlv80_2x64( vdata, pdata );
+   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
+   do {
+      sha3d_hash_2x64( hash, vdata );
+
+      for ( int lane = 0; lane < 2; lane++ )
+      if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
+      {
+          extr_lane_2x64( lane_hash, hash, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) )
+          {
+              pdata[19] = bswap_32( n + lane );
+              submit_solution( work, lane_hash, mythr );
+          }
+      }
+      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
+      n += 2;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
 #endif
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -59,7 +59,7 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
 };


-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 #define cns4w(i)  mm512_bcast_m128( ( (__m128i*)CNS_INIT)[i] )

@@ -273,8 +273,6 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
    uint32_t hash[8*4] __attribute((aligned(128)));
    __m512i* chainv = state->chainv;
    __m512i t[2];
-    const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x( 
-                                  0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

    /*---- blank round with m=0 ----*/
    rnd512_4way( state, NULL );
@@ -289,10 +287,8 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
    _mm512_store_si512( (__m512i*)&hash[ 0], t[0] );
    _mm512_store_si512( (__m512i*)&hash[16], t[1] );

-    casti_m512i( b,0 ) = _mm512_shuffle_epi8(
-                                  casti_m512i( hash,0 ), shuff_bswap32 );
-    casti_m512i( b,1 ) = _mm512_shuffle_epi8(
-                                  casti_m512i( hash,1 ), shuff_bswap32 );
+    casti_m512i( b,0 ) = mm512_bswap_32( casti_m512i( hash,0 ) );
+    casti_m512i( b,1 ) = mm512_bswap_32( casti_m512i( hash,1 ) );

    rnd512_4way( state, NULL );

@@ -306,10 +302,8 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
    _mm512_store_si512( (__m512i*)&hash[ 0], t[0] );
    _mm512_store_si512( (__m512i*)&hash[16], t[1] );

-    casti_m512i( b,2 ) = _mm512_shuffle_epi8(
-                                  casti_m512i( hash,0 ), shuff_bswap32 );
-    casti_m512i( b,3 ) = _mm512_shuffle_epi8(
-                                  casti_m512i( hash,1 ), shuff_bswap32 );
+    casti_m512i( b,2 ) = mm512_bswap_32( casti_m512i( hash,0 ) );
+    casti_m512i( b,3 ) = mm512_bswap_32( casti_m512i( hash,1 ) );
 }

 int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
@@ -349,16 +343,14 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
    __m512i msg[2];
    int i;
    int blocks = (int)len >> 5;
-    const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(  
-                                   0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

    state->rembytes = (int)len & 0x1F;

    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
-       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       msg[0] = mm512_bswap_32( vdata[ 0 ] );
+       msg[1] = mm512_bswap_32( vdata[ 1 ] );
       rnd512_4way( state, msg );
    }

@@ -367,7 +359,7 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
    if ( state->rembytes  )
    {
      // remaining data bytes
-      buffer[0] = _mm512_shuffle_epi8( vdata[0], shuff_bswap32 );
+      buffer[0] = mm512_bswap_32( vdata[0] );
      buffer[1] = mm512_bcast128lo_64( 0x0000000080000000 );
    }
    return 0;
@@ -434,16 +426,14 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
    __m512i msg[2];
    int i;
    const int blocks = (int)( inlen >> 5 );
-    const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x( 
-                                   0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

    state->rembytes = inlen & 0x1F;

    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
-       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       msg[0] = mm512_bswap_32( vdata[ 0 ] );
+       msg[1] = mm512_bswap_32( vdata[ 1 ] );
       rnd512_4way( state, msg );
    }

@@ -451,7 +441,7 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
    if ( state->rembytes  )
    {
       // padding of partial block
-       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[0] = mm512_bswap_32( vdata[ 0 ] );
       msg[1] = mm512_bcast128lo_64( 0x0000000080000000 );
       rnd512_4way( state, msg );
    }
@@ -479,16 +469,14 @@ int luffa_4way_update_close( luffa_4way_context *state,
    __m512i msg[2];
    int i;
    const int blocks = (int)( inlen >> 5 );
-    const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x( 
-                                   0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

    state->rembytes = inlen & 0x1F;

    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
-       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       msg[0] = mm512_bswap_32( vdata[ 0 ] );
+       msg[1] = mm512_bswap_32( vdata[ 1 ] );
       rnd512_4way( state, msg );
    }

@@ -496,7 +484,7 @@ int luffa_4way_update_close( luffa_4way_context *state,
    if ( state->rembytes  )
    {
       // padding of partial block
-       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[0] = mm512_bswap_32( vdata[ 0 ] );
       msg[1] = mm512_bcast128lo_64( 0x0000000080000000 );
       rnd512_4way( state, msg );
    }
@@ -524,8 +512,7 @@ int luffa_4way_update_close( luffa_4way_context *state,
    a = _mm256_xor_si256( a, c0 ); \
    b = _mm256_xor_si256( b, c1 );

-//TODO Enable for AVX10_256, not used with AVX512 or AVX10_512
-#if defined(__AVX512VL__) 
+#if defined(VL256) 

 #define MULT2( a0, a1 ) \
 { \
@@ -776,8 +763,6 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
    uint32 hash[8*2] __attribute((aligned(64)));
    __m256i* chainv = state->chainv;
    __m256i t0, t1;
-    const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
-                                                 0x0405060700010203 );
    /*---- blank round with m=0 ----*/
    rnd512_2way( state, NULL );

@@ -792,10 +777,8 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
    _mm256_store_si256( (__m256i*)&hash[0], t0 );
    _mm256_store_si256( (__m256i*)&hash[8], t1 );

-    casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
-                                  casti_m256i( hash, 0 ), shuff_bswap32 );
-    casti_m256i( b, 1 ) = _mm256_shuffle_epi8( 
-                                  casti_m256i( hash, 1 ), shuff_bswap32 );
+    casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );

    rnd512_2way( state, NULL );

@@ -810,10 +793,8 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
    _mm256_store_si256( (__m256i*)&hash[0], t0 );
    _mm256_store_si256( (__m256i*)&hash[8], t1 );

-    casti_m256i( b, 2 ) = _mm256_shuffle_epi8( 
-                                  casti_m256i( hash, 0 ), shuff_bswap32 );
-    casti_m256i( b, 3 ) = _mm256_shuffle_epi8( 
-                                  casti_m256i( hash, 1 ), shuff_bswap32 );
+    casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
 }

 int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
@@ -848,15 +829,13 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
    __m256i msg[2];
    int i;
    int blocks = (int)len >> 5;
-    const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
-                                                 0x0405060700010203 );
    state-> rembytes = (int)len & 0x1F;

    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
-       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       msg[0] = mm256_bswap_32( vdata[ 0 ] );
+       msg[1] = mm256_bswap_32( vdata[ 1 ] );
       rnd512_2way( state, msg );
    }

@@ -865,7 +844,7 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
    if ( state->rembytes  )
    {
      // remaining data bytes
-      buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
+      buffer[0] = mm256_bswap_32( vdata[0] );
      buffer[1] = mm256_bcast128lo_64( 0x0000000080000000 );
    }
    return 0;
@@ -917,16 +896,14 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
    __m256i msg[2];
    int i;
    const int blocks = (int)( inlen >> 5 );
-    const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
-                                                 0x0405060700010203 );

    state->rembytes = inlen & 0x1F;

    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
-       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       msg[0] = mm256_bswap_32( vdata[ 0 ] );
+       msg[1] = mm256_bswap_32( vdata[ 1 ] );
       rnd512_2way( state, msg );
    }

@@ -934,7 +911,7 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
    if ( state->rembytes  )
    {
       // padding of partial block
-       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[0] = mm256_bswap_32( vdata[ 0 ] );
       msg[1] = mm256_bcast128lo_64( 0x0000000080000000 );
       rnd512_2way( state, msg );
    }
@@ -962,16 +939,14 @@ int luffa_2way_update_close( luffa_2way_context *state,
    __m256i msg[2];
    int i;
    const int blocks = (int)( inlen >> 5 );
-    const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
-                                                 0x0405060700010203 );

    state->rembytes = inlen & 0x1F;

    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
-       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       msg[0] = mm256_bswap_32( vdata[ 0 ] );
+       msg[1] = mm256_bswap_32( vdata[ 1 ] );
       rnd512_2way( state, msg );
    }

@@ -979,7 +954,7 @@ int luffa_2way_update_close( luffa_2way_context *state,
    if ( state->rembytes  )
    {
       // padding of partial block
-       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[0] = mm256_bswap_32( vdata[ 0 ] );
       msg[1] = mm256_bcast128lo_64( 0x0000000080000000 );
       rnd512_2way( state, msg );
    }
--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -51,7 +51,7 @@
 #define LIMIT_512 128
 /*********************************/

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 typedef struct {
    uint32_t buffer[8*4];
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -28,8 +28,7 @@
    a = v128_xor( a, c0 ); \
    b = v128_xor( b, c1 ); \

-#if defined(__AVX512VL__)
-//TODO enable for AVX10_512 AVX10_256
+#if defined(VL256)

 #define MULT2( a0, a1 ) \
 { \
@@ -48,29 +47,22 @@
  a1 = _mm_alignr_epi8( b, a1, 4 ); \
 }

-#elif defined(__ARM_NEON)
+
+#elif defined(__ARM_NEON) || defined(__SSE2__)

 // { a1_0, 0, a1_0, a1_0 }
 #define MULT2( a0, a1 ) \
 { \
-  v128_t b = v128_xor( a0, v128_and( vdupq_laneq_u32( a1, 0 ), MASK ) ); \
+  v128_t b = v128_xor( a0, v128_and( v128_bcast32( a1 ), MASK ) ); \
  a0 = v128_alignr32( a1, b, 1 ); \
  a1 = v128_alignr32( b, a1, 1 ); \
 }

-#else   // assume SSE2
-
-#define MULT2( a0, a1 ) \
-{ \
-  v128_t b = v128_xor( a0, v128_and( _mm_shuffle_epi32( a1, 0 ), MASK ) ); \
-  a0 = v128_or( _mm_srli_si128(  b, 4 ), _mm_slli_si128( a1, 12 ) ); \
-  a1 = v128_or( _mm_srli_si128( a1, 4 ), _mm_slli_si128(  b, 12 ) ); \
-} 
-
+#else
+  #warning __FILE__ ":" __LINE__ " Unknown or unsupported CPU architecture."
 #endif

-#if defined(__AVX512VL__)
-//TODO enable for AVX10_512 AVX10_256
+#if defined(VL256)

 #define SUBCRUMB( a0, a1, a2, a3 ) \
 { \
--- a/algo/luffa/luffa_for_sse2.h
+++ b/algo/luffa/luffa_for_sse2.h
@@ -68,4 +68,4 @@ int update_and_final_luffa( hashState_luffa *state, void* output,

 int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
                                   const void* data, size_t inlen );
-#endif   // LUFFA_FOR_SSE2_H___
+#endif   // LUFFA_FOR_SSE2_H__
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -15,7 +15,7 @@
 #include "algo/groestl/sph_groestl.h"
 #endif

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define ALLIUM_16WAY 1
 #elif defined(__AVX2__)
  #define ALLIUM_8WAY 1
@@ -26,9 +26,9 @@
 #if defined (ALLIUM_16WAY)  

 typedef union {
-   keccak256_8way_context    keccak;
+   keccak256_8x64_context    keccak;
   cube_4way_2buf_context    cube;
-   skein256_8way_context     skein;
+   skein256_8x64_context     skein;
 #if defined(__VAES__)
   groestl256_4way_context   groestl;
 #else
@@ -60,7 +60,7 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
   uint32_t hash15[8] __attribute__ ((aligned (32)));
   allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));

-   blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
+   blake256_16x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );

   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
@@ -70,12 +70,12 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
                hash15, 256 );
   
-   keccak256_8way_init( &ctx.keccak );
-   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
-   keccak256_8way_close( &ctx.keccak, vhashA);
-   keccak256_8way_init( &ctx.keccak );
-   keccak256_8way_update( &ctx.keccak, vhashB, 32 );
-   keccak256_8way_close( &ctx.keccak, vhashB);
+   keccak256_8x64_init( &ctx.keccak );
+   keccak256_8x64_update( &ctx.keccak, vhashA, 32 );
+   keccak256_8x64_close( &ctx.keccak, vhashA);
+   keccak256_8x64_init( &ctx.keccak );
+   keccak256_8x64_update( &ctx.keccak, vhashB, 32 );
+   keccak256_8x64_close( &ctx.keccak, vhashB);

   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 vhashA, 256 );
@@ -153,12 +153,12 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
                hash15, 256 );

-   skein256_8way_init( &ctx.skein );
-   skein256_8way_update( &ctx.skein, vhashA, 32 );
-   skein256_8way_close( &ctx.skein, vhashA );
-   skein256_8way_init( &ctx.skein );
-   skein256_8way_update( &ctx.skein, vhashB, 32 );
-   skein256_8way_close( &ctx.skein, vhashB );
+   skein256_8x64_init( &ctx.skein );
+   skein256_8x64_update( &ctx.skein, vhashA, 32 );
+   skein256_8x64_close( &ctx.skein, vhashA );
+   skein256_8x64_init( &ctx.skein );
+   skein256_8x64_update( &ctx.skein, vhashB, 32 );
+   skein256_8x64_close( &ctx.skein, vhashB );

   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 vhashA, 256 );
@@ -251,7 +251,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );

   // Partialy prehash second block without touching nonces in block_buf[3].
-   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
     allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -273,9 +273,9 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
 #elif defined (ALLIUM_8WAY)  

 typedef union {
-   keccak256_4way_context    keccak;
+   keccak256_4x64_context    keccak;
   cube_2way_context         cube;
-   skein256_4way_context     skein;
+   skein256_4x64_context     skein;
 #if defined(__VAES__)
   groestl256_2way_context   groestl;
 #else
@@ -298,19 +298,19 @@ static void allium_8way_hash( void *hash, const void *midstate_vars,
   uint64_t *hash7 = (uint64_t*)hash+28;
   allium_8way_ctx_holder ctx __attribute__ ((aligned (64))); 

-   blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
+   blake256_8x32_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );

   dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 vhashA, 256 );
   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );

-   keccak256_4way_init( &ctx.keccak );
-   keccak256_4way_update( &ctx.keccak, vhashA, 32 );
-   keccak256_4way_close( &ctx.keccak, vhashA );
-   keccak256_4way_init( &ctx.keccak );
-   keccak256_4way_update( &ctx.keccak, vhashB, 32 );
-   keccak256_4way_close( &ctx.keccak, vhashB );
+   keccak256_4x64_init( &ctx.keccak );
+   keccak256_4x64_update( &ctx.keccak, vhashA, 32 );
+   keccak256_4x64_close( &ctx.keccak, vhashA );
+   keccak256_4x64_init( &ctx.keccak );
+   keccak256_4x64_update( &ctx.keccak, vhashB, 32 );
+   keccak256_4x64_close( &ctx.keccak, vhashB );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
   dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );
@@ -350,12 +350,12 @@ static void allium_8way_hash( void *hash, const void *midstate_vars,
   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );

-   skein256_4way_init( &ctx.skein );
-   skein256_4way_update( &ctx.skein, vhashA, 32 );
-   skein256_4way_close( &ctx.skein, vhashA );
-   skein256_4way_init( &ctx.skein );
-   skein256_4way_update( &ctx.skein, vhashB, 32 );
-   skein256_4way_close( &ctx.skein, vhashB );
+   skein256_4x64_init( &ctx.skein );
+   skein256_4x64_update( &ctx.skein, vhashA, 32 );
+   skein256_4x64_close( &ctx.skein, vhashA );
+   skein256_4x64_init( &ctx.skein );
+   skein256_4x64_update( &ctx.skein, vhashB, 32 );
+   skein256_4x64_close( &ctx.skein, vhashB );

 #if defined(__VAES__)

@@ -433,7 +433,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                                     n+ 3, n+ 2, n+ 1, n );

   // Partialy prehash second block without touching nonces
-   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
     allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -465,12 +465,8 @@ typedef union
 {
   keccak256_2x64_context    keccak;
   cubehashParam             cube;
-//#if defined(__x86_64__)
   skein256_2x64_context     skein;
-//#else
-//   sph_skein512_context      skein;
-//#endif
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   hashState_groestl256      groestl;
 #else
   sph_groestl256_context     groestl;
@@ -487,7 +483,7 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
   uint64_t *hash3 = (uint64_t*)hash+12;
   allium_4way_ctx_holder ctx __attribute__ ((aligned (64)));

-   blake256_4way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
+   blake256_4x32_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
   dintrlv_4x32( hash0, hash1, hash2, hash3, vhashA, 256 );

   intrlv_2x64( vhashA, hash0, hash1, 256 );
@@ -516,7 +512,6 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );

-//#if defined(__x86_64__)
   intrlv_2x64( vhashA, hash0, hash1, 256 );
   skein256_2x64_init( &ctx.skein );
   skein256_2x64_update( &ctx.skein, vhashA, 32 );
@@ -527,23 +522,8 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
   skein256_2x64_update( &ctx.skein, vhashA, 32 );
   skein256_2x64_close( &ctx.skein, vhashA );
   dintrlv_2x64( hash2, hash3, vhashA, 256 );
-/*
-#else
-    sph_skein256_init( &ctx.skein );
-    sph_skein256( &ctx.skein, hash0, 32 );
-    sph_skein256_close( &ctx.skein, hash0 );
-    sph_skein256_init( &ctx.skein );
-    sph_skein256( &ctx.skein, hash1, 32 );
-    sph_skein256_close( &ctx.skein, hash1 );
-    sph_skein256_init( &ctx.skein );
-    sph_skein256( &ctx.skein, hash2, 32 );
-    sph_skein256_close( &ctx.skein, hash2 );
-    sph_skein256_init( &ctx.skein );
-    sph_skein256( &ctx.skein, hash3, 32 );
-    sph_skein256_close( &ctx.skein, hash3 );
-#endif
-*/
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   groestl256_full( &ctx.groestl, hash0, hash0, 256 );
   groestl256_full( &ctx.groestl, hash1, hash1, 256 );
   groestl256_full( &ctx.groestl, hash2, hash2, 256 );
@@ -608,7 +588,7 @@ int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
   block_buf[15] = v128_32( 640 );

      // Partialy prehash second block without touching nonces
-   blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_4x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
     allium_4way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -636,7 +616,6 @@ int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
 //
 //  1 way

-
 typedef struct 
 {
        blake256_context        blake;
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -5,7 +5,7 @@
 #include <stdint.h>
 #include "lyra2.h"

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define LYRA2REV3_16WAY 1
 #elif defined(__AVX2__)
  #define LYRA2REV3_8WAY 1
@@ -49,7 +49,7 @@ bool init_lyra2rev3_ctx();

 //////////////////////////////////

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define LYRA2REV2_16WAY 1
 #elif defined(__AVX2__)
  #define LYRA2REV2_8WAY 1
@@ -108,7 +108,7 @@ bool lyra2h_thread_init();

 /////////////////////////////////////////

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define PHI2_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define PHI2_4WAY 1
--- a/algo/lyra2/lyra2-hash-2way.c
+++ b/algo/lyra2/lyra2-hash-2way.c
@@ -41,7 +41,7 @@
 //  lyra2z330, lyra2h, 


-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 /**
 * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -59,7 +59,7 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,

 int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
                  uint64_t timeCost, uint64_t nRows, uint64_t nCols );
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -14,12 +14,12 @@ bool lyra2h_4way_thread_init()
 return ( lyra2h_4way_matrix = mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
 }

-static __thread blake256_4way_context l2h_4way_blake_mid;
+static __thread blake256_4x32_context l2h_4way_blake_mid;

 void lyra2h_4way_midstate( const void* input )
 {
-       blake256_4way_init( &l2h_4way_blake_mid );
-       blake256_4way_update( &l2h_4way_blake_mid, input, 64 );
+       blake256_4x32_init( &l2h_4way_blake_mid );
+       blake256_4x32_update( &l2h_4way_blake_mid, input, 64 );
 }

 void lyra2h_4way_hash( void *state, const void *input )
@@ -29,11 +29,11 @@ void lyra2h_4way_hash( void *state, const void *input )
     uint32_t hash2[8] __attribute__ ((aligned (64)));
     uint32_t hash3[8] __attribute__ ((aligned (64)));
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
+     blake256_4x32_context ctx_blake __attribute__ ((aligned (64)));

     memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
-     blake256_4way_update( &ctx_blake, input + (64*4), 16 );
-     blake256_4way_close( &ctx_blake, vhash );
+     blake256_4x32_update( &ctx_blake, input + (64*4), 16 );
+     blake256_4x32_close( &ctx_blake, vhash );

     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

@@ -67,7 +67,7 @@ int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
   lyra2h_4way_midstate( vdata );

   do {
-     *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+     *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      lyra2h_4way_hash( hash, vdata );

      for ( int i = 0; i < 4; i++ )
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -7,25 +7,24 @@
 #include "algo/cubehash/cubehash_sse2.h" 
 #include "algo/cubehash/cube-hash-2way.h"

-
 #if defined (LYRA2REV2_16WAY)

 typedef struct {
-   blake256_16way_context    blake;
-   keccak256_8way_context    keccak;
+   blake256_16x32_context    blake;
+   keccak256_8x64_context    keccak;
   cubehashParam             cube;
-   skein256_8way_context     skein;
-   bmw256_16way_context      bmw;
+   skein256_8x64_context     skein;
+   bmw256_16x32_context      bmw;
 } lyra2v2_16way_ctx_holder __attribute__ ((aligned (64)));

 static lyra2v2_16way_ctx_holder l2v2_16way_ctx;

 bool init_lyra2rev2_16way_ctx()
 {
-   keccak256_8way_init( &l2v2_16way_ctx.keccak );
+   keccak256_8x64_init( &l2v2_16way_ctx.keccak );
   cubehashInit( &l2v2_16way_ctx.cube, 256, 16, 32 );
-   skein256_8way_init( &l2v2_16way_ctx.skein );
-   bmw256_16way_init( &l2v2_16way_ctx.bmw );
+   skein256_8x64_init( &l2v2_16way_ctx.skein );
+   bmw256_16x32_init( &l2v2_16way_ctx.bmw );
   return true;
 }

@@ -51,8 +50,8 @@ void lyra2rev2_16way_hash( void *state, const void *input )
   lyra2v2_16way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v2_16way_ctx, sizeof(l2v2_16way_ctx) );

-   blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
-   blake256_16way_close( &ctx.blake, vhash );
+   blake256_16x32_update( &ctx.blake, input + (64<<4), 16 );
+   blake256_16x32_close( &ctx.blake, vhash );

   dintrlv_16x32( hash0,  hash1,  hash2,  hash3,
                  hash4,  hash5,  hash6,  hash7,
@@ -62,17 +61,17 @@ void lyra2rev2_16way_hash( void *state, const void *input )
   intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
                       hash4, hash5, hash6, hash7, 256 );

-   keccak256_8way_update( &ctx.keccak, vhash, 32 );
-   keccak256_8way_close( &ctx.keccak, vhash );
+   keccak256_8x64_update( &ctx.keccak, vhash, 32 );
+   keccak256_8x64_close( &ctx.keccak, vhash );

   dintrlv_8x64( hash0, hash1, hash2, hash3,
                 hash4, hash5, hash6, hash7, vhash, 256 );
   intrlv_8x64( vhash, hash8,  hash9,  hash10, hash11,
                       hash12, hash13, hash14, hash15, 256 );

-   keccak256_8way_init( &ctx.keccak );
-   keccak256_8way_update( &ctx.keccak, vhash, 32 );
-   keccak256_8way_close( &ctx.keccak, vhash );
+   keccak256_8x64_init( &ctx.keccak );
+   keccak256_8x64_update( &ctx.keccak, vhash, 32 );
+   keccak256_8x64_close( &ctx.keccak, vhash );

   dintrlv_8x64( hash8,  hash9,  hash10,  hash11,
                 hash12, hash13, hash14, hash15, vhash, 256 );
@@ -122,21 +121,20 @@ void lyra2rev2_16way_hash( void *state, const void *input )

   intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
                       hash4, hash5, hash6, hash7, 256 );
-   skein256_8way_update( &ctx.skein, vhash, 32 );
-   skein256_8way_close( &ctx.skein, vhash );
+   skein256_8x64_update( &ctx.skein, vhash, 32 );
+   skein256_8x64_close( &ctx.skein, vhash );

   dintrlv_8x64( hash0, hash1, hash2, hash3,
                 hash4, hash5, hash6, hash7, vhash, 256 );
   intrlv_8x64( vhash, hash8,  hash9,  hash10, hash11, hash12,
                       hash13, hash14, hash15, 256 );

-   skein256_8way_init( &ctx.skein );
-   skein256_8way_update( &ctx.skein, vhash, 32 );
-   skein256_8way_close( &ctx.skein, vhash );
+   skein256_8x64_init( &ctx.skein );
+   skein256_8x64_update( &ctx.skein, vhash, 32 );
+   skein256_8x64_close( &ctx.skein, vhash );

   dintrlv_8x64( hash8,  hash9,  hash10, hash11,
                 hash12, hash13, hash14, hash15, vhash, 256 );
-
   
   cubehash_full( &ctx.cube, (byte*) hash0,  256, (const byte*) hash0, 32 );
   cubehash_full( &ctx.cube, (byte*) hash1,  256, (const byte*) hash1, 32 );
@@ -160,8 +158,8 @@ void lyra2rev2_16way_hash( void *state, const void *input )
                        hash8,  hash9,  hash10, hash11,
                        hash12, hash13, hash14, hash15, 256 );

-   bmw256_16way_update( &ctx.bmw, vhash, 32 );
-   bmw256_16way_close( &ctx.bmw, state );
+   bmw256_16x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_16x32_close( &ctx.bmw, state );
 }

 int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
@@ -186,8 +184,8 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
   mm512_bswap32_intrlv80_16x32( vdata, pdata );
   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
-   blake256_16way_init( &l2v2_16way_ctx.blake );
-   blake256_16way_update( &l2v2_16way_ctx.blake, vdata, 64 );
+   blake256_16x32_init( &l2v2_16way_ctx.blake );
+   blake256_16x32_update( &l2v2_16way_ctx.blake, vdata, 64 );

   do
   {
@@ -214,21 +212,21 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
 #elif defined (LYRA2REV2_8WAY)

 typedef struct {
-   blake256_8way_context     blake;
-   keccak256_4way_context    keccak;
+   blake256_8x32_context     blake;
+   keccak256_4x64_context    keccak;
   cubehashParam             cube;
-   skein256_4way_context     skein;
-   bmw256_8way_context       bmw;
+   skein256_4x64_context     skein;
+   bmw256_8x32_context       bmw;
 } lyra2v2_8way_ctx_holder __attribute__ ((aligned (64)));

 static lyra2v2_8way_ctx_holder l2v2_8way_ctx;

 bool init_lyra2rev2_8way_ctx()
 {
-   keccak256_4way_init( &l2v2_8way_ctx.keccak );
+   keccak256_4x64_init( &l2v2_8way_ctx.keccak );
   cubehashInit( &l2v2_8way_ctx.cube, 256, 16, 32 );
-   skein256_4way_init( &l2v2_8way_ctx.skein );
-   bmw256_8way_init( &l2v2_8way_ctx.bmw );
+   skein256_4x64_init( &l2v2_8way_ctx.skein );
+   bmw256_8x32_init( &l2v2_8way_ctx.bmw );
   return true;
 }

@@ -246,20 +244,20 @@ void lyra2rev2_8way_hash( void *state, const void *input )
   lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );

-   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
-   blake256_8way_close( &ctx.blake, vhash );
+   blake256_8x32_update( &ctx.blake, input + (64<<3), 16 );
+   blake256_8x32_close( &ctx.blake, vhash );

   dintrlv_8x32( hash0, hash1, hash2, hash3,
                 hash4, hash5, hash6, hash7, vhash, 256 );

   intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
-   keccak256_4way_update( &ctx.keccak, vhash, 32 );
-   keccak256_4way_close( &ctx.keccak, vhash );
+   keccak256_4x64_update( &ctx.keccak, vhash, 32 );
+   keccak256_4x64_close( &ctx.keccak, vhash );
   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
   intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
-   keccak256_4way_init( &ctx.keccak );
-   keccak256_4way_update( &ctx.keccak, vhash, 32 );
-   keccak256_4way_close( &ctx.keccak, vhash );
+   keccak256_4x64_init( &ctx.keccak );
+   keccak256_4x64_update( &ctx.keccak, vhash, 32 );
+   keccak256_4x64_close( &ctx.keccak, vhash );
   dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );

   cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
@@ -282,13 +280,13 @@ void lyra2rev2_8way_hash( void *state, const void *input )
   LYRA2REV2( l2v2_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
   
   intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
-   skein256_4way_update( &ctx.skein, vhash, 32 );
-   skein256_4way_close( &ctx.skein, vhash );
+   skein256_4x64_update( &ctx.skein, vhash, 32 );
+   skein256_4x64_close( &ctx.skein, vhash );
   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
   intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
-   skein256_4way_init( &ctx.skein );
-   skein256_4way_update( &ctx.skein, vhash, 32 );
-   skein256_4way_close( &ctx.skein, vhash );
+   skein256_4x64_init( &ctx.skein );
+   skein256_4x64_update( &ctx.skein, vhash, 32 );
+   skein256_4x64_close( &ctx.skein, vhash );
   dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );

   cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
@@ -303,8 +301,8 @@ void lyra2rev2_8way_hash( void *state, const void *input )
   intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
                       hash4, hash5, hash6, hash7, 256 );

-   bmw256_8way_update( &ctx.bmw, vhash, 32 );
-   bmw256_8way_close( &ctx.bmw, state );
+   bmw256_8x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_8x32_close( &ctx.bmw, state );
 }

 int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
@@ -328,8 +326,8 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
-   blake256_8way_init( &l2v2_8way_ctx.blake );
-   blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 );
+   blake256_8x32_init( &l2v2_8way_ctx.blake );
+   blake256_8x32_update( &l2v2_8way_ctx.blake, vdata, 64 );

   do
   {
@@ -356,21 +354,21 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
 #elif defined (LYRA2REV2_4WAY)

 typedef struct {
-   blake256_4way_context     blake;
-   keccak256_4way_context    keccak;
+   blake256_4x32_context     blake;
+   keccak256_4x64_context    keccak;
   cubehashParam             cube;
-   skein256_4way_context     skein;
-   bmw256_4way_context          bmw;
+   skein256_4x64_context     skein;
+   bmw256_4x32_context          bmw;
 } lyra2v2_4way_ctx_holder;

 static lyra2v2_4way_ctx_holder l2v2_4way_ctx;

 bool init_lyra2rev2_4way_ctx()
 {
-   keccak256_4way_init( &l2v2_4way_ctx.keccak );
+   keccak256_4x64_init( &l2v2_4way_ctx.keccak );
   cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
-   skein256_4way_init( &l2v2_4way_ctx.skein );
-   bmw256_4way_init( &l2v2_4way_ctx.bmw );
+   skein256_4x64_init( &l2v2_4way_ctx.skein );
+   bmw256_4x32_init( &l2v2_4way_ctx.bmw );
   return true;
 }

@@ -385,13 +383,13 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
   memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );

-   blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
-   blake256_4way_close( &ctx.blake, vhash );
+   blake256_4x32_update( &ctx.blake, input + (64<<2), 16 );
+   blake256_4x32_close( &ctx.blake, vhash );

   rintrlv_4x32_4x64( vhash64, vhash, 256 );

-   keccak256_4way_update( &ctx.keccak, vhash64, 32 );
-   keccak256_4way_close( &ctx.keccak, vhash64 );
+   keccak256_4x64_update( &ctx.keccak, vhash64, 32 );
+   keccak256_4x64_close( &ctx.keccak, vhash64 );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

@@ -410,8 +408,8 @@ void lyra2rev2_4way_hash( void *state, const void *input )

   intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );

-   skein256_4way_update( &ctx.skein, vhash64, 32 );
-   skein256_4way_close( &ctx.skein, vhash64 );
+   skein256_4x64_update( &ctx.skein, vhash64, 32 );
+   skein256_4x64_close( &ctx.skein, vhash64 );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

@@ -426,8 +424,8 @@ void lyra2rev2_4way_hash( void *state, const void *input )

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );

-   bmw256_4way_update( &ctx.bmw, vhash, 32 );
-   bmw256_4way_close( &ctx.bmw, state );
+   bmw256_4x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_4x32_close( &ctx.bmw, state );
 }

 int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
@@ -451,12 +449,12 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,

   v128_bswap32_intrlv80_4x32( vdata, pdata );

-   blake256_4way_init( &l2v2_4way_ctx.blake );
-   blake256_4way_update( &l2v2_4way_ctx.blake, vdata, 64 );
+   blake256_4x32_init( &l2v2_4way_ctx.blake );
+   blake256_4x32_update( &l2v2_4way_ctx.blake, vdata, 64 );

   do
   {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );

      lyra2rev2_4way_hash( hash, vdata );

--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -9,18 +9,18 @@
 #if defined (LYRA2REV3_16WAY)

 typedef struct {
-   blake256_16way_context     blake;
+   blake256_16x32_context     blake;
   cube_4way_context          cube;
-   bmw256_16way_context       bmw;
+   bmw256_16x32_context       bmw;
 } lyra2v3_16way_ctx_holder;

 static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx;

 bool init_lyra2rev3_16way_ctx()
 {
-   blake256_16way_init( &l2v3_16way_ctx.blake );
+   blake256_16x32_init( &l2v3_16way_ctx.blake );
   cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 );
-   bmw256_16way_init( &l2v3_16way_ctx.bmw );
+   bmw256_16x32_init( &l2v3_16way_ctx.bmw );
   return true;
 }

@@ -46,8 +46,8 @@ void lyra2rev3_16way_hash( void *state, const void *input )
   lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) );

-   blake256_16way_update( &ctx.blake, input + (64*16), 16 );
-   blake256_16way_close( &ctx.blake, vhash );
+   blake256_16x32_update( &ctx.blake, input + (64*16), 16 );
+   blake256_16x32_close( &ctx.blake, vhash );

   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
           hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
@@ -120,8 +120,8 @@ void lyra2rev3_16way_hash( void *state, const void *input )
             hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
             hash15, 256 );

-   bmw256_16way_update( &ctx.bmw, vhash, 32 );
-   bmw256_16way_close( &ctx.bmw, state );
+   bmw256_16x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_16x32_close( &ctx.bmw, state );
 }


@@ -145,8 +145,8 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,

   mm512_bswap32_intrlv80_16x32( vdata, pdata );

-   blake256_16way_init( &l2v3_16way_ctx.blake );
-   blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 );
+   blake256_16x32_init( &l2v3_16way_ctx.blake );
+   blake256_16x32_update( &l2v3_16way_ctx.blake, vdata, 64 );

   do
   {
@@ -178,18 +178,18 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
 #elif defined (LYRA2REV3_8WAY)

 typedef struct {
-   blake256_8way_context     blake;
+   blake256_8x32_context     blake;
   cubehashParam             cube;
-   bmw256_8way_context       bmw;
+   bmw256_8x32_context       bmw;
 } lyra2v3_8way_ctx_holder;

 static __thread lyra2v3_8way_ctx_holder l2v3_8way_ctx;

 bool init_lyra2rev3_8way_ctx()
 {
-   blake256_8way_init( &l2v3_8way_ctx.blake );
+   blake256_8x32_init( &l2v3_8way_ctx.blake );
   cubehashInit( &l2v3_8way_ctx.cube, 256, 16, 32 );
-   bmw256_8way_init( &l2v3_8way_ctx.bmw );
+   bmw256_8x32_init( &l2v3_8way_ctx.bmw );
   return true;
 }

@@ -207,8 +207,8 @@ void lyra2rev3_8way_hash( void *state, const void *input )
   lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );

-   blake256_8way_update( &ctx.blake, input + (64*8), 16 );
-   blake256_8way_close( &ctx.blake, vhash );
+   blake256_8x32_update( &ctx.blake, input + (64*8), 16 );
+   blake256_8x32_close( &ctx.blake, vhash );

   dintrlv_8x32( hash0, hash1, hash2, hash3,
                       hash4, hash5, hash6, hash7, vhash, 256 );
@@ -243,8 +243,8 @@ void lyra2rev3_8way_hash( void *state, const void *input )
   intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
                             hash4, hash5, hash6, hash7, 256 );

-   bmw256_8way_update( &ctx.bmw, vhash, 32 );
-   bmw256_8way_close( &ctx.bmw, state );
+   bmw256_8x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_8x32_close( &ctx.bmw, state );

   }

@@ -269,8 +269,8 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
-   blake256_8way_init( &l2v3_8way_ctx.blake );
-   blake256_8way_update( &l2v3_8way_ctx.blake, vdata, 64 );
+   blake256_8x32_init( &l2v3_8way_ctx.blake );
+   blake256_8x32_update( &l2v3_8way_ctx.blake, vdata, 64 );

   do
   {
@@ -300,19 +300,18 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
 #if defined (LYRA2REV3_4WAY)  

 typedef struct {
-   blake256_4way_context     blake;
+   blake256_4x32_context     blake;
   cubehashParam             cube;
-   bmw256_4way_context       bmw;
+   bmw256_4x32_context       bmw;
 } lyra2v3_4way_ctx_holder;

-//static lyra2v3_4way_ctx_holder l2v3_4way_ctx;
 static __thread lyra2v3_4way_ctx_holder l2v3_4way_ctx;

 bool init_lyra2rev3_4way_ctx()
 {
-   blake256_4way_init( &l2v3_4way_ctx.blake );
+   blake256_4x32_init( &l2v3_4way_ctx.blake );
   cubehashInit( &l2v3_4way_ctx.cube, 256, 16, 32 );
-   bmw256_4way_init( &l2v3_4way_ctx.bmw );
+   bmw256_4x32_init( &l2v3_4way_ctx.bmw );
   return true;
 }

@@ -326,8 +325,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
   memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );

-   blake256_4way_update( &ctx.blake, input + (64*4), 16 );
-   blake256_4way_close( &ctx.blake, vhash );
+   blake256_4x32_update( &ctx.blake, input + (64*4), 16 );
+   blake256_4x32_close( &ctx.blake, vhash );
   dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
@@ -349,8 +348,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
-   bmw256_4way_update( &ctx.bmw, vhash, 32 );
-   bmw256_4way_close( &ctx.bmw, state );
+   bmw256_4x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_4x32_close( &ctx.bmw, state );
 }

 int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
@@ -374,8 +373,8 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
   v128_bswap32_intrlv80_4x32( vdata, pdata );
   *noncev = _mm_set_epi32( n+3, n+2, n+1, n );

-   blake256_4way_init( &l2v3_4way_ctx.blake );
-   blake256_4way_update( &l2v3_4way_ctx.blake, vdata, 64 );
+   blake256_4x32_init( &l2v3_4way_ctx.blake );
+   blake256_4x32_update( &l2v3_4way_ctx.blake, vdata, 64 );

   do
   {
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -3,7 +3,7 @@
 #include "lyra2.h"
 #include "algo/blake/blake256-hash.h"

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define LYRA2Z_16WAY 1
 #elif defined(__AVX2__)
  #define LYRA2Z_8WAY 1
@@ -45,7 +45,7 @@ static void lyra2z_16way_hash( void *state, const void *midstate_vars,
    uint32_t hash14[8] __attribute__ ((aligned (32)));
    uint32_t hash15[8] __attribute__ ((aligned (32)));

-    blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
+    blake256_16x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );

    dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
              hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
@@ -139,7 +139,7 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );

   // Partialy prehash second block without touching nonces in block_buf[3].
-   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
     lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -180,7 +180,7 @@ static void lyra2z_8way_hash( void *state, const void *midstate_vars,
     uint32_t hash7[8] __attribute__ ((aligned (32)));
     uint32_t vhash[8*8] __attribute__ ((aligned (64)));

-     blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
+     blake256_8x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );

     dintrlv_8x32( hash0, hash1, hash2, hash3,
                   hash4, hash5, hash6, hash7, vhash, 256 );
@@ -246,7 +246,7 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
            _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );

   // Partialy prehash second block without touching nonces
-   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
     lyra2z_8way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -279,12 +279,12 @@ bool lyra2z_4way_thread_init()
 return ( lyra2z_4way_matrix = mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
 }

-static __thread blake256_4way_context l2z_4way_blake_mid;
+static __thread blake256_4x32_context l2z_4way_blake_mid;

 void lyra2z_4way_midstate( const void* input )
 {
-       blake256_4way_init( &l2z_4way_blake_mid );
-       blake256_4way_update( &l2z_4way_blake_mid, input, 64 );
+       blake256_4x32_init( &l2z_4way_blake_mid );
+       blake256_4x32_update( &l2z_4way_blake_mid, input, 64 );
 }

 void lyra2z_4way_hash( void *hash, const void *midstate_vars,
@@ -295,15 +295,8 @@ void lyra2z_4way_hash( void *hash, const void *midstate_vars,
     uint32_t hash2[8] __attribute__ ((aligned (64)));
     uint32_t hash3[8] __attribute__ ((aligned (64)));
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-//     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));

-     blake256_4way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
-
-/*
-     memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
-     blake256_4way_update( &ctx_blake, input + (64*4), 16 );
-     blake256_4way_close( &ctx_blake, vhash );
-*/
+     blake256_4x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );

     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

@@ -357,7 +350,7 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
   block_buf[15] = v128_32( 640 );

   // Partialy prehash second block without touching nonces
-   blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_4x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
      lyra2z_4way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -454,11 +447,9 @@ bool register_lyra2z_algo( algo_gate_t* gate )
 #if defined(LYRA2Z_16WAY)
  gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
  gate->scanhash          = (void*)&scanhash_lyra2z_16way;
-//  gate->hash       = (void*)&lyra2z_16way_hash;
 #elif defined(LYRA2Z_8WAY)
  gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
  gate->scanhash          = (void*)&scanhash_lyra2z_8way;
-//  gate->hash       = (void*)&lyra2z_8way_hash;
 #elif defined(LYRA2Z_4WAY)
  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
  gate->scanhash          = (void*)&scanhash_lyra2z_4way;
--- a/algo/lyra2/phi2-4way.c
+++ b/algo/lyra2/phi2-4way.c
@@ -4,7 +4,7 @@
 #include "algo/gost/sph_gost.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "lyra2.h"
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__VAES__) && defined(SIMD512)
  #include "algo/echo/echo-hash-4way.h"
 #elif defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -27,7 +27,7 @@
 #include "lyra2.h"
 #include "simd-utils.h"

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
 {
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -43,9 +43,9 @@ static const uint64_t blake2b_IV[8] =
  0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
 };

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

-#define G2W_4X64(a,b,c,d) \
+#define G2W(a,b,c,d) \
   a = _mm512_add_epi64( a, b ); \
   d = _mm512_ror_epi64( _mm512_xor_si512( d, a ), 32 ); \
   c = _mm512_add_epi64( c, d ); \
@@ -56,27 +56,15 @@ static const uint64_t blake2b_IV[8] =
   b = _mm512_ror_epi64( _mm512_xor_si512( b, c ), 63 );

 #define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
-   G2W_4X64( s0, s1, s2, s3 ); \
+   G2W( s0, s1, s2, s3 ); \
   s0 = mm512_shufll256_64( s0 ); \
-   s3 = mm512_swap256_128( s3); \
+   s3 = mm512_swap256_128( s3 ); \
   s2 = mm512_shuflr256_64( s2 ); \
-   G2W_4X64( s0, s1, s2, s3 ); \
+   G2W( s0, s1, s2, s3 ); \
   s0 = mm512_shuflr256_64( s0 ); \
   s3 = mm512_swap256_128( s3 ); \
   s2 = mm512_shufll256_64( s2 ); 

-/*
-#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
-   G2W_4X64( s0, s1, s2, s3 ); \
-   s3 = mm512_shufll256_64( s3 ); \
-   s1 = mm512_shuflr256_64( s1); \
-   s2 = mm512_swap256_128( s2 ); \
-   G2W_4X64( s0, s1, s2, s3 ); \
-   s3 = mm512_shuflr256_64( s3 ); \
-   s1 = mm512_shufll256_64( s1 ); \
-   s2 = mm512_swap256_128( s2 ); 
-*/
-
 #define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
@@ -95,7 +83,7 @@ static const uint64_t blake2b_IV[8] =

 #if defined(__AVX2__)

-#define G_4X64(a,b,c,d) \
+#define G_AVX2(a,b,c,d) \
   a = _mm256_add_epi64( a, b ); \
   d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
   c = _mm256_add_epi64( c, d ); \
@@ -107,27 +95,15 @@ static const uint64_t blake2b_IV[8] =

 // Pivot about s1 instead of s0 reduces latency.
 #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
-   G_4X64( s0, s1, s2, s3 ); \
+   G_AVX2( s0, s1, s2, s3 ); \
   s0 = mm256_shufll_64( s0 ); \
-   s3 = mm256_swap_128( s3); \
+   s3 = mm256_swap_128( s3 ); \
   s2 = mm256_shuflr_64( s2 ); \
-   G_4X64( s0, s1, s2, s3 ); \
+   G_AVX2( s0, s1, s2, s3 ); \
   s0 = mm256_shuflr_64( s0 ); \
   s3 = mm256_swap_128( s3 ); \
   s2 = mm256_shufll_64( s2 );

-/*
-#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
-   G_4X64( s0, s1, s2, s3 ); \
-   s3 = mm256_shufll_64( s3 ); \
-   s1 = mm256_shuflr_64( s1); \
-   s2 = mm256_swap_128( s2 ); \
-   G_4X64( s0, s1, s2, s3 ); \
-   s3 = mm256_shuflr_64( s3 ); \
-   s1 = mm256_shufll_64( s1 ); \
-   s2 = mm256_swap_128( s2 );
-*/
-
 #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -148,29 +124,29 @@ static const uint64_t blake2b_IV[8] =

 // process 2 columns in parallel
 // returns void, all args updated
-#define G_2X64(a,b,c,d) \
+#define G_128(a,b,c,d) \
   a = v128_add64( a, b ); \
-   d = v128_ror64( v128_xor( d, a), 32 ); \
+   d = v128_ror64xor( d, a, 32 ); \
   c = v128_add64( c, d ); \
-   b = v128_ror64( v128_xor( b, c ), 24 ); \
+   b = v128_ror64xor( b, c, 24 ); \
   a = v128_add64( a, b ); \
-   d = v128_ror64( v128_xor( d, a ), 16 ); \
+   d = v128_ror64xor( d, a, 16 ); \
   c = v128_add64( c, d ); \
-   b = v128_ror64( v128_xor( b, c ), 63 );
+   b = v128_ror64xor( b, c, 63 );

 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
 { \
   v128u64_t t; \
-   G_2X64( s0, s2, s4, s6 ); \
-   G_2X64( s1, s3, s5, s7 ); \
+   G_128( s0, s2, s4, s6 ); \
+   G_128( s1, s3, s5, s7 ); \
   t =  v128_alignr64( s7, s6, 1 ); \
   s6 = v128_alignr64( s6, s7, 1 ); \
   s7 = t; \
   t =  v128_alignr64( s2, s3, 1 ); \
   s2 = v128_alignr64( s3, s2, 1 ); \
   s3 = t; \
-   G_2X64( s0, s2, s5, s6 ); \
-   G_2X64( s1, s3, s4, s7 ); \
+   G_128( s0, s2, s5, s6 ); \
+   G_128( s1, s3, s4, s7 ); \
   t =  v128_alignr64( s6, s7, 1 ); \
   s6 = v128_alignr64( s7, s6, 1 ); \
   s7 = t; \
@@ -195,10 +171,6 @@ static const uint64_t blake2b_IV[8] =

 #endif // AVX2 else SSE2

-static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
-    return ( w >> c ) | ( w << ( 64 - c ) );
-}
-
 #define G( r, i, a, b, c, d ) \
 { \
    a = a + b; \
@@ -222,7 +194,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    G( r, 7, v[ 3], v[ 4], v[ 9], v[14] );


-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)

 union _ovly_512
 {
--- a/algo/m7m/m7m.c
+++ b/algo/m7m/m7m.c
@@ -1,8 +1,6 @@
 #include "cpuminer-config.h"
 #include "algo-gate-api.h"

-#if !defined(__APPLE__)
-
 #include <gmp.h>
 #include <stdbool.h>
 #include <stdlib.h>
@@ -21,7 +19,7 @@
 #define EPS1 DBL_EPSILON
 #define EPS2 3.0e-11

-inline double exp_n( double xt )
+static inline double exp_n( double xt )
 {
    if ( xt < -700.0 )
        return 0;
@@ -33,7 +31,8 @@ inline double exp_n( double xt )
        return exp( xt );
 }

-inline double exp_n2( double x1, double x2 )
+/*
+static inline double exp_n2( double x1, double x2 )
 {
    double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
           p5 = 37., p6 = 700.;
@@ -53,6 +52,7 @@ inline double exp_n2( double x1, double x2 )
    else if ( xt > p6 - 1.e-200 )
        return 0.;
 }
+*/

 double swit2_( double wvnmb )
 {
@@ -298,15 +298,9 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
    return 0;
 }

-#endif   // not apple
-
 bool register_m7m_algo( algo_gate_t *gate )
 {
-#if defined(__APPLE__)
-  applog( LOG_ERR, "M7M algo is not supported on MacOS");
-  return false;
-#else  
-  gate->optimizations = SHA_OPT;
+  gate->optimizations = SHA256_OPT;
  init_m7m_ctx();
  gate->scanhash              = (void*)&scanhash_m7m_hash;
  gate->build_stratum_request = (void*)&std_be_build_stratum_request;
@@ -315,6 +309,5 @@ bool register_m7m_algo( algo_gate_t *gate )
  gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
  opt_target_factor = 65536.0;
  return true;
-#endif
 }

--- a/algo/m7m/magimath.cpp
+++ b/algo/m7m/magimath.cpp
@@ -1,75 +0,0 @@
-// Copyright (c) 2014 The Magi developers
-// Distributed under the MIT/X11 software license, see the accompanying
-// file COPYING or http://www.opensource.org/licenses/mit-license.php.
-
-#include <iostream>
-#include <cfloat>
-#include <limits>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-
-#include "magimath.h"
-
-#define EPS1 (std::numeric_limits<double>::epsilon())
-#define EPS2 3.0e-11
-
-static void gauleg(double x1, double x2, double x[], double w[], const int n)
-{
-	int m,j,i;
-	double z1, z, xm, xl, pp, p3, p2, p1;
-	m=(n+1)/2;
-	xm=0.5*(x2+x1);
-	xl=0.5*(x2-x1);
-	for (i=1;i<=m;i++) {
-		z=cos(3.141592654*(i-0.25)/(n+0.5));
-		do {
-			p1=1.0;
-			p2=0.0;
-			for (j=1;j<=n;j++) {
-				p3=p2;
-				p2=p1;
-				p1=((2.0*j-1.0)*z*p2-(j-1.0)*p3)/j;
-			}
-			pp=n*(z*p1-p2)/(z*z-1.0);
-			z1=z;
-			z=z1-p1/pp;
-		} while (fabs(z-z1) > EPS2);
-		x[i]=xm-xl*z;
-		x[n+1-i]=xm+xl*z;
-		w[i]=2.0*xl/((1.0-z*z)*pp*pp);
-		w[n+1-i]=w[i];
-	}
-}
-
-static double GaussianQuad_N(double func(const double), const double a2, const double b2, const int NptGQ)
-{
-	double s=0.0;
-#ifdef _MSC_VER
-#define SW_DIVS 23
-	double x[SW_DIVS+1], w[SW_DIVS+1];
-#else
-	double x[NptGQ+1], w[NptGQ+1];
-#endif
-
-	gauleg(a2, b2, x, w, NptGQ);
-
-	for (int j=1; j<=NptGQ; j++) {
-		s += w[j]*func(x[j]);
-	}
-
-	return s;
-}
-
-static double swit_(double wvnmb)
-{
-	return pow( (5.55243*(exp_n(-0.3*wvnmb/15.762) - exp_n(-0.6*wvnmb/15.762)))*wvnmb, 0.5)
-		/ 1034.66 * pow(sin(wvnmb/65.), 2.);
-}
-
-uint32_t sw_(int nnounce, int divs)
-{
-	double wmax = ((sqrt((double)(nnounce))*(1.+EPS1))/450+100);
-	return ((uint32_t)(GaussianQuad_N(swit_, 0., wmax, divs)*(1.+EPS1)*1.e6));
-}
--- a/algo/m7m/magimath.h
+++ b/algo/m7m/magimath.h
@@ -1,54 +0,0 @@
-// Copyright (c) 2014 The Magi developers
-// Distributed under the MIT/X11 software license, see the accompanying
-// file COPYING or http://www.opensource.org/licenses/mit-license.php.
-#ifndef MAGI_MATH_H
-#define MAGI_MATH_H
-
-#include <math.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-uint32_t sw_(int nnounce, int divs);
-
-#ifdef __cplusplus
-}
-#endif
-
-
-inline double exp_n(double xt)
-{
-	double p1 = -700.0, p3 = -0.8e-8, p4 = 0.8e-8, p6 = 700.0;
-	if(xt < p1)
-		return 0;
-	else if(xt > p6)
-		return 1e200;
-	else if(xt > p3 && xt < p4)
-		return (1.0 + xt);
-	else
-		return exp(xt);
-}
-
-// 1 / (1 + exp(x1-x2))
-inline double exp_n2(double x1, double x2)
-{
-	double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8, p5 = 37., p6 = 700.;
-	double xt = x1 - x2;
-	if (xt < p1+1.e-200)
-		return 1.;
-	else if (xt > p1 && xt < p2 + 1.e-200)
-		return ( 1. - exp(xt) );
-	else if (xt > p2 && xt < p3 + 1.e-200)
-		return ( 1. / (1. + exp(xt)) );
-	else if (xt > p3 && xt < p4)
-		return ( 1. / (2. + xt) );
-	else if (xt > p4 - 1.e-200 && xt < p5)
-		return ( exp(-xt) / (1. + exp(-xt)) );
-	else if (xt > p5 - 1.e-200 && xt < p6)
-		return ( exp(-xt) );
-	else //if (xt > p6 - 1.e-200)
-		return 0.;
-}
-
-#endif
--- a/algo/nist5/nist5-gate.h
+++ b/algo/nist5/nist5-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define NIST5_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define NIST5_4WAY 1
--- a/algo/panama/panama-hash-4way.c
+++ b/algo/panama/panama-hash-4way.c
@@ -71,8 +71,7 @@ do { \
 } while (0)

 #define GAMMA_4W(n0, n1, n2, n4)   \
-   (g ## n0 = v128_xor( a ## n0, \
-                             v128_or( a ## n1, v128_not( a ## n2 ) ) ) )
+   (g ## n0 = v128_xor( a ## n0, v128_ornot( a ## n2, a ## n1 ) ) )

 #define PI_ALL_4W   do { \
      a0  = g0; \
@@ -312,7 +311,7 @@ do { \
      BUPDATE1_8W( 7, 1 ); \
 } while (0)

-#if defined(__AVX512VL__)
+#if defined(VL256)

 #define GAMMA_8W(n0, n1, n2, n4)   \
   ( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) )  
--- a/algo/panama/panama-hash-4way.h
+++ b/algo/panama/panama-hash-4way.h
@@ -18,11 +18,14 @@ typedef struct {
 } panama_4way_context __attribute__ ((aligned (64)));

 void panama_4way_init( void *cc );
-
 void panama_4way_update( void *cc, const void *data, size_t len );
-
 void panama_4way_close( void *cc, void *dst );

+#define panama_4x32_context panama_4way_context
+#define panama_4x32_init    panama_4way_init
+#define panama_4x32_update  panama_4way_update
+#define panama_4x32_close   panama_4way_close
+
 #if defined(__AVX2__)

 typedef struct {
@@ -34,10 +37,13 @@ typedef struct {
 } panama_8way_context __attribute__ ((aligned (128)));

 void panama_8way_init( void *cc );
-
 void panama_8way_update( void *cc, const void *data, size_t len );
-
 void panama_8way_close( void *cc, void *dst );

+#define panama_8x32_context panama_8way_context
+#define panama_8x32_init    panama_8way_init
+#define panama_8x32_update  panama_8way_update
+#define panama_8x32_close   panama_8way_close
+
 #endif
 #endif
--- a/algo/quark/anime-gate.h
+++ b/algo/quark/anime-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define ANIME_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define ANIME_4WAY 1
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -11,7 +11,6 @@
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#include "algo/simd/nist.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/shavite/shavite-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
@@ -32,20 +31,20 @@

 union _hmq1725_8way_context_overlay
 {
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
+    blake512_8x64_context   blake;
+    bmw512_8x64_context     bmw;
+    skein512_8x64_context   skein;
+    jh512_8x64_context      jh;
+    keccak512_8x64_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
    simd_4way_context       simd;
-    hamsi512_8way_context   hamsi;
+    hamsi512_8x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_8way_context  shabal;
+    shabal512_8x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
-    haval256_5_8way_context haval;
+    sha512_8x64_context     sha512;
+    haval256_8x32_context   haval;
 #if defined(__VAES__)
    groestl512_4way_context groestl;
    shavite512_4way_context shavite;
@@ -82,7 +81,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   __m512i* vhB = (__m512i*)vhashB;
   __m512i* vhC = (__m512i*)vhashC;

-   bmw512_8way_full( &ctx.bmw, vhash, input, 80 );
+   bmw512_8x64_full( &ctx.bmw, vhash, input, 80 );

   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
                     hash4, hash5, hash6, hash7, vhash );
@@ -142,26 +141,26 @@ extern void hmq1725_8way_hash(void *state, const void *input)

   // B
   if ( likely( vh_mask & 0xff ) )
-       skein512_8way_full( &ctx.skein, vhashB, vhash, 64 );
+       skein512_8x64_full( &ctx.skein, vhashB, vhash, 64 );

   mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );

-   jh512_8way_init( &ctx.jh );
-   jh512_8way_update( &ctx.jh, vhash, 64 );
-   jh512_8way_close( &ctx.jh, vhash );
+   jh512_8x64_init( &ctx.jh );
+   jh512_8x64_update( &ctx.jh, vhash, 64 );
+   jh512_8x64_close( &ctx.jh, vhash );

-   keccak512_8way_init( &ctx.keccak );
-   keccak512_8way_update( &ctx.keccak, vhash, 64 );
-   keccak512_8way_close( &ctx.keccak, vhash );
+   keccak512_8x64_init( &ctx.keccak );
+   keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+   keccak512_8x64_close( &ctx.keccak, vhash );

   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );

   // A
   if ( ( vh_mask & 0xff ) != 0xff )
-       blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
+       blake512_8x64_full( &ctx.blake, vhashA, vhash, 64 );
   // B
   if ( vh_mask & 0xff )
-       bmw512_8way_full( &ctx.bmw, vhashB, vhash, 64 );
+       bmw512_8x64_full( &ctx.bmw, vhashB, vhash, 64 );

   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
@@ -177,16 +176,16 @@ extern void hmq1725_8way_hash(void *state, const void *input)

   if ( likely( ( vh_mask & 0xff ) != 0xff ) )
   {
-      keccak512_8way_init( &ctx.keccak );
-      keccak512_8way_update( &ctx.keccak, vhash, 64 );
-      keccak512_8way_close( &ctx.keccak, vhashA );
+      keccak512_8x64_init( &ctx.keccak );
+      keccak512_8x64_update( &ctx.keccak, vhash, 64 );
+      keccak512_8x64_close( &ctx.keccak, vhashA );
   }

   if ( likely( vh_mask & 0xff ) )
   {
-      jh512_8way_init( &ctx.jh );
-      jh512_8way_update( &ctx.jh, vhash, 64 );
-      jh512_8way_close( &ctx.jh, vhashB );
+      jh512_8x64_init( &ctx.jh );
+      jh512_8x64_update( &ctx.jh, vhash, 64 );
+      jh512_8x64_close( &ctx.jh, vhashB );
   }

   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
@@ -252,9 +251,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   // B
   if ( likely( vh_mask & 0xff ) )
   {
-      haval256_5_8way_init( &ctx.haval );
-      haval256_5_8way_update( &ctx.haval, vhash, 64 );
-      haval256_5_8way_close( &ctx.haval, vhash );
+      haval256_8x32_init( &ctx.haval );
+      haval256_8x32_update( &ctx.haval, vhash, 64 );
+      haval256_8x32_close( &ctx.haval, vhash );
      memset( &vhash[8<<3], 0, 32<<3 );
      rintrlv_8x32_8x64( vhashB, vhash, 512 );
   }
@@ -297,7 +296,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)

 #endif

-   blake512_8way_full( &ctx.blake, vhash, vhash, 64 );
+   blake512_8x64_full( &ctx.blake, vhash, vhash, 64 );

   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );

@@ -352,9 +351,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)

   mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );

-   hamsi512_8way_init( &ctx.hamsi );
-   hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
-   hamsi512_8way_close( &ctx.hamsi, vhash );
+   hamsi512_8x64_init( &ctx.hamsi );
+   hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
+   hamsi512_8x64_close( &ctx.hamsi, vhash );

   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
                     hash4, hash5, hash6, hash7, vhash );
@@ -430,9 +429,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)

   rintrlv_8x64_8x32( vhashA, vhash, 512 );

-   shabal512_8way_init( &ctx.shabal );
-   shabal512_8way_update( &ctx.shabal, vhashA, 64 );
-   shabal512_8way_close( &ctx.shabal, vhash );
+   shabal512_8x32_init( &ctx.shabal );
+   shabal512_8x32_update( &ctx.shabal, vhashA, 64 );
+   shabal512_8x32_close( &ctx.shabal, vhash );

   dintrlv_8x32_512( hash0, hash1, hash2, hash3,
                     hash4, hash5, hash6, hash7, vhash );
@@ -475,9 +474,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   // B
   if ( likely( vh_mask & 0xff ) )
   {
-      sha512_8way_init( &ctx.sha512 );
-      sha512_8way_update( &ctx.sha512, vhash, 64 );
-      sha512_8way_close( &ctx.sha512, vhashB );
+      sha512_8x64_init( &ctx.sha512 );
+      sha512_8x64_update( &ctx.sha512, vhash, 64 );
+      sha512_8x64_close( &ctx.sha512, vhashB );
   }

   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
@@ -510,9 +509,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   
 #endif

-   sha512_8way_init( &ctx.sha512 );
-   sha512_8way_update( &ctx.sha512, vhash, 64 );
-   sha512_8way_close( &ctx.sha512, vhash );
+   sha512_8x64_init( &ctx.sha512 );
+   sha512_8x64_update( &ctx.sha512, vhash, 64 );
+   sha512_8x64_close( &ctx.sha512, vhash );

   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
@@ -523,9 +522,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   {
      intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                         hash7 );
-      haval256_5_8way_init( &ctx.haval );
-      haval256_5_8way_update( &ctx.haval, vhash, 64 );
-      haval256_5_8way_close( &ctx.haval, vhash );
+      haval256_8x32_init( &ctx.haval );
+      haval256_8x32_update( &ctx.haval, vhash, 64 );
+      haval256_8x32_close( &ctx.haval, vhash );
      memset( &vhash[8<<3], 0, 32<<3 );
      rintrlv_8x32_8x64( vhashA, vhash, 512 );
   }
@@ -552,9 +551,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
                    hash7 );
   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );

-   bmw512_8way_init( &ctx.bmw );
-   bmw512_8way_update( &ctx.bmw, vhash, 64 );
-   bmw512_8way_close( &ctx.bmw, state );
+   bmw512_8x64_init( &ctx.bmw );
+   bmw512_8x64_update( &ctx.bmw, vhash, 64 );
+   bmw512_8x64_close( &ctx.bmw, state );
 }

 int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
@@ -606,27 +605,27 @@ int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,

 union _hmq1725_4way_context_overlay
 {
-    blake512_4way_context   blake;
-    bmw512_4way_context     bmw;
+    blake512_4x64_context   blake;
+    bmw512_4x64_context     bmw;
    hashState_groestl       groestl;
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;
-    keccak512_4way_context  keccak;
+    skein512_4x64_context   skein;
+    jh512_4x64_context      jh;
+    keccak512_4x64_context  keccak;
    hashState_luffa         luffa;
    luffa_2way_context      luffa2;
    cubehashParam           cube;
    cube_2way_context       cube2;
    sph_shavite512_context  shavite;
-    hashState_sd            sd;
+    simd512_context         simd;
    shavite512_2way_context shavite2;
-    simd_2way_context       simd;
+    simd_2way_context       simd_2way;
    hashState_echo          echo;
-    hamsi512_4way_context   hamsi;
+    hamsi512_4x64_context   hamsi;
    hashState_fugue         fugue;
-    shabal512_4way_context  shabal;
+    shabal512_4x32_context  shabal;
    sph_whirlpool_context   whirlpool;
-    sha512_4way_context     sha512;
-    haval256_5_4way_context haval;
+    sha512_4x64_context     sha512;
+    haval256_4x32_context haval;
 #if defined(__VAES__)
    groestl512_2way_context groestl2;
    echo_2way_context       echo2;
@@ -653,9 +652,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
   __m256i* vhA = (__m256i*)vhashA;
   __m256i* vhB = (__m256i*)vhashB;

-   bmw512_4way_init( &ctx.bmw );
-   bmw512_4way_update( &ctx.bmw, input, 80 );
-   bmw512_4way_close( &ctx.bmw, vhash );
+   bmw512_4x64_init( &ctx.bmw );
+   bmw512_4x64_update( &ctx.bmw, input, 80 );
+   bmw512_4x64_close( &ctx.bmw, vhash );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

@@ -687,17 +686,17 @@ extern void hmq1725_4way_hash(void *state, const void *input)
 // B

    if ( h_mask & 0xffffffff )
-       skein512_4way_full( &ctx.skein, vhashB, vhash, 64 );
+       skein512_4x64_full( &ctx.skein, vhashB, vhash, 64 );

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

-    jh512_4way_init( &ctx.jh );
-    jh512_4way_update( &ctx.jh, vhash, 64 );
-    jh512_4way_close( &ctx.jh, vhash );
+    jh512_4x64_init( &ctx.jh );
+    jh512_4x64_update( &ctx.jh, vhash, 64 );
+    jh512_4x64_close( &ctx.jh, vhash );

-    keccak512_4way_init( &ctx.keccak );
-    keccak512_4way_update( &ctx.keccak, vhash, 64 );
-    keccak512_4way_close( &ctx.keccak, vhash );
+    keccak512_4x64_init( &ctx.keccak );
+    keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+    keccak512_4x64_close( &ctx.keccak, vhash );

 // second fork, A = blake parallel, B= bmw parallel.
    
@@ -705,13 +704,13 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    h_mask = _mm256_movemask_epi8( vh_mask );

    if ( ( h_mask & 0xffffffff ) != 0xffffffff )
-       blake512_4way_full( &ctx.blake, vhashA, vhash, 64 );
+       blake512_4x64_full( &ctx.blake, vhashA, vhash, 64 );

    if ( h_mask & 0xffffffff )
    {
-       bmw512_4way_init( &ctx.bmw );
-       bmw512_4way_update( &ctx.bmw, vhash, 64 );
-       bmw512_4way_close( &ctx.bmw, vhashB );
+       bmw512_4x64_init( &ctx.bmw );
+       bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+       bmw512_4x64_close( &ctx.bmw, vhashB );
    }

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
@@ -734,16 +733,16 @@ extern void hmq1725_4way_hash(void *state, const void *input)

    if ( ( h_mask & 0xffffffff ) != 0xffffffff )
    {
-        keccak512_4way_init( &ctx.keccak );
-        keccak512_4way_update( &ctx.keccak, vhash, 64 );
-        keccak512_4way_close( &ctx.keccak, vhashA );
+        keccak512_4x64_init( &ctx.keccak );
+        keccak512_4x64_update( &ctx.keccak, vhash, 64 );
+        keccak512_4x64_close( &ctx.keccak, vhashA );
    }

    if ( h_mask & 0xffffffff )
    {
-        jh512_4way_init( &ctx.jh );
-        jh512_4way_update( &ctx.jh, vhash, 64 );
-        jh512_4way_close( &ctx.jh, vhashB );
+        jh512_4x64_init( &ctx.jh );
+        jh512_4x64_update( &ctx.jh, vhash, 64 );
+        jh512_4x64_close( &ctx.jh, vhashB );
    }

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
@@ -753,8 +752,8 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    shavite512_2way_full( &ctx.shavite2, vhashA, vhashA, 64 );
    shavite512_2way_full( &ctx.shavite2, vhashB, vhashB, 64 );

-    simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
-    simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
+    simd512_2way_full( &ctx.simd_2way, vhashA, vhashA, 64 );
+    simd512_2way_full( &ctx.simd_2way, vhashB, vhashB, 64 );

    rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );     

@@ -779,9 +778,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    // B
    if ( h_mask & 0xffffffff )
    {
-       haval256_5_4way_init( &ctx.haval );
-       haval256_5_4way_update( &ctx.haval, vhash, 64 );
-       haval256_5_4way_close( &ctx.haval, vhash );
+       haval256_4x32_init( &ctx.haval );
+       haval256_4x32_update( &ctx.haval, vhash, 64 );
+       haval256_4x32_close( &ctx.haval, vhash );
       memset( &vhash[8<<2], 0, 32<<2 );
       rintrlv_4x32_4x64( vhashB, vhash, 512 );
    }
@@ -814,7 +813,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)

 #endif

-    blake512_4way_full( &ctx.blake, vhash, vhash, 64 );
+    blake512_4x64_full( &ctx.blake, vhash, vhash, 64 );

    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

@@ -846,9 +845,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)

    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

-    hamsi512_4way_init( &ctx.hamsi );
-    hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
-    hamsi512_4way_close( &ctx.hamsi, vhash );
+    hamsi512_4x64_init( &ctx.hamsi );
+    hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
+    hamsi512_4x64_close( &ctx.hamsi, vhash );

    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

@@ -869,47 +868,31 @@ extern void hmq1725_4way_hash(void *state, const void *input)
       echo_full( &ctx.echo, (BitSequence *)hash0, 512,
                       (const BitSequence *)hash0, 64 );
    else
-    {
-       init_sd( &ctx.sd, 512 );
-       update_final_sd( &ctx.sd, (BitSequence *)hash0,
-                           (const BitSequence *)hash0, 512 );
-    }
+       simd512_ctx( &ctx.simd, hash0, hash0, 64 );

   if ( hash1[0] & mask ) //4
       echo_full( &ctx.echo, (BitSequence *)hash1, 512,
                       (const BitSequence *)hash1, 64 );
   else
-   {
-       init_sd( &ctx.sd, 512 );
-       update_final_sd( &ctx.sd, (BitSequence *)hash1,
-                           (const BitSequence *)hash1, 512 );
-   }
+       simd512_ctx( &ctx.simd, hash1, hash1, 64 );

   if ( hash2[0] & mask ) //4
       echo_full( &ctx.echo, (BitSequence *)hash2, 512,
                       (const BitSequence *)hash2, 64 );
   else
-   {
-       init_sd( &ctx.sd, 512 );
-       update_final_sd( &ctx.sd, (BitSequence *)hash2,
-                           (const BitSequence *)hash2, 512 );
-   }
+       simd512_ctx( &ctx.simd, hash2, hash2, 64 );

   if ( hash3[0] & mask ) //4
       echo_full( &ctx.echo, (BitSequence *)hash3, 512,
                       (const BitSequence *)hash3, 64 );
   else
-   {
-       init_sd( &ctx.sd, 512 );
-       update_final_sd( &ctx.sd, (BitSequence *)hash3,
-                           (const BitSequence *)hash3, 512 );
-   }
+       simd512_ctx( &ctx.simd, hash3, hash3, 64 );

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

-   shabal512_4way_init( &ctx.shabal );
-   shabal512_4way_update( &ctx.shabal, vhash, 64 );
-   shabal512_4way_close( &ctx.shabal, vhash );
+   shabal512_4x32_init( &ctx.shabal );
+   shabal512_4x32_update( &ctx.shabal, vhash, 64 );
+   shabal512_4x32_close( &ctx.shabal, vhash );

   dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );

@@ -938,9 +921,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)

   if ( h_mask & 0xffffffff )
   {
-      sha512_4way_init( &ctx.sha512 );
-      sha512_4way_update( &ctx.sha512, vhash, 64 );
-      sha512_4way_close( &ctx.sha512, vhashB );
+      sha512_4x64_init( &ctx.sha512 );
+      sha512_4x64_update( &ctx.sha512, vhash, 64 );
+      sha512_4x64_close( &ctx.sha512, vhashB );
   }

   mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
@@ -967,9 +950,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)

 #endif

-   sha512_4way_init( &ctx.sha512 ); 
-   sha512_4way_update( &ctx.sha512, vhash, 64 );
-   sha512_4way_close( &ctx.sha512, vhash ); 
+   sha512_4x64_init( &ctx.sha512 ); 
+   sha512_4x64_update( &ctx.sha512, vhash, 64 );
+   sha512_4x64_close( &ctx.sha512, vhash ); 

 // A = haval parallel, B = Whirlpool serial

@@ -981,9 +964,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)

   if ( ( h_mask & 0xffffffff ) != 0xffffffff )
   {
-      haval256_5_4way_init( &ctx.haval );
-      haval256_5_4way_update( &ctx.haval, vhash, 64 );
-      haval256_5_4way_close( &ctx.haval, vhash );
+      haval256_4x32_init( &ctx.haval );
+      haval256_4x32_update( &ctx.haval, vhash, 64 );
+      haval256_4x32_close( &ctx.haval, vhash );
      memset( &vhash[8<<2], 0, 32<<2 );
      rintrlv_4x32_4x64( vhashA, vhash, 512 );
   }
@@ -1001,9 +984,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)

   mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

-   bmw512_4way_init( &ctx.bmw );
-   bmw512_4way_update( &ctx.bmw, vhash, 64 );
-   bmw512_4way_close( &ctx.bmw, state );
+   bmw512_4x64_init( &ctx.bmw );
+   bmw512_4x64_update( &ctx.bmw, vhash, 64 );
+   bmw512_4x64_close( &ctx.bmw, state );
 }

 int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
--- a/algo/quark/hmq1725-gate.c
+++ b/algo/quark/hmq1725-gate.c
@@ -9,11 +9,11 @@ bool register_hmq1725_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_hmq1725_4way;
  gate->hash      = (void*)&hmq1725_4way_hash;
 #else
-  init_hmq1725_ctx();
  gate->scanhash  = (void*)&scanhash_hmq1725;
  gate->hash      = (void*)&hmq1725hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT 
+                      | NEON_OPT;
  opt_target_factor = 65536.0;
  return true;
 };
--- a/algo/quark/hmq1725-gate.h
+++ b/algo/quark/hmq1725-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define HMQ1725_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define HMQ1725_4WAY 1
@@ -29,7 +29,6 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
 void hmq1725hash( void *state, const void *input );
 int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );
-void init_hmq1725_ctx();

 #endif

--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -4,367 +4,273 @@

 #include <string.h>
 #include <stdint.h>
-#include "algo/blake/sph_blake.h"
+#include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
-#include "algo/groestl/sph_groestl.h"
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+  #include "algo/fugue/fugue-aesni.h"
+#else
+  #include "algo/fugue/sph_fugue.h"
+#endif
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+  #include "algo/groestl/aes_ni/hash-groestl.h"
+  #include "algo/echo/aes_ni/hash_api.h"
+#else
+  #include "algo/groestl/sph_groestl.h"
+  #include "algo/echo/sph_echo.h"
+#endif
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
-#include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/sha/sph_sha2.h"
-#if defined(__AES__)
-  #include "algo/groestl/aes_ni/hash-groestl.h"
-  #include "algo/echo/aes_ni/hash_api.h"
-  #include "algo/fugue/fugue-aesni.h"
-#else
-  #include "algo/groestl/sph_groestl.h"
-  #include "algo/echo/sph_echo.h"
-  #include "algo/fugue/sph_fugue.h"
-#endif
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"

-typedef struct {
-  sph_blake512_context    blake1, blake2;
-  sph_bmw512_context      bmw1, bmw2, bmw3;
-  sph_skein512_context    skein1, skein2;
-  sph_jh512_context       jh1, jh2;
-  sph_keccak512_context   keccak1, keccak2;
-  hashState_luffa         luffa1, luffa2;
-  cubehashParam           cube;
-  sph_shavite512_context  shavite1, shavite2;
-#if defined(__aarch64__)
-  sph_simd512_context     simd1, simd2;
-#else
-  hashState_sd            simd1, simd2;
-#endif
-  sph_hamsi512_context    hamsi1;
-  sph_shabal512_context   shabal1;
-  sph_whirlpool_context   whirlpool1, whirlpool2, whirlpool3, whirlpool4;
-  sph_sha512_context      sha1, sha2;
-  sph_haval256_5_context  haval1, haval2;
-#if defined(__AES__)
-  hashState_echo          echo1, echo2;
-  hashState_groestl       groestl1, groestl2;
-  hashState_fugue         fugue1, fugue2;
-#else
-  sph_groestl512_context  groestl1, groestl2;
-  sph_echo512_context     echo1, echo2;
-  sph_fugue512_context    fugue1, fugue2;
-#endif
-} hmq1725_ctx_holder;
-
-static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64)));
-static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64)));
-
-void init_hmq1725_ctx()
+union _hmq1725_ctx_holder
 {
-    sph_blake512_init(&hmq1725_ctx.blake1);
-    sph_blake512_init(&hmq1725_ctx.blake2);
-
-    sph_bmw512_init(&hmq1725_ctx.bmw1);
-    sph_bmw512_init(&hmq1725_ctx.bmw2);
-    sph_bmw512_init(&hmq1725_ctx.bmw3);
-
-    sph_skein512_init(&hmq1725_ctx.skein1);
-    sph_skein512_init(&hmq1725_ctx.skein2);
-
-    sph_jh512_init(&hmq1725_ctx.jh1);
-    sph_jh512_init(&hmq1725_ctx.jh2);
-
-    sph_keccak512_init(&hmq1725_ctx.keccak1);
-    sph_keccak512_init(&hmq1725_ctx.keccak2);
-
-    init_luffa( &hmq1725_ctx.luffa1, 512 );
-    init_luffa( &hmq1725_ctx.luffa2, 512 );
-
-    cubehashInit( &hmq1725_ctx.cube, 512, 16, 32 );
-
-    sph_shavite512_init(&hmq1725_ctx.shavite1);
-    sph_shavite512_init(&hmq1725_ctx.shavite2);
-
-#if defined(__aarch64__)
-    sph_simd512_init(&hmq1725_ctx.simd1);
-    sph_simd512_init(&hmq1725_ctx.simd2);
-#else    
-    init_sd( &hmq1725_ctx.simd1, 512 );
-    init_sd( &hmq1725_ctx.simd2, 512 );
-#endif
-
-    sph_hamsi512_init(&hmq1725_ctx.hamsi1);
-
-#if defined(__AES__)
-    fugue512_Init( &hmq1725_ctx.fugue1, 512 );
-    fugue512_Init( &hmq1725_ctx.fugue2, 512 );
+   blake512_context        blake;
+   sph_bmw512_context      bmw;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+   hashState_fugue         fugue;
 #else
-    sph_fugue512_init(&hmq1725_ctx.fugue1);
-    sph_fugue512_init(&hmq1725_ctx.fugue2);
+   sph_fugue512_context    fugue;
 #endif
-
-    sph_shabal512_init(&hmq1725_ctx.shabal1);
-
-    sph_whirlpool_init(&hmq1725_ctx.whirlpool1);
-    sph_whirlpool_init(&hmq1725_ctx.whirlpool2);
-    sph_whirlpool_init(&hmq1725_ctx.whirlpool3);
-    sph_whirlpool_init(&hmq1725_ctx.whirlpool4);
-
-    sph_sha512_init( &hmq1725_ctx.sha1 );
-    sph_sha512_init( &hmq1725_ctx.sha2 );
-
-    sph_haval256_5_init(&hmq1725_ctx.haval1);
-    sph_haval256_5_init(&hmq1725_ctx.haval2);
-
-#if defined(__AES__)
-     init_echo( &hmq1725_ctx.echo1, 512 );
-     init_echo( &hmq1725_ctx.echo2, 512 );
-     init_groestl( &hmq1725_ctx.groestl1, 64 );
-     init_groestl( &hmq1725_ctx.groestl2, 64 );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+   hashState_groestl       groestl;
+   hashState_echo          echo;
 #else
-     sph_groestl512_init( &hmq1725_ctx.groestl1 );
-     sph_groestl512_init( &hmq1725_ctx.groestl2 );
-     sph_echo512_init( &hmq1725_ctx.echo1 );
-     sph_echo512_init( &hmq1725_ctx.echo2 );
+   sph_groestl512_context  groestl;
+   sph_echo512_context     echo;
 #endif
-}
-
-void hmq_bmw512_midstate( const void* input )
-{
-    memcpy( &hmq_bmw_mid, &hmq1725_ctx.bmw1, sizeof hmq_bmw_mid );
-    sph_bmw512( &hmq_bmw_mid, input, 64 );
-}
-
-__thread hmq1725_ctx_holder h_ctx __attribute__ ((aligned (64)));
+   sph_skein512_context    skein;
+   sph_jh512_context       jh;
+   sph_keccak512_context   keccak;
+   hashState_luffa         luffa;
+   cubehashParam           cube;
+   sph_shavite512_context  shavite;
+   simd512_context         simd;
+   sph_hamsi512_context    hamsi;
+   sph_shabal512_context   shabal;
+   sph_whirlpool_context   whirlpool;
+   sph_sha512_context      sha;
+   sph_haval256_5_context  haval;
+};
+typedef union _hmq1725_ctx_holder hmq1725_ctx_holder;

 extern void hmq1725hash(void *state, const void *input)
 {
    const uint32_t mask = 24;
-    uint32_t hashA[32] __attribute__((aligned(64)));
-    uint32_t hashB[32] __attribute__((aligned(64)));
-    const int midlen = 64;            // bytes
-    const int tail   = 80 - midlen;   // 16
+    uint32_t hashA[32] __attribute__((aligned(32)));
+    uint32_t hashB[32] __attribute__((aligned(32)));
+    hmq1725_ctx_holder ctx __attribute__ ((aligned (64)));

-    memcpy(&h_ctx, &hmq1725_ctx, sizeof(hmq1725_ctx));
+    sph_bmw512_init( &ctx.bmw );
+    sph_bmw512( &ctx.bmw, input, 80 );
+    sph_bmw512_close( &ctx.bmw, hashA );   //1

-    memcpy( &h_ctx.bmw1, &hmq_bmw_mid, sizeof hmq_bmw_mid );
-    sph_bmw512( &h_ctx.bmw1, input + midlen, tail );
-    sph_bmw512_close(&h_ctx.bmw1, hashA);   //1
-
-    sph_whirlpool (&h_ctx.whirlpool1, hashA, 64);    //0
-    sph_whirlpool_close(&h_ctx.whirlpool1, hashB);   //1
+    sph_whirlpool_init( &ctx.whirlpool );
+    sph_whirlpool( &ctx.whirlpool, hashA, 64 );    //0
+    sph_whirlpool_close( &ctx.whirlpool, hashB );   //1

    if ( hashB[0] & mask )   //1
    {
-#if defined(__AES__)
-     update_and_final_groestl( &h_ctx.groestl1, (char*)hashA,
-                               (const char*)hashB, 512 );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+       groestl512_full( &ctx.groestl, hashA, hashB, 512 );
 #else
-     sph_groestl512 (&h_ctx.groestl1, hashB, 64); //1
-     sph_groestl512_close(&h_ctx.groestl1, hashA); //2
+       sph_groestl512_init( &ctx.groestl );
+       sph_groestl512( &ctx.groestl, hashB, 64 ); //1
+       sph_groestl512_close( &ctx.groestl, hashA ); //2
 #endif
    }
    else
    {
-      sph_skein512 (&h_ctx.skein1, hashB, 64); //1
-      sph_skein512_close(&h_ctx.skein1, hashA); //2
+      sph_skein512_init( &ctx.skein );
+      sph_skein512( &ctx.skein, hashB, 64 ); //1
+      sph_skein512_close( &ctx.skein, hashA ); //2
    }
 	
-    sph_jh512 (&h_ctx.jh1, hashA, 64); //3
-    sph_jh512_close(&h_ctx.jh1, hashB); //4
+    sph_jh512_init( &ctx.jh );
+    sph_jh512( &ctx.jh, hashA, 64 ); //3
+    sph_jh512_close( &ctx.jh, hashB ); //4

-    sph_keccak512 (&h_ctx.keccak1, hashB, 64); //2
-    sph_keccak512_close(&h_ctx.keccak1, hashA); //3
+    sph_keccak512_init( &ctx.keccak );
+    sph_keccak512( &ctx.keccak, hashB, 64 ); //2
+    sph_keccak512_close( &ctx.keccak, hashA ); //3

    if ( hashA[0] & mask ) //4
    {
-        sph_blake512 (&h_ctx.blake1, hashA, 64); //
-        sph_blake512_close(&h_ctx.blake1, hashB); //5
+        blake512_init( &ctx.blake );
+        blake512_update( &ctx.blake, hashA, 64 );
+        blake512_close( &ctx.blake, hashB );
    }
    else
    {
-        sph_bmw512 (&h_ctx.bmw2, hashA, 64); //4
-        sph_bmw512_close(&h_ctx.bmw2, hashB);   //5
+        sph_bmw512_init( &ctx.bmw );
+        sph_bmw512( &ctx.bmw, hashA, 64 ); //4
+        sph_bmw512_close( &ctx.bmw, hashB );   //5
    }
    
-     update_and_final_luffa( &h_ctx.luffa1, hashA, hashB, 64 );
+    luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );

-     cubehashUpdateDigest( &h_ctx.cube, hashB, hashA, 64 );
+    cubehash_full( &ctx.cube, hashB, 512, hashA, 64 );

    if ( hashB[0] & mask ) //7
    {
-        sph_keccak512 (&h_ctx.keccak2, hashB, 64); //
-        sph_keccak512_close(&h_ctx.keccak2, hashA); //8
+        sph_keccak512_init( &ctx.keccak );
+        sph_keccak512( &ctx.keccak, hashB, 64 ); //
+        sph_keccak512_close( &ctx.keccak, hashA ); //8
    }
    else
    {
-        sph_jh512 (&h_ctx.jh2, hashB, 64); //7
-        sph_jh512_close(&h_ctx.jh2, hashA); //8
+        sph_jh512_init( &ctx.jh );
+        sph_jh512( &ctx.jh, hashB, 64 ); //7
+        sph_jh512_close( &ctx.jh, hashA ); //8
    }

-    sph_shavite512 (&h_ctx.shavite1, hashA, 64); //3
-    sph_shavite512_close(&h_ctx.shavite1, hashB); //4
+    sph_shavite512_init( &ctx.shavite );
+    sph_shavite512( &ctx.shavite, hashA, 64 ); //3
+    sph_shavite512_close( &ctx.shavite, hashB ); //4

-#if defined(__aarch64__)
-    sph_simd512 (&h_ctx.simd1, hashB, 64); //3
-    sph_simd512_close(&h_ctx.simd1, hashA); //4
-#else    
-    update_final_sd( &h_ctx.simd1, (BitSequence *)hashA,
-                                   (const BitSequence *)hashB, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hashA, hashB, 64 );

    if ( hashA[0] & mask ) //4
    {
-        sph_whirlpool (&h_ctx.whirlpool2, hashA, 64); //
-        sph_whirlpool_close(&h_ctx.whirlpool2, hashB); //5
+        sph_whirlpool_init( &ctx.whirlpool );
+        sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //
+        sph_whirlpool_close( &ctx.whirlpool, hashB ); //5
    }
    else
    {
-        sph_haval256_5 (&h_ctx.haval1, hashA, 64); //4
-        sph_haval256_5_close(&h_ctx.haval1, hashB);   //5
+        sph_haval256_5_init( &ctx.haval );
+        sph_haval256_5( &ctx.haval, hashA, 64 ); //4
+        sph_haval256_5_close( &ctx.haval, hashB );   //5
        memset(&hashB[8], 0, 32);
    }

-#if defined(__AES__)
-    update_final_echo ( &h_ctx.echo1, (BitSequence *)hashA,
-                        (const BitSequence *)hashB, 512 );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+    echo_full( &ctx.echo, hashA, 512, hashB, 64 );
 #else
-    sph_echo512 (&h_ctx.echo1, hashB, 64); //5
-    sph_echo512_close(&h_ctx.echo1, hashA); //6
+    sph_echo512_init( &ctx.echo );
+    sph_echo512( &ctx.echo, hashB, 64 ); //5
+    sph_echo512_close( &ctx.echo, hashA ); //6
 #endif

-    sph_blake512 (&h_ctx.blake2, hashA, 64); //6
-    sph_blake512_close(&h_ctx.blake2, hashB); //7
+    blake512_init( &ctx.blake );
+    blake512_update( &ctx.blake, hashA, 64 );
+    blake512_close( &ctx.blake, hashB );

    if ( hashB[0] & mask ) //7
    {
-        sph_shavite512 (&h_ctx.shavite2, hashB, 64); //
-        sph_shavite512_close(&h_ctx.shavite2, hashA); //8
+       sph_shavite512_init( &ctx.shavite );
+       sph_shavite512( &ctx.shavite, hashB, 64 ); //
+       sph_shavite512_close( &ctx.shavite, hashA ); //8
    }
    else
-    {
-     update_and_final_luffa( &h_ctx.luffa2, hashA, hashB, 64 );
-    }
+       luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );

-    sph_hamsi512 (&h_ctx.hamsi1, hashA, 64); //3
-    sph_hamsi512_close(&h_ctx.hamsi1, hashB); //4
+    sph_hamsi512_init( &ctx.hamsi );
+    sph_hamsi512( &ctx.hamsi, hashA, 64 ); //3
+    sph_hamsi512_close( &ctx.hamsi, hashB ); //4

-#if defined(__AES__)
-    fugue512_Update( &h_ctx.fugue1, hashB, 512 ); //2   ////
-    fugue512_Final( &h_ctx.fugue1, hashA ); //3 
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+    fugue512_full( &ctx.fugue, hashA, hashB, 64 );
 #else
-    sph_fugue512 (&h_ctx.fugue1, hashB, 64); //2   ////
-    sph_fugue512_close(&h_ctx.fugue1, hashA); //3 
+    sph_fugue512_init( &ctx.fugue );
+    sph_fugue512( &ctx.fugue, hashB, 64 ); //2   ////
+    sph_fugue512_close( &ctx.fugue, hashA ); //3 
 #endif

    if ( hashA[0] & mask ) //4
    {
-#if defined(__AES__)
-     update_final_echo ( &h_ctx.echo2, (BitSequence *)hashB,
-                         (const BitSequence *)hashA, 512 );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+       echo_full( &ctx.echo, hashB, 512, hashA, 64 );
 #else
-     sph_echo512 (&h_ctx.echo2, hashA, 64); //
-     sph_echo512_close(&h_ctx.echo2, hashB); //5
+       sph_echo512_init( &ctx.echo );
+       sph_echo512( &ctx.echo, hashA, 64 ); //
+       sph_echo512_close( &ctx.echo, hashB ); //5
 #endif
    }
    else
-    {
-#if defined(__aarch64__)
-    sph_simd512(&h_ctx.simd2, hashA, 64); //6
-    sph_simd512_close(&h_ctx.simd2, hashB); //7
-#else
-    update_final_sd( &h_ctx.simd2, (BitSequence *)hashB,
-                      (const BitSequence *)hashA, 512 );
-#endif
-    }
+       simd512_ctx( &ctx.simd, hashB, hashA, 64 );

-    sph_shabal512 (&h_ctx.shabal1, hashB, 64); //5
-    sph_shabal512_close(&h_ctx.shabal1, hashA); //6
+    sph_shabal512_init( &ctx.shabal );
+    sph_shabal512( &ctx.shabal, hashB, 64 ); //5
+    sph_shabal512_close( &ctx.shabal, hashA ); //6

-    sph_whirlpool (&h_ctx.whirlpool3, hashA, 64); //6
-    sph_whirlpool_close(&h_ctx.whirlpool3, hashB); //7
+    sph_whirlpool_init( &ctx.whirlpool );
+    sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //6
+    sph_whirlpool_close( &ctx.whirlpool, hashB ); //7

    if ( hashB[0] & mask ) //7
    {
-#if defined(__AES__)
-        fugue512_Update( &h_ctx.fugue2, hashB, 512 ); //
-        fugue512_Final( &h_ctx.fugue2, hashA ); //8
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+       fugue512_full( &ctx.fugue, hashA, hashB, 64 );
 #else
-        sph_fugue512 (&h_ctx.fugue2, hashB, 64); //
-        sph_fugue512_close(&h_ctx.fugue2, hashA); //8
+       sph_fugue512_init( &ctx.fugue );
+       sph_fugue512( &ctx.fugue, hashB, 64 ); //
+       sph_fugue512_close( &ctx.fugue, hashA ); //8
 #endif
    }
    else
    {
-        sph_sha512( &h_ctx.sha1, hashB, 64 );
-        sph_sha512_close( &h_ctx.sha1, hashA );
+       sph_sha512_init( &ctx.sha );
+       sph_sha512( &ctx.sha, hashB, 64 );
+       sph_sha512_close( &ctx.sha, hashA );
    }

-#if defined(__AES__)
-    update_and_final_groestl( &h_ctx.groestl2, (char*)hashB,
-                               (const char*)hashA, 512 );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+    groestl512_full( &ctx.groestl, hashB, hashA, 512 );
 #else
-    sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3
-    sph_groestl512_close(&h_ctx.groestl2, hashB); //4
+    sph_groestl512_init( &ctx.groestl );
+    sph_groestl512( &ctx.groestl, hashA, 64 ); //3
+    sph_groestl512_close( &ctx.groestl, hashB ); //4
 #endif

-    sph_sha512( &h_ctx.sha2, hashB, 64 );
-    sph_sha512_close( &h_ctx.sha2, hashA );
+    sph_sha512_init( &ctx.sha );
+    sph_sha512( &ctx.sha, hashB, 64 );
+    sph_sha512_close( &ctx.sha, hashA );

    if ( hashA[0] & mask ) //4
    {
-        sph_haval256_5 (&h_ctx.haval2, hashA, 64); //
-        sph_haval256_5_close(&h_ctx.haval2, hashB); //5
-	memset(&hashB[8], 0, 32);
+        sph_haval256_5_init( &ctx.haval );
+        sph_haval256_5( &ctx.haval, hashA, 64 ); //
+        sph_haval256_5_close( &ctx.haval, hashB ); //5
+        memset( &hashB[8], 0, 32 );
    }
    else
    {
-        sph_whirlpool (&h_ctx.whirlpool4, hashA, 64); //4
-        sph_whirlpool_close(&h_ctx.whirlpool4, hashB);   //5
+        sph_whirlpool_init( &ctx.whirlpool );
+        sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //4
+        sph_whirlpool_close( &ctx.whirlpool, hashB );   //5
    }

-    sph_bmw512 (&h_ctx.bmw3, hashB, 64); //5
-    sph_bmw512_close(&h_ctx.bmw3, hashA); //6
+    sph_bmw512_init( &ctx.bmw );
+    sph_bmw512( &ctx.bmw, hashB, 64 ); //5
+    sph_bmw512_close( &ctx.bmw, hashA ); //6

-	memcpy(state, hashA, 32);
+	memcpy( state, hashA, 32 );
 }

 int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-//        uint32_t endiandata[32] __attribute__((aligned(64)));
-        uint32_t endiandata[20] __attribute__((aligned(64)));
-        uint32_t hash64[8] __attribute__((aligned(64)));
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t endiandata[20] __attribute__((aligned(32)));
+   uint32_t hash64[8] __attribute__((aligned(32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
   int thr_id = mythr->id;  // thr_id arg is deprecated
-	//const uint32_t Htarg = ptarget[7];

 	//we need bigendian data...
-//        for (int k = 0; k < 32; k++)
-        for (int k = 0; k < 20; k++)
-                be32enc(&endiandata[k], pdata[k]);
+   for (int k = 0; k < 20; k++)
+         be32enc(&endiandata[k], pdata[k]);

-        hmq_bmw512_midstate( endiandata );
-
-//	if (opt_debug) 
-//	{
-//		applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce);
-//	}
-	
-	/* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */
-	/* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */
 	if (ptarget[7]==0) {
 		do {
 			pdata[19] = ++n;
--- a/algo/quark/quark-gate.c
+++ b/algo/quark/quark-gate.c
@@ -14,7 +14,8 @@ bool register_quark_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_quark;
  gate->hash      = (void*)&quark_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
+                      | NEON_OPT;
  return true;
 };

--- a/algo/quark/quark-gate.h
+++ b/algo/quark/quark-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define QUARK_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define QUARK_4WAY 1
--- a/algo/quark/quark.c
+++ b/algo/quark/quark.c
@@ -7,12 +7,12 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "algo/blake/sph_blake.h"
+#include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #else
  #include "algo/groestl/sph_groestl.h"
@@ -21,9 +21,9 @@
 void quark_hash(void *state, const void *input)
 {
   uint32_t hash[16] __attribute__((aligned(64)));
-   sph_blake512_context    ctx_blake;
+   blake512_context        ctx_blake;
   sph_bmw512_context      ctx_bmw;
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   hashState_groestl       ctx_groestl;
 #else
   sph_groestl512_context  ctx_groestl;
@@ -33,17 +33,15 @@ void quark_hash(void *state, const void *input)
   sph_keccak512_context   ctx_keccak;
   uint32_t mask = 8;

-   sph_blake512_init( &ctx_blake );
-   sph_blake512( &ctx_blake, input, 80 );
-   sph_blake512_close( &ctx_blake, hash );
-
+   blake512_full( &ctx_blake, hash, input, 80 );
+   
   sph_bmw512_init( &ctx_bmw );
   sph_bmw512( &ctx_bmw, hash, 64 );
   sph_bmw512_close( &ctx_bmw, hash ); 

   if ( hash[0] & mask )
   {
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
      init_groestl( &ctx_groestl, 64 );
      update_and_final_groestl( &ctx_groestl, (char*)hash,
                                        (const char*)hash, 512 );
@@ -60,7 +58,7 @@ void quark_hash(void *state, const void *input)
      sph_skein512_close( &ctx_skein, hash );
   }

-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   init_groestl( &ctx_groestl, 64 );
   update_and_final_groestl( &ctx_groestl, (char*)hash,
                                     (const char*)hash, 512 );
@@ -76,9 +74,7 @@ void quark_hash(void *state, const void *input)

   if ( hash[0] & mask )
   {
-      sph_blake512_init( &ctx_blake );
-      sph_blake512( &ctx_blake, hash, 64 );
-      sph_blake512_close( &ctx_blake, hash );
+      blake512_full( &ctx_blake, hash, hash, 64 );
   }
   else
   {
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	12480a3ea5	v25.6	2025-07-20 19:43:10 -04:00
Jay D Dee	aa47e880d5	v25.5	2025-07-09 01:32:38 -04:00
Jay D Dee	66191db93c	v25.4	2025-06-20 20:31:41 -04:00
Jay D Dee	dd99580a4c	v25.3	2025-01-16 12:31:53 -05:00
Jay D Dee	1ed18bf22e	v25.2	2025-01-12 18:58:21 -05:00
Jay D Dee	1d9341ee92	v25.1	2024-12-30 21:33:04 -05:00
Jay D Dee	a45a333b40	v24.8	2024-12-25 23:12:29 -05:00
Jay D Dee	2b1037a7c7	v24.7	2024-12-16 19:17:19 -05:00
Jay D Dee	06624a0ff2	v24.6	2024-12-08 11:14:08 -05:00
Jay D Dee	8e91bfbe19	v24.5	2024-09-13 14:14:57 -04:00
Jay D Dee	47e24b50e8	v24.4	2024-07-01 00:33:19 -04:00
Jay D Dee	c47c4a8885	v24.3	2024-05-28 18:20:19 -04:00
Jay D Dee	042d13d1e1	v24.2	2024-05-20 23:08:50 -04:00
Jay D Dee	4f930574cc	v24.1	2024-04-16 21:31:35 -04:00
Jay D Dee	9d3a46c355	v23.15	2023-11-30 14:36:47 -05:00
Jay D Dee	4e3f1b926f	v23.14	2023-11-28 00:58:43 -05:00
Jay D Dee	045b42babf	v23.13	2023-11-21 14:18:15 -05:00
Jay D Dee	fc696dbbe5	v23.12	2023-11-20 11:51:57 -05:00