v25.3

v25.2
v25.1
2025-09-17 23:44:27 +00:00 · 2025-01-16 12:31:53 -05:00 · 2025-01-12 18:58:21 -05:00 · 2024-12-30 21:33:04 -05:00 · 2024-12-25 23:12:29 -05:00 · 2024-12-16 19:17:19 -05:00
93 changed files with 3219 additions and 5323 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,19 +1,39 @@

-if WANT_JANSSON
-JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson
+if HAVE_APPLE
+# MacOS uses Homebrew to install needed packages but they aren't linked for
+# the jansson test in configure. Ignore the failed test & link them now,
+# different path for different CPU arch.
+
+if ARCH_ARM64
+  EXTRA_INCLUDES = -I/opt/homebrew/include
+  EXTRA_LIBS     = -L/opt/homebrew/lib
 else
-JANSSON_INCLUDES=
+  EXTRA_INCLUDES = -I/usr/local/include
+  EXTRA_LIBS     = -L/usr/local/lib
 endif

-EXTRA_DIST	= example-cfg.json nomacro.pl
+else

-SUBDIRS		= compat
+if WANT_JANSSON
+# Can't find jansson libraries, compile the included source code.
+  EXTRA_INCLUDES = -I$(top_srcdir)/compat/jansson
+  EXTRA_LIBS     = -L$(top_srcdir)/compat/jansson
+else
+  EXTRA_INCLUDES =
+  EXTRA_LIBS     =
+endif

-ALL_INCLUDES	= @PTHREAD_FLAGS@ -fno-strict-aliasing $(JANSSON_INCLUDES) -I.
+endif

-bin_PROGRAMS	= cpuminer
+EXTRA_DIST = example-cfg.json nomacro.pl

-dist_man_MANS	= cpuminer.1
+SUBDIRS = compat
+
+ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(EXTRA_INCLUDES) -I.
+
+bin_PROGRAMS = cpuminer
+
+dist_man_MANS = cpuminer.1

 cpuminer_SOURCES = \
  dummy.cpp \
@@ -166,9 +186,6 @@ cpuminer_SOURCES = \
  algo/shavite/sph-shavite-aesni.c \
  algo/shavite/shavite-hash-2way.c \
  algo/shavite/shavite-hash-4way.c \
-  algo/shavite/shavite.c \
-  algo/simd/nist.c \
-  algo/simd/vector.c \
  algo/simd/sph_simd.c \
  algo/simd/simd-hash-2way.c \
  algo/skein/sph_skein.c \
@@ -275,29 +292,29 @@ cpuminer_SOURCES = \
  algo/yespower/yespower-opt.c \
  algo/yespower/yespower-ref.c \
  algo/yespower/yespower-blake2b-ref.c
-  
-disable_flags =
-
-if USE_ASM
-   cpuminer_SOURCES += asm/neoscrypt_asm.S
-else
-   disable_flags += -DNOASM
-endif

 if HAVE_WINDOWS
   cpuminer_SOURCES += compat/winansi.c
 endif

-cpuminer_LDFLAGS	= @LDFLAGS@
-cpuminer_LDADD	= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
-cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
-cpuminer_CFLAGS   = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
+if USE_ASM
+   disable_flags =
+   cpuminer_SOURCES += asm/neoscrypt_asm.S
+else
+   disable_flags = -DNOASM
+endif

-if HAVE_WINDOWS
-cpuminer_CFLAGS += -Wl,--stack,10485760
+cpuminer_LDFLAGS = @LDFLAGS@
+cpuminer_LDADD	= $(EXTRA_LIBS) @LIBCURL@ -ljansson @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
+cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
+cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
+
+if ARCH_ARM64
+   cpuminer_CFLAGS += -flax-vector-conversions
 endif

 if HAVE_WINDOWS
+
 # use to profile an object
 # gprof_cflags = -pg -g3
 # cpuminer_LDFLAGS += -pg
@@ -311,5 +328,4 @@ cpuminer-neoscrypt.o: neoscrypt.c
 	@echo "CUSTOM ${@}: ${filter %.o,${^}} ${filter %.c,${^}}"
 	$(CC) $(common_ccflags) -g -O3 $(gprof_cflags) -MT $@ -MD -MP -c -o $@ $<

-
 endif
--- a/README.md
+++ b/README.md
@@ -36,34 +36,18 @@ for compile instructions.
 Requirements
 ------------

-1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
-Intel Core2 and newer and AMD equivalents. Further optimizations are available
-on some algoritms for CPUs with AES, AVX, AVX2, SHA, AVX512 and VAES.
-
-32 bit CPUs are not supported.
-Other CPU architectures such as ARM, Raspberry Pi, RISC-V, Xeon Phi, etc,
-are not supported.
+1. A 64 bit CPU supporting x86_64 (Intel or AMD) or aarch64 (ARM).
+x86_64 requires SSE2, aarch64 requires armv8 & NEON.

 Mobile CPUs like laptop computers are not recommended because they aren't
 designed for extreme heat of operating at full load for extended periods of
 time.

-Older CPUs and ARM architecture may be supported by cpuminer-multi by TPruvot.
-
-2. 64 bit Linux or Windows OS. Ubuntu and Fedora based distributions,
-including Mint and Centos, are known to work and have all dependencies
-in their repositories. Others may work but may require more effort. Older
-versions such as Centos 6 don't work due to missing features. 
-
-Windows 7 or newer is supported with mingw_w64 and msys or using the pre-built
-binaries. WindowsXP 64 bit is YMMV.
-
-FreeBSD is not actively tested but should work, YMMV.
-MacOS, OSx and Android are not supported.
+2. 64 bit operating system including Linux, Windows, MacOS, or BSD.
+Android, IOS and alt OSs like Haiku & ReactOS are not supported.

 3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
-RPC getwork using http:// or https://.
-GBT is YMMV.
+RPC getblockte,plate using http:// or https://.

 Supported Algorithms
 --------------------
@@ -71,9 +55,9 @@ Supported Algorithms
                          allium        Garlicoin
                          anime         Animecoin
                          argon2        Argon2 coin (AR2)
-                          argon2d250    argon2d-crds, Credits (CRDS)
-                          argon2d500    argon2d-dyn,  Dynamic (DYN)
-                          argon2d4096   argon2d-uis, Unitus, (UIS)
+                          argon2d250    
+                          argon2d500
+                          argon2d4096
                          blake         Blake-256
                          blake2b       Blake2-512
                          blake2s       Blake2-256
--- a/64
+++ b/64
@@ -75,6 +75,70 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v25.3
+
+#442, #443: Fixed a regression in Makefile.am.
+Updated dockerfile.
+Removed algo features log display.
+Some code cleanup.
+
+v25.2
+
+ARM: Fixed regression from v25.1 that could cause build fail.
+BSD: FreeBSD is now supported. Other BSDs may also work.
+MacOS: build with installed jansson library instead of compiling the included source code.
+Windows: remove "_WIN32_WINNT=0x0601" which is a downgrade on Win11.
+Changed build.sh shell from bash to sh.
+
+v25.1
+
+MacOS ARM64: m7m algo is now working.
+MacOS ARM64: can now be compiled with GCC.
+MacOS x86_64: is now working compiled with GCC.
+Fixed some minor bugs & removed some obsolete code.
+
+v24.8
+
+ARM: Apple MacOS on M series CPU is now supported compiled from source
+code, see Wiki for details.
+ARM: Fix incorrect compiler version display when using clang. 
+build.sh can now be used to compile all targets, arm_build.sh & build_msys2.sh
+have been removed.
+Windows: MSys2 build now enables CPU groups by default, prebuilt binaries
+continue to be compiled with CPU groups disabled.
+
+v24.7
+
+ARM: compile works for Windows using MSys2 & MingW, see wiki for details.
+
+v24.6
+
+ARM: Fixed scryptn2, x16*, broken in v24.2. 
+ARM: Small improvement to interleaving.
+Eliminated some potential compile errors in code that was dependent on 
+compiler optimisations.
+x86_64: improved support for AVX10 compilation, needs GCC-14 or higher.
+
+v24.5
+
+Fix MinGW compile error after MSys2 upgrade to GCC-14.2. 
+#427: GBT: Improved handling of new work.
+Removed shavite3 algo.
+
+v24.4
+
+x86_64: fixed a bug in ornot macro for AVX2 which broke some algos in v24.2.
+x86_64: fixed a bug in alignr macros for SSE2.
+ARM: CPU feature reporting enhancements.
+Some code cleanup.
+
+v24.3
+
+ARM: CPU feature detection and reporting is now working.
+ARM: Verthash is now working.
+ARM: Small speedup for yescrypt, yespower & argon2d.
+Code cleanup.
+
 v24.2

 x86_64: Fixed blakes2s for AVX2 & AVX512, x25x for AVX512, broken in v3.23.4.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -263,8 +263,8 @@ static void init_algo_gate( algo_gate_t* gate )
   gate->build_block_header      = (void*)&std_build_block_header;
   gate->build_extraheader       = (void*)&std_build_extraheader;
   gate->set_work_data_endian    = (void*)&do_nothing;
-   gate->resync_threads          = (void*)&do_nothing;
-   gate->do_this_thread          = (void*)&return_true;
+//   gate->resync_threads          = (void*)&do_nothing;
+//   gate->do_this_thread          = (void*)&return_true;
   gate->longpoll_rpc_call       = (void*)&std_longpoll_rpc_call;
   gate->get_work_data_size      = (void*)&std_get_work_data_size;
   gate->optimizations           = EMPTY_SET;
@@ -295,8 +295,8 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
  {
    case ALGO_ALLIUM:       rc = register_allium_algo        ( gate ); break;
    case ALGO_ANIME:        rc = register_anime_algo         ( gate ); break;
-    case ALGO_ARGON2D250:   rc = register_argon2d_crds_algo  ( gate ); break;
-    case ALGO_ARGON2D500:   rc = register_argon2d_dyn_algo   ( gate ); break;
+    case ALGO_ARGON2D250:   rc = register_argon2d250_algo    ( gate ); break;
+    case ALGO_ARGON2D500:   rc = register_argon2d500_algo    ( gate ); break;
    case ALGO_ARGON2D4096:  rc = register_argon2d4096_algo   ( gate ); break;
    case ALGO_AXIOM:        rc = register_axiom_algo         ( gate ); break;
    case ALGO_BLAKE:        rc = register_blake_algo         ( gate ); break;
@@ -340,7 +340,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_SHA256T:      rc = register_sha256t_algo       ( gate ); break;
    case ALGO_SHA3D:        rc = register_sha3d_algo         ( gate ); break;
    case ALGO_SHA512256D:   rc = register_sha512256d_algo    ( gate ); break;
-    case ALGO_SHAVITE3:     rc = register_shavite_algo       ( gate ); break;
    case ALGO_SKEIN:        rc = register_skein_algo         ( gate ); break;
    case ALGO_SKEIN2:       rc = register_skein2_algo        ( gate ); break;
    case ALGO_SKUNK:        rc = register_skunk_algo         ( gate ); break;
@@ -417,8 +416,6 @@ void exec_hash_function( int algo, void *output, const void *pdata )
 const char* const algo_alias_map[][2] =
 {
 //   alias                proper
-  { "argon2d-dyn",       "argon2d500"     },
-  { "argon2d-uis",       "argon2d4096"    },
  { "bcd",               "x13bcd"         },
  { "bitcore",           "timetravel10"   },
  { "bitzeny",           "yescryptr8"     },
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -98,7 +98,6 @@ typedef  uint32_t set_t;
 #define AVX512_OPT       1 <<  6   // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
 #define AES_OPT          1 <<  7   // Intel Westmere, AArch64
 #define VAES_OPT         1 <<  8   // Icelake, Zen3
-#define SHA_OPT          1 <<  9   // Zen1, Icelake, AArch64 
 #define SHA256_OPT       1 <<  9   // Zen1, Icelake, AArch64 
 #define SHA512_OPT       1 << 10   // Intel Arrow Lake, AArch64 
 #define NEON_OPT         1 << 11   // AArch64 
@@ -166,10 +165,10 @@ char* ( *malloc_txs_request )   ( struct work* );
 void ( *set_work_data_endian )  ( struct work* );

 // Diverge mining threads
-bool ( *do_this_thread )        ( int );
+//bool ( *do_this_thread )        ( int );

 // After do_this_thread
-void ( *resync_threads )        ( int, struct work* );
+//void ( *resync_threads )        ( int, struct work* );

 json_t* ( *longpoll_rpc_call )  ( CURL*, int*, char* );

--- a/algo/argon2d/argon2d-gate.c
+++ b/algo/argon2d/argon2d-gate.c
@@ -6,9 +6,7 @@ static const size_t INPUT_BYTES = 80;  // Lenth of a block header in bytes. Inpu
 static const size_t OUTPUT_BYTES = 32; // Length of output needed for a 256-bit hash
 static const unsigned int DEFAULT_ARGON2_FLAG = 2; //Same as ARGON2_DEFAULT_FLAGS

-// Credits
-
-void argon2d_crds_hash( void *output, const void *input )
+void argon2d250_hash( void *output, const void *input )
 {
 	argon2_context context;
 	context.out = (uint8_t *)output;
@@ -34,7 +32,7 @@ void argon2d_crds_hash( void *output, const void *input )
 	argon2_ctx( &context, Argon2_d );
 }

-int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
+int scanhash_argon2d250( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(64) edata[20];
@@ -50,7 +48,7 @@ int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,

   do {
      be32enc(&edata[19], nonce);
-      argon2d_crds_hash( hash, edata );
+      argon2d250_hash( hash, edata );
      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
      {
          pdata[19] = nonce;
@@ -64,18 +62,16 @@ int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
   return 0;
 }

-bool register_argon2d_crds_algo( algo_gate_t* gate )
+bool register_argon2d250_algo( algo_gate_t* gate )
 {
-        gate->scanhash = (void*)&scanhash_argon2d_crds;
-        gate->hash = (void*)&argon2d_crds_hash;
+        gate->scanhash = (void*)&scanhash_argon2d250;
+        gate->hash = (void*)&argon2d250_hash;
        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
        opt_target_factor = 65536.0;
        return true;
 }

-// Dynamic
-
-void argon2d_dyn_hash( void *output, const void *input )
+void argon2d500_hash( void *output, const void *input )
 {
    argon2_context context;
    context.out = (uint8_t *)output;
@@ -101,7 +97,7 @@ void argon2d_dyn_hash( void *output, const void *input )
    argon2_ctx( &context, Argon2_d );
 }

-int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
+int scanhash_argon2d500( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(64) edata[20];
@@ -118,7 +114,7 @@ int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
   do
   {
      edata[19] = nonce;
-      argon2d_dyn_hash( hash, edata );
+      argon2d500_hash( hash, edata );
      if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
           && !bench ) )
      {
@@ -133,17 +129,15 @@ int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
   return 0;
 }

-bool register_argon2d_dyn_algo( algo_gate_t* gate )
+bool register_argon2d500_algo( algo_gate_t* gate )
 {
-        gate->scanhash = (void*)&scanhash_argon2d_dyn;
-        gate->hash = (void*)&argon2d_dyn_hash;
+        gate->scanhash = (void*)&scanhash_argon2d500;
+        gate->hash = (void*)&argon2d500_hash;
        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
        opt_target_factor = 65536.0;
        return true;
 }

-// Unitus
-
 int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
--- a/algo/argon2d/argon2d-gate.h
+++ b/algo/argon2d/argon2d-gate.h
@@ -5,19 +5,19 @@
 #include <stdint.h>

 // Credits: version = 0x10, m_cost = 250.
-bool register_argon2d_crds_algo( algo_gate_t* gate );
+bool register_argon2d250_algo( algo_gate_t* gate );

-void argon2d_crds_hash( void *state, const void *input );
+void argon2d250_hash( void *state, const void *input );

-int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
+int scanhash_argon2d250( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );

 // Dynamic: version = 0x10, m_cost = 500.
-bool register_argon2d_dyn_algo( algo_gate_t* gate );
+bool register_argon2d500_algo( algo_gate_t* gate );

-void argon2d_dyn_hash( void *state, const void *input );
+void argon2d500_hash( void *state, const void *input );

-int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
+int scanhash_argon2d500( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );


--- a/algo/blake/blake2b-hash.c
+++ b/algo/blake/blake2b-hash.c
@@ -240,7 +240,7 @@ static const uint8_t sigma[12][16] =
   v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
 }

-static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
+static void blake2b_8x64_compress( blake2b_8x64_ctx *ctx, int last )
 {  
   __m512i v[16], m[16];

@@ -306,7 +306,7 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
   ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
 }

-int blake2b_8way_init( blake2b_8way_ctx *ctx )
+int blake2b_8x64_init( blake2b_8x64_ctx *ctx )
 {
   size_t i;

@@ -333,7 +333,7 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
 }


-void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
+void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
                          size_t inlen )
 {
   __m512i* in =(__m512i*)input;
@@ -348,7 +348,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
         ctx->t[0] += ctx->c;
         if ( ctx->t[0] < ctx->c )
            ctx->t[1]++;
-         blake2b_8way_compress( ctx, 0 );
+         blake2b_8x64_compress( ctx, 0 );
         ctx->c = 0;
      }
      ctx->b[ c++ ] = in[i];
@@ -356,7 +356,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
   }
 }

-void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
+void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out )
 {
   size_t c;
   c = ctx->c >> 3;
@@ -371,7 +371,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
      ctx->c += 8;
   }

-   blake2b_8way_compress( ctx, 1 );           // final block flag = 1
+   blake2b_8x64_compress( ctx, 1 );           // final block flag = 1

   casti_m512i( out, 0 ) = ctx->h[0];
   casti_m512i( out, 1 ) = ctx->h[1];
@@ -407,7 +407,7 @@ static const uint64_t blake2b_iv[8] = {
 };
 */

-static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
+static void blake2b_4x64_compress( blake2b_4x64_ctx *ctx, int last )
 {
 	__m256i v[16], m[16];

@@ -473,7 +473,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
   ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
 }

-int blake2b_4way_init( blake2b_4way_ctx *ctx ) 
+int blake2b_4x64_init( blake2b_4x64_ctx *ctx ) 
 {
 	size_t i;

@@ -499,7 +499,7 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
 	return 0;
 }

-void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
+void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
                          size_t inlen ) 
 {
   __m256i* in =(__m256i*)input;
@@ -514,7 +514,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
 			ctx->t[0] += ctx->c;
 			if ( ctx->t[0] < ctx->c )
 				ctx->t[1]++;
-			blake2b_4way_compress( ctx, 0 );
+			blake2b_4x64_compress( ctx, 0 );
 			ctx->c = 0;
 		}
      ctx->b[ c++ ] = in[i];
@@ -522,7 +522,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
   }
 }

-void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
+void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out )
 {
 	size_t c;
   c = ctx->c >> 3;
@@ -537,7 +537,7 @@ void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
      ctx->c += 8;
   }

-   blake2b_4way_compress( ctx, 1 );           // final block flag = 1
+   blake2b_4x64_compress( ctx, 1 );           // final block flag = 1

   casti_m256i( out, 0 ) = ctx->h[0];
   casti_m256i( out, 1 ) = ctx->h[1];
--- a/algo/blake/blake2b-hash.h
+++ b/algo/blake/blake2b-hash.h
@@ -1,6 +1,6 @@
 #pragma once
-#ifndef __BLAKE2B_HASH_4WAY_H__
-#define __BLAKE2B_HASH_4WAY_H__
+#ifndef BLAKE2B_HASH_4WAY_H__
+#define BLAKE2B_HASH_4WAY_H__

 #include "simd-utils.h"
 #include <stddef.h>
@@ -23,12 +23,17 @@ typedef struct ALIGN( 64 ) {
   uint64_t t[2];  // total number of bytes
   size_t c;       // pointer for b[]
   size_t outlen;  // digest size
-} blake2b_8way_ctx;
+} blake2b_8x64_ctx;

-int blake2b_8way_init( blake2b_8way_ctx *ctx );
-void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
+int blake2b_8x64_init( blake2b_8x64_ctx *ctx );
+void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
                          size_t inlen );
-void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
+void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out );
+
+#define blake2b_8way_ctx         blake2b_8x64_ctx
+#define blake2b_8way_init        blake2b_8x64_init
+#define blake2b_8way_update      blake2b_8x64_update
+#define blake2b_8way_final       blake2b_8x64_final

 #endif

@@ -41,12 +46,17 @@ typedef struct ALIGN( 64 ) {
 	uint64_t t[2];  // total number of bytes
 	size_t c;       // pointer for b[]
 	size_t outlen;  // digest size
-} blake2b_4way_ctx;
+} blake2b_4x64_ctx;

-int blake2b_4way_init( blake2b_4way_ctx *ctx );
-void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
+int blake2b_4x64_init( blake2b_4x64_ctx *ctx );
+void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
                          size_t inlen );
-void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
+void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out );
+
+#define blake2b_4way_ctx         blake2b_4x64_ctx
+#define blake2b_4way_init        blake2b_4x64_init
+#define blake2b_4way_update      blake2b_4x64_update
+#define blake2b_4way_final       blake2b_4x64_final

 #endif

--- a/algo/blake/blake2s-hash.h
+++ b/algo/blake/blake2s-hash.h
@@ -11,8 +11,8 @@
 * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
 */
 //#pragma once
-#ifndef __BLAKE2S_HASH_4WAY_H__
-#define __BLAKE2S_HASH_4WAY_H__ 1
+#ifndef BLAKE2S_HASH_4WAY_H__
+#define BLAKE2S_HASH_4WAY_H__ 1

 #if defined(__SSE2__) || defined(__ARM_NEON)

--- a/algo/groestl/groestl512-intr-4way.h
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -239,7 +239,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
  { \
    /* AddRoundConstant P1024 */\
    xmm8 = _mm512_xor_si512( xmm8, mm512_bcast_m128( \
-             casti_m128i( round_const_p, round_counter ) ) ); \
+             casti_v128u32( round_const_p, round_counter ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm8  = _mm512_shuffle_epi8( xmm8,  SUBSH_MASK0 ); \
    xmm9  = _mm512_shuffle_epi8( xmm9,  SUBSH_MASK1 );\
@@ -254,7 +254,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    \
     /* AddRoundConstant P1024 */\
    xmm0 = _mm512_xor_si512( xmm0, mm512_bcast_m128( \
-             casti_m128i( round_const_p, round_counter+1 ) ) ); \
+             casti_v128u32( round_const_p, round_counter+1 ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
    xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK1 );\
@@ -283,7 +283,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
    xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
    xmm15 = _mm512_xor_si512( xmm15, mm512_bcast_m128( \
-                 casti_m128i( round_const_q, round_counter ) ) ); \
+                 casti_v128u32( round_const_q, round_counter ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm8  = _mm512_shuffle_epi8( xmm8,  SUBSH_MASK1 );\
    xmm9  = _mm512_shuffle_epi8( xmm9,  SUBSH_MASK3 );\
@@ -306,7 +306,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
    xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
    xmm7 = _mm512_xor_si512( xmm7, mm512_bcast_m128( \
-             casti_m128i( round_const_q, round_counter+1 ) ) ); \
+             casti_v128u32( round_const_q, round_counter+1 ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
    xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK3 );\
@@ -812,7 +812,7 @@ static const __m256i SUBSH_MASK7_2WAY =
  { \
    /* AddRoundConstant P1024 */\
    xmm8 = _mm256_xor_si256( xmm8, mm256_bcast_m128( \
-             casti_m128i( round_const_p, round_counter ) ) ); \
+             casti_v128u32( round_const_p, round_counter ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm8  = _mm256_shuffle_epi8( xmm8,  SUBSH_MASK0_2WAY ); \
    xmm9  = _mm256_shuffle_epi8( xmm9,  SUBSH_MASK1_2WAY );\
@@ -827,7 +827,7 @@ static const __m256i SUBSH_MASK7_2WAY =
    \
     /* AddRoundConstant P1024 */\
    xmm0 = _mm256_xor_si256( xmm0, mm256_bcast_m128( \
-             casti_m128i( round_const_p, round_counter+1 ) ) ); \
+             casti_v128u32( round_const_p, round_counter+1 ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
    xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK1_2WAY );\
@@ -856,7 +856,7 @@ static const __m256i SUBSH_MASK7_2WAY =
    xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
    xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
    xmm15 = _mm256_xor_si256( xmm15, mm256_bcast_m128( \
-                 casti_m128i( round_const_q, round_counter ) ) ); \
+                 casti_v128u32( round_const_q, round_counter ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm8  = _mm256_shuffle_epi8( xmm8,  SUBSH_MASK1_2WAY );\
    xmm9  = _mm256_shuffle_epi8( xmm9,  SUBSH_MASK3_2WAY );\
@@ -879,7 +879,7 @@ static const __m256i SUBSH_MASK7_2WAY =
    xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
    xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
    xmm7 = _mm256_xor_si256( xmm7, mm256_bcast_m128( \
-             casti_m128i( round_const_q, round_counter+1 ) ) ); \
+             casti_v128u32( round_const_q, round_counter+1 ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
    xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK3_2WAY );\
--- a/algo/groestl/myrgr-gate.c
+++ b/algo/groestl/myrgr-gate.c
@@ -16,7 +16,7 @@ bool register_myriad_algo( algo_gate_t* gate )
  init_myrgr_ctx();
  gate->scanhash  = (void*)&scanhash_myriad;
  gate->hash      = (void*)&myriad_hash;
-  gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT;
+  gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA256_OPT | VAES_OPT;
 #endif
  return true;
 };
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -387,7 +387,7 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
 // Hamsi 8 way AVX512 

 // Intel docs say _mm512_movepi64_mask & _mm512_cmplt_epi64_mask have same
-// timig. However, when tested hashing X13 on i9-9940x using cmplt with zero
+// timing. However, testing hashing X13 on i9-9940x using cmplt with zero
 // had a 3% faster overall hashrate than than using movepi. 

 #define INPUT_BIG8 \
@@ -418,13 +418,11 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
  tb = mm512_xoror( b, d, a ); \
  a = _mm512_xor_si512( a, c ); \
  b = mm512_xoror( td, tb, a ); \
-  td = mm512_xorand( a, td, tb ); \
+  d = _mm512_ternarylogic_epi64( a, td, tb, 0x87 );/* not( xorand( a, td, tb ) ); */ \
  a = c; \
-  c = mm512_xor3( tb, b, td ); \
-  d = mm512_not( td ); \
+  c = _mm512_ternarylogic_epi64( tb, b, d, 0x69 ); /* not( xor3( tb, b, d ) ); */ \
 }

-
 /*
 #define SBOX8( a, b, c, d ) \
 do { \
@@ -1155,11 +1153,99 @@ do { \
  b = mm256_xoror( td, tb, a ); \
  d = _mm256_ternarylogic_epi64( a, td, tb, 0x87 );/* mm256_not( mm256_xorand( a, td, tb ) ); */ \
  a = c; \
-  c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /*mm256_not( mm256_xor3( tb, b, d ) );*/ \
+  c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /* mm256_not( mm256_xor3( tb, b, d ) ); */ \
 }

 #else

+#define INPUT_BIG_sub( db_i ) \
+{ \
+     const __m256i dm = _mm256_cmpgt_epi64( zero, db_i ); \
+     m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, v256_64( tp[0] ) ) ); \
+     m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, v256_64( tp[1] ) ) ); \
+     m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, v256_64( tp[2] ) ) ); \
+     m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, v256_64( tp[3] ) ) ); \
+     m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, v256_64( tp[4] ) ) ); \
+     m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, v256_64( tp[5] ) ) ); \
+     m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, v256_64( tp[6] ) ) ); \
+     m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, v256_64( tp[7] ) ) ); \
+     tp += 8; \
+}
+
+#define INPUT_BIG \
+{ \
+  const __m256i db = *buf; \
+  const __m256i zero = m256_zero; \
+  const uint64_t *tp = (const uint64_t*)T512;  \
+  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,63 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,62 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,61 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,60 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,59 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,58 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,57 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,56 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,55 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,54 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,53 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,52 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,51 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,50 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,49 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,48 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,47 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,46 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,45 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,44 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,43 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,42 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,41 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,40 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,39 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,38 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,37 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,36 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,35 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,34 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,33 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,32 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,31 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,30 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,29 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,28 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,27 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,26 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,25 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,24 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,23 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,22 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,21 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,20 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,19 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,18 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,17 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,16 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,15 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,14 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,13 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,12 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,11 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,10 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 9 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 8 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 7 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 6 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 5 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 4 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 3 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 2 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 1 ) ); \
+  INPUT_BIG_sub( db ); \
+}
+
+#if 0
+// dependent on the compiler unrolling the loop
 #define INPUT_BIG \
 do { \
  __m256i db = *buf; \
@@ -1180,6 +1266,7 @@ do { \
     tp += 8; \
  } \
 } while (0)
+#endif

 // v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
 #define SBOX( a, b, c, d ) \
@@ -1219,7 +1306,7 @@ do { \
 do { \
   a = mm256_rol_32( a, 13 ); \
   c = mm256_rol_32( c,  3 ); \
-   b = mm256_xor3( a, b, c ); \
+   b = mm256_xor3( b, a, c ); \
   d = mm256_xor3( d, c, _mm256_slli_epi32( a, 3 ) ); \
   b = mm256_rol_32( b, 1 ); \
   d = mm256_rol_32( d, 7 ); \
@@ -1961,6 +2048,94 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
   sc->h[6] = c6; \
   sc->h[7] = c7;

+#define INPUT_2x64_sub( db_i ) \
+{ \
+     const v128u64_t dm = v128_cmpgt64( zero, db_i ); \
+     m0 = v128_xor( m0, v128_and( dm, v128_64( tp[0] ) ) ); \
+     m1 = v128_xor( m1, v128_and( dm, v128_64( tp[1] ) ) ); \
+     m2 = v128_xor( m2, v128_and( dm, v128_64( tp[2] ) ) ); \
+     m3 = v128_xor( m3, v128_and( dm, v128_64( tp[3] ) ) ); \
+     m4 = v128_xor( m4, v128_and( dm, v128_64( tp[4] ) ) ); \
+     m5 = v128_xor( m5, v128_and( dm, v128_64( tp[5] ) ) ); \
+     m6 = v128_xor( m6, v128_and( dm, v128_64( tp[6] ) ) ); \
+     m7 = v128_xor( m7, v128_and( dm, v128_64( tp[7] ) ) ); \
+     tp += 8; \
+}
+
+#define INPUT_2x64 \
+{ \
+  const v128u64_t db = *buf; \
+  const v128u64_t zero = v128_zero; \
+  const uint64_t *tp = (const uint64_t*)T512;  \
+  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
+  INPUT_2x64_sub( v128_sl64( db,63 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,62 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,61 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,60 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,59 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,58 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,57 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,56 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,55 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,54 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,53 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,52 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,51 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,50 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,49 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,48 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,47 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,46 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,45 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,44 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,43 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,42 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,41 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,40 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,39 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,38 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,37 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,36 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,35 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,34 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,33 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,32 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,31 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,30 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,29 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,28 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,27 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,26 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,25 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,24 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,23 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,22 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,21 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,20 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,19 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,18 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,17 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,16 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,15 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,14 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,13 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,12 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,11 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,10 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 9 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 8 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 7 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 6 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 5 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 4 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 3 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 2 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 1 ) ); \
+  INPUT_2x64_sub( db ); \
+}
+
+#if 0
+// Dependent on the compiler unrolling the loop.
 #define INPUT_2x64 \
 { \
  v128u64_t db = *buf; \
@@ -1981,6 +2156,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
     tp += 8; \
  } \
 }
+#endif

 // v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
 #define SBOX_2x64( a, b, c, d ) \
@@ -2001,7 +2177,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
 { \
   a = v128_rol32( a, 13 ); \
   c = v128_rol32( c,  3 ); \
-   b = v128_xor3( a, b, c ); \
+   b = v128_xor3( c, a, b ); \
   d = v128_xor3( d, c, v128_sl32( a, 3 ) ); \
   b = v128_rol32( b, 1 ); \
   d = v128_rol32( d, 7 ); \
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -161,29 +161,25 @@ keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
 static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
                                 size_t byte_len, size_t lim )
 {
-    unsigned eb;
-    union {
-       __m512i tmp[lim + 1];
-       uint64_t dummy;   /* for alignment */
-    } u;
+    __m512i tmp[lim + 1] __attribute__ ((aligned (64)));
    size_t j;
    size_t m512_len = byte_len >> 3;
+    const unsigned eb = hard_coded_eb;

-    eb = hard_coded_eb;
    if ( kc->ptr == (lim - 8) )
    {
        const uint64_t t = eb | 0x8000000000000000;
-        u.tmp[0] = _mm512_set1_epi64( t );
+        tmp[0] = _mm512_set1_epi64( t );
        j = 8;
    }
    else
    {
        j = lim - kc->ptr;
-        u.tmp[0] = _mm512_set1_epi64( eb );
-        memset_zero_512( u.tmp + 1, (j>>3) - 2 );
-        u.tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
+        tmp[0] = _mm512_set1_epi64( eb );
+        memset_zero_512( tmp + 1, (j>>3) - 2 );
+        tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
    }
-    keccak64_8way_core( kc, u.tmp, j, lim );
+    keccak64_8way_core( kc, tmp, j, lim );
    /* Finalize the "lane complement" */
    NOT64( kc->w[ 1], kc->w[ 1] );
    NOT64( kc->w[ 2], kc->w[ 2] );
@@ -361,29 +357,25 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
 static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
            size_t lim )
 {
-    unsigned eb;
-    union {
-       __m256i tmp[lim + 1];
-       uint64_t dummy;   /* for alignment */
-    } u;
+    __m256i tmp[lim + 1] __attribute__ ((aligned (32)));
    size_t j;
    size_t m256_len = byte_len >> 3;
+    const unsigned eb = hard_coded_eb;

-    eb = hard_coded_eb;
    if ( kc->ptr == (lim - 8) )
    {
        const uint64_t t = eb | 0x8000000000000000;
-        u.tmp[0] = _mm256_set1_epi64x( t );
+        tmp[0] = _mm256_set1_epi64x( t );
        j = 8;
    }
    else
    {
        j = lim - kc->ptr;
-        u.tmp[0] = _mm256_set1_epi64x( eb );
-        memset_zero_256( u.tmp + 1, (j>>3) - 2 );
-        u.tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
+        tmp[0] = _mm256_set1_epi64x( eb );
+        memset_zero_256( tmp + 1, (j>>3) - 2 );
+        tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
    }
-    keccak64_core( kc, u.tmp, j, lim );
+    keccak64_core( kc, tmp, j, lim );
    /* Finalize the "lane complement" */
    NOT64( kc->w[ 1], kc->w[ 1] );
    NOT64( kc->w[ 2], kc->w[ 2] );
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -47,25 +47,19 @@
  a1 = _mm_alignr_epi8( b, a1, 4 ); \
 }

-#elif defined(__ARM_NEON)
+
+#elif defined(__ARM_NEON) || defined(__SSE2__)

 // { a1_0, 0, a1_0, a1_0 }
 #define MULT2( a0, a1 ) \
 { \
-  v128_t b = v128_xor( a0, v128_and( vdupq_laneq_u32( a1, 0 ), MASK ) ); \
+  v128_t b = v128_xor( a0, v128_and( v128_bcast32( a1 ), MASK ) ); \
  a0 = v128_alignr32( a1, b, 1 ); \
  a1 = v128_alignr32( b, a1, 1 ); \
 }

-#else   // assume SSE2
-
-#define MULT2( a0, a1 ) \
-{ \
-  v128_t b = v128_xor( a0, v128_and( _mm_shuffle_epi32( a1, 0 ), MASK ) ); \
-  a0 = v128_or( _mm_srli_si128(  b, 4 ), _mm_slli_si128( a1, 12 ) ); \
-  a1 = v128_or( _mm_srli_si128( a1, 4 ), _mm_slli_si128(  b, 12 ) ); \
-} 
-
+#else
+  #warning __FILE__ ":" __LINE__ " Unknown or unsupported CPU architecture."
 #endif

 #if defined(VL256)
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -195,10 +195,6 @@ static const uint64_t blake2b_IV[8] =

 #endif // AVX2 else SSE2

-static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
-    return ( w >> c ) | ( w << ( 64 - c ) );
-}
-
 #define G( r, i, a, b, c, d ) \
 { \
    a = a + b; \
--- a/algo/m7m/m7m.c
+++ b/algo/m7m/m7m.c
@@ -1,8 +1,6 @@
 #include "cpuminer-config.h"
 #include "algo-gate-api.h"

-#if !defined(__APPLE__)
-
 #include <gmp.h>
 #include <stdbool.h>
 #include <stdlib.h>
@@ -33,6 +31,7 @@ static inline double exp_n( double xt )
        return exp( xt );
 }

+/*
 static inline double exp_n2( double x1, double x2 )
 {
    double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
@@ -53,6 +52,7 @@ static inline double exp_n2( double x1, double x2 )
    else if ( xt > p6 - 1.e-200 )
        return 0.;
 }
+*/

 double swit2_( double wvnmb )
 {
@@ -298,15 +298,9 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
    return 0;
 }

-#endif   // not apple
-
 bool register_m7m_algo( algo_gate_t *gate )
 {
-#if defined(__APPLE__)
-  applog( LOG_ERR, "M7M algo is not supported on MacOS");
-  return false;
-#else  
-  gate->optimizations = SHA_OPT;
+  gate->optimizations = SHA256_OPT;
  init_m7m_ctx();
  gate->scanhash              = (void*)&scanhash_m7m_hash;
  gate->build_stratum_request = (void*)&std_be_build_stratum_request;
@@ -315,6 +309,5 @@ bool register_m7m_algo( algo_gate_t *gate )
  gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
  opt_target_factor = 65536.0;
  return true;
-#endif
 }

--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -11,7 +11,6 @@
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#include "algo/simd/nist.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/shavite/shavite-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
@@ -617,9 +616,9 @@ union _hmq1725_4way_context_overlay
    cubehashParam           cube;
    cube_2way_context       cube2;
    sph_shavite512_context  shavite;
-    hashState_sd            sd;
+    simd512_context         simd;
    shavite512_2way_context shavite2;
-    simd_2way_context       simd;
+    simd_2way_context       simd_2way;
    hashState_echo          echo;
    hamsi512_4way_context   hamsi;
    hashState_fugue         fugue;
@@ -753,8 +752,8 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    shavite512_2way_full( &ctx.shavite2, vhashA, vhashA, 64 );
    shavite512_2way_full( &ctx.shavite2, vhashB, vhashB, 64 );

-    simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
-    simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
+    simd512_2way_full( &ctx.simd_2way, vhashA, vhashA, 64 );
+    simd512_2way_full( &ctx.simd_2way, vhashB, vhashB, 64 );

    rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );     

@@ -869,41 +868,25 @@ extern void hmq1725_4way_hash(void *state, const void *input)
       echo_full( &ctx.echo, (BitSequence *)hash0, 512,
                       (const BitSequence *)hash0, 64 );
    else
-    {
-       init_sd( &ctx.sd, 512 );
-       update_final_sd( &ctx.sd, (BitSequence *)hash0,
-                           (const BitSequence *)hash0, 512 );
-    }
+       simd512_ctx( &ctx.simd, hash0, hash0, 64 );

   if ( hash1[0] & mask ) //4
       echo_full( &ctx.echo, (BitSequence *)hash1, 512,
                       (const BitSequence *)hash1, 64 );
   else
-   {
-       init_sd( &ctx.sd, 512 );
-       update_final_sd( &ctx.sd, (BitSequence *)hash1,
-                           (const BitSequence *)hash1, 512 );
-   }
+       simd512_ctx( &ctx.simd, hash1, hash1, 64 );

   if ( hash2[0] & mask ) //4
       echo_full( &ctx.echo, (BitSequence *)hash2, 512,
                       (const BitSequence *)hash2, 64 );
   else
-   {
-       init_sd( &ctx.sd, 512 );
-       update_final_sd( &ctx.sd, (BitSequence *)hash2,
-                           (const BitSequence *)hash2, 512 );
-   }
+       simd512_ctx( &ctx.simd, hash2, hash2, 64 );

   if ( hash3[0] & mask ) //4
       echo_full( &ctx.echo, (BitSequence *)hash3, 512,
                       (const BitSequence *)hash3, 64 );
   else
-   {
-       init_sd( &ctx.sd, 512 );
-       update_final_sd( &ctx.sd, (BitSequence *)hash3,
-                           (const BitSequence *)hash3, 512 );
-   }
+       simd512_ctx( &ctx.simd, hash3, hash3, 64 );

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -104,14 +104,14 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
   int thr_id = mythr->id;  // thr_id arg is deprecated

   // we need bigendian data...
-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-   casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
-   casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
-   casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
+   casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
+   casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
+   casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
+   casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
+   casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
+   casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
+   casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
+   casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
   intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
        edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );

@@ -224,14 +224,14 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
   int thr_id = mythr->id;  // thr_id arg is deprecated

   // we need bigendian data...
-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-   casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
-   casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
-   casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
+   casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
+   casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
+   casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
+   casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
+   casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
+   casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
+   casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
+   casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
   intrlv_8x32( vdata, edata, edata, edata, edata,
                       edata, edata, edata, edata, 1024 );

--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -51,7 +51,6 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }

 bool register_lbry_algo( algo_gate_t* gate )
 {
-//  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
 #if defined (LBRY_16WAY)
  gate->scanhash              = (void*)&scanhash_lbry_16way;
  gate->hash                  = (void*)&lbry_16way_hash;
@@ -67,7 +66,7 @@ bool register_lbry_algo( algo_gate_t* gate )
 #else 
  gate->scanhash              = (void*)&scanhash_lbry;
  gate->hash                  = (void*)&lbry_hash;
-  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
+  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA256_OPT;
 #endif
  gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
  gate->build_extraheader     = (void*)&lbry_build_extraheader;
--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -319,7 +319,7 @@ void ripemd160_4way_close( ripemd160_4way_context  *sc, void *dst )
    sc->buf[ (pad>>2) + 1 ] = _mm_set1_epi32( high );
    ripemd160_4way_round( sc );
    for (u = 0; u < 5; u ++)
-        casti_m128i( dst, u ) = sc->val[u];
+        casti_v128u32( dst, u ) = sc->val[u];
 }

 #endif
--- a/algo/scrypt/neoscrypt.c
+++ b/algo/scrypt/neoscrypt.c
@@ -46,7 +46,7 @@
 #endif

 #ifdef __GNUC__
-#if defined(NOASM) || defined(__arm__) || defined(__aarch64__)
+#if defined(NOASM) || defined(__arm__) || defined(__aarch64__) || defined(__APPLE__)
 #define ASM 0
 #else
 #define ASM 1
--- a/algo/scrypt/scrypt-core-4way.c
+++ b/algo/scrypt/scrypt-core-4way.c
@@ -2074,7 +2074,7 @@ void scrypt_core_4way( v128_t *X, v128_t *V, const uint32_t N )
         v128_ovly v;    
         for ( int l = 0; l < 4; l++ )
            v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
-         X[i] = v128_xor( X[i], v.m128 );
+         X[i] = v128_xor( X[i], v.v128 );
      }

      xor_salsa8_4way( &X[ 0], &X[16] );
@@ -2211,10 +2211,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
   // X2 is shuffled left 2 (swap_64)    { xd, x8, x7, x2 }
   // X3 is shuffled left 3 (ror_1x32)   { xc, xb, x6, x1 }

-   y[0].m128 = X0;
-   y[1].m128 = X1;
-   y[2].m128 = X2;
-   y[3].m128 = X3;
+   y[0].v128 = X0;
+   y[1].v128 = X1;
+   y[2].v128 = X2;
+   y[3].v128 = X3;

   z[0].u32[0] = y[0].u32[0];
   z[0].u32[3] = y[1].u32[0];
@@ -2236,10 +2236,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
   z[3].u32[1] = y[2].u32[3];
   z[3].u32[0] = y[3].u32[3];

-   B[0] = v128_add32( B[0], z[0].m128 );
-   B[1] = v128_add32( B[1], z[1].m128 );
-   B[2] = v128_add32( B[2], z[2].m128 );
-   B[3] = v128_add32( B[3], z[3].m128 );
+   B[0] = v128_add32( B[0], z[0].v128 );
+   B[1] = v128_add32( B[1], z[1].v128 );
+   B[2] = v128_add32( B[2], z[2].v128 );
+   B[3] = v128_add32( B[3], z[3].v128 );

 #endif

@@ -2404,14 +2404,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
 /*
   v128_ovly ya[4], za[4], yb[4], zb[4];

-   ya[0].m128 = XA[0];
-   yb[0].m128 = XB[0];
-   ya[1].m128 = XA[1];
-   yb[1].m128 = XB[1];
-   ya[2].m128 = XA[2];
-   yb[2].m128 = XB[2];
-   ya[3].m128 = XA[3];
-   yb[3].m128 = XB[3];
+   ya[0].v128 = XA[0];
+   yb[0].v128 = XB[0];
+   ya[1].v128 = XA[1];
+   yb[1].v128 = XB[1];
+   ya[2].v128 = XA[2];
+   yb[2].v128 = XB[2];
+   ya[3].v128 = XA[3];
+   yb[3].v128 = XB[3];

   za[0].u32[0] = ya[0].u32[0];
   zb[0].u32[0] = yb[0].u32[0];
@@ -2449,14 +2449,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
   za[3].u32[3] = ya[0].u32[3];
   zb[3].u32[3] = yb[0].u32[3];

-   XA[0] = za[0].m128;
-   XB[0] = zb[0].m128;
-   XA[1] = za[1].m128;
-   XB[1] = zb[1].m128;
-   XA[2] = za[2].m128;
-   XB[2] = zb[2].m128;
-   XA[3] = za[3].m128;
-   XB[3] = zb[3].m128;
+   XA[0] = za[0].v128;
+   XB[0] = zb[0].v128;
+   XA[1] = za[1].v128;
+   XB[1] = zb[1].v128;
+   XA[2] = za[2].v128;
+   XB[2] = zb[2].v128;
+   XA[3] = za[3].v128;
+   XB[3] = zb[3].v128;
 */
 }

@@ -2770,18 +2770,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
 /*  
   v128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4];

-   ya[0].m128 = XA[0];
-   yb[0].m128 = XB[0];
-   yc[0].m128 = XC[0];
-   ya[1].m128 = XA[1];
-   yb[1].m128 = XB[1];
-   yc[1].m128 = XC[1];
-   ya[2].m128 = XA[2];
-   yb[2].m128 = XB[2];
-   yc[2].m128 = XC[2];
-   ya[3].m128 = XA[3];
-   yb[3].m128 = XB[3];
-   yc[3].m128 = XC[3];
+   ya[0].v128 = XA[0];
+   yb[0].v128 = XB[0];
+   yc[0].v128 = XC[0];
+   ya[1].v128 = XA[1];
+   yb[1].v128 = XB[1];
+   yc[1].v128 = XC[1];
+   ya[2].v128 = XA[2];
+   yb[2].v128 = XB[2];
+   yc[2].v128 = XC[2];
+   ya[3].v128 = XA[3];
+   yb[3].v128 = XB[3];
+   yc[3].v128 = XC[3];

   za[0].u32[0] = ya[0].u32[0];
   zb[0].u32[0] = yb[0].u32[0];
@@ -2835,18 +2835,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
   zb[3].u32[3] = yb[0].u32[3];
   zc[3].u32[3] = yc[0].u32[3];

-   XA[0] = za[0].m128;
-   XB[0] = zb[0].m128;
-   XC[0] = zc[0].m128;
-   XA[1] = za[1].m128;
-   XB[1] = zb[1].m128;
-   XC[1] = zc[1].m128;
-   XA[2] = za[2].m128;
-   XB[2] = zb[2].m128;
-   XC[2] = zc[2].m128;
-   XA[3] = za[3].m128;
-   XB[3] = zb[3].m128;
-   XC[3] = zc[3].m128;
+   XA[0] = za[0].v128;
+   XB[0] = zb[0].v128;
+   XC[0] = zc[0].v128;
+   XA[1] = za[1].v128;
+   XB[1] = zb[1].v128;
+   XC[1] = zc[1].v128;
+   XA[2] = za[2].v128;
+   XB[2] = zb[2].v128;
+   XC[2] = zc[2].v128;
+   XA[3] = za[3].v128;
+   XB[3] = zb[3].v128;
+   XC[3] = zc[3].v128;
 */
 }   

@@ -3049,7 +3049,7 @@ static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
            xf = (B[15] ^= C[15]);

   
-   #define ROL32( a, c )    ror32( a, c )
+   #define ROL32( a, c )    rol32( a, c )
   #define ADD32( a, b )    ( (a)+(b) )
   #define XOR( a, b )      ( (a)^(b) )

--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -274,9 +274,6 @@ static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,

 #endif   // SHA

-
-
-
 static const uint32_t keypad_4way[ 4*12 ] __attribute((aligned(32))) = 
 {
 	0x80000000, 0x80000000, 0x80000000, 0x80000000,
@@ -447,7 +444,7 @@ static inline void PBKDF2_SHA256_128_32_4way( uint32_t *tstate,
 		output[i] = bswap_32( ostate[i] );
 }

-#ifdef HAVE_SHA256_8WAY
+#if defined(__AVX2__)

 /*
 static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = {
@@ -590,7 +587,7 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
 		output[i] = bswap_32(ostate[i]);
 }

-#endif /* HAVE_SHA256_8WAY */
+#endif //AVX2

 #if defined(SIMD512)

@@ -724,25 +721,10 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,

 #endif // AVX512

-#define SCRYPT_MAX_WAYS 12
-#define HAVE_SCRYPT_3WAY 1
-void scrypt_core(uint32_t *X, uint32_t *V, int N);
-void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
-
-#if defined(__AVX2__)
-#undef SCRYPT_MAX_WAYS
-#define SCRYPT_MAX_WAYS 24
-#define HAVE_SCRYPT_6WAY 1
-void scrypt_core_6way(uint32_t *X, uint32_t *V, int N);
-#endif
-
-#ifndef SCRYPT_MAX_WAYS
-#define SCRYPT_MAX_WAYS 1
-#endif
-
 #include "scrypt-core-4way.h"

-/*
+#if ( SCRYPT_THROUGHPUT == 1 )
+   
 static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
                              uint32_t *midstate, int N, int thr_id )
 {
@@ -752,15 +734,12 @@ static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
 	memcpy(tstate, midstate, 32);
 	HMAC_SHA256_80_init(input, tstate, ostate);
 	PBKDF2_SHA256_80_128(tstate, ostate, input, X);
-
-   scrypt_core_simd128( X, scratchbuf, N );  // woring
-//   scrypt_core_1way( X, V, N );  // working
-//   scrypt_core(X, V, N);
-
+   scrypt_core_1way( X, scratchbuf, N ); 
 	PBKDF2_SHA256_128_32(tstate, ostate, X, output);
   return true;
 }
-*/
+
+#endif

 #if ( SCRYPT_THROUGHPUT == 8 )

@@ -1201,20 +1180,6 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_2buf( X+448, V, N );
 ********************/
-/*
-   scrypt_core_3way( X,     V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_3way( X+ 96, V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_2buf( X+192, V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_3way( X+256, V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_3way( X+352, V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_2buf( X+448, V, N );
-*/
-

   if ( work_restart[thrid].restart ) return 0;

@@ -1321,8 +1286,7 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
   return 1;
 }

-#else  
-// SSE2
+#elif defined(__SSE2__) || defined(__ARM_NEON)  

 static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
           uint32_t *midstate, int N, int thrid )
@@ -1481,7 +1445,7 @@ bool scrypt_miner_thread_init( int thr_id )
 bool register_scrypt_algo( algo_gate_t* gate )
 {
 #if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
-   gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+   gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | SHA256_OPT | NEON_OPT;
 #else
   gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
 #endif
@@ -1491,31 +1455,31 @@ bool register_scrypt_algo( algo_gate_t* gate )
   opt_param_n = opt_param_n ? opt_param_n : 1024;
   applog( LOG_INFO,"Scrypt paramaters: N= %d, R= 1", opt_param_n );

-// scrypt_throughput defined at compile time and used to replace
-// MAX_WAYS to reduce memory usage.
-   
-#if defined(SIMD512)
-//   scrypt_throughput = 16;
-   if ( opt_param_n > 0x4000 )
-      scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
-   else      
-      scratchbuf_size = opt_param_n * 4 * 128;  // 4 way
-#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
-//   scrypt_throughput = 2;
-   scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
-#elif defined(__AVX2__)
-//   scrypt_throughput = 8;   
-   if ( opt_param_n > 0x4000 )
-      scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
-   else
-      scratchbuf_size = opt_param_n * 2 * 128;  // 2 way
-#else
-//   scrypt_throughput = 4;
-   if ( opt_param_n > 0x4000 )
-   scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
-   else
-   scratchbuf_size = opt_param_n * 4 * 128;  // 4 way
-#endif
+   switch ( SCRYPT_THROUGHPUT )
+   {
+     case 16:  // AVX512
+       if ( opt_param_n > 0x4000 )
+         scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+       else      
+         scratchbuf_size = opt_param_n * 4 * 128;  // 4 way
+     break;
+     case 2:  // SHA256
+         scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
+     break;
+     case 8:  // AVX2
+       if ( opt_param_n > 0x4000 )
+         scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+     else
+         scratchbuf_size = opt_param_n * 2 * 128;  // 2 way
+     break;
+     case 4:  // SSE2, NEON
+       if ( opt_param_n > 0x4000 )
+         scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
+       else
+         scratchbuf_size = opt_param_n * 4 * 128;  // 4 way
+     default:
+         scratchbuf_size = opt_param_n;  // 1 way
+   }

   char t_units[4] = {0};
   char d_units[4] = {0};
--- a/algo/sha/hmac-sha256-hash-4way.c
+++ b/algo/sha/hmac-sha256-hash-4way.c
@@ -74,8 +74,8 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
 	memset( pad, 0x36, 64*4 );

   for ( i = 0; i < Klen; i++ )
-		casti_m128i( pad, i ) = _mm_xor_si128( casti_m128i( pad, i ),
-                                             casti_m128i( K, i ) );
+		casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
+                                               casti_v128u32( K, i ) );

   sha256_4way_update( &ctx->ictx, pad, 64 );

@@ -83,8 +83,8 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
 	sha256_4way_init( &ctx->octx );
 	memset( pad, 0x5c, 64*4 );
 	for ( i = 0; i < Klen/4; i++ )
-		casti_m128i( pad, i ) = _mm_xor_si128( casti_m128i( pad, i ),
-                                             casti_m128i( K, i ) );
+		casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
+                                               casti_v128u32( K, i ) );
 	sha256_4way_update( &ctx->octx, pad, 64 );
 }

@@ -158,8 +158,8 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,

 			/* ... xor U_j ... */
 			for ( k = 0; k < 8; k++ )
-				casti_m128i( T, k ) = _mm_xor_si128( casti_m128i( T, k ),
-                                                 casti_m128i( U, k ) );
+				casti_v128u32( T, k ) = _mm_xor_si128( casti_v128u32( T, k ),
+                                                   casti_v128u32( U, k ) );
 		}

 		/* Copy as many bytes as necessary into buf. */
--- a/algo/sha/sha256-hash.c
+++ b/algo/sha/sha256-hash.c
@@ -569,8 +569,8 @@ void sha256_x86_sha_prehash_3rounds( uint32_t *ostate, const void *msg,
   __m128i STATE0, STATE1, MSG, TMP;

   // Load initial values
-   TMP    = casti_m128i( istate, 0 );
-   STATE1 = casti_m128i( istate, 1 );
+   TMP    = casti_v128u32( istate, 0 );
+   STATE1 = casti_v128u32( istate, 1 );

   TMP    = _mm_shuffle_epi32( TMP, 0xB1 );       // CDAB
   STATE1 = _mm_shuffle_epi32( STATE1, 0x1B );    // EFGH
@@ -578,17 +578,17 @@ void sha256_x86_sha_prehash_3rounds( uint32_t *ostate, const void *msg,
   STATE1 = _mm_blend_epi16( STATE1, TMP, 0xF0 ); // CDGH

   // Save current hash
-   casti_m128i( sstate, 0 ) = STATE0;
-   casti_m128i( sstate, 1 ) = STATE1;
+   casti_v128u32( sstate, 0 ) = STATE0;
+   casti_v128u32( sstate, 1 ) = STATE1;

   // Rounds 0 to 3
-   MSG = casti_m128i( msg, 0 );
+   MSG = casti_v128u32( msg, 0 );
   TMP = _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL );
   MSG = _mm_add_epi32( MSG, TMP );
   STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
   MSG = _mm_shuffle_epi32( MSG, 0x0E );
-   casti_m128i( ostate, 0 ) = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-   casti_m128i( ostate, 1 ) = STATE1;
+   casti_v128u32( ostate, 0 ) = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
+   casti_v128u32( ostate, 1 ) = STATE1;
 }

 void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
@@ -601,22 +601,22 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
    __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
    __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;

-    STATE0_X = casti_m128i( state_mid_X, 0 );
-    STATE1_X = casti_m128i( state_mid_X, 1 );
-    STATE0_Y = casti_m128i( state_mid_Y, 0 );
-    STATE1_Y = casti_m128i( state_mid_Y, 1 );
+    STATE0_X = casti_v128u32( state_mid_X, 0 );
+    STATE1_X = casti_v128u32( state_mid_X, 1 );
+    STATE0_Y = casti_v128u32( state_mid_Y, 0 );
+    STATE1_Y = casti_v128u32( state_mid_Y, 1 );

    // Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3)
-    TMSG0_X = casti_m128i( msg_X, 0 );
-    TMSG0_Y = casti_m128i( msg_Y, 0 );
+    TMSG0_X = casti_v128u32( msg_X, 0 );
+    TMSG0_Y = casti_v128u32( msg_Y, 0 );
    TMP_X = v128_xim32( TMSG0_X, TMSG0_X, 0xd5 );
    TMP_Y = v128_xim32( TMSG0_Y, TMSG0_Y, 0xd5 );
    STATE0_X = _mm_add_epi32( STATE0_X, TMP_X );
    STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y );

    // Rounds 4 to 7
-    TMSG1_X = casti_m128i( msg_X, 1 );
-    TMSG1_Y = casti_m128i( msg_Y, 1 );
+    TMSG1_X = casti_v128u32( msg_X, 1 );
+    TMSG1_Y = casti_v128u32( msg_Y, 1 );
    TMP_X = _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL );
    MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
    MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
@@ -638,8 +638,8 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_X );

    // Rounds 12 to 15
-    TMSG3_X = casti_m128i( msg_X, 3 );
-    TMSG3_Y = casti_m128i( msg_Y, 3 );
+    TMSG3_X = casti_v128u32( msg_X, 3 );
+    TMSG3_Y = casti_v128u32( msg_Y, 3 );
    TMP_X = _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL );
    MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
    MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
@@ -867,20 +867,20 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );

    // Add saved state to new state
-    STATE0_X = _mm_add_epi32( STATE0_X, casti_m128i( state_save_X, 0 ) );
-    STATE1_X = _mm_add_epi32( STATE1_X, casti_m128i( state_save_X, 1 ) );
-    STATE0_Y = _mm_add_epi32( STATE0_Y, casti_m128i( state_save_Y, 0 ) );
-    STATE1_Y = _mm_add_epi32( STATE1_Y, casti_m128i( state_save_Y, 1 ) );
+    STATE0_X = _mm_add_epi32( STATE0_X, casti_v128u32( state_save_X, 0 ) );
+    STATE1_X = _mm_add_epi32( STATE1_X, casti_v128u32( state_save_X, 1 ) );
+    STATE0_Y = _mm_add_epi32( STATE0_Y, casti_v128u32( state_save_Y, 0 ) );
+    STATE1_Y = _mm_add_epi32( STATE1_Y, casti_v128u32( state_save_Y, 1 ) );

    // Unshuffle & save state    
    TMP_X = _mm_shuffle_epi32( STATE0_X, 0x1B );                        // FEBA
    TMP_Y = _mm_shuffle_epi32( STATE0_Y, 0x1B );
    STATE1_X = _mm_shuffle_epi32( STATE1_X, 0xB1 );                     // DCHG
    STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0xB1 );
-    casti_m128i( out_X, 0 ) = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA
-    casti_m128i( out_Y, 0 ) = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 );
-    casti_m128i( out_X, 1 ) = _mm_alignr_epi8( STATE1_X, TMP_X, 8 );    // ABEF
-    casti_m128i( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
+    casti_v128u32( out_X, 0 ) = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA
+    casti_v128u32( out_Y, 0 ) = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 );
+    casti_v128u32( out_X, 1 ) = _mm_alignr_epi8( STATE1_X, TMP_X, 8 );    // ABEF
+    casti_v128u32( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
 }

 #endif     // SHA
--- a/algo/sha/sha256d.c
+++ b/algo/sha/sha256d.c
@@ -8,14 +8,14 @@ void sha256d( void *hash, const void *data, int len )
 }
 bool register_sha256d_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
 #if defined(SHA256D_16WAY)
   gate->scanhash = (void*)&scanhash_sha256d_16way;
 #elif defined(SHA256D_SHA)
-   gate->optimizations = SHA_OPT;
+   gate->optimizations = SSE2_OPT | SHA256_OPT;
   gate->scanhash = (void*)&scanhash_sha256d_sha;
 #elif defined(SHA256D_NEON_SHA2)
-   gate->optimizations = SHA_OPT;
+   gate->optimizations = NEON_OPT | SHA256_OPT;
   gate->scanhash = (void*)&scanhash_sha256d_neon_sha2;
 #elif defined(SHA256D_8WAY)
   gate->scanhash = (void*)&scanhash_sha256d_8way;
--- a/algo/sha/sha256dt.c
+++ b/algo/sha/sha256dt.c
@@ -500,10 +500,10 @@ bool register_sha256dt_algo( algo_gate_t* gate )
 #if defined(SHA256DT_16X32)
    gate->scanhash = (void*)&scanhash_sha256dt_16x32;
 #elif defined(SHA256DT_X86_SHA256)
-    gate->optimizations = SHA_OPT;
+    gate->optimizations = SSE2_OPT | SHA256_OPT;
    gate->scanhash = (void*)&scanhash_sha256dt_x86_x2sha;    
 #elif defined(SHA256DT_NEON_SHA256)
-    gate->optimizations = SHA_OPT;
+    gate->optimizations = NEON_OPT | SHA256_OPT;
    gate->scanhash = (void*)&scanhash_sha256dt_neon_x2sha;
 #elif defined(SHA256DT_8X32)
    gate->scanhash = (void*)&scanhash_sha256dt_8x32;
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -6,9 +6,10 @@ bool register_sha256t_algo( algo_gate_t* gate )
 #if defined(SHA256T_16WAY)
    gate->scanhash   = (void*)&scanhash_sha256t_16way;
 #elif defined(SHA256T_SHA)
-    gate->optimizations = SHA_OPT;
+    gate->optimizations = SSE2_OPT | SHA256_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t_sha;
 #elif defined(SHA256T_NEON_SHA2)
+    gate->optimizations = NEON_OPT | SHA256_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t_neon_sha2;
 #elif defined(SHA256T_8WAY)
    gate->scanhash   = (void*)&scanhash_sha256t_8way;
@@ -28,7 +29,7 @@ bool register_sha256q_algo( algo_gate_t* gate )
    gate->scanhash   = (void*)&scanhash_sha256q_16way;
    gate->hash       = (void*)&sha256q_16way_hash;
 //#elif defined(SHA256T_SHA)
-//    gate->optimizations = SHA_OPT;
+//    gate->optimizations = SHA256_OPT;
 //    gate->scanhash   = (void*)&scanhash_sha256q;
 //    gate->hash       = (void*)&sha256q_hash;
 #elif defined(SHA256T_8WAY)
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -71,12 +71,13 @@ static const uint64_t K512[80] =

 // SHA-512 implemented using SHA512 CPU extension.

-// Experimental. Not tested. Not reviewed. Compile tested only.
+// Experimental. Not supported. Not tested. Not reviewed. Compile tested only.
+// Modelled after noloader sha256 implementation, replacing 4x32 bit
+// instructions with equivalent 4x64 bit instructions and increasing rounds
+// to 80.

 // Needs GCC-14 for compilation.
 // Needs Intel Lunarlake or Arrowlake CPU, or AMD Zen-6? for execution.
-// Modelled after noloader sha256 implementation.
-

 void sha512_opt_transform_be( uint64_t *state_out, const void *input,
                              const uint64_t *state_in )
@@ -571,6 +572,20 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,

 #endif

+/*
+#if defined(__ARM_FEATURE_NEON) && defined(__ARM_FEATURE_SHA512)
+
+uint64x2_t sha512_compile_test( uint64x2_t test )
+{
+   test = vsha512hq_u64( test, test, test );
+   test = vsha512h2q_u64( test, test, test );
+   test = vsha512su0q_u64( test, test );
+   test = vsha512su1q_u64( test, test, test );
+   return test;
+}
+
+#endif
+*/

 #if defined(SIMD512)

--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -300,11 +300,12 @@ static inline __m512i v512_mult_x5( const __m512i x )

 #define PERM_ELT16( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
-   xa0 = mm512_xor3( xm, xb1, mm512_xorandnot( \
-           v512_mult_x3( mm512_xor3( xa0, xc, \
-              v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ) ), \
-           xb3, xb2 ) ); \
-   xb0 = mm512_xnor( xa0, mm512_rol_32( xb0, 1 ) ); \
+   xa0 = mm512_xor3( xa0, xc, \
+                     v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ); \
+   xb0 = mm512_rol_32( xb0, 1 ); \
+   xa0 = mm512_xor3( xm, xb1, \
+                     mm512_xorandnot( v512_mult_x3( xa0 ), xb3, xb2 ) ); \
+   xb0 = mm512_xnor( xa0, xb0 ); \
 } while (0)

 #define PERM_STEP_0_16  do { \
@@ -905,11 +906,12 @@ static inline __m256i v256_mult_x5( const __m256i x )

 #define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
-   xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
-           v256_mult_x3( mm256_xor3( xa0, xc, \
-              v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ) ), \
-           xb3, xb2 ) ); \
-   xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
+   xa0 = mm256_xor3( xa0, xc, \
+                     v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ); \
+   xb0 = mm256_rol_32( xb0, 1 ); \
+   xa0 = mm256_xor3( xm, xb1, \
+                     mm256_xorandnot( v256_mult_x3( xa0 ), xb3, xb2 ) ); \
+   xb0 = mm256_xnor( xa0, xb0 ); \
 } while (0)

 #define PERM_STEP_0_8   do { \
--- a/algo/simd/nist.c
+++ b/algo/simd/nist.c
@@ -1,472 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "nist.h"
-#include "simd_iv.h"
-
-
-/* #define NO_PRECOMPUTED_IV */
-#if defined(__SSE2__)  // || defined(__ARM_NEON)
-
-/* 
- * Increase the counter.
- */
-void IncreaseCounter(hashState_sd *state, DataLength databitlen) {
-#ifdef HAS_64
-      state->count += databitlen;
-#else
-      uint32_t old_count = state->count_low;
-      state->count_low += databitlen;
-      if (state->count_low < old_count)
-        state->count_high++;
-#endif
-}
-
-
-/* 
- * Initialize the hashState_sd with a given IV.
- * If the IV is NULL, initialize with zeros.
- */
-int InitIV(hashState_sd *state, int hashbitlen, const u32 *IV) {
-
-  int n = 8;
-
-  state->hashbitlen = hashbitlen;
-  state->n_feistels = n;
-  state->blocksize = 128*8;
-  
-#ifdef HAS_64
-  state->count = 0;
-#else
-  state->count_low  = 0;
-  state->count_high = 0;
-#endif  
-
-//  state->buffer = malloc(16*n + 16);
-  /*
-   * Align the buffer to a 128 bit boundary.
-   */
-//  state->buffer += ((unsigned char*)NULL - state->buffer)&15;
-
-//  state->A = malloc((4*n+4)*sizeof(u32));
-  /*
-   * Align the buffer to a 128 bit boundary.
-   */
-//  state->A += ((u32*)NULL - state->A)&3;
-
-  state->B = state->A+n;
-  state->C = state->B+n;
-  state->D = state->C+n;
-
-  if (IV)
-    memcpy(state->A, IV, 4*n*sizeof(u32));
-  else
-    memset(state->A, 0, 4*n*sizeof(u32));
-
-   // free(state->buffer);
-  //  free(state->A);	
-  return 0;
-  
-}
-
-/* 
- * Initialize the hashState_sd.
- */
-int init_sd(hashState_sd *state, int hashbitlen) {
-  int r;
-  char *init;
-
-#ifndef NO_PRECOMPUTED_IV
-//  if (hashbitlen == 224)
-//    r=InitIV(state, hashbitlen, IV_224);
-//  else if (hashbitlen == 256)
-//    r=InitIV(state, hashbitlen, IV_256);
-//  else if (hashbitlen == 384)
-//    r=InitIV(state, hashbitlen, IV_384);
-//  else
-  if (hashbitlen == 512)
-    r = InitIV(state, hashbitlen, IV_512);
-  else
-#endif
-    {
-      /* 
-       * Nonstandart length: IV is not precomputed.
-       */
-      r=InitIV(state, hashbitlen, NULL);
-      if (r != 0)
-        return r;
-      
-      init = malloc(state->blocksize);
-      memset(init, 0, state->blocksize);
-#if defined __STDC__ && __STDC_VERSION__ >= 199901L
-      snprintf(init, state->blocksize, "SIMD-%i v1.1", hashbitlen);
-#else
-      sprintf(init, "SIMD-%i v1.1", hashbitlen);
-#endif
-      SIMD_Compress(state, (unsigned char*) init, 0);
-      free(init);
-    }
-  return r;
-}
-
-int update_sd( hashState_sd *state, const BitSequence *data,
-                      DataLength databitlen )
-{
-  unsigned current;
-  unsigned int bs = state->blocksize;
-  static int align = -1;
-
-  if (align == -1)
-    align = RequiredAlignment();
-
-#ifdef HAS_64
-  current = state->count & (bs - 1);
-#else
-  current = state->count_low & (bs - 1);
-#endif
-  
-  if ( current & 7 )
-  {
-    // The number of hashed bits is not a multiple of 8.
-    // Very painfull to implement and not required by the NIST API.
-    return 1;
-  }
-
-  while ( databitlen > 0 )
-  {
-    if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
-    {
-       // We can hash the data directly from the input buffer.
-      SIMD_Compress(state, data, 0);
-      databitlen -= bs;
-      data += bs/8;
-      IncreaseCounter(state, bs);
-    }
-    else
-    {
-       // Copy a chunk of data to the buffer
-      unsigned int len = bs - current;
-      if ( databitlen < len )
-      {
-        memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
-        IncreaseCounter( state, databitlen );        
-        return 0;
-      }
-      else
-      {
-        memcpy( state->buffer+current/8, data, len/8 );
-        IncreaseCounter( state,len );
-        databitlen -= len;
-        data += len/8;
-        current = 0;
-        SIMD_Compress( state, state->buffer, 0 );
-      }
-    }
-  }
-  return 0;
-}
-
-int final_sd( hashState_sd *state, BitSequence *hashval )
-{
-#ifdef HAS_64
-  uint64_t l;
-  int current = state->count & (state->blocksize - 1);
-#else
-  uint32_t l;
-  int current = state->count_low & (state->blocksize - 1);
-#endif
-  unsigned int i;
-  BitSequence bs[64];
-  int isshort = 1;
-
-  // If there is still some data in the buffer, hash it
-  if ( current )
-  {
-    // We first need to zero out the end of the buffer.
-    if ( current & 7 )
-    {
-      BitSequence mask = 0xff >> ( current & 7 );
-      state->buffer[current/8] &= ~mask;
-    }
-    current = ( current+7 ) / 8;
-    memset( state->buffer+current, 0, state->blocksize/8 - current );
-    SIMD_Compress( state, state->buffer, 0 );
-  }
-
-  //* Input the message length as the last block
-  memset( state->buffer, 0, state->blocksize / 8 );
-#ifdef HAS_64
-  l = state->count;
-  for ( i=0; i<8; i++ )
-  {
-    state->buffer[i] = l & 0xff;
-    l >>= 8;
-  }
-  if ( state->count < 16384 )
-    isshort = 2;
-#else
-  l = state->count_low;
-  for ( i=0; i<4; i++ )
-  {
-    state->buffer[i] = l & 0xff;
-    l >>= 8;
-  }
-  l = state->count_high;
-  for ( i=0; i<4; i++ )
-  {
-    state->buffer[4+i] = l & 0xff;
-    l >>= 8;
-  }
-  if ( state->count_high == 0 && state->count_low < 16384 )
-    isshort = 2;
-#endif
-
-  SIMD_Compress( state, state->buffer, isshort );
-  
-  // Decode the 32-bit words into a BitSequence
-  for ( i=0; i < 2*state->n_feistels; i++ )
-  {
-    u32 x = state->A[i];
-    bs[4*i  ] = x&0xff;
-    x >>= 8;
-    bs[4*i+1] = x&0xff;
-    x >>= 8;
-    bs[4*i+2] = x&0xff;
-    x >>= 8;
-    bs[4*i+3] = x&0xff;
-  }
-
-  memcpy( hashval, bs, state->hashbitlen / 8 );
-  if ( state->hashbitlen % 8 )
-  {
-    BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
-    hashval[state->hashbitlen/8 + 1] = bs[state->hashbitlen/8 + 1] & mask;
-  }
-  return 0;
-}
-
-int update_final_sd( hashState_sd *state, BitSequence *hashval,
-                            const BitSequence *data, DataLength databitlen )
-{
-  int current, i;
-  unsigned int bs = state->blocksize;
-  static int align = -1;
-  BitSequence out[64];
-  int isshort = 1;
-  uint64_t l;
-
-  if (align == -1)
-    align = RequiredAlignment();
-
-#ifdef HAS_64
-  current = state->count & (bs - 1);
-#else
-  current = state->count_low & (bs - 1);
-#endif
-
-  if ( current & 7 )
-  {
-    // The number of hashed bits is not a multiple of 8.
-    // Very painfull to implement and not required by the NIST API.
-    return 1;
-  }
-
-  while ( databitlen > 0 )
-  {
-    if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
-    {
-       // We can hash the data directly from the input buffer.
-      SIMD_Compress(state, data, 0);
-      databitlen -= bs;
-      data += bs/8;
-      IncreaseCounter(state, bs);
-    }
-    else
-    {
-       // Copy a chunk of data to the buffer
-      unsigned int len = bs - current;
-      if ( databitlen < len )
-      {
-        memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
-        IncreaseCounter( state, databitlen );
-        break;
-      }
-      else
-      {
-        memcpy( state->buffer+current/8, data, len/8 );
-        IncreaseCounter( state,len );
-        databitlen -= len;
-        data += len/8;
-        current = 0;
-        SIMD_Compress( state, state->buffer, 0 );
-      }
-    }
-  }
-
-  current = state->count & (state->blocksize - 1);
-
-  // If there is still some data in the buffer, hash it
-  if ( current )
-  {
-    // We first need to zero out the end of the buffer.
-    if ( current & 7 )
-    {
-      BitSequence mask = 0xff >> ( current & 7 );
-      state->buffer[current/8] &= ~mask;
-    }
-    current = ( current+7 ) / 8;
-    memset( state->buffer+current, 0, state->blocksize/8 - current );
-    SIMD_Compress( state, state->buffer, 0 );
-  }
-
-  //* Input the message length as the last block
-  memset( state->buffer, 0, state->blocksize / 8 );
-  l = state->count;
-  for ( i=0; i<8; i++ )
-  {
-    state->buffer[i] = l & 0xff;
-    l >>= 8;
-  }
-  if ( state->count < 16384 )
-    isshort = 2;
-
-  SIMD_Compress( state, state->buffer, isshort );
-
-  // Decode the 32-bit words into a BitSequence
-  for ( i=0; i < 2*state->n_feistels; i++ )
-  {
-    u32 x = state->A[i];
-    out[4*i  ] = x & 0xff;
-    x >>= 8;
-    out[4*i+1] = x & 0xff;
-    x >>= 8;
-    out[4*i+2] = x & 0xff;
-    x >>= 8;
-    out[4*i+3] = x & 0xff;
-  }
-
-  memcpy( hashval, out, state->hashbitlen / 8 );
-  if ( state->hashbitlen % 8 )
-  {
-    BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
-    hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask;
-  }
-  return 0;
-}
-
-int simd_full( hashState_sd *state, BitSequence *hashval,
-                            const BitSequence *data, DataLength databitlen )
-{
- 
-
-  InitIV( state, 512, IV_512 );
- 
-  int current, i;
-  unsigned int bs = state->blocksize;
-  static int align = -1;
-  BitSequence out[64];
-  int isshort = 1;
-  uint64_t l;
-
-  if (align == -1)
-    align = RequiredAlignment();
-
-#ifdef HAS_64
-  current = state->count & (bs - 1);
-#else
-  current = state->count_low & (bs - 1);
-#endif
-
-  if ( current & 7 )
-  {
-    // The number of hashed bits is not a multiple of 8.
-    // Very painfull to implement and not required by the NIST API.
-    return 1;
-  }
-
-  while ( databitlen > 0 )
-  {
-    if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
-    {
-       // We can hash the data directly from the input buffer.
-      SIMD_Compress(state, data, 0);
-      databitlen -= bs;
-      data += bs/8;
-      IncreaseCounter(state, bs);
-    }
-    else
-    {
-       // Copy a chunk of data to the buffer
-      unsigned int len = bs - current;
-      if ( databitlen < len )
-      {
-        memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
-        IncreaseCounter( state, databitlen );
-        break;
-      }
-      else
-      {
-        memcpy( state->buffer+current/8, data, len/8 );
-        IncreaseCounter( state,len );
-        databitlen -= len;
-        data += len/8;
-        current = 0;
-        SIMD_Compress( state, state->buffer, 0 );
-      }
-    }
-  }
-
-  current = state->count & (state->blocksize - 1);
-
-  // If there is still some data in the buffer, hash it
-  if ( current )
-  {
-    // We first need to zero out the end of the buffer.
-    if ( current & 7 )
-    {
-      BitSequence mask = 0xff >> ( current & 7 );
-      state->buffer[current/8] &= ~mask;
-    }
-    current = ( current+7 ) / 8;
-    memset( state->buffer+current, 0, state->blocksize/8 - current );
-    SIMD_Compress( state, state->buffer, 0 );
-  }
-
-  //* Input the message length as the last block
-  memset( state->buffer, 0, state->blocksize / 8 );
-  l = state->count;
-  for ( i=0; i<8; i++ )
-  {
-    state->buffer[i] = l & 0xff;
-    l >>= 8;
-  }
-  if ( state->count < 16384 )
-    isshort = 2;
-
-  SIMD_Compress( state, state->buffer, isshort );
-
-  // Decode the 32-bit words into a BitSequence
-  for ( i=0; i < 2*state->n_feistels; i++ )
-  {
-    u32 x = state->A[i];
-    out[4*i  ] = x & 0xff;
-    x >>= 8;
-    out[4*i+1] = x & 0xff;
-    x >>= 8;
-    out[4*i+2] = x & 0xff;
-    x >>= 8;
-    out[4*i+3] = x & 0xff;
-  }
-
-  memcpy( hashval, out, state->hashbitlen / 8 );
-  if ( state->hashbitlen % 8 )
-  {
-    BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
-    hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask;
-  }
-  return 0;
-}
-
-#endif
-
--- a/algo/simd/nist.h
+++ b/algo/simd/nist.h
@@ -1,64 +0,0 @@
-#ifndef __NIST_H__
-#define __NIST_H__
-
-/*define data alignment for different C compilers*/
-#if defined(__GNUC__)
-#define DATA_ALIGN(x) x __attribute__((aligned(16)))
-#else
-#define DATA_ALIGN(x) __declspec(align(16)) x
-#endif
-
-#include "simd-compat.h"
-#include "compat/sha3-defs.h"
-/*
- * NIST API Specific types.
- */
-
-typedef struct {
-  unsigned int hashbitlen;
-  unsigned int blocksize;
-  unsigned int n_feistels;
-
-#ifdef HAS_64
-  uint64_t count;
-#else
-  uint32_t count_low;
-  uint32_t count_high;
-#endif
-
-  DATA_ALIGN(uint32_t A[32]);
-  uint32_t *B;
-  uint32_t *C;
-  uint32_t *D;
-  DATA_ALIGN(unsigned char buffer[128]);
-  
-} hashState_sd;
-
-/* 
- * NIST API
- */
-
-int init_sd(hashState_sd *state, int hashbitlen);
-
-int update_sd(hashState_sd *state, const BitSequence *data, DataLength databitlen);
-
-int final_sd(hashState_sd *state, BitSequence *hashval);
-
-int update_final_sd( hashState_sd *state, BitSequence *hashval,
-                            const BitSequence *data, DataLength databitlen );
-
-int simd_full( hashState_sd *state, BitSequence *hashval,
-               const BitSequence *data, DataLength databitlen );
-
-/* 
- * Internal API
- */
-
-//int SupportedLength(int hashbitlen);
-int RequiredAlignment(void);
-void SIMD_Compress(hashState_sd * state, const unsigned char *M, int final);
-
-void fft128_natural(fft_t *a, unsigned char *x);
-void fft256_natural(fft_t *a, unsigned char *x);
-
-#endif
--- a/algo/simd/simd-compat.h
+++ b/algo/simd/simd-compat.h
@@ -1,198 +0,0 @@
-#ifndef __SIMD_COMPAT_H__
-#define __SIMD_COMPAT_H__
-
-#include <limits.h>
-
-
-/* 
- * This file desfines some helper function for cross-platform compatibility.
- */
-
-#if defined __GNUC_PREREQ && (! defined __STRICT_ANSI__)
-#define GNU_EXT
-#endif
-
-/*
- * First define some integer types.
- */
-
-#if defined __STDC__ && __STDC_VERSION__ >= 199901L
-
-/*
- * On C99 implementations, we can use <stdint.h> to get an exact 32-bit
- * type, if any, or otherwise use a wider type.
- */
-
-#include <stdint.h>
-#include "compat/brg_types.h"
-
-#define C32(x)    ((u32)(x))
-
-#define HAS_64  1
-
-#else
-
-/*
- * On non-C99 systems, we use "unsigned int" if it is wide enough,
- * "unsigned long" otherwise. This supports all "reasonable" architectures.
- * We have to be cautious: pre-C99 preprocessors handle constants
- * differently in '#if' expressions. Hence the shifts to test UINT_MAX.
- */
-
-#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
-
-typedef unsigned int u32;
-
-#define C32(x)    ((u32)(x ## U))
-
-#else
-
-typedef unsigned long u32;
-
-#define C32(x)    ((u32)(x ## UL))
-
-#endif
-
-/*
- * We want a 64-bit type. We use "unsigned long" if it is wide enough (as
- * is common on 64-bit architectures such as AMD64, Alpha or Sparcv9),
- * "unsigned long long" otherwise, if available. We use ULLONG_MAX to
- * test whether "unsigned long long" is available; we also know that
- * gcc features this type, even if the libc header do not know it.
- */
-
-#if ((ULONG_MAX >> 31) >> 31) >= 3
-
-typedef unsigned long u64;
-
-#define HAS_64  1
-
-#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__
-
-typedef unsigned long long u64;
-
-#define HAS_64  1
-
-#else
-
-/*
- * No 64-bit type...
- */
-
-#endif
-
-#endif
-
-
-/*
- * fft_t should be at least 16 bits wide.
- * using short int will require less memory, but int is faster...
- */
-
-typedef int fft_t;
-
-
-/*
- * Implementation note: some processors have specific opcodes to perform
- * a rotation. Recent versions of gcc recognize the expression above and
- * use the relevant opcodes, when appropriate.
- */
-
-#define T32(x)    ((x) & C32(0xFFFFFFFF))
-#define ROTL32(x, n)   T32(((x) << (n)) | ((x) >> (32 - (n))))
-#define ROTR32(x, n)   ROTL32(x, (32 - (n)))
-
-
-
-/*
- * The macro MAYBE_INLINE expands to an inline qualifier, is available.
- */
-
-#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined GNU_EXT
-#define MAYBE_INLINE static inline
-#elif defined _MSC_VER
-#define MAYBE_INLINE __inline
-#else
-#define MAYBE_INLINE
-#endif
-
-
-/*  */
-
-#if defined __GNUC__ && ( defined __i386__ || defined __x86_64__ )
-
-#define rdtsc()                                                         \
-  ({                                                                    \
-    u32 lo, hi;                                                         \
-    __asm__ __volatile__ (      /* serialize */                         \
-                          "xorl %%eax,%%eax \n        cpuid"            \
-                          ::: "%rax", "%rbx", "%rcx", "%rdx");          \
-    /* We cannot use "=A", since this would use %rax on x86_64 */       \
-    __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));              \
-    (u64)hi << 32 | lo;                                                 \
-  })                                                                    \
-
-#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
-
-#define rdtsc __rdtsc
-
-#endif
-
-/* 
- * The IS_ALIGNED macro tests if a char* pointer is aligned to an
- * n-bit boundary.
- * It is defined as false on unknown architectures.
- */
-
-
-#define CHECK_ALIGNED(p,n) ((((unsigned char *) (p) - (unsigned char *) NULL) & ((n)-1)) == 0)
-
-#if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64
-/*
- * Unaligned 32-bit access are not expensive on x86 so we don't care
- */
-#define IS_ALIGNED(p,n)    (n<=4 || CHECK_ALIGNED(p,n))
-
-#elif defined __sparcv9 || defined __sparc || defined __arm || \
-      defined __ia64 || defined __ia64__ || \
-      defined __itanium__ || defined __M_IA64 || \
-      defined __powerpc__ || defined __powerpc
-#define IS_ALIGNED(p,n)    CHECK_ALIGNED(p,n)
-
-#else
-/* 
- * Unkonwn architecture: play safe
- */
-#define IS_ALIGNED(p,n)    0
-#endif
-
-
-
-/* checks for endianness */
-
-#if defined (__linux__) || defined (__GLIBC__)
-#  include <endian.h>
-#elif defined (__FreeBSD__)
-#  include <machine/endian.h> 
-#elif defined (__OpenBSD__)
-#  include <sys/endian.h>
-#endif
-
-#ifdef __BYTE_ORDER
-
-#  if __BYTE_ORDER == __LITTLE_ENDIAN
-#    define SIMD_LITTLE_ENDIAN
-#  elif __BYTE_ORDER == __BIG_ENDIAN
-#    define SIMD_BIG_ENDIAN
-#  endif
-
-#else
-
-#  if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64
-#    define SIMD_LITTLE_ENDIAN
-#  endif
-
-#endif
-
-
-#endif
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -231,7 +231,7 @@ static void FFT64( void *a )
   //  Unrolled decimation in frequency (DIF) radix-2 NTT.
   //  Output data is in revbin_permuted order.

-  static const int w[] = {0, 2, 4, 6};
+//  static const int w[] = {0, 2, 4, 6};

 #define BUTTERFLY_0( i,j ) \
 do { \
@@ -240,25 +240,25 @@ do { \
    X(i) = v128_sub16( X(i), v ); \
 } while(0)

-#define BUTTERFLY_N( i,j,n ) \
+#define BUTTERFLY_N( i, j, w_n ) \
 do { \
    v128u16_t v = X(j); \
    X(j) = v128_add16( X(i), X(j) ); \
-    X(i) = v128_sl16( v128_sub16( X(i), v ), w[n] ); \
+    X(i) = v128_sl16( v128_sub16( X(i), v ), w_n ); \
 } while(0)

  BUTTERFLY_0( 0, 4 );
-  BUTTERFLY_N( 1, 5, 1 );
-  BUTTERFLY_N( 2, 6, 2 );
-  BUTTERFLY_N( 3, 7, 3 );
+  BUTTERFLY_N( 1, 5, 2 );
+  BUTTERFLY_N( 2, 6, 4 );
+  BUTTERFLY_N( 3, 7, 6 );

  DO_REDUCE( 2 );
  DO_REDUCE( 3 );

  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
-  BUTTERFLY_N( 1, 3, 2 );
-  BUTTERFLY_N( 5, 7, 2 );
+  BUTTERFLY_N( 1, 3, 4 );
+  BUTTERFLY_N( 5, 7, 4 );

  DO_REDUCE( 1 );

@@ -329,10 +329,10 @@ do { \
 } while(0)


-#define BUTTERFLY_N( i,j,n ) \
+#define BUTTERFLY_N( i, j, w_n ) \
 do { \
   v128u16_t u = X(j); \
-   X(i) = v128_sl16( X(i), w[n] ); \
+   X(i) = v128_sl16( X(i), w_n ); \
   X(j) = v128_sub16( X(j), X(i) ); \
   X(i) = v128_add16( u, X(i) ); \
 } while(0)
@@ -353,15 +353,15 @@ do { \

  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
-  BUTTERFLY_N( 1, 3, 2 );
-  BUTTERFLY_N( 5, 7, 2 );
+  BUTTERFLY_N( 1, 3, 4 );
+  BUTTERFLY_N( 5, 7, 4 );

  DO_REDUCE( 3 );

  BUTTERFLY_0( 0, 4 );
-  BUTTERFLY_N( 1, 5, 1 );
-  BUTTERFLY_N( 2, 6, 2 );
-  BUTTERFLY_N( 3, 7, 3 );
+  BUTTERFLY_N( 1, 5, 2 );
+  BUTTERFLY_N( 2, 6, 4 );
+  BUTTERFLY_N( 3, 7, 6 );

  DO_REDUCE_FULL_S( 0 );
  DO_REDUCE_FULL_S( 1 );
@@ -853,7 +853,7 @@ static void fft64_2way( void *a )
   //  Unrolled decimation in frequency (DIF) radix-2 NTT.
   //  Output data is in revbin_permuted order.

-  static const int w[] = {0, 2, 4, 6};
+//  static const int w[] = {0, 2, 4, 6};
 //   __m256i *Twiddle = (__m256i*)FFT64_Twiddle;


@@ -864,25 +864,25 @@ do { \
    X(i) = _mm256_sub_epi16( X(i), v ); \
 } while(0)

-#define BUTTERFLY_N( i,j,n ) \
+#define BUTTERFLY_N( i, j, w_n ) \
 do { \
    __m256i v = X(j); \
    X(j) = _mm256_add_epi16( X(i), X(j) ); \
-    X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \
+    X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w_n ); \
 } while(0)

  BUTTERFLY_0( 0, 4 );
-  BUTTERFLY_N( 1, 5, 1 );
-  BUTTERFLY_N( 2, 6, 2 );
-  BUTTERFLY_N( 3, 7, 3 );
+  BUTTERFLY_N( 1, 5, 2 );
+  BUTTERFLY_N( 2, 6, 4 );
+  BUTTERFLY_N( 3, 7, 6 );

  DO_REDUCE( 2 );
  DO_REDUCE( 3 );

  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
-  BUTTERFLY_N( 1, 3, 2 );
-  BUTTERFLY_N( 5, 7, 2 );
+  BUTTERFLY_N( 1, 3, 4 );
+  BUTTERFLY_N( 5, 7, 4 );

  DO_REDUCE( 1 );

@@ -953,10 +953,10 @@ do { \
 } while(0)


-#define BUTTERFLY_N( i,j,n ) \
+#define BUTTERFLY_N( i, j, w_n ) \
 do { \
   __m256i u = X(j); \
-   X(i) = _mm256_slli_epi16( X(i), w[n] ); \
+   X(i) = _mm256_slli_epi16( X(i), w_n ); \
   X(j) = _mm256_sub_epi16( X(j), X(i) ); \
   X(i) = _mm256_add_epi16( u, X(i) ); \
 } while(0)
@@ -977,15 +977,15 @@ do { \

  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
-  BUTTERFLY_N( 1, 3, 2 );
-  BUTTERFLY_N( 5, 7, 2 );
+  BUTTERFLY_N( 1, 3, 4 );
+  BUTTERFLY_N( 5, 7, 4 );

  DO_REDUCE( 3 );

  BUTTERFLY_0( 0, 4 );
-  BUTTERFLY_N( 1, 5, 1 );
-  BUTTERFLY_N( 2, 6, 2 );
-  BUTTERFLY_N( 3, 7, 3 );
+  BUTTERFLY_N( 1, 5, 2 );
+  BUTTERFLY_N( 2, 6, 4 );
+  BUTTERFLY_N( 3, 7, 6 );

  DO_REDUCE_FULL_S( 0 );
  DO_REDUCE_FULL_S( 1 );
@@ -1709,11 +1709,11 @@ do { \
    X(i) = _mm512_sub_epi16( X(i), v ); \
 } while(0)

-#define BUTTERFLY_N( i, j, w ) \
+#define BUTTERFLY_N( i, j, w_n ) \
 do { \
    __m512i v = X(j); \
    X(j) = _mm512_add_epi16( X(i), X(j) ); \
-    X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w ); \
+    X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w_n ); \
 } while(0)

  BUTTERFLY_0( 0, 4 );
@@ -1792,10 +1792,10 @@ do { \
 } while(0)


-#define BUTTERFLY_N( i, j, w ) \
+#define BUTTERFLY_N( i, j, w_n ) \
 do { \
   __m512i u = X(j); \
-   X(i) = _mm512_slli_epi16( X(i), w ); \
+   X(i) = _mm512_slli_epi16( X(i), w_n ); \
   X(j) = _mm512_sub_epi16( X(j), X(i) ); \
   X(i) = _mm512_add_epi16( u, X(i) ); \
 } while(0)
--- a/algo/simd/simd-hash-2way.h
+++ b/algo/simd/simd-hash-2way.h
@@ -1,7 +1,6 @@
 #ifndef SIMD_HASH_2WAY_H__
 #define SIMD_HASH_2WAY_H__ 1

-#include "simd-compat.h"
 #include "simd-utils.h"

 #if defined(__SSE2__) || defined (__ARM_NEON)
@@ -34,7 +33,7 @@ typedef struct
  unsigned int hashbitlen;
  unsigned int blocksize;
  unsigned int n_feistels;
-} simd512_2way_context __attribute__((aligned(128)));
+} simd512_2way_context __attribute__((aligned(64)));
 #define simd_2way_context simd512_2way_context

 // databitlen is bits
--- a/algo/simd/vector.c
+++ b/algo/simd/vector.c
@@ -1,948 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "nist.h"
-#include "vector.h"
-
-
-//#if defined(__SSE2__) || defined(__ARM_NEON)
-#if defined(__SSE2__)
-
-#define PRINT_SOME 0
-
-/*
-int SupportedLength(int hashbitlen) {
-  if (hashbitlen <= 0 || hashbitlen > 512)
-    return 0;
-  else
-    return 1;
-}
-*/
-
-int RequiredAlignment(void) {
-  return 16;
-}
-
-static const union cv V128 = CV(128);
-static const union cv V255 = CV(255);
-static const union cv V257 = CV(257);
-static const union cv8  V0 = CV(0);
-
-
-/*
- * Reduce modulo 257; result is in [-127; 383]
- * REDUCE(x) := (x&255) - (x>>8)
- */
-#define REDUCE(x)                               \
-  v16_sub(v16_and(x, V255.v16), v16_shift_r (x, 8))
-
-/*
- * Reduce from [-127; 383] to [-128; 128]
- * EXTRA_REDUCE_S(x) := x<=128 ? x : x-257
- */
-#define EXTRA_REDUCE_S(x)                       \
-  v16_sub(x, v16_and(V257.v16, v16_cmp(x, V128.v16)))
- 
-/*
- * Reduce modulo 257; result is in [-128; 128]
- */
-#define REDUCE_FULL_S(x)                        \
-  EXTRA_REDUCE_S(REDUCE(x))
-
-#define DO_REDUCE(i)                            \
-  X(i) = REDUCE(X(i))
-
-#define DO_REDUCE_FULL_S(i)                     \
-  do {                                          \
-    X(i) = REDUCE(X(i));                        \
-    X(i) = EXTRA_REDUCE_S(X(i));                \
-  } while(0)
-
-#define MAYBE_VOLATILE
-
-MAYBE_INLINE void fft64(void *a) {
-
-  v16* const A = a;
-
-  register v16 X0, X1, X2, X3, X4, X5, X6, X7;
-/*
-#if V16_SIZE == 8
-#define X(i) A[i]
-#elif V16_SIZE == 4
-#define X(i) A[2*i]
-#endif
-*/
-#define X(i) X##i
-
-  X0 = A[0];
-  X1 = A[1];
-  X2 = A[2];
-  X3 = A[3];
-  X4 = A[4];
-  X5 = A[5];
-  X6 = A[6];
-  X7 = A[7];
-
-#define DO_REDUCE(i)                            \
-  X(i) = REDUCE(X(i))
-
-  /*
-   * Begin with 8 parallels DIF FFT_8
-   *
-   * FFT_8 using w=4 as 8th root of unity
-   *  Unrolled decimation in frequency (DIF) radix-2 NTT.
-   *  Output data is in revbin_permuted order.
-   */
-
-  static const int w[] = {0, 2, 4, 6};
-  //  v16 *Twiddle = (v16*)FFT64_Twiddle;
-
-#define BUTTERFLY(i,j,n)                                \
-  do {                                                  \
-    MAYBE_VOLATILE v16 v = X(j);                              \
-    X(j) =  v16_add(X(i), X(j));                        \
-    if (n)                                              \
-      X(i) = v16_shift_l(v16_sub(X(i), v), w[n]);       \
-    else                                                \
-      X(i) = v16_sub(X(i), v);                          \
-  } while(0)
-
-  BUTTERFLY(0, 4, 0);
-  BUTTERFLY(1, 5, 1);
-  BUTTERFLY(2, 6, 2);
-  BUTTERFLY(3, 7, 3);
-  
-  DO_REDUCE(2);
-  DO_REDUCE(3);
-  
-  BUTTERFLY(0, 2, 0);
-  BUTTERFLY(4, 6, 0);
-  BUTTERFLY(1, 3, 2);
-  BUTTERFLY(5, 7, 2);
-  
-  DO_REDUCE(1);
-  
-  BUTTERFLY(0, 1, 0);
-  BUTTERFLY(2, 3, 0);
-  BUTTERFLY(4, 5, 0);
-  BUTTERFLY(6, 7, 0);
-  
-  /* We don't need to reduce X(7) */
-  DO_REDUCE_FULL_S(0);
-  DO_REDUCE_FULL_S(1);
-  DO_REDUCE_FULL_S(2);
-  DO_REDUCE_FULL_S(3);
-  DO_REDUCE_FULL_S(4);
-  DO_REDUCE_FULL_S(5);
-  DO_REDUCE_FULL_S(6);
-    
-#undef BUTTERFLY
-
-  /*
-   * Multiply by twiddle factors
-   */
-
-  X(6) = v16_mul(X(6), FFT64_Twiddle[0].v16);
-  X(5) = v16_mul(X(5), FFT64_Twiddle[1].v16);
-  X(4) = v16_mul(X(4), FFT64_Twiddle[2].v16);
-  X(3) = v16_mul(X(3), FFT64_Twiddle[3].v16);
-  X(2) = v16_mul(X(2), FFT64_Twiddle[4].v16);
-  X(1) = v16_mul(X(1), FFT64_Twiddle[5].v16);
-  X(0) = v16_mul(X(0), FFT64_Twiddle[6].v16);
-
-  /*
-   * Transpose the FFT state with a revbin order permutation
-   * on the rows and the column.
-   * This will make the full FFT_64 in order.
-   */
-
-#define INTERLEAVE(i,j)                          \
-  do {                                           \
-    v16 t1= X(i);                                \
-    v16 t2= X(j);                                \
-    X(i) = v16_interleavel(t1, t2);              \
-    X(j) = v16_interleaveh(t1, t2);              \
-  } while(0)
-
-  INTERLEAVE(1, 0);
-  INTERLEAVE(3, 2);
-  INTERLEAVE(5, 4);
-  INTERLEAVE(7, 6);
-
-  INTERLEAVE(2, 0);
-  INTERLEAVE(3, 1);
-  INTERLEAVE(6, 4);
-  INTERLEAVE(7, 5);
-
-  INTERLEAVE(4, 0);
-  INTERLEAVE(5, 1);
-  INTERLEAVE(6, 2);
-  INTERLEAVE(7, 3);
-
-#undef INTERLEAVE
-
-  /*
-   * Finish with 8 parallels DIT FFT_8
-   *
-   * FFT_8 using w=4 as 8th root of unity
-   *  Unrolled decimation in time (DIT) radix-2 NTT.
-   *  Intput data is in revbin_permuted order.
-   */
-  
-#define BUTTERFLY(i,j,n)                                \
-  do {                                                  \
-    MAYBE_VOLATILE v16 u = X(j);                              \
-    if (n)                                              \
-      X(i) = v16_shift_l(X(i), w[n]);                   \
-    X(j) = v16_sub(X(j), X(i));                         \
-    X(i) = v16_add(u, X(i));                            \
-  } while(0)
-
-  DO_REDUCE(0);
-  DO_REDUCE(1);
-  DO_REDUCE(2);
-  DO_REDUCE(3);
-  DO_REDUCE(4);
-  DO_REDUCE(5);
-  DO_REDUCE(6);
-  DO_REDUCE(7);
-  
-  BUTTERFLY(0, 1, 0);
-  BUTTERFLY(2, 3, 0);
-  BUTTERFLY(4, 5, 0);
-  BUTTERFLY(6, 7, 0);
-  
-  BUTTERFLY(0, 2, 0);
-  BUTTERFLY(4, 6, 0);
-  BUTTERFLY(1, 3, 2);
-  BUTTERFLY(5, 7, 2);
-  
-  DO_REDUCE(3);
-  
-  BUTTERFLY(0, 4, 0);
-  BUTTERFLY(1, 5, 1);
-  BUTTERFLY(2, 6, 2);
-  BUTTERFLY(3, 7, 3);
-  
-  DO_REDUCE_FULL_S(0);
-  DO_REDUCE_FULL_S(1);
-  DO_REDUCE_FULL_S(2);
-  DO_REDUCE_FULL_S(3);
-  DO_REDUCE_FULL_S(4);
-  DO_REDUCE_FULL_S(5);
-  DO_REDUCE_FULL_S(6);
-  DO_REDUCE_FULL_S(7);
-  
-#undef BUTTERFLY
-
-  A[0] = X0;
-  A[1] = X1;
-  A[2] = X2;
-  A[3] = X3;
-  A[4] = X4;
-  A[5] = X5;
-  A[6] = X6;
-  A[7] = X7;
-
-#undef X
-
-}
-
-
-MAYBE_INLINE void fft128(void *a) {
-
-  int i;
-
-  // Temp space to help for interleaving in the end
-  v16 B[8];
-
-  v16 *A = (v16*) a;
-  //  v16 *Twiddle = (v16*)FFT128_Twiddle;
-
-  /* Size-2 butterflies */
-
-  for (i = 0; i<8; i++) {
-    B[i]   = v16_add(A[i], A[i+8]);
-    B[i]   = REDUCE_FULL_S(B[i]);
-    A[i+8] = v16_sub(A[i], A[i+8]);
-    A[i+8] = REDUCE_FULL_S(A[i+8]);
-    A[i+8] = v16_mul(A[i+8], FFT128_Twiddle[i].v16);
-    A[i+8] = REDUCE_FULL_S(A[i+8]);
-  }
-
-  fft64(B);
-  fft64(A+8);
-
-  /* Transpose (i.e. interleave) */
-
-  for (i=0; i<8; i++) {
-    A[2*i]   = v16_interleavel (B[i], A[i+8]);
-    A[2*i+1] = v16_interleaveh (B[i], A[i+8]);
-  }
-}
-
-#ifdef v16_broadcast
-/* Compute the FFT using a table
- * The function works if the value of the message is smaller 
- * than 2^14.
- */
-void fft128_msg_final(short *a, const unsigned char *x) {
-
-  static const union cv FFT128_Final_Table[] = {
-    {{   1, -211,   60,  -67,    2,   92, -137,  123}},
-    {{   2,  118,   45,  111,   97,  -46,   49, -106}},
-    {{   4,  -73,  -17,  -11,    8,  111,  -34,  -22}},
-    {{ -68,   -4,   76,  -25,   96,  -96,  -68,   -9}},
-    {{  16,  -35,  -68,  -44,   32,  -70, -136,  -88}},
-    {{   0, -124,   17,   12,   -6,   57,   47,   -8}},
-    {{  64,  117,  -15,   81,  128,  -23,  -30,  -95}},
-    {{ -68,  -53,  -52,  -70,  -10, -117,   77,   21}},
-    {{  -1,  -46,  -60,   67,   -2,  -92, -120, -123}},
-    {{  -2, -118,  -45, -111,  -97,   46,  -49,  106}},
-    {{  -4,   73,   17,   11,   -8, -111,   34,   22}},
-    {{  68,    4,  -76,   25,  -96,   96,   68,    9}},
-    {{ -16, -222,   68,   44,  -32,   70, -121,   88}},
-    {{   0,  124,  -17,  -12,    6,  -57,  -47,    8}},
-    {{ -64, -117,   15,  -81, -128, -234,   30,   95}},
-    {{  68,   53,   52,   70,   10,  117,  -77,  -21}},
-    {{-118,  -31,  116,  -61,   21,  -62,  -25, -122}},
-    {{-101,  107,  -45,  -95,   -8,    3,  101,  -34}},
-    {{  42, -124,  -50,   13,   84,    9, -100, -231}},
-    {{ -79,  -53,   82,   65,  -81,   47,   61,  107}},
-    {{ -89, -239,   57, -205, -178,   36, -143,  104}},
-    {{-126,  113,   33,  111,  103, -109,   65, -114}},
-    {{ -99,   72,  -29,  -49, -198, -113,  -58,  -98}},
-    {{   8,  -27, -106,  -30,  111,    6,   10, -108}},
-    {{-139,   31, -116, -196,  -21,   62,   25, -135}},
-    {{ 101, -107,   45,   95,    8,   -3, -101,   34}},
-    {{ -42, -133,   50,  -13,  -84,   -9,  100,  -26}},
-    {{  79,   53,  -82,  -65,   81,  -47,  -61, -107}},
-    {{-168,  -18,  -57,  -52,  -79,  -36, -114, -104}},
-    {{ 126, -113,  -33, -111, -103,  109,  -65,  114}},
-    {{  99,  -72, -228,   49,  -59,  113,   58, -159}},
-    {{  -8,   27,  106,   30, -111,   -6,  -10,  108}}
-  };
-
-  //  v16 *Table = (v16*)FFT128_Final_Table;
-  v16 *A = (v16*) a;
-  v16 msg1 = v16_broadcast(x[0]>128?x[0]-257:x[0]);
-  v16 msg2 = v16_broadcast(x[1]>128?x[1]-257:x[1]);
-  // v16 msg2 = v16_broadcast(x[1]);
-
-#if 0
-  int i;
-  for (i=0; i<16; i++) {
-    v16 tmp = v16_mul(FFT128_Final_Table[2*i].v16  , msg2);
-    v16 sum = v16_add(FFT128_Final_Table[2*i+1].v16, msg1);
-    sum = v16_add(sum, tmp);
-    A[i] = REDUCE_FULL_S(sum);
-  }
-
-#else
-
-#define FFT_FINAL(i)                                           \
-  v16 tmp##i = v16_mul(FFT128_Final_Table[2*i].v16, msg2);     \
-  v16 sum##i = v16_add(FFT128_Final_Table[2*i+1].v16, msg1);   \
-  sum##i = v16_add(sum##i, tmp##i);                            \
-  A[i] = REDUCE_FULL_S(sum##i);
-
-  FFT_FINAL(0)
-  FFT_FINAL(1)
-  FFT_FINAL(2)
-  FFT_FINAL(3)
-  FFT_FINAL(4)
-  FFT_FINAL(5)
-  FFT_FINAL(6)
-  FFT_FINAL(7)
-  FFT_FINAL(8)
-  FFT_FINAL(9)
-  FFT_FINAL(10)
-  FFT_FINAL(11)
-  FFT_FINAL(12)
-  FFT_FINAL(13)
-  FFT_FINAL(14)
-  FFT_FINAL(15)
-
-#endif
-
-}
-#endif
-
-void fft128_msg(short *a, const unsigned char *x, int final) {
-
-  static const union cv Tweak =
-    {{0,0,0,0,0,0,0,1}};
-  static const union cv FinalTweak =
-    {{0,0,0,0,0,1,0,1}};
-
-
-  v8  *X = (v8*)  x;
-  v16 *A = (v16*) a;
-  //  v16 *Twiddle = (v16*)FFT128_Twiddle;
-
-#define UNPACK(i)                                      \
-  do {                                                 \
-    v8 t = X[i];                                       \
-    A[2*i]   = v8_mergel(t, V0.v8);                    \
-    A[2*i+8] = v16_mul(A[2*i], FFT128_Twiddle[2*i].v16);          \
-    A[2*i+8] = REDUCE(A[2*i+8]);                       \
-    A[2*i+1] = v8_mergeh(t, V0.v8);                    \
-    A[2*i+9] = v16_mul(A[2*i+1], FFT128_Twiddle[2*i+1].v16);      \
-    A[2*i+9] = REDUCE(A[2*i+9]);                       \
-  } while(0)
-
-
-  /* 
-   * This allows to tweak the last butterflies to introduce X^127
-   */
-#define UNPACK_TWEAK(i,tw)                             \
-  do {                                                 \
-    v8 t = X[i];                                       \
-    v16 tmp;                                           \
-    A[2*i]   = v8_mergel(t, V0.v8);                    \
-    A[2*i+8] = v16_mul(A[2*i], FFT128_Twiddle[2*i].v16);          \
-    A[2*i+8] = REDUCE(A[2*i+8]);                       \
-    tmp      = v8_mergeh(t, V0.v8);                    \
-    A[2*i+1] = v16_add(tmp, tw);                               \
-    A[2*i+9] = v16_mul(v16_sub(tmp, tw), FFT128_Twiddle[2*i+1].v16);      \
-    A[2*i+9] = REDUCE(A[2*i+9]);                       \
-  } while(0)
-
-  UNPACK(0);
-  UNPACK(1);
-  UNPACK(2);
-  if (final)
-    UNPACK_TWEAK(3, FinalTweak.v16);
-  else
-    UNPACK_TWEAK(3, Tweak.v16);
-
-#undef UNPACK
-#undef UNPACK_TWEAK
-
-  fft64(a);
-  fft64(a+64);
-}
-
-#if 0
-void fft128_msg(short *a, const unsigned char *x, int final) {
-
-  for (int i=0; i<64; i++)
-    a[i] = x[i];
-
-  for (int i=64; i<128; i++)
-    a[i] = 0;
-
-  a[127] = 1;
-  a[125] = final? 1: 0;
-
-  fft128(a);
-}
-#endif
-
-void fft256_msg(short *a, const unsigned char *x, int final) {
-
-  static const union cv Tweak =
-    {{0,0,0,0,0,0,0,1}};
-  static const union cv FinalTweak =
-    {{0,0,0,0,0,1,0,1}};
-
-
-  v8  *X = (v8*)  x;
-  v16 *A = (v16*) a;
-  //  v16 *Twiddle = (v16*)FFT256_Twiddle;
-
-#define UNPACK(i)                                       \
-  do {                                                  \
-    v8 t      = X[i];                                   \
-    A[2*i]    = v8_mergel(t, V0.v8);                    \
-    A[2*i+16] = v16_mul(A[2*i], FFT256_Twiddle[2*i].v16);          \
-    A[2*i+16] = REDUCE(A[2*i+16]);                      \
-    A[2*i+1]  = v8_mergeh(t, V0.v8);                    \
-    A[2*i+17] = v16_mul(A[2*i+1], FFT256_Twiddle[2*i+1].v16);      \
-    A[2*i+17] = REDUCE(A[2*i+17]);                       \
-  } while(0)
-
-
-  /* 
-   * This allows to tweak the last butterflies to introduce X^127
-   */
-#define UNPACK_TWEAK(i,tw)                              \
-  do {                                                  \
-    v8 t = X[i];                                        \
-    v16 tmp;                                            \
-    A[2*i]    = v8_mergel(t, V0.v8);                    \
-    A[2*i+16] = v16_mul(A[2*i], FFT256_Twiddle[2*i].v16);          \
-    A[2*i+16] = REDUCE(A[2*i+16]);                       \
-    tmp       = v8_mergeh(t, V0.v8);                    \
-    A[2*i+1]  = v16_add(tmp, tw);                               \
-    A[2*i+17] = v16_mul(v16_sub(tmp, tw), FFT256_Twiddle[2*i+1].v16);      \
-    A[2*i+17] = REDUCE(A[2*i+17]);                      \
-  } while(0)
-
-  UNPACK(0);
-  UNPACK(1);
-  UNPACK(2);
-  UNPACK(3);
-  UNPACK(4);
-  UNPACK(5);
-  UNPACK(6);
-  if (final)
-    UNPACK_TWEAK(7, FinalTweak.v16);
-  else
-    UNPACK_TWEAK(7, Tweak.v16);
-
-#undef UNPACK
-#undef UNPACK_TWEAK
-
-  fft128(a);
-  fft128(a+128);
-}
-
-
-void rounds(u32* state, const unsigned char* msg, short* fft) {
-  
-  v32* S = (v32*) state;
-  const v32* M = (v32*)msg;
-  volatile v16* W = (v16*)fft;
-
-  register v32 S0, S1, S2, S3;
-  static const union cv code[] = { CV(185), CV(233) };
-
-  S0 = v32_xor(S[0], v32_bswap(M[0]));
-  S1 = v32_xor(S[1], v32_bswap(M[1]));
-  S2 = v32_xor(S[2], v32_bswap(M[2]));
-  S3 = v32_xor(S[3], v32_bswap(M[3]));
-
-#define S(i) S##i
-
-
-/* #define F_0(B, C, D)     ((((C) ^ (D)) & (B)) ^ (D)) */
-/* #define F_1(B, C, D)     (((D) & (C)) | (((D) | (C)) & (B))) */
-
-#define F_0(B, C, D)     v32_xor(v32_and(v32_xor(C,D), B), D)
-#define F_1(B, C, D)     v32_or(v32_and(D, C), v32_and( v32_or(D,C), B))
-
-#define F(a,b,c,fun) F_##fun (a,b,c)
-
-  /*
-   * We split the round function in two halfes
-   * so as to insert some independent computations in between
-   */
-
-#define SUM3_00 1
-#define SUM3_01 2
-#define SUM3_02 3
-#define SUM3_10 2
-#define SUM3_11 3
-#define SUM3_12 1
-#define SUM3_20 3
-#define SUM3_21 1
-#define SUM3_22 2
-
-#define STEP_1(a,b,c,d,w,fun,r,s,z)                             \
-  do {                                                          \
-    if (PRINT_SOME) {                                           \
-      int j;                                                    \
-      v32 ww=w, aa=a, bb=b, cc=c, dd=d;                         \
-      u32 *WW = (void*)&ww;                                     \
-      u32 *AA = (void*)&aa;                                     \
-      u32 *BB = (void*)&bb;                                     \
-      u32 *CC = (void*)&cc;                                     \
-      u32 *DD = (void*)&dd;                                     \
-      for (j=0; j<4; j++) {                                     \
-        printf ("%08x/%2i/%2i[%i]: %08x %08x %08x %08x\n",      \
-                WW[j], r, s, SUM3_##z,                          \
-                AA[j], BB[j], CC[j], DD[j]);                    \
-      }                                                         \
-    }                                                           \
-    TT = F(a,b,c,fun);                                          \
-    a = v32_rotate(a,r);                                        \
-    w = v32_add(w, d);                                          \
-    TT = v32_add(TT, w);                                        \
-    TT = v32_rotate(TT,s);                                      \
-    d = v32_shufxor(a,SUM3_##z);                                \
-  } while(0)
-
-#define STEP_2(a,b,c,d,w,fun,r,s)                               \
-  do {                                                          \
-    d = v32_add(d, TT);                                         \
-  } while(0)
-
-#define STEP(a,b,c,d,w,fun,r,s,z)               \
-  do {                                          \
-    register v32 TT;                            \
-    STEP_1(a,b,c,d,w,fun,r,s,z);                \
-    STEP_2(a,b,c,d,w,fun,r,s);                  \
-  } while(0);
-
-
-#define ROUND(h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,        \
-              fun,r,s,t,u,z,r0)                           \
-  do {                                                    \
-    register v32 W0, W1, W2, W3, TT;                      \
-    W0 = v16_merge##u0(W[h0], W[l0]);                     \
-    W0 = V1632(v16_mul(V3216(W0), code[z].v16));          \
-    STEP_1(S(0), S(1), S(2), S(3), W0, fun, r, s, r0##0); \
-    W1 = v16_merge##u1(W[h1], W[l1]);                     \
-    W1 = V1632(v16_mul(V3216(W1), code[z].v16));          \
-    STEP_2(S(0), S(1), S(2), S(3), W0, fun, r, s);        \
-    STEP_1(S(3), S(0), S(1), S(2), W1, fun, s, t, r0##1); \
-    W2 = v16_merge##u2(W[h2], W[l2]);                     \
-    W2 = V1632(v16_mul(V3216(W2), code[z].v16));          \
-    STEP_2(S(3), S(0), S(1), S(2), W1, fun, s, t);        \
-    STEP_1(S(2), S(3), S(0), S(1), W2, fun, t, u, r0##2); \
-    W3 = v16_merge##u3(W[h3], W[l3]);                     \
-    W3 = V1632(v16_mul(V3216(W3), code[z].v16));          \
-    STEP_2(S(2), S(3), S(0), S(1), W2, fun, t, u);        \
-    STEP_1(S(1), S(2), S(3), S(0), W3, fun, u, r, r0##0); \
-    STEP_2(S(1), S(2), S(3), S(0), W3, fun, u, r);        \
-  } while(0)
-
-
-  /*
-   * 4 rounds with code 185
-   */
-  ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0, 0);
-  ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0, 1);
-  ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22,  7, 0, 2);
-  ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22,  7, 0, 0);
-
-  /*
-   * 4 rounds with code 233
-   */
-  ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1, 1);
-  ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1, 2);
-  ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1, 0);
-  ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1, 1);
-
-
-  /*
-   * 1 round as feed-forward
-   */
-  STEP(S(0), S(1), S(2), S(3), S[0], 0,  4, 13, 20);
-  STEP(S(3), S(0), S(1), S(2), S[1], 0, 13, 10, 21);
-  STEP(S(2), S(3), S(0), S(1), S[2], 0, 10, 25, 22);
-  STEP(S(1), S(2), S(3), S(0), S[3], 0, 25,  4, 20);
-
-  S[0] = S(0);  S[1] = S(1);  S[2] = S(2);  S[3] = S(3);
-
-#undef ROUND
-#undef STEP
-#undef STEP_1
-#undef STEP_2
-}
-
-
-void rounds512(u32* state, const unsigned char* msg, short* fft) {
-  
-  v32* S = (v32*) state;
-  v32* M = (v32*) msg;
-  v16* W = (v16*) fft;
-
-  register v32 S0l, S1l, S2l, S3l;
-  register v32 S0h, S1h, S2h, S3h;
-  static const union cv code[] = { CV(185), CV(233) };
-
-  S0l = v32_xor(S[0], v32_bswap(M[0]));
-  S0h = v32_xor(S[1], v32_bswap(M[1]));
-  S1l = v32_xor(S[2], v32_bswap(M[2]));
-  S1h = v32_xor(S[3], v32_bswap(M[3]));
-  S2l = v32_xor(S[4], v32_bswap(M[4]));
-  S2h = v32_xor(S[5], v32_bswap(M[5]));
-  S3l = v32_xor(S[6], v32_bswap(M[6]));
-  S3h = v32_xor(S[7], v32_bswap(M[7]));
-
-#define S(i) S##i
-
-
-/* #define F_0(B, C, D)     ((((C) ^ (D)) & (B)) ^ (D)) */
-/* #define F_1(B, C, D)     (((D) & (C)) | (((D) | (C)) & (B))) */
-
-#define F_0(B, C, D)     v32_xor(v32_and(v32_xor(C,D), B), D)
-#define F_1(B, C, D)     v32_or(v32_and(D, C), v32_and( v32_or(D,C), B))
-
-#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
-#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
-
-  /*
-   * We split the round function in two halfes
-   * so as to insert some independent computations in between
-   */
-
-#define SUM7_00 0
-#define SUM7_01 1
-#define SUM7_02 2
-#define SUM7_03 3
-#define SUM7_04 4
-#define SUM7_05 5
-#define SUM7_06 6
-
-#define SUM7_10 1
-#define SUM7_11 2
-#define SUM7_12 3
-#define SUM7_13 4
-#define SUM7_14 5
-#define SUM7_15 6
-#define SUM7_16 0
-                
-#define SUM7_20 2
-#define SUM7_21 3
-#define SUM7_22 4
-#define SUM7_23 5
-#define SUM7_24 6
-#define SUM7_25 0
-#define SUM7_26 1
-                
-#define SUM7_30 3
-#define SUM7_31 4
-#define SUM7_32 5
-#define SUM7_33 6
-#define SUM7_34 0
-#define SUM7_35 1
-#define SUM7_36 2
-                
-#define SUM7_40 4
-#define SUM7_41 5
-#define SUM7_42 6
-#define SUM7_43 0
-#define SUM7_44 1
-#define SUM7_45 2
-#define SUM7_46 3
-                
-#define SUM7_50 5
-#define SUM7_51 6
-#define SUM7_52 0
-#define SUM7_53 1
-#define SUM7_54 2
-#define SUM7_55 3
-#define SUM7_56 4
-
-#define SUM7_60 6
-#define SUM7_61 0
-#define SUM7_62 1
-#define SUM7_63 2
-#define SUM7_64 3
-#define SUM7_65 4
-#define SUM7_66 5
-
-#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
-
-#define PERM_0(d,a) /* XOR 1 */           \
-  do {                                    \
-    d##l = v32_shufxor(a##l,1);           \
-    d##h = v32_shufxor(a##h,1);           \
-  } while(0)
-
-#define PERM_1(d,a) /* XOR 6 */           \
-  do {                                    \
-    d##l = v32_shufxor(a##h,2);           \
-    d##h = v32_shufxor(a##l,2);           \
-  } while(0)
-
-#define PERM_2(d,a) /* XOR 2 */           \
-  do {                                    \
-    d##l = v32_shufxor(a##l,2);           \
-    d##h = v32_shufxor(a##h,2);           \
-  } while(0)
-
-#define PERM_3(d,a) /* XOR 3 */           \
-  do {                                    \
-    d##l = v32_shufxor(a##l,3);           \
-    d##h = v32_shufxor(a##h,3);           \
-  } while(0)
-
-#define PERM_4(d,a) /* XOR 5 */           \
-  do {                                    \
-    d##l = v32_shufxor(a##h,1);           \
-    d##h = v32_shufxor(a##l,1);           \
-  } while(0)
-
-#define PERM_5(d,a) /* XOR 7 */           \
-  do {                                    \
-    d##l = v32_shufxor(a##h,3);           \
-    d##h = v32_shufxor(a##l,3);           \
-  } while(0)
-
-#define PERM_6(d,a) /* XOR 4 */           \
-  do {                                    \
-    d##l = a##h;                          \
-    d##h = a##l;                          \
-  } while(0)
-
-#define STEP_1_(a,b,c,d,w,fun,r,s,z)                            \
-  do {                                                          \
-    if (PRINT_SOME) {                                           \
-      int j;                                                    \
-      v32 ww=w##l, aa=a##l, bb=b##l, cc=c##l, dd=d##l;          \
-      u32 *WW = (void*)&ww;                                     \
-      u32 *AA = (void*)&aa;                                     \
-      u32 *BB = (void*)&bb;                                     \
-      u32 *CC = (void*)&cc;                                     \
-      u32 *DD = (void*)&dd;                                     \
-      for (j=0; j<4; j++) {                                     \
-        printf ("%08x/%2i/%2i: %08x %08x %08x %08x\n",          \
-                WW[j], r, s,                                    \
-                AA[j], BB[j], CC[j], DD[j]);                    \
-      }                                                         \
-    }                                                           \
-    TTl = Fl(a,b,c,fun);                                        \
-    TTh = Fh(a,b,c,fun);                                        \
-    a##l = v32_rotate(a##l,r);                                  \
-    a##h = v32_rotate(a##h,r);                                  \
-    w##l  = v32_add(w##l, d##l);                                \
-    w##h  = v32_add(w##h, d##h);                                \
-    TTl = v32_add(TTl, w##l);                                   \
-    TTh = v32_add(TTh, w##h);                                   \
-    TTl = v32_rotate(TTl,s);                                    \
-    TTh = v32_rotate(TTh,s);                                    \
-    PERM(z,d,a);                                                \
-  } while(0)
-
-#define STEP_1(a,b,c,d,w,fun,r,s,z)             \
-  STEP_1_(a,b,c,d,w,fun,r,s,z)
-
-#define STEP_2_(a,b,c,d,w,fun,r,s)                               \
-  do {                                                          \
-    d##l = v32_add(d##l, TTl);                                  \
-    d##h = v32_add(d##h, TTh);                                  \
-  } while(0)
-
-#define STEP_2(a,b,c,d,w,fun,r,s)              \
-  STEP_2_(a,b,c,d,w,fun,r,s)
-  
-#define STEP(a,b,c,d,w1,w2,fun,r,s,z)           \
-  do {                                          \
-    register v32 TTl, TTh, Wl=w1, Wh=w2;        \
-    STEP_1(a,b,c,d,W,fun,r,s,z);                \
-    STEP_2(a,b,c,d,W,fun,r,s);                  \
-  } while(0);
-
-
-#define MSG_l(x) (2*(x))
-#define MSG_h(x) (2*(x)+1)
-
-#define MSG(w,hh,ll,u,z)                                \
-  do {                                                  \
-    int a = MSG_##u(hh);                                \
-    int b = MSG_##u(ll);                                \
-    w##l = v16_mergel(W[a], W[b]);                      \
-    w##l = V1632(v16_mul(V3216(w##l), code[z].v16));    \
-    w##h = v16_mergeh(W[a], W[b]);                      \
-    w##h = V1632(v16_mul(V3216(w##h), code[z].v16));    \
-  } while(0)
-  
-#define ROUND(h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,        \
-              fun,r,s,t,u,z)                              \
-  do {                                                    \
-    register v32 W0l, W1l, W2l, W3l, TTl;                 \
-    register v32 W0h, W1h, W2h, W3h, TTh;                 \
-    MSG(W0,h0,l0,u0,z);                                   \
-    STEP_1(S(0), S(1), S(2), S(3), W0, fun, r, s, 0);     \
-    MSG(W1,h1,l1,u1,z);                                   \
-    STEP_2(S(0), S(1), S(2), S(3), W0, fun, r, s);        \
-    STEP_1(S(3), S(0), S(1), S(2), W1, fun, s, t, 1);     \
-    MSG(W2,h2,l2,u2,z);                                   \
-    STEP_2(S(3), S(0), S(1), S(2), W1, fun, s, t);        \
-    STEP_1(S(2), S(3), S(0), S(1), W2, fun, t, u, 2);     \
-    MSG(W3,h3,l3,u3,z);                                   \
-    STEP_2(S(2), S(3), S(0), S(1), W2, fun, t, u);        \
-    STEP_1(S(1), S(2), S(3), S(0), W3, fun, u, r, 3);     \
-    STEP_2(S(1), S(2), S(3), S(0), W3, fun, u, r);        \
-  } while(0)
-
-
-  /*
-   * 4 rounds with code 185
-   */
-#define PERM_START 0
-  ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0);
-#undef PERM_START
-#define PERM_START 4
-  ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0);
-#undef PERM_START
-#define PERM_START 1
-  ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22, 7,  0);
-#undef PERM_START
-#define PERM_START 5
-  ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22, 7,  0);
-#undef PERM_START
-
-  /*
-   * 4 rounds with code 233
-   */
-#define PERM_START 2
-  ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1);
-#undef PERM_START
-#define PERM_START 6
-  ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1);
-#undef PERM_START
-#define PERM_START 3
-  ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1);
-#undef PERM_START
-#define PERM_START 0
-  ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1);
-#undef PERM_START
-
-
-  /*
-   * 1 round as feed-forward
-   */
-#define PERM_START 4
-  STEP(S(0), S(1), S(2), S(3), S[0], S[1], 0,  4, 13, 0);
-  STEP(S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1);
-  STEP(S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2);
-  STEP(S(1), S(2), S(3), S(0), S[6], S[7], 0, 25,  4, 3);
-#undef PERM_START
-
-  S[0] = S0l;  S[1] = S0h;  S[2] = S1l;  S[3] = S1h;
-  S[4] = S2l;  S[5] = S2h;  S[6] = S3l;  S[7] = S3h;
-
-#undef ROUND
-#undef STEP
-#undef STEP_1
-#undef STEP_2
-}
-
-void SIMD_Compress(hashState_sd * state, const unsigned char *m, int final) {
-  if (state->hashbitlen <= 256) {
-    union cv Y[16];
-    short* y = (short*) Y[0].u16;
-
-#ifdef v16_broadcast
-    if (final == 2) {
-      fft128_msg_final(y, m);
-      rounds(state->A, m, y);
-    } else {
-      fft128_msg(y, m, final);
-      rounds(state->A, m, y);
-    }
-#else
-    fft128_msg(y, m, final);
-    rounds(state->A, m, y);
-#endif
-  } else {
-    union cv Y[32];
-    short* y = (short*) Y[0].u16;
-    
-    fft256_msg(y, m, final);
-    rounds512(state->A, m, y);
-  }
-}
-
-/* 
- * Give the FFT output in the regular order for consitancy checks
- */
-void fft128_natural(fft_t *x, unsigned char *a) {
-  union cv Y[16];
-  short* y = (short*) Y[0].u16;
-  int i;
-
-  fft128_msg(y, a, 0);
-
-  for(i=0; i<64; i++) {
-    x[2*i]   = y[i];
-    x[2*i+1] = y[i+64];
-  }
-}
-
-#endif // SSE2
--- a/algo/simd/vector.h
+++ b/algo/simd/vector.h
@@ -1,248 +0,0 @@
-#ifndef __VECTOR_H__
-#define __VECTOR_H__
-
-#include "compat.h"
-#include "simd-utils.h"
-
-/******************************* 
- * Using GCC vector extensions * 
- *******************************/
-
-//typedef unsigned char v16qi __attribute__ ((vector_size (16)));
-typedef char          v16qi __attribute__ ((vector_size (16)));
-typedef short          v8hi __attribute__ ((vector_size (16)));
-typedef int            v4si __attribute__ ((vector_size (16)));
-typedef float          v4sf __attribute__ ((vector_size (16)));
-typedef long long int  v2di __attribute__ ((vector_size (16)));
-
-typedef short          v4hi __attribute__ ((vector_size (8)));
-typedef unsigned char  v8qi __attribute__ ((vector_size (8)));
-
-typedef v16qi v8;
-typedef v8hi v16;
-typedef v4si v32;
-#define V16_SIZE 8
-
-union cv {
-  unsigned short u16[8];
-  v16 v16;
-};
-
-union cv8 {
-  unsigned char u8[16];
-  v8 v8;
-};
-
-union u32 {
-  u32 u[4];
-  v32 v;
-};
-
-#define V3216(x) ((v16) (x))
-#define V1632(x) ((v32) (x))
-#define  V168(x) ( (v8) (x))
-#define  V816(x) ((v16) (x))
-
-#if 0
-/* These instruction are shorter than the PAND/POR/... that GCC uses */
-
-#define vec_and(x,y)  ({v16 a = (v16) x; v16 b = (v16) y;  __builtin_ia32_andps ((v4sf) a, (v4sf) b);})
-#define vec_or(x,y)   ({v16 a = (v16) x; v16 b = (v16) y;  __builtin_ia32_orps ((v4sf) a, (v4sf) b);})
-#define vec_xor(x,y)  ({v16 a = (v16) x; v16 b = (v16) y;  __builtin_ia32_xorps ((v4sf) a, (v4sf) b);})
-#define vec_andn(x,y) ({v16 a = (v16) x; v16 b = (v16) y;  __builtin_ia32_andnps ((v4sf) a, (v4sf) b);})
-
-#define v16_and(x,y)  ((v16) vec_and ((x), (y)))
-#define v16_or(x,y)   ((v16) vec_or  ((x), (y)))
-#define v16_xor(x,y)  ((v16) vec_xor ((x), (y)))
-#define v16_andn(x,y) ((v16) vec_andn((x), (y)))
-
-#define v32_and(x,y)  ((v32) vec_and ((x), (y)))
-#define v32_or(x,y)   ((v32) vec_or  ((x), (y)))
-#define v32_xor(x,y)  ((v32) vec_xor ((x), (y)))
-#define v32_andn(x,y) ((v32) vec_andn((x), (y)))
-#endif
-
-//TODO  aarch support for widening multiply
-
-#if defined(__SSE2__)
-
-#define vec_and(x,y) ((x)&(y))
-#define vec_or(x,y)  ((x)|(y))
-#define vec_xor(x,y) ((x)^(y))
-
-#define v16_and vec_and
-#define v16_or  vec_or
-#define v16_xor vec_xor
-
-#define v32_and vec_and
-#define v32_or  vec_or
-#define v32_xor vec_xor
-
-#define vec_andn(x,y) __builtin_ia32_pandn128 ((v2di) x, (v2di) y)
-#define v16_andn(x,y) ((v16) vec_andn(x,y))
-#define v32_andn(x,y) ((v32) vec_andn(x,y))
-
-#define v32_add(x,y) ((x)+(y))
-
-#define v16_add(x,y) ((x)+(y))
-#define v16_sub(x,y) ((x)-(y))
-#define v16_mul(x,y) ((x)*(y))
-#define v16_neg(x)   (-(x))
-#define v16_shift_l  __builtin_ia32_psllwi128
-#define v16_shift_r  __builtin_ia32_psrawi128
-#define v16_cmp      __builtin_ia32_pcmpgtw128
-
-#define v16_interleavel   __builtin_ia32_punpcklwd128
-#define v16_interleaveh   __builtin_ia32_punpckhwd128
-
-#define v16_mergel(a,b)   V1632(__builtin_ia32_punpcklwd128(a,b))
-#define v16_mergeh(a,b)   V1632(__builtin_ia32_punpckhwd128(a,b))
-
-#define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b))
-#define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b))
-
-#define v32_shift_l  __builtin_ia32_pslldi128
-#define v32_shift_r  __builtin_ia32_psrldi128
-
-#define v32_rotate(x,n)                                 \
-  v32_or(v32_shift_l(x,n), v32_shift_r(x,32-(n)))
-
-#define v32_shuf __builtin_ia32_pshufd
-
-#define SHUFXOR_1 0xb1          /* 0b10110001 */
-#define SHUFXOR_2 0x4e          /* 0b01001110 */
-#define SHUFXOR_3 0x1b          /* 0b00011011 */
-
-#define CAT(x, y) x##y
-#define XCAT(x,y) CAT(x,y)
-
-#define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s))
-
-#define v32_bswap(x) (x)
-
-#define v16_broadcast(x) ({                     \
-      union u32 u;                              \
-      u32 xx = x;                               \
-      u.u[0] = xx | (xx << 16);                 \
-      V3216(v32_shuf(u.v,0)); })
-
-#define CV(x) {{x, x, x, x, x, x, x, x}}
-
-#elif defined(__aarch64__) && defined(__ARM_NEON)
-
-#define vec_and( x, y )    v128_and( x, y )
-#define vec_or(x,y)        v128_or( x, y )
-#define vec_xor(x,y)       v128_xor( x, y )
-
-#define v16_and v128_and
-#define v16_or  v128_or
-#define v16_xor v128_xor
-
-#define v32_and v128_and
-#define v32_or  v128_or
-#define v32_xor v128_xor
-
-#define vec_andn( x,y )   v128_andnot( x, y )
-#define v16_andn          vec_andn 
-#define v32_andn          vec_andn
-
-#define v32_add( x, y )   v128_add32( x, y )
-
-#define v16_add( x, y )        v128_add16( x, y )
-#define v16_sub( x, y )        v128_sub16( x, y )
-#define v16_mul( x, y )        v128_mul16( x, y )
-#define v16_neg(x)             v128_negate16( x )
-#define v16_shift_l( x, c )    v128_sl16
-#define v16_shift_r            v128_sr16
-#define v16_cmp                v128_cmpgt16
-
-#define v16_interleavel        v128_unpacklo16
-#define v16_interleaveh        v128_unpackhi16 
-
-#define v16_mergel(a,b)   V1632(__builtin_ia32_punpcklwd128(a,b))
-#define v16_mergeh(a,b)   V1632(__builtin_ia32_punpckhwd128(a,b))
-
-#define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b))
-#define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b))
-
-#define v32_shift_l            v128_sl32
-#define v32_shift_r            v128_sr32
-
-#define v32_rotate(x,n)        v128_rol32
-
-#define v32_shuf __builtin_ia32_pshufd
-
-#define SHUFXOR_1 0xb1          /* 0b10110001 */
-#define SHUFXOR_2 0x4e          /* 0b01001110 */
-#define SHUFXOR_3 0x1b          /* 0b00011011 */
-
-#define CAT(x, y) x##y
-#define XCAT(x,y) CAT(x,y)
-
-#define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s))
-
-#define v32_bswap(x) (x)
-
-#define v16_broadcast(x) ({                     \
-      union u32 u;                              \
-      u32 xx = x;                               \
-      u.u[0] = xx | (xx << 16);                 \
-      V3216(v32_shuf(u.v,0)); })
-
-#define CV(x) {{x, x, x, x, x, x, x, x}}
-
-#else
-
-#error "I don't know how to vectorize on this architecture."
-
-#endif
-
-
-/* Twiddle tables */
-
-  static const union cv FFT64_Twiddle[] = {
-    {{1,    2,    4,    8,   16,   32,   64,  128}},
-    {{1,   60,    2,  120,    4,  -17,    8,  -34}},
-    {{1,  120,    8,  -68,   64,  -30,   -2,   17}},
-    {{1,   46,   60,  -67,    2,   92,  120,  123}},
-    {{1,   92,  -17,  -22,   32,  117,  -30,   67}},
-    {{1,  -67,  120,  -73,    8,  -22,  -68,  -70}},
-    {{1,  123,  -34,  -70,  128,   67,   17,   35}},
-  };
-
-
-  static const union cv FFT128_Twiddle[] =  {
-    {{  1, -118,   46,  -31,   60,  116,  -67,  -61}},
-    {{  2,   21,   92,  -62,  120,  -25,  123, -122}},
-    {{  4,   42,  -73, -124,  -17,  -50,  -11,   13}},
-    {{  8,   84,  111,    9,  -34, -100,  -22,   26}},
-    {{ 16,  -89,  -35,   18,  -68,   57,  -44,   52}},
-    {{ 32,   79,  -70,   36,  121,  114,  -88,  104}},
-    {{ 64,  -99,  117,   72,  -15,  -29,   81,  -49}},
-    {{128,   59,  -23, -113,  -30,  -58,  -95,  -98}},
-  };
-
-
-  static const union cv FFT256_Twiddle[] =  {
-    {{   1,   41, -118,   45,   46,   87,  -31,   14}}, 
-    {{  60, -110,  116, -127,  -67,   80,  -61,   69}}, 
-    {{   2,   82,   21,   90,   92,  -83,  -62,   28}}, 
-    {{ 120,   37,  -25,    3,  123,  -97, -122, -119}}, 
-    {{   4,  -93,   42,  -77,  -73,   91, -124,   56}}, 
-    {{ -17,   74,  -50,    6,  -11,   63,   13,   19}}, 
-    {{   8,   71,   84,  103,  111,  -75,    9,  112}}, 
-    {{ -34, -109, -100,   12,  -22,  126,   26,   38}}, 
-    {{  16, -115,  -89,  -51,  -35,  107,   18,  -33}}, 
-    {{ -68,   39,   57,   24,  -44,   -5,   52,   76}}, 
-    {{  32,   27,   79, -102,  -70,  -43,   36,  -66}}, 
-    {{ 121,   78,  114,   48,  -88,  -10,  104, -105}}, 
-    {{  64,   54,  -99,   53,  117,  -86,   72,  125}}, 
-    {{ -15, -101,  -29,   96,   81,  -20,  -49,   47}}, 
-    {{ 128,  108,   59,  106,  -23,   85, -113,   -7}}, 
-    {{ -30,   55,  -58,  -65,  -95,  -40,  -98,   94}}
-  };
-
-
-
-
-#endif
--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -8,15 +8,15 @@ bool register_skein_algo( algo_gate_t* gate )
    gate->scanhash  = (void*)&scanhash_skein_8way;
    gate->hash      = (void*)&skeinhash_8way;
 #elif defined(SKEIN_4WAY)
-    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
+    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA256_OPT | NEON_OPT;
    gate->scanhash  = (void*)&scanhash_skein_4way;
    gate->hash      = (void*)&skeinhash_4way;
 #elif defined(SKEIN_2WAY)
-    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
+    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA256_OPT | NEON_OPT;
    gate->scanhash  = (void*)&scanhash_skein_2x64;
    gate->hash      = (void*)&skeinhash_2x64;
 #else
-    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
+    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA256_OPT | NEON_OPT;
    gate->scanhash  = (void*)&scanhash_skein;
    gate->hash      = (void*)&skeinhash;
 #endif
--- a/algo/sm3/sm3-hash-4way.c
+++ b/algo/sm3/sm3-hash-4way.c
@@ -240,10 +240,10 @@ void sm3_8way_close( void *cc, void *dst )

 #if defined(__SSE2__)

-#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x,  9 ), \
-                                               mm128_rol_32( x, 17 ) ) ) 
-#define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 15 ), \
-                                               mm128_rol_32( x, 23 ) ) ) 
+#define P0(x) _mm_xor_si128( x, _mm_xor_si128( v128_rol32( x,  9 ), \
+                                               v128_rol32( x, 17 ) ) )
+#define P1(x) _mm_xor_si128( x, _mm_xor_si128( v128_rol32( x, 15 ), \
+                                               v128_rol32( x, 23 ) ) )

 #define FF0(x,y,z) _mm_xor_si128( x, _mm_xor_si128( y, z ) )
 #define FF1(x,y,z) _mm_or_si128( _mm_or_si128( _mm_and_si128( x, y ), \
@@ -273,13 +273,13 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
   int j;

   for ( j = 0; j < 16; j++ )
-      W[j] = mm128_bswap_32( block[j] );
+      W[j] = v128_bswap32( block[j] );

   for ( j = 16; j < 68; j++ )
      W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ],
                                                              W[ j-9 ] ),
-                                               mm128_rol_32( W[ j-3 ], 15 ) ) ),
-                            _mm_xor_si128( mm128_rol_32( W[ j-13 ], 7 ),
+                                               v128_rol32( W[ j-3 ], 15 ) ) ),
+                            _mm_xor_si128( v128_rol32( W[ j-13 ], 7 ),
                                           W[ j-6 ] ) );

   for( j = 0; j < 64; j++ )
@@ -288,19 +288,19 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
   T = _mm_set1_epi32( 0x79CC4519UL );
   for( j =0; j < 16; j++ )
   {
-      SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ),
+      SS1 = v128_rol32( _mm_add_epi32( _mm_add_epi32( v128_rol32(A,12), E ),
                                      mm128_rol_var_32( T, j ) ), 7 );
-      SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) );
+      SS2 = _mm_xor_si128( SS1, v128_rol32( A, 12 ) );
      TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF0( A, B, C ), D ),
                                          SS2 ), W1[j] );
      TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG0( E, F, G ), H ),
                                          SS1 ), W[j] );
      D = C;
-      C = mm128_rol_32( B, 9 );
+      C = v128_rol32( B, 9 );
      B = A;
      A = TT1;
      H = G;
-      G = mm128_rol_32( F, 19 );
+      G = v128_rol32( F, 19 );
      F = E;
      E = P0( TT2 );
   }
@@ -308,19 +308,19 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
   T = _mm_set1_epi32( 0x7A879D8AUL );
   for( j =16; j < 64; j++ )
   {
-      SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ),
+      SS1 = v128_rol32( _mm_add_epi32( _mm_add_epi32( v128_rol32(A,12), E ),
                                      mm128_rol_var_32( T, j&31 ) ), 7 );
-      SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) );
+      SS2 = _mm_xor_si128( SS1, v128_rol32( A, 12 ) );
      TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF1( A, B, C ), D ), 
                                          SS2 ), W1[j] );
      TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG1( E, F, G ), H ),
                                          SS1 ), W[j] );
      D = C;
-      C = mm128_rol_32( B, 9 );
+      C = v128_rol32( B, 9 );
      B = A;
      A = TT1;
      H = G;
-      G = mm128_rol_32( F, 19 );
+      G = v128_rol32( F, 19 );
      F = E;
      E = P0( TT2 );
   }
@@ -408,14 +408,14 @@ void sm3_4way_close( void *cc, void *dst )
      memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
   }

-   count[0] = mm128_bswap_32(
+   count[0] = v128_bswap32(
                  _mm_set1_epi32( ctx->nblocks >> 23 ) );
-   count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
-                                              ( ctx->num     << 3 ) ) );
+   count[1] = v128_bswap32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
+                                            ( ctx->num     << 3 ) ) );
   sm3_4way_compress( ctx->digest, block );

   for ( i = 0; i < 8 ; i++ )
-     hash[i] = mm128_bswap_32( ctx->digest[i] );
+     hash[i] = v128_bswap32( ctx->digest[i] );
 }

 #endif
--- a/algo/verthash/Verthash.c
+++ b/algo/verthash/Verthash.c
@@ -137,53 +137,8 @@ void verthash_info_free(verthash_info_t* info)
 #define VH_N_INDEXES 4096
 #define VH_BYTE_ALIGNMENT 16

-static inline uint32_t fnv1a(const uint32_t a, const uint32_t b)
-{
-    return (a ^ b) * 0x1000193;
-}
+#define fnv1a( a, b )           ( ( (a) ^ (b) ) * 0x1000193 )

-#if 0
-static void rotate_indexes( uint32_t *p )
-{
-#if defined(__AVX2__)
-
-   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m256i); x += 8 )
-   {
-      __m256i *px = (__m256i*)p + x;
-
-      px[0] = mm256_rol_32( px[0], 1 );
-      px[1] = mm256_rol_32( px[1], 1 );
-      px[2] = mm256_rol_32( px[2], 1 );
-      px[3] = mm256_rol_32( px[3], 1 );
-      px[4] = mm256_rol_32( px[4], 1 );
-      px[5] = mm256_rol_32( px[5], 1 );
-      px[6] = mm256_rol_32( px[6], 1 );
-      px[7] = mm256_rol_32( px[7], 1 );
-   }
-
-#else
-
-   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m128i); x += 8 )
-   {
-      __m128i *px = (__m128i*)p0_index + x;
-
-      px[0] = mm128_rol_32( px[0], 1 );
-      px[1] = mm128_rol_32( px[1], 1 );
-      px[2] = mm128_rol_32( px[2], 1 );
-      px[3] = mm128_rol_32( px[3], 1 );
-      px[4] = mm128_rol_32( px[4], 1 );
-      px[5] = mm128_rol_32( px[5], 1 );
-      px[6] = mm128_rol_32( px[6], 1 );
-      px[7] = mm128_rol_32( px[7], 1 );
-   }
-
-#endif
-/*   
-   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(uint32_t); ++x )
-      p[x] = ( p[x] << 1 ) | ( p[x] >> 31 );
-*/
-}
-#endif
 // Vectorized and targetted version of fnv1a
 #if defined (__AVX2__)        

@@ -191,7 +146,7 @@ static void rotate_indexes( uint32_t *p )
   *(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \
                                 *(__m256i*)hash, *(__m256i*)blob_off ), k );

-#elif defined(__SSE4_1__)  || defined(__ARM_NEON)
+#elif defined(__SSE4_1__) || defined(__ARM_NEON)

 #define MULXOR \
   casti_v128( hash, 0 ) = v128_mul32( v128_xor( \
@@ -229,7 +184,7 @@ for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
   MULXOR; \
 }

-// subsequent passes rotate by r on demand, no need for mass rotate
+// subsequent passes rotate by r
 #define ROUND_r( r ) \
 for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
 { \
@@ -243,8 +198,8 @@ for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
 void verthash_hash( const void *blob_bytes, const size_t blob_size,
                    const void *input, void *output )
 {
-    uint32_t hash[ VH_HASH_OUT_SIZE / 4 ] __attribute__ ((aligned (64)));
    uint32_t subset[ VH_N_SUBSET / 4 ] __attribute__ ((aligned (64)));
+    uint32_t hash[ VH_HASH_OUT_SIZE / 4 ] __attribute__ ((aligned (32)));
    const uint32_t *blob = (const uint32_t*)blob_bytes;
    uint32_t accumulator = 0x811c9dc5;
    const uint32_t mdiv = ( ( blob_size - VH_HASH_OUT_SIZE )
--- a/algo/verthash/verthash-gate.c
+++ b/algo/verthash/verthash-gate.c
@@ -91,8 +91,8 @@ void verthash_sha3_512_final_8( void *hash, const uint64_t nonce )
 int scanhash_verthash( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t edata[20] __attribute__((aligned(64)));
   uint32_t hash[8] __attribute__((aligned(64)));
+   uint32_t edata[20] __attribute__((aligned(32)));
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -101,9 +101,7 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce,
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;

-   for (int i = 0; i < 20; i++)
-         edata[i] = bswap_32( pdata[i] );
-//   v128_bswap32_80( edata, pdata );
+   v128_bswap32_80( edata, pdata );
   verthash_sha3_512_prehash_72( edata );

   do
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -204,11 +204,11 @@ int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
   const __m512i eight = _mm512_set1_epi64( 8 );
   const bool bench = opt_benchmark;

-   edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
-   edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
-   edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
-   edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
-   edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
+   edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
+   edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
+   edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
+   edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
+   edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );

   mm512_intrlv80_8x64( vdata, edata );
   *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
@@ -372,11 +372,11 @@ int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
   const __m256i four = _mm256_set1_epi64x( 4 );
   const bool bench = opt_benchmark;

-   edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
-   edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
-   edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
-   edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
-   edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
+   edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
+   edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
+   edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
+   edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
+   edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );

   mm256_intrlv80_4x64( vdata, edata );

--- a/algo/x11/c11.c
+++ b/algo/x11/c11.c
@@ -13,11 +13,7 @@
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
@@ -43,11 +39,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cube;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
+   simd512_context         simd;
 } c11_ctx_holder;

 c11_ctx_holder c11_ctx __attribute__ ((aligned (64)));
@@ -69,11 +61,6 @@ void init_c11_ctx()
   init_luffa( &c11_ctx.luffa, 512 );
   cubehashInit( &c11_ctx.cube, 512, 16, 32 );
   sph_shavite512_init( &c11_ctx.shavite );
-#if defined(__aarch64__)
-   sph_simd512_init( &c11_ctx.simd );
-#else
-   init_sd( &c11_ctx.simd, 512 );
-#endif
 }

 void c11_hash( void *output, const void *input )
@@ -105,41 +92,35 @@ void c11_hash( void *output, const void *input )
    sph_skein512( &ctx.skein, (const void*) hash, 64 );
    sph_skein512_close( &ctx.skein, hash );

-     update_and_final_luffa( &ctx.luffa, hash, hash, 64 );
+    update_and_final_luffa( &ctx.luffa, hash, hash, 64 );

-     cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
+    cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );

-     sph_shavite512( &ctx.shavite, hash, 64);
-     sph_shavite512_close( &ctx.shavite, hash);
+    sph_shavite512( &ctx.shavite, hash, 64);
+    sph_shavite512_close( &ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                                   (const BitSequence *)hash, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
-     update_final_echo ( &ctx.echo, (BitSequence *)hash,
-                         (const BitSequence *)hash, 512 );
+    update_final_echo ( &ctx.echo, (BitSequence *)hash,
+                        (const BitSequence *)hash, 512 );
 #else
-     sph_echo512( &ctx.echo, hash, 64 );
-     sph_echo512_close( &ctx.echo, hash );
+    sph_echo512( &ctx.echo, hash, 64 );
+    sph_echo512_close( &ctx.echo, hash );
 #endif

-        memcpy(output, hash, 32);
+    memcpy(output, hash, 32);
 }

 int scanhash_c11( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t endiandata[20] __attribute__((aligned(64)));
-        uint32_t hash[8] __attribute__((aligned(64)));
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t hash[8] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
-        const uint32_t Htarg = ptarget[7];
+   const uint32_t Htarg = ptarget[7];
 	uint32_t nonce = first_nonce;
   int thr_id = mythr->id;
 	volatile uint8_t *restart = &(work_restart[thr_id].restart);
--- a/algo/x11/timetravel10.c
+++ b/algo/x11/timetravel10.c
@@ -13,17 +13,13 @@
 #include "algo/skein/sph_skein.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #ifdef __AES__
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #else
  #include "algo/groestl/sph_groestl.h"
 #endif
-  #include "algo/luffa/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"

 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
@@ -37,11 +33,7 @@ typedef struct {
        hashState_luffa         luffa;
        cubehashParam           cube;
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
+        simd512_context         simd;
 #ifdef __AES__
        hashState_groestl       groestl;
 #else
@@ -62,11 +54,6 @@ void init_tt10_ctx()
        init_luffa( &tt10_ctx.luffa, 512 );
        cubehashInit( &tt10_ctx.cube, 512, 16, 32 );
        sph_shavite512_init( &tt10_ctx.shavite );
-#if defined(__aarch64__)
-   sph_simd512_init( &tt10_ctx.simd );
-#else
-   init_sd( &tt10_ctx.simd, 512 );
-#endif
 #ifdef __AES__
        init_groestl( &tt10_ctx.groestl, 64 );
 #else
@@ -222,27 +209,7 @@ void timetravel10_hash(void *output, const void *input)
        }
        break;
     case 9:
-        if ( i == 0 )
-        {
-           memcpy( &ctx.simd, &tt10_mid.simd, sizeof tt10_mid.simd );
-#if defined(__aarch64__)
-           sph_simd512(&ctx.simd, (const void*) input + midlen, tail );
-           sph_simd512_close(&ctx.simd, hash);
-#else
-           update_final_sd( &ctx.simd, (BitSequence *)hashB,
-                            (const BitSequence *)input + midlen, tail*8 );
-#endif
-        }
-        else
-        {
-#if defined(__aarch64__)
-           sph_simd512(&ctx.simd, (const void*) hash, 64);
-           sph_simd512_close(&ctx.simd, hash);
-#else
-           update_sd( &ctx.simd, (const BitSequence *)hashA, dataLen*8 );
-           final_sd( &ctx.simd, (BitSequence *)hashB );
-#endif
-        }
+        simd512_ctx( &ctx.simd, hashB, hashA, dataLen );
        break;
     default:
 	break;
@@ -325,15 +292,6 @@ int scanhash_timetravel10( struct work *work, uint32_t max_nonce,
           memcpy( &tt10_mid.shavite, &tt10_ctx.shavite, sizeof(tt10_mid.shavite ) );
           sph_shavite512( &tt10_mid.shavite, endiandata, 64 );
           break;
-        case 9:
-           memcpy( &tt10_mid.simd, &tt10_ctx.simd, sizeof(tt10_mid.simd ) );
-#if defined(__aarch64__)
-           sph_simd512( &tt10_mid.simd, (const void*) endiandata, 64 );
-           sph_simd512_close( &tt10_mid.simd, hash);
-#else
-           update_sd( &tt10_mid.simd, (const BitSequence *)endiandata, 512 );
-#endif
-           break;
        default:
           break;
      }
--- a/algo/x11/x11.c
+++ b/algo/x11/x11.c
@@ -22,12 +22,7 @@
  #include "algo/echo/sph_echo.h"
 #endif
 #include "algo/luffa/luffa_for_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
-
+#include "algo/simd/simd-hash-2way.h"

 typedef struct {
   sph_blake512_context blake;
@@ -45,11 +40,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cube;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
+   simd512_context         simd;
 } x11_ctx_holder;

 x11_ctx_holder x11_ctx;
@@ -71,11 +62,6 @@ void init_x11_ctx()
   init_luffa( &x11_ctx.luffa, 512 );
   cubehashInit( &x11_ctx.cube, 512, 16, 32 );
   sph_shavite512_init( &x11_ctx.shavite );
-#if defined(__aarch64__)
-   sph_simd512_init( &x11_ctx.simd );
-#else
-   init_sd( &x11_ctx.simd, 512 );
-#endif
 }

 void x11_hash( void *state, const void *input )
@@ -118,13 +104,7 @@ void x11_hash( void *state, const void *input )
    sph_shavite512( &ctx.shavite, hash, 64 );
    sph_shavite512_close( &ctx.shavite, hash );

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                       (const BitSequence *)hash, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
    update_final_echo ( &ctx.echo, (BitSequence *)hash,
--- a/algo/x11/x11evo.c
+++ b/algo/x11/x11evo.c
@@ -20,11 +20,7 @@
  #include "algo/echo/sph_echo.h"
 #endif
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/luffa/luffa_for_sse2.h"

 typedef struct {
@@ -37,11 +33,7 @@ typedef struct {
 #endif
    hashState_luffa         luffa;
    cubehashParam           cube;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
+   simd512_context         simd;
    sph_blake512_context    blake;
    sph_bmw512_context      bmw;
    sph_skein512_context    skein;
@@ -63,11 +55,6 @@ void init_x11evo_ctx()
 #endif
     init_luffa( &x11evo_ctx.luffa, 512 );
     cubehashInit( &x11evo_ctx.cube, 512, 16, 32 );
-#if defined(__aarch64__)
-     sph_simd512_init( &x11evo_ctx.simd );
-#else
-     init_sd( &x11evo_ctx.simd, 512 );
-#endif
     sph_blake512_init( &x11evo_ctx.blake );
     sph_bmw512_init( &x11evo_ctx.bmw );
     sph_skein512_init( &x11evo_ctx.skein );
@@ -146,12 +133,7 @@ void x11evo_hash( void *state, const void *input )
 	      sph_shavite512_close( &ctx.shavite, (char*)hash );
 	      break;
 	    case 9:
-#if defined(__aarch64__)
-         sph_simd512(&ctx.simd, (const void*) hash, 64);
-         sph_simd512_close(&ctx.simd, hash);
-#else
-         update_final_sd( &ctx.simd, (char*)hash, (const char*)hash, 512 );
-#endif
+         simd512_ctx( &ctx.simd, hash, hash, 64 );
    break;
 	    case 10:
 #ifdef __AES__
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -17,12 +17,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-#include "algo/simd/sph_simd.h"
-#else
-#include "algo/simd/nist.h"
-#endif
-
+#include "algo/simd/simd-hash-2way.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -47,11 +42,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cube;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-  sph_simd512_context     simd;
-#else
-  hashState_sd            simd;
-#endif
+   simd512_context         simd;
   sph_gost512_context     gost;
 } x11gost_ctx_holder;

@@ -75,11 +66,6 @@ void init_x11gost_ctx()
   sph_shavite512_init( &x11gost_ctx.shavite );
   init_luffa( &x11gost_ctx.luffa, 512 );
   cubehashInit( &x11gost_ctx.cube, 512, 16, 32 );
-#if defined(__aarch64__)
-    sph_simd512_init(&x11gost_ctx.simd);
-#else
-    init_sd( &x11gost_ctx.simd, 512 );
-#endif
 }

 void x11gost_hash(void *output, const void *input)
@@ -123,13 +109,7 @@ void x11gost_hash(void *output, const void *input)
    sph_shavite512( &ctx.shavite, hash, 64 );
    sph_shavite512_close( &ctx.shavite, hash );

-#if defined(__aarch64__)
-    sph_simd512 (&ctx.simd, hash, 64); 
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                                   (const BitSequence *)hash, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
     update_final_echo ( &ctx.echo, (BitSequence *)hash,
--- a/algo/x12/x12.c
+++ b/algo/x12/x12.c
@@ -17,11 +17,7 @@
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
@@ -44,11 +40,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam            cubehash;
   sph_shavite512_context   shavite;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
+   simd512_context         simd;
   sph_hamsi512_context     hamsi;
 } x12_ctx_holder;

@@ -68,14 +60,9 @@ void init_x12_ctx()
        sph_groestl512_init(&x12_ctx.groestl);
        sph_echo512_init(&x12_ctx.echo);
 #endif
-   init_luffa( &x12_ctx.luffa, 512 );
+        init_luffa( &x12_ctx.luffa, 512 );
        cubehashInit( &x12_ctx.cubehash, 512, 16, 32 );
        sph_shavite512_init( &x12_ctx.shavite );
-#if defined(__aarch64__)
-   sph_simd512_init( &x12_ctx.simd );
-#else
-   init_sd( &x12_ctx.simd, 512 );
-#endif
        sph_hamsi512_init( &x12_ctx.hamsi );
 };

@@ -101,13 +88,7 @@ void x12hash(void *output, const void *input)
   sph_shavite512( &ctx.shavite, hash, 64);
   sph_shavite512_close( &ctx.shavite, hashB);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hashB, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_sd( &ctx.simd, (const BitSequence *)hash, 512 );
-    final_sd( &ctx.simd, (BitSequence *)hash );
-#endif
+   simd512_ctx( &ctx.simd, hash, hashB, 64 );

 #if defined(__AES__)
   update_final_echo ( &ctx.echo, (BitSequence *)hashB,
--- a/algo/x13/x13.c
+++ b/algo/x13/x13.c
@@ -15,11 +15,7 @@
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-#include "algo/simd/sph_simd.h"
-#else
-#include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -48,11 +44,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cubehash;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-  sph_simd512_context     simd;
-#else
-  hashState_sd            simd;
-#endif
+   simd512_context         simd;
   sph_hamsi512_context    hamsi;
 } x13_ctx_holder;

@@ -77,11 +69,6 @@ void init_x13_ctx()
   init_luffa( &x13_ctx.luffa, 512 );
   cubehashInit( &x13_ctx.cubehash, 512, 16, 32 );
   sph_shavite512_init( &x13_ctx.shavite );
-#if defined(__aarch64__)
-    sph_simd512_init(&x13_ctx.simd);
-#else
-    init_sd( &x13_ctx.simd, 512 );
-#endif
   sph_hamsi512_init( &x13_ctx.hamsi );
 };

@@ -121,13 +108,7 @@ void x13hash(void *output, const void *input)
    sph_shavite512( &ctx.shavite, hash, 64);
    sph_shavite512_close( &ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                                   (const BitSequence *)hash, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
    update_final_echo ( &ctx.echo, (BitSequence *)hash,
--- a/algo/x13/x13bcd.c
+++ b/algo/x13/x13bcd.c
@@ -15,11 +15,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -47,11 +43,7 @@ typedef struct {
   sph_skein512_context    skein;
   cubehashParam           cube;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
+   simd512_context         simd;
   sph_hamsi512_context    hamsi;
   sm3_ctx_t               sm3;
 } x13bcd_ctx_holder;
@@ -76,11 +68,6 @@ void init_x13bcd_ctx()
   sph_keccak512_init( &x13bcd_ctx.keccak );
   cubehashInit( &x13bcd_ctx.cube,512,16,32 );
   sph_shavite512_init( &x13bcd_ctx.shavite );
-#if defined(__aarch64__)
-   sph_simd512_init( &x13bcd_ctx.simd );
-#else
-   init_sd( &x13bcd_ctx.simd, 512 );
-#endif
   sm3_init( &x13bcd_ctx.sm3 );
   sph_hamsi512_init( &x13bcd_ctx.hamsi );
 };
@@ -127,13 +114,7 @@ void x13bcd_hash(void *output, const void *input)
    sph_shavite512( &ctx.shavite, hash, 64);
    sph_shavite512_close( &ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                                   (const BitSequence *)hash, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
    update_final_echo ( &ctx.echo, (BitSequence *)hash,
--- a/algo/x13/x13sm3.c
+++ b/algo/x13/x13sm3.c
@@ -17,11 +17,7 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -46,11 +42,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cube;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
+   simd512_context         simd;
   sm3_ctx_t               sm3;
   sph_hamsi512_context    hamsi;
   sph_fugue512_context    fugue;
@@ -75,11 +67,6 @@ void init_x13sm3_ctx()
   init_luffa( &hsr_ctx.luffa,512 );
   cubehashInit( &hsr_ctx.cube,512,16,32 );
   sph_shavite512_init( &hsr_ctx.shavite );
-#if defined(__aarch64__)
-   sph_simd512_init( &hsr_ctx.simd );
-#else
-   init_sd( &hsr_ctx.simd,512 );
-#endif
   sm3_init( &hsr_ctx.sm3 );
   sph_hamsi512_init( &hsr_ctx.hamsi );
   sph_fugue512_init( &hsr_ctx.fugue );
@@ -123,13 +110,7 @@ void x13sm3_hash(void *output, const void *input)
    sph_shavite512( &ctx.shavite, hash, 64);
        sph_shavite512_close( &ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                                   (const BitSequence *)hash, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hash, hash, 64 );

        //11---echo---
 #ifdef __AES__
--- a/algo/x14/x14.c
+++ b/algo/x14/x14.c
@@ -15,11 +15,7 @@
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -49,11 +45,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cube;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
+   simd512_context         simd;
   sph_hamsi512_context    hamsi;
   sph_shabal512_context   shabal;
 } x14_ctx_holder;
@@ -79,11 +71,6 @@ void init_x14_ctx()
   init_luffa( &x14_ctx.luffa,512 );
   cubehashInit( &x14_ctx.cube,512,16,32 );
   sph_shavite512_init( &x14_ctx.shavite );
-#if defined(__aarch64__)
-   sph_simd512_init( &x14_ctx.simd );
-#else
-   init_sd( &x14_ctx.simd, 512 );
-#endif
   sph_hamsi512_init( &x14_ctx.hamsi );
   sph_shabal512_init( &x14_ctx.shabal );
 };
@@ -124,13 +111,7 @@ void x14hash(void *output, const void *input)
    sph_shavite512( &ctx.shavite, hash, 64);
    sph_shavite512_close( &ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                                   (const BitSequence *)hash, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
    update_final_echo ( &ctx.echo, (BitSequence *)hash,
--- a/algo/x15/x15.c
+++ b/algo/x15/x15.c
@@ -17,12 +17,7 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
-
+#include "algo/simd/simd-hash-2way.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -52,11 +47,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cubehash;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
+   simd512_context         simd;
   sph_hamsi512_context    hamsi;
   sph_shabal512_context   shabal;
   sph_whirlpool_context   whirlpool;
@@ -83,11 +74,6 @@ void init_x15_ctx()
   init_luffa( &x15_ctx.luffa,512 );
   cubehashInit( &x15_ctx.cubehash, 512, 16, 32 );
   sph_shavite512_init( &x15_ctx.shavite );
-#if defined(__aarch64__)
-   sph_simd512_init( &x15_ctx.simd );
-#else
-   init_sd( &x15_ctx.simd, 512 );
-#endif
   sph_hamsi512_init( &x15_ctx.hamsi );
   sph_shabal512_init( &x15_ctx.shabal );
   sph_whirlpool_init( &x15_ctx.whirlpool );
@@ -131,13 +117,7 @@ void x15hash(void *output, const void *input)
    sph_shavite512( &ctx.shavite, hash, 64);
    sph_shavite512_close( &ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                                   (const BitSequence *)hash, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
    update_final_echo ( &ctx.echo, (BitSequence *)hash,
--- a/algo/x16/hex.c
+++ b/algo/x16/hex.c
@@ -236,7 +236,7 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
   do
   {
      edata[19] = nonce;
-      if ( hex_hash( hash32, edata, thr_id ) );
+      if ( hex_hash( hash32, edata, thr_id ) )
      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
      {
         be32enc( &pdata[19], nonce );
--- a/algo/x16/minotaur.c
+++ b/algo/x16/minotaur.c
@@ -318,7 +318,7 @@ bool register_minotaur_algo( algo_gate_t* gate )
  gate->hash              = (void*)&minotaur_hash;
  gate->miner_thread_init = (void*)&initialize_torture_garden;
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
-  if ( opt_algo == ALGO_MINOTAURX ) gate->optimizations |= SHA_OPT;
+  if ( opt_algo == ALGO_MINOTAURX ) gate->optimizations |= SHA256_OPT;
  return true;
 };

--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -526,7 +526,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
   do
   {
-      if( x16r_8way_hash( hash, vdata, thr_id ) );
+      if ( x16r_8way_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 8; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
@@ -952,7 +952,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
   {
-      if ( x16r_4way_hash( hash, vdata, thr_id ) );
+      if ( x16r_4way_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 4; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
@@ -1353,7 +1353,7 @@ int scanhash_x16r_2x64( struct work *work, uint32_t max_nonce,
   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
   do
   {
-      if ( x16r_2x64_hash( hash, vdata, thr_id ) );
+      if ( x16r_2x64_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 2; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -15,7 +15,6 @@
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/sph_simd.h"
-#include "algo/simd/nist.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
--- a/algo/x16/x20r.c
+++ b/algo/x16/x20r.c
@@ -137,7 +137,7 @@ int scanhash_x20r_8x64( struct work *work, uint32_t max_nonce,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
   do
   {
-      if( x20r_8x64_hash( hash, vdata, thr_id ) );
+      if ( x20r_8x64_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 8; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
@@ -205,7 +205,7 @@ int scanhash_x20r_4x64( struct work *work, uint32_t max_nonce,
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
   {
-      if ( x20r_4x64_hash( hash, vdata, thr_id ) );
+      if ( x20r_4x64_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 4; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
@@ -269,7 +269,7 @@ int scanhash_x20r_2x64( struct work *work, uint32_t max_nonce,
   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
   do
   {
-      if ( x20r_2x64_hash( hash, vdata, thr_id ) );
+      if ( x20r_2x64_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 2; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
--- a/algo/x17/sonoa.c
+++ b/algo/x17/sonoa.c
@@ -18,11 +18,7 @@
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/sha/sph_sha2.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
@@ -53,11 +49,7 @@ typedef struct {
        hashState_luffa         luffa;
        cubehashParam           cubehash;
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-        sph_simd512_context     simd;
-#else
-        hashState_sd            simd;
-#endif
+        simd512_context         simd;
        sph_hamsi512_context    hamsi;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
@@ -86,11 +78,6 @@ void init_sonoa_ctx()
        init_luffa( &sonoa_ctx.luffa, 512 );
        cubehashInit( &sonoa_ctx.cubehash, 512, 16, 32 );
        sph_shavite512_init( &sonoa_ctx.shavite );
-#if defined(__aarch64__)
-        sph_simd512_init( &sonoa_ctx.simd );
-#else
-        init_sd( &sonoa_ctx.simd, 512 );
-#endif
        sph_hamsi512_init( &sonoa_ctx.hamsi );
        sph_shabal512_init( &sonoa_ctx.shabal );
        sph_whirlpool_init( &sonoa_ctx.whirlpool );
@@ -134,13 +121,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
 	sph_shavite512(&ctx.shavite, hash, 64);
 	sph_shavite512_close(&ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                       (const BitSequence *)hash, 512 );
-#endif
+   simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
   update_final_echo ( &ctx.echo, (BitSequence *)hash,
@@ -189,13 +170,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);

-#if defined(__aarch64__)
-   sph_simd512(&ctx.simd, (const void*) hash, 64);
-   sph_simd512_close(&ctx.simd, hash);
-#else
-   update_final_sd( &ctx.simd, (BitSequence *)hash,
-                       (const BitSequence *)hash, 512 );
-#endif
+   simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
   init_echo( &ctx.echo, 512 );
@@ -249,13 +224,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);

-#if defined(__aarch64__)
-   sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-   update_final_sd( &ctx.simd, (BitSequence *)hash,
-                       (const BitSequence *)hash, 512 );
-#endif
+   simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
   init_echo( &ctx.echo, 512 );
@@ -318,13 +287,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                       (const BitSequence *)hash, 512 );
-#endif
+   simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
   init_echo( &ctx.echo, 512 );
@@ -410,13 +373,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512_init( &ctx.simd );
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    simd_full( &ctx.simd, hash, hash, 512 );
-#endif
+   simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
   init_echo( &ctx.echo, 512 );
@@ -483,13 +440,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512_init( &ctx.simd );
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    simd_full( &ctx.simd, hash, hash, 512 );
-#endif
+   simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
   init_echo( &ctx.echo, 512 );
@@ -527,7 +478,6 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_whirlpool_close(&ctx.whirlpool, hash);

   if ( work_restart[thr_id].restart ) return 0;
-//

   sph_bmw512_init( &ctx.bmw);
   sph_bmw512(&ctx.bmw, hash, 64);
@@ -565,13 +515,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512_init( &ctx.simd );
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    simd_full( &ctx.simd, hash, hash, 512 );
-#endif
+   simd512_ctx( &ctx.simd, hash, hash, 64 );

 #if defined(__AES__)
   init_echo( &ctx.echo, 512 );
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -418,11 +418,11 @@ int scanhash_x17_16x32( struct work *work, uint32_t max_nonce,
   const bool bench = opt_benchmark;

   // convert LE32 to LE64
-   edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
-   edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
-   edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
-   edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
-   edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
+   edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
+   edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
+   edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
+   edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
+   edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );

   mm512_intrlv80_8x64( vdata, edata );
   blake512_8way_prehash_le( &blake512_8way_ctx, x17_16way_midstate, vdata );
@@ -681,11 +681,11 @@ int scanhash_x17_8x64( struct work *work, uint32_t max_nonce,
   const bool bench = opt_benchmark;

   // convert LE32 to LE64
-   edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
-   edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
-   edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
-   edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
-   edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
+   edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
+   edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
+   edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
+   edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
+   edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );

   mm512_intrlv80_8x64( vdata, edata );
   *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
@@ -895,11 +895,11 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,
   const bool bench = opt_benchmark;

   // convert LE32 to LE64
-   edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
-   edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
-   edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
-   edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
-   edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
+   edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
+   edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
+   edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
+   edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
+   edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );

   mm256_intrlv80_4x64( vdata, edata );
   *noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32( 0,3,0,2, 0,1,0,0 ) );
--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -18,11 +18,7 @@
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/sha/sph_sha2.h"
 #if defined(__AES__)
  #include "algo/fugue/fugue-aesni.h"
@@ -34,7 +30,7 @@
  #include "algo/fugue/sph_fugue.h"
 #endif
 #include "algo/blake/sph_blake.h"
-#include "algo/cubehash/sph_cubehash.h"
+//#include "algo/cubehash/sph_cubehash.h"
 #include "algo/luffa/sph_luffa.h"


@@ -63,17 +59,9 @@ union _x17_context_overlay
 #else
        hashState_luffa         luffa;
 #endif
-//#if defined(__aarch64__)
-//        sph_cubehash512_context    cube;
-//#else
        cubehashParam           cube;
-//#endif
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-        sph_simd512_context     simd;
-#else
-        hashState_sd            simd;
-#endif
+        simd512_context         simd;
        sph_hamsi512_context    hamsi;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
@@ -127,26 +115,13 @@ int x17_hash(void *output, const void *input, int thr_id )
    luffa_full( &ctx.luffa, hash, 512, hash, 64 );
 #endif

-//#if defined(__aarch64__)
-//    sph_cubehash512_init(&ctx.cube);
-//    sph_cubehash512(&ctx.cube, (const void*) hash, 64);
-//    sph_cubehash512_close(&ctx.cube, hash);
-//#else
    cubehash_full( &ctx.cube, hash, 512, hash, 64 );
-//#endif

    sph_shavite512_init( &ctx.shavite );
    sph_shavite512( &ctx.shavite, hash, 64);
    sph_shavite512_close( &ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512_init( &ctx.simd );
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    simd_full( &ctx.simd, (BitSequence *)hash,
-               (const BitSequence *)hash, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hash, hash, 64 );        

 #if defined(__AES__)
    echo_full( &ctx.echo, (BitSequence *)hash, 512,
--- a/algo/x17/xevan.c
+++ b/algo/x17/xevan.c
@@ -17,11 +17,7 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/sph-haval.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/sha/sph_sha2.h"
 #if defined(__AES__)
@@ -45,11 +41,7 @@ typedef struct {
        hashState_luffa         luffa;
        cubehashParam           cubehash;
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-        sph_simd512_context     simd;
-#else
-        hashState_sd            simd;
-#endif
+        simd512_context         simd;
        sph_hamsi512_context    hamsi;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
@@ -78,11 +70,6 @@ void init_xevan_ctx()
        init_luffa( &xevan_ctx.luffa, 512 );
        cubehashInit( &xevan_ctx.cubehash, 512, 16, 32 );
        sph_shavite512_init( &xevan_ctx.shavite );
-#if defined(__aarch64__)
-        sph_simd512_init( &xevan_ctx.simd );
-#else
-        init_sd( &xevan_ctx.simd, 512 );
-#endif
        sph_hamsi512_init( &xevan_ctx.hamsi );
        sph_shabal512_init( &xevan_ctx.shabal );
        sph_whirlpool_init( &xevan_ctx.whirlpool );
@@ -137,13 +124,7 @@ int xevan_hash(void *output, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, dataLen);
   sph_shavite512_close(&ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512( &ctx.simd, (const void*) hash, dataLen );
-    sph_simd512_close( &ctx.simd, hash );
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)hash,
-                         (const BitSequence *)hash, dataLen*8 );
-#endif
+   simd512_ctx( &ctx.simd, hash, hash, dataLen );

 #if defined(__AES__)
   update_final_echo( &ctx.echo, (BitSequence *) hash,
@@ -210,13 +191,14 @@ int xevan_hash(void *output, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, dataLen);
   sph_shavite512_close(&ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-   update_final_sd( &ctx.simd, (BitSequence *)hash,
-                         (const BitSequence *)hash, dataLen*8 );
-#endif
+    simd512_ctx( &ctx.simd, hash, hash, dataLen );
+//#if defined(__aarch64__)
+//    sph_simd512(&ctx.simd, (const void*) hash, 64);
+//    sph_simd512_close(&ctx.simd, hash);
+//#else
+//   update_final_sd( &ctx.simd, (BitSequence *)hash,
+//                         (const BitSequence *)hash, dataLen*8 );
+//#endif

 #if defined(__AES__)
   update_final_echo( &ctx.echo, (BitSequence *) hash,
--- a/algo/x22/x22i-gate.c
+++ b/algo/x22/x22i-gate.c
@@ -31,7 +31,7 @@ bool register_x22i_algo( algo_gate_t* gate )

 #endif

-  gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT
+  gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA256_OPT
                      | AVX512_OPT | VAES_OPT | NEON_OPT;
  return true;
 };
@@ -48,7 +48,7 @@ bool register_x25x_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x25x;
  gate->hash      = (void*)&x25x_hash;
 #endif
-  gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT |
+  gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA256_OPT |
                        AVX512_OPT | VAES_OPT | NEON_OPT;
  InitializeSWIFFTX();
  return true;
--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -18,7 +18,6 @@
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/shavite-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/nist.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/fugue/fugue-aesni.h"
 #include "algo/whirlpool/sph_whirlpool.h"
--- a/algo/yespower/yescrypt-r8g.c
+++ b/algo/yespower/yescrypt-r8g.c
@@ -71,7 +71,7 @@ int scanhash_yespower_r8g( struct work *work, uint32_t max_nonce,

 bool register_yescryptr8g_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+  gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
  gate->scanhash      = (void*)&scanhash_yespower_r8g;
 #if (__SSE2__) || defined(__aarch64__)
  gate->hash          = (void*)&yespower_hash;
--- a/algo/yespower/yespower-gate.c
+++ b/algo/yespower/yespower-gate.c
@@ -162,7 +162,7 @@ bool register_yespower_algo( algo_gate_t* gate )
  if ( yespower_params.pers )
     applog( LOG_NOTICE,"Key= \"%s\"\n", yespower_params.pers );

-  gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+  gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
  gate->scanhash      = (void*)&scanhash_yespower;
 #if (__SSE2__) || defined(__aarch64__)
  gate->hash          = (void*)&yespower_hash;
@@ -180,7 +180,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate )
  yespower_params.r       = 16;
  yespower_params.pers    = NULL;
  yespower_params.perslen = 0;
-  gate->optimizations     = SSE2_OPT | SHA_OPT | NEON_OPT;
+  gate->optimizations     = SSE2_OPT | SHA256_OPT | NEON_OPT;
  gate->scanhash          = (void*)&scanhash_yespower;
 #if (__SSE2__) || defined(__aarch64__)
  gate->hash              = (void*)&yespower_hash;
@@ -195,7 +195,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate )

 bool register_yescrypt_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+   gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
 #if (__SSE2__) || defined(__aarch64__)
   gate->hash       = (void*)&yespower_hash;
@@ -233,7 +233,7 @@ bool register_yescrypt_algo( algo_gate_t* gate )

 bool register_yescryptr8_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+   gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
   gate->scanhash      = (void*)&scanhash_yespower;
 #if (__SSE2__) || defined(__aarch64__)
   gate->hash          = (void*)&yespower_hash;
@@ -251,7 +251,7 @@ bool register_yescryptr8_algo( algo_gate_t* gate )

 bool register_yescryptr16_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+   gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
 #if (__SSE2__) || defined(__aarch64__)
   gate->hash          = (void*)&yespower_hash;
@@ -269,7 +269,7 @@ bool register_yescryptr16_algo( algo_gate_t* gate )

 bool register_yescryptr32_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+   gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
 #if (__SSE2__) || defined(__aarch64__)
   gate->hash          = (void*)&yespower_hash;
--- a/api.c
+++ b/api.c
@@ -531,7 +531,7 @@ static void api()
 	time_t bindstart;
 	struct sockaddr_in serv;
 	struct sockaddr_in cli;
-	socklen_t clisiz;
+	uint32_t clisiz;
 	bool addrok = false;
 	long long counter;
 	char *result;
--- a/arm-build.sh
+++ b/arm-build.sh
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-# Linux build
-
-make distclean || echo clean
-
-rm -f config.status
-./autogen.sh || echo done
-
-CFLAGS="-O3 -march=native -Wall -flax-vector-conversions" ./configure  --with-curl 
-
-make -j $(nproc)
-
-strip -s cpuminer
--- a/armbuild-all.sh
+++ b/armbuild-all.sh
@@ -4,72 +4,54 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.

-rm cpuminer cpuminer-armv9-aes-sha3 cpuminer-armv9-aes-sha3-sve2 cpuminer-armv8.2-aes-sha3-sve2 cpuminer-armv8-aes-sha2-sve2 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null
+rm cpuminer cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null

 # armv9 needs gcc-13
+# -march-armv9-a includes SVE2 but no crypto
+# -march=armv9-a+crypto adds AES & SHA2 but not SHA512

 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=armv9-a+crypto+sha3+aes -Wall -flax-vector-conversions" ./configure  --with-curl
+CFLAGS="-O3 -march=armv9-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure  --with-curl
 make -j $(nproc)
-strip -s cpuminer
-mv cpuminer cpuminer-armv9-aes-sha3
+mv cpuminer cpuminer-armv9-crypto-sha3

 make clean || echo clean
-CFLAGS="-O3 -march=armv9-a+crypto+sha3+aes+sve2 -Wall -flax-vector-conversions" ./configure  --with-curl
+CFLAGS="-O3 -march=armv9-a+crypto -Wall -flax-vector-conversions" ./configure  --with-curl
 make -j $(nproc)
-strip -s cpuminer
-mv cpuminer cpuminer-armv9-aes-sha3-sve2
+mv cpuminer cpuminer-armv9-crypto

 make clean || echo clean
-CFLAGS="-O3 -march=armv8.2-a+crypto+sha3+aes+sve2 -Wall -flax-vector-conversions" ./configure  --with-curl
+CFLAGS="-O3 -march=armv9-a -Wall -flax-vector-conversions" ./configure  --with-curl
 make -j $(nproc)
-strip -s cpuminer
-mv cpuminer cpuminer-armv8.2-aes-sha3-sve2
+mv cpuminer cpuminer-armv9

+# SVE2 available in armv8.5
 make clean || echo clean
-CFLAGS="-O3 -march=armv8-a+crypto+sha2+aes+sve2 -Wall -flax-vector-conversions" ./configure  --with-curl
+CFLAGS="-O3 -march=armv8.5-a+crypto+sha3+sve2 -Wall -flax-vector-conversions" ./configure  --with-curl
 make -j $(nproc)
-strip -s cpuminer
-mv cpuminer cpuminer-armv8-aes-sha2-sve2
+mv cpuminer cpuminer-armv8.5-crypto-sha3-sve2

+# SHA3 available in armv8.4
 make clean || echo clean
-CFLAGS="-O3 -march=armv8-a+crypto+sha2+aes -Wall -flax-vector-conversions" ./configure  --with-curl 
+CFLAGS="-O3 -march=armv8.4-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure  --with-curl
 make -j $(nproc)
-strip -s cpuminer
-mv cpuminer cpuminer-armv8-aes-sha2
-
-make clean || echo clean
-rm -f config.status
-CFLAGS="-O3 -march=armv8-a+crypto+sha2 -Wall -flax-vector-conversions" ./configure  --with-curl      
-make -j $(nproc)
-strip -s cpuminer
-mv cpuminer cpuminer-armv8-sha2
-
-make clean || echo clean
-rm -f config.status
-CFLAGS="-O3 -march=armv8-a+crypto+aes -Wall -flax-vector-conversions" ./configure  --with-curl      
-make -j $(nproc)
-strip -s cpuminer
-mv cpuminer cpuminer-armv8-aes
+mv cpuminer cpuminer-armv8.4-crypto-sha3

 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=armv8-a+crypto -Wall -flax-vector-conversions" ./configure  --with-curl
 make -j $(nproc)
-strip -s cpuminer
 mv cpuminer cpuminer-armv8-crypto

 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=armv8-a -Wall -flax-vector-conversions" ./configure  --with-curl
 make -j $(nproc)
-strip -s cpuminer
 mv cpuminer cpuminer-armv8

 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=native -Wall -flax-vector-conversions" ./configure  --with-curl     
 make -j $(nproc)
-strip -s cpuminer
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,7 +4,7 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.

-rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-zen5 cpuminer-alderlake cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2  > /dev/null
+rm cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-zen5 cpuminer-alderlake cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2  > /dev/null

 # AVX512 SHA VAES: Intel Core Icelake, Rocketlake
 make distclean || echo clean
@@ -18,28 +18,55 @@ strip -s cpuminer
 mv cpuminer cpuminer-avx512-sha-vaes

 # Intel Core Alderlake: AVX2 SHA VAES, needs gcc-12
-make clean || echo clean
-rm -f config.status
-CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl
-make -j 8
-strip -s cpuminer
-mv cpuminer cpuminer-alderlake
+#make clean || echo clean
+#rm -f config.status
+#CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl
+#make -j 8
+#strip -s cpuminer
+#mv cpuminer cpuminer-alderlake

-# Intel Core Arrowlake: AVX2 SHA512 VAES, needs gcc-14
+# Intel Core Arrowlake-s: AVX2 SHA512 VAES, needs gcc-14
+# Arrowlake-s includes SHA512, Arrowlake does not?
 #make clean || echo clean
 #rm -f config.status
 #CFLAGS="-O3 -march=arrowlake-s -Wall" ./configure --with-curl
 #make -j 8
 #strip -s cpuminer
-#mv cpuminer cpuminer-arrowlake
+#mv cpuminer cpuminer-arrowlake-s
+
+# Intel Core Graniterapids: AVX512, SHA256, VAES, needs gcc-14
+# Apparently Granitrapids will not include AVX10, SHA512 or APX,
+# wait for Diamondrapids & gcc-15.
+#make clean || echo clean
+#rm -f config.status
+#CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl
+#make -j 8
+#strip -s cpuminer
+#mv cpuminer cpuminer-graniterapids
+
+# Force AVX10-256
+#make clean || echo clean
+#rm -f config.status
+#CFLAGS="-O3 -march=arrowlake-s -mavx10.1-256 -Wall" ./configure --with-curl
+#make -j 8
+#strip -s cpuminer
+#mv cpuminer cpuminer-avx10-256
+
+# Force SHA512 AVX10-512
+#make clean || echo clean
+#rm -f config.status
+#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1-512 -Wall" ./configure --with-curl
+#make -j 8
+#strip -s cpuminer
+#mv cpuminer cpuminer-avx10-512

 # Zen5: AVX512 SHA VAES, requires gcc-14.
 #make clean || echo clean
 #rm -f config.status
-#CFLAGS="-O3 -march=znver5" ./configure --with-curl
+#CFLAGS="-O3 -march=znver5 -Wall" ./configure --with-curl
 #make -j $(nproc)
 #strip -s cpuminer
-#mv cpuminer cpuminer-zen4
+#mv cpuminer cpuminer-zen5

 # Zen4: AVX512 SHA VAES
 make clean || echo clean
@@ -70,7 +97,7 @@ make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-avx512

-# AVX2 SHA VAES: generic
+# AVX2 SHA VAES: generic, zen3, alderlake...arrowlake
 make clean || echo done
 rm -f config.status
 # vaes doesn't include aes
--- a/build-avx2.sh
+++ b/build-avx2.sh
@@ -1,27 +1,9 @@
-#!/bin/bash
-
-#if [ "$OS" = "Windows_NT" ]; then
-#    ./mingw64.sh
-#    exit 0
-#fi
+#!/bin/sh

 # Linux build

 make distclean || echo clean
-
 rm -f config.status
 ./autogen.sh || echo done
-
-# Ubuntu 10.04 (gcc 4.4)
-# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
-
-# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
-#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
-
-#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
 CFLAGS="-O3 -march=haswell -maes -Wall" ./configure --with-curl
-#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
-
 make -j $(nproc)
-
-strip -s cpuminer
--- a/build-msys2.sh
+++ b/build-msys2.sh
@@ -1,10 +0,0 @@
-#!/bin/bash
-#
-# Compile on Windows using MSYS2 and MinGW.
-
-make distclean || echo clean
-rm -f config.status
-./autogen.sh || echo done
-CFLAGS="-O3 -march=native -Wall -D_WIN32_WINNT=0x0601" ./configure --with-curl
-make -j $(nproc)
-strip -s cpuminer
--- a/build.sh
+++ b/build.sh
@@ -1,20 +1,9 @@
-#!/bin/bash
-
-#if [ "$OS" = "Windows_NT" ]; then
-#    ./mingw64.sh
-#    exit 0
-#fi
-
-# Linux build
+#!/bin/sh

 make distclean || echo clean
-
 rm -f config.status
 ./autogen.sh || echo done
-
-#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
 CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
-
 make -j $(nproc)

-strip -s cpuminer
+#strip -s cpuminer
--- a/clean-all.sh
+++ b/clean-all.sh
@@ -1,8 +1,8 @@
-#!/bin/bash
+#!/bin/sh
 #
 # make clean and rm all the targetted executables.

-rm cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9-aes-sha3 cpuminer-armv9-aes-sha3-sve2 cpuminer-armv8.2-aes-sha3-sve2 cpuminer-armv8-aes-sha3-sve2 cpuminer-armv8-aes-sha2-sve2 cpuminer-armv8-crypto cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-aes-sha3 cpuminer-armv8-aes-sha2 cpuminer-armv8-sha2 > /dev/null
+rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9 cpuminer-armv9-crypto cpuminer-armv9-crypto-sha3 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8.5-crypto-sha3-sve2  cpuminer-armv8-crypto cpuminer-armv8 > /dev/null

 rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null

--- a/compat.h
+++ b/compat.h
@@ -3,7 +3,7 @@

 #ifdef WIN32

-#if _WIN32_WINNT==0x0601    // Windows 7
+#if _WIN32_WINNT>=0x0601    // Windows 7
 #define WINDOWS_CPU_GROUPS_ENABLED 1
 #endif

--- a/425
+++ b/425
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.2.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.3.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='24.2'
-PACKAGE_STRING='cpuminer-opt 24.2'
+PACKAGE_VERSION='25.3'
+PACKAGE_STRING='cpuminer-opt 25.3'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -657,14 +657,14 @@ JANSSON_LIBS
 LIBCURL_CPPFLAGS
 LIBCURL_CFLAGS
 LIBCURL
+HAVE_APPLE_FALSE
+HAVE_APPLE_TRUE
 MINGW_FALSE
 MINGW_TRUE
-ARCH_ARM_FALSE
-ARCH_ARM_TRUE
+ARCH_ARM64_FALSE
+ARCH_ARM64_TRUE
 ARCH_x86_64_FALSE
 ARCH_x86_64_TRUE
-ARCH_x86_FALSE
-ARCH_x86_TRUE
 USE_ASM_FALSE
 USE_ASM_TRUE
 HAVE_WINDOWS_FALSE
@@ -796,7 +796,6 @@ enable_maintainer_mode
 enable_dependency_tracking
 enable_assembly
 with_curl
-with_crypto
 '
      ac_precious_vars='build_alias
 host_alias
@@ -1360,7 +1359,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 24.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 25.3 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1432,7 +1431,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 24.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 25.3:";;
   esac
  cat <<\_ACEOF

@@ -1455,7 +1454,6 @@ Optional Packages:
  --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
  --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
  --with-curl=PATH         prefix where curl is installed default=/usr
-  --with-crypto=PATH       prefix where openssl crypto is installed default=/usr

 Some influential environment variables:
  CC          C compiler command
@@ -1538,7 +1536,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 24.2
+cpuminer-opt configure 25.3
 generated by GNU Autoconf 2.71

 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1983,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 24.2, which was
+It was created by cpuminer-opt $as_me 25.3, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3593,7 +3591,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='24.2'
+ VERSION='25.3'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -6502,32 +6500,30 @@ then :
 fi


-MINGW_TARGET=`$CC -dumpmachine 2>&1`
-case $MINGW_TARGET in
-  arm*-*-*)
-    have_arm=true
-    ;;
-  i*86-*-mingw*)
-    have_x86=true
-    have_win32=true
-    CFLAGS="-Icompat/pthreads $CFLAGS"
-    PTHREAD_LDFLAGS="-Lcompat/pthreads/x86"
-    WS2_LIBS="-lws2_32"
-    ;;
-  x86_64-*-mingw*|amd64-*-mingw*)
-    have_x86_64=true
-    have_win32=true
-    CFLAGS="-Icompat/pthreads $CFLAGS"
-    PTHREAD_LDFLAGS="-Lcompat/pthreads/x64"
-    # SHOULD BE AT END! after -lcrypto #
-    WS2_LIBS="-L/mingw/x86_64-w64-mingw32/lib -lws2_32"
-    ;;
-  i*86-*-*)
-    have_x86=true
-    ;;
+case $target in
  x86_64-*-*|amd64-*-*)
    have_x86_64=true
    ;;
+  aarch64*-*-*|arm64*-*-*)
+    have_arm64=true
+    ;;
+  powerpc*-*-*)
+    have_ppc=true
+    ;;
+esac
+
+PTHREAD_FLAGS="-pthread"
+WS2_LIBS=""
+
+case $target in
+  *-*-mingw*)
+    have_win32=true
+    PTHREAD_FLAGS=""
+    WS2_LIBS="-lws2_32"
+    ;;
+  *-apple-*)
+    have_apple=true
+    ;;
 esac

 # Check whether --enable-assembly was given.
@@ -6542,126 +6538,7 @@ printf "%s\n" "#define USE_ASM 1" >>confdefs.h

 fi

-if test x$enable_assembly != xno -a x$have_x86_64 = xtrue
-then
-  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX code" >&5
-printf %s "checking whether we can compile AVX code... " >&6; }
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main (void)
-{
-asm ("vmovdqa %ymm0, %ymm1");
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"
-then :
-
-printf "%s\n" "#define USE_AVX 1" >>confdefs.h
-
-    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-printf "%s\n" "yes" >&6; }
-    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile XOP code" >&5
-printf %s "checking whether we can compile XOP code... " >&6; }
-    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main (void)
-{
-asm ("vprotd \$7, %xmm0, %xmm1");
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"
-then :
-
-printf "%s\n" "#define USE_XOP 1" >>confdefs.h
-
-      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-printf "%s\n" "yes" >&6; }
-
-else $as_nop
-  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
-printf "%s\n" "no" >&6; }
-      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the XOP instruction set." >&5
-printf "%s\n" "$as_me: WARNING: The assembler does not support the XOP instruction set." >&2;}
-
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
-    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX2 code" >&5
-printf %s "checking whether we can compile AVX2 code... " >&6; }
-    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main (void)
-{
-asm ("vpaddd %ymm0, %ymm1, %ymm2");
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"
-then :
-
-printf "%s\n" "#define USE_AVX2 1" >>confdefs.h
-
-      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-printf "%s\n" "yes" >&6; }
-      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX512 code" >&5
-printf %s "checking whether we can compile AVX512 code... " >&6; }
-      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main (void)
-{
-asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"
-then :
-
-printf "%s\n" "#define USE_AVX512 1" >>confdefs.h
-
-        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-printf "%s\n" "yes" >&6; }
-
-else $as_nop
-  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
-printf "%s\n" "no" >&6; }
-        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX512 instruction set." >&5
-printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX512 instruction set." >&2;}
-
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
-
-else $as_nop
-  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
-printf "%s\n" "no" >&6; }
-      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX2 instruction set." >&5
-printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX2 instruction set." >&2;}
-
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
-
-else $as_nop
-  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
-printf "%s\n" "no" >&6; }
-    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX instruction set." >&5
-printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX instruction set." >&2;}
-
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
-fi
-
+# jansson test fails on Linux/Mingw, handled in Makefile.am.
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for json_loads in -ljansson" >&5
 printf %s "checking for json_loads in -ljansson... " >&6; }
 if test ${ac_cv_lib_jansson_json_loads+y}
@@ -6705,51 +6582,7 @@ else $as_nop
 fi


-# GC2 for GNU static
-if test "x$have_win32" = "xtrue" ; then
-   # MinGW
-   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
-printf %s "checking for pthread_create in -lpthread... " >&6; }
-if test ${ac_cv_lib_pthread_pthread_create+y}
-then :
-  printf %s "(cached) " >&6
-else $as_nop
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-lpthread  $LIBS"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-char pthread_create ();
-int
-main (void)
-{
-return pthread_create ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"
-then :
-  ac_cv_lib_pthread_pthread_create=yes
-else $as_nop
-  ac_cv_lib_pthread_pthread_create=no
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.beam \
-    conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
-fi
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthread_pthread_create" >&5
-printf "%s\n" "$ac_cv_lib_pthread_pthread_create" >&6; }
-if test "x$ac_cv_lib_pthread_pthread_create" = xyes
-then :
-  PTHREAD_LIBS="-lpthreadGC2"
-fi
-
-else
-   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
 printf %s "checking for pthread_create in -lpthread... " >&6; }
 if test ${ac_cv_lib_pthread_pthread_create+y}
 then :
@@ -6787,12 +6620,132 @@ printf "%s\n" "$ac_cv_lib_pthread_pthread_create" >&6; }
 if test "x$ac_cv_lib_pthread_pthread_create" = xyes
 then :
  PTHREAD_LIBS="-lpthread"
+else $as_nop
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC2" >&5
+printf %s "checking for pthread_create in -lpthreadGC2... " >&6; }
+if test ${ac_cv_lib_pthreadGC2_pthread_create+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lpthreadGC2  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+char pthread_create ();
+int
+main (void)
+{
+return pthread_create ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  ac_cv_lib_pthreadGC2_pthread_create=yes
+else $as_nop
+  ac_cv_lib_pthreadGC2_pthread_create=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC2_pthread_create" >&5
+printf "%s\n" "$ac_cv_lib_pthreadGC2_pthread_create" >&6; }
+if test "x$ac_cv_lib_pthreadGC2_pthread_create" = xyes
+then :
+  PTHREAD_LIBS="-lpthreadGC2"
+else $as_nop
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC1" >&5
+printf %s "checking for pthread_create in -lpthreadGC1... " >&6; }
+if test ${ac_cv_lib_pthreadGC1_pthread_create+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lpthreadGC1  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+char pthread_create ();
+int
+main (void)
+{
+return pthread_create ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  ac_cv_lib_pthreadGC1_pthread_create=yes
+else $as_nop
+  ac_cv_lib_pthreadGC1_pthread_create=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC1_pthread_create" >&5
+printf "%s\n" "$ac_cv_lib_pthreadGC1_pthread_create" >&6; }
+if test "x$ac_cv_lib_pthreadGC1_pthread_create" = xyes
+then :
+  PTHREAD_LIBS="-lpthreadGC1"
+else $as_nop
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC" >&5
+printf %s "checking for pthread_create in -lpthreadGC... " >&6; }
+if test ${ac_cv_lib_pthreadGC_pthread_create+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lpthreadGC  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+char pthread_create ();
+int
+main (void)
+{
+return pthread_create ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  ac_cv_lib_pthreadGC_pthread_create=yes
+else $as_nop
+  ac_cv_lib_pthreadGC_pthread_create=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC_pthread_create" >&5
+printf "%s\n" "$ac_cv_lib_pthreadGC_pthread_create" >&6; }
+if test "x$ac_cv_lib_pthreadGC_pthread_create" = xyes
+then :
+  PTHREAD_LIBS="-lpthreadGC"
+
+fi
+
+fi
+
 fi

 fi

-LDFLAGS="$PTHREAD_LDFLAGS $LDFLAGS"
-# PTHREAD_LIBS="$PTHREAD_LIBS"

 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether __uint128_t is supported" >&5
 printf %s "checking whether __uint128_t is supported... " >&6; }
@@ -6847,14 +6800,6 @@ else
  USE_ASM_FALSE=
 fi

- if test x$have_x86 = xtrue; then
-  ARCH_x86_TRUE=
-  ARCH_x86_FALSE='#'
-else
-  ARCH_x86_TRUE='#'
-  ARCH_x86_FALSE=
-fi
-
 if test x$have_x86_64 = xtrue; then
  ARCH_x86_64_TRUE=
  ARCH_x86_64_FALSE='#'
@@ -6863,12 +6808,12 @@ else
  ARCH_x86_64_FALSE=
 fi

- if test x$have_arm = xtrue; then
-  ARCH_ARM_TRUE=
-  ARCH_ARM_FALSE='#'
+ if test x$have_arm64 = xtrue; then
+  ARCH_ARM64_TRUE=
+  ARCH_ARM64_FALSE='#'
 else
-  ARCH_ARM_TRUE='#'
-  ARCH_ARM_FALSE=
+  ARCH_ARM64_TRUE='#'
+  ARCH_ARM64_FALSE=
 fi

 if test "x$OS" = "xWindows_NT"; then
@@ -6879,13 +6824,15 @@ else
  MINGW_FALSE=
 fi

-
-if test x$request_jansson = xtrue ; then
-	JANSSON_LIBS="compat/jansson/libjansson.a"
+ if test x$have_apple = xtrue; then
+  HAVE_APPLE_TRUE=
+  HAVE_APPLE_FALSE='#'
 else
-	JANSSON_LIBS=-ljansson
+  HAVE_APPLE_TRUE='#'
+  HAVE_APPLE_FALSE=
 fi

+
 # libcurl install path (for mingw : --with-curl=/usr/local)

 # Check whether --with-curl was given.
@@ -6902,30 +6849,10 @@ if test -n "$with_curl" ; then
   LIBCURL="-lcurl -lz"
 fi

-# SSL install path (for mingw : --with-crypto=/usr/local/ssl)
-
-# Check whether --with-crypto was given.
-if test ${with_crypto+y}
-then :
-  withval=$with_crypto;
-fi
-
-
-if test -n "$with_crypto" ; then
-   LIBCURL_CFLAGS="$LIBCURL_CFLAGS -I$with_crypto/include"
-   LIBCURL_CPPFLAGS="$LIBCURL_CPPFLAGS -I$with_crypto/include"
-   LIBCURL_LDFLAGS="-L$with_crypto/lib $LIBCURL_LDFLAGS"
-   LIBCURL="$LIBCURL -lssl -lcrypto"
-fi
-
 CFLAGS="$CFLAGS $LIBCURL_CFLAGS"
 CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS"
 LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS"

-#AC_CHECK_LIB([z],[gzopen],[],[])
-#AC_CHECK_LIB([crypto],[OPENSSL_init], crypto=yes, [AC_MSG_ERROR([OpenSSL crypto library required])])
-#AC_CHECK_LIB([ssl],[SSL_new], ssl=yes, ssl=no)
-
 # AC_CHECK_LIB([curl], [curl_multi_timeout],
 #    have_libcurl=yes,
 #    have_libcurl=no AC_MSG_ERROR([curl library required])
@@ -7102,22 +7029,22 @@ if test -z "${USE_ASM_TRUE}" && test -z "${USE_ASM_FALSE}"; then
  as_fn_error $? "conditional \"USE_ASM\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${ARCH_x86_TRUE}" && test -z "${ARCH_x86_FALSE}"; then
-  as_fn_error $? "conditional \"ARCH_x86\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
 if test -z "${ARCH_x86_64_TRUE}" && test -z "${ARCH_x86_64_FALSE}"; then
  as_fn_error $? "conditional \"ARCH_x86_64\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${ARCH_ARM_TRUE}" && test -z "${ARCH_ARM_FALSE}"; then
-  as_fn_error $? "conditional \"ARCH_ARM\" was never defined.
+if test -z "${ARCH_ARM64_TRUE}" && test -z "${ARCH_ARM64_FALSE}"; then
+  as_fn_error $? "conditional \"ARCH_ARM64\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
 if test -z "${MINGW_TRUE}" && test -z "${MINGW_FALSE}"; then
  as_fn_error $? "conditional \"MINGW\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${HAVE_APPLE_TRUE}" && test -z "${HAVE_APPLE_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_APPLE\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi

 : "${CONFIG_STATUS=./config.status}"
 ac_write_fail=0
@@ -7508,7 +7435,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 24.2, which was
+This file was extended by cpuminer-opt $as_me 25.3, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7576,7 +7503,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 24.2
+cpuminer-opt config.status 25.3
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [24.2])
+AC_INIT([cpuminer-opt], [25.3])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
@@ -41,32 +41,30 @@ AC_CHECK_DECLS([be32dec, le32dec, be32enc, le32enc, le16dec, le16enc], [], [],
 AC_FUNC_ALLOCA
 AC_CHECK_FUNCS([getopt_long])

-MINGW_TARGET=`$CC -dumpmachine 2>&1`
-case $MINGW_TARGET in
-  arm*-*-*)
-    have_arm=true
-    ;;
-  i*86-*-mingw*)
-    have_x86=true
-    have_win32=true
-    CFLAGS="-Icompat/pthreads $CFLAGS"
-    PTHREAD_LDFLAGS="-Lcompat/pthreads/x86"
-    WS2_LIBS="-lws2_32"
-    ;;
-  x86_64-*-mingw*|amd64-*-mingw*)
-    have_x86_64=true
-    have_win32=true
-    CFLAGS="-Icompat/pthreads $CFLAGS"
-    PTHREAD_LDFLAGS="-Lcompat/pthreads/x64"
-    # SHOULD BE AT END! after -lcrypto #
-    WS2_LIBS="-L/mingw/x86_64-w64-mingw32/lib -lws2_32"
-    ;;
-  i*86-*-*)
-    have_x86=true
-    ;;
+case $target in
  x86_64-*-*|amd64-*-*)
    have_x86_64=true
    ;;
+  aarch64*-*-*|arm64*-*-*)
+    have_arm64=true
+    ;;
+  powerpc*-*-*)
+    have_ppc=true
+    ;;
+esac
+
+PTHREAD_FLAGS="-pthread"
+WS2_LIBS=""
+
+case $target in
+  *-*-mingw*)
+    have_win32=true
+    PTHREAD_FLAGS=""
+    WS2_LIBS="-lws2_32"
+    ;;
+  *-apple-*)
+    have_apple=true
+    ;;
 esac

 AC_ARG_ENABLE([assembly],
@@ -75,54 +73,14 @@ if test x$enable_assembly != xno; then
  AC_DEFINE([USE_ASM], [1], [Define to 1 if assembly routines are wanted.])
 fi

-if test x$enable_assembly != xno -a x$have_x86_64 = xtrue
-then
-  AC_MSG_CHECKING(whether we can compile AVX code)
-  AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vmovdqa %ymm0, %ymm1");])],
-    AC_DEFINE(USE_AVX, 1, [Define to 1 if AVX assembly is available.])
-    AC_MSG_RESULT(yes)
-    AC_MSG_CHECKING(whether we can compile XOP code)
-    AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vprotd \$7, %xmm0, %xmm1");])],
-      AC_DEFINE(USE_XOP, 1, [Define to 1 if XOP assembly is available.])
-      AC_MSG_RESULT(yes)
-    ,
-      AC_MSG_RESULT(no)
-      AC_MSG_WARN([The assembler does not support the XOP instruction set.])
-    )
-    AC_MSG_CHECKING(whether we can compile AVX2 code)
-    AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %ymm0, %ymm1, %ymm2");])],
-      AC_DEFINE(USE_AVX2, 1, [Define to 1 if AVX2 assembly is available.])
-      AC_MSG_RESULT(yes)
-      AC_MSG_CHECKING(whether we can compile AVX512 code)
-      AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");])],
-        AC_DEFINE(USE_AVX512, 1, [Define to 1 if AVX512 assembly is available.])
-        AC_MSG_RESULT(yes)
-      ,
-        AC_MSG_RESULT(no)
-        AC_MSG_WARN([The assembler does not support the AVX512 instruction set.])
-      )
-    ,
-      AC_MSG_RESULT(no)
-      AC_MSG_WARN([The assembler does not support the AVX2 instruction set.])
-    )
-  ,
-    AC_MSG_RESULT(no)
-    AC_MSG_WARN([The assembler does not support the AVX instruction set.])
-  )
-fi
-
+# jansson test fails on Linux/Mingw, handled in Makefile.am.
 AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true)

-# GC2 for GNU static
-if test "x$have_win32" = "xtrue" ; then
-   # MinGW
-   AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthreadGC2",[])
-else
-   AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",[])
-fi
-
-LDFLAGS="$PTHREAD_LDFLAGS $LDFLAGS"
-# PTHREAD_LIBS="$PTHREAD_LIBS"
+AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",
+  AC_CHECK_LIB([pthreadGC2], [pthread_create], PTHREAD_LIBS="-lpthreadGC2",
+    AC_CHECK_LIB([pthreadGC1], [pthread_create], PTHREAD_LIBS="-lpthreadGC1",
+      AC_CHECK_LIB([pthreadGC], [pthread_create], PTHREAD_LIBS="-lpthreadGC"
+))))

 AC_MSG_CHECKING(whether __uint128_t is supported)
 AC_COMPILE_IFELSE([AC_LANG_PROGRAM([static __uint128_t i = 100;])],
@@ -136,16 +94,10 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([static __uint128_t i = 100;])],
 AM_CONDITIONAL([WANT_JANSSON], [test x$request_jansson = xtrue])
 AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
 AM_CONDITIONAL([USE_ASM], [test x$enable_assembly != xno])
-AM_CONDITIONAL([ARCH_x86], [test x$have_x86 = xtrue])
 AM_CONDITIONAL([ARCH_x86_64], [test x$have_x86_64 = xtrue])
-AM_CONDITIONAL([ARCH_ARM], [test x$have_arm = xtrue])
+AM_CONDITIONAL([ARCH_ARM64], [test x$have_arm64 = xtrue])
 AM_CONDITIONAL([MINGW], [test "x$OS" = "xWindows_NT"])
-
-if test x$request_jansson = xtrue ; then
-	JANSSON_LIBS="compat/jansson/libjansson.a"
-else
-	JANSSON_LIBS=-ljansson
-fi
+AM_CONDITIONAL([HAVE_APPLE], [test x$have_apple = xtrue])

 # libcurl install path (for mingw : --with-curl=/usr/local)
 AC_ARG_WITH([curl],
@@ -158,25 +110,10 @@ if test -n "$with_curl" ; then
   LIBCURL="-lcurl -lz"
 fi

-# SSL install path (for mingw : --with-crypto=/usr/local/ssl)
-AC_ARG_WITH([crypto],
-   [  --with-crypto=PATH       prefix where openssl crypto is installed [default=/usr]])
-
-if test -n "$with_crypto" ; then
-   LIBCURL_CFLAGS="$LIBCURL_CFLAGS -I$with_crypto/include"
-   LIBCURL_CPPFLAGS="$LIBCURL_CPPFLAGS -I$with_crypto/include"
-   LIBCURL_LDFLAGS="-L$with_crypto/lib $LIBCURL_LDFLAGS"
-   LIBCURL="$LIBCURL -lssl -lcrypto"
-fi
-
 CFLAGS="$CFLAGS $LIBCURL_CFLAGS"
 CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS"
 LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS"

-#AC_CHECK_LIB([z],[gzopen],[],[])
-#AC_CHECK_LIB([crypto],[OPENSSL_init], crypto=yes, [AC_MSG_ERROR([OpenSSL crypto library required])])
-#AC_CHECK_LIB([ssl],[SSL_new], ssl=yes, ssl=no)
-
 # AC_CHECK_LIB([curl], [curl_multi_timeout],
 #    have_libcurl=yes,
 #    have_libcurl=no AC_MSG_ERROR([curl library required])
--- a/2226
+++ b/2226
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -206,7 +206,7 @@ static uint32_t last_block_height = 0;
 static double   highest_share = 0;   // highest accepted share diff
 static double   lowest_share = 9e99; // lowest accepted share diff
 static double   last_targetdiff = 0.;
-#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32))
+#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32) || defined(__APPLE__))
 static uint32_t hi_temp = 0;
 static uint32_t prev_temp = 0;
 #endif
@@ -286,15 +286,15 @@ static inline void drop_policy(void) { }
 static void affine_to_cpu( struct thr_info *thr )
 {
   int thread = thr->id;
-   unsigned long last_error;    
-   bool ok;
+   unsigned long last_error = 0;    
+   bool ok = true;

 #if defined(WINDOWS_CPU_GROUPS_ENABLED)
   unsigned long group_size = GetActiveProcessorCount( 0 );
   unsigned long group      = thread / group_size;
   unsigned long cpu        = thread_affinity_map[ thread % group_size ];

-   GROUP_AFFINITY affinity;
+   GROUP_AFFINITY affinity = {0};
   affinity.Group = group;
   affinity.Mask = 1ULL << cpu;

@@ -320,8 +320,7 @@ static void affine_to_cpu( struct thr_info *thr )
   {
      last_error = GetLastError();
      if ( !thread )
-      applog( LOG_WARNING, "Set affinity returned error 0x%x for thread %d",
-                           last_error, thread );
+         applog( LOG_WARNING, "Set affinity returned error 0x%x", last_error );
   }
 }   

@@ -992,19 +991,19 @@ void report_summary_log( bool force )

  if ( rejected_share_count > 10 )
  {
-     if ( rejected_share_count > ( submitted_share_count * .5 ) )
+     if ( rejected_share_count > ( submitted_share_count / 2 ) )
     {
        applog(LOG_ERR,"Excessive rejected share rate, exiting...");
        exit(1);
     } 
-     else if ( rejected_share_count > ( submitted_share_count * .1 ) )
+     else if ( rejected_share_count > ( submitted_share_count / 10 ) )
       applog(LOG_WARNING,"High rejected share rate, check settings.");
   }

   gettimeofday( &now, NULL );
   timeval_subtract( &et, &now, &five_min_start );

-#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32))
+#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32) || defined(__APPLE__))

   // Display CPU temperature and clock rate.
   int curr_temp = cpu_temp(0); 
@@ -1013,8 +1012,9 @@ void report_summary_log( bool force )

   if ( !opt_quiet || ( curr_temp >= 80 ) )
   {
-      int wait_time = curr_temp >= 90 ? 5 : curr_temp >= 80 ? 30 :
-                                            curr_temp >= 70 ? 60 : 120;
+      int wait_time = curr_temp >= 90 ? 5
+                    : curr_temp >= 80 ? 30
+                    : curr_temp >= 70 ? 60 : 120;
      timeval_subtract( &diff, &now, &cpu_temp_time );
      if ( ( diff.tv_sec > wait_time )
        || ( ( curr_temp > prev_temp ) && ( curr_temp >= 75 ) ) )
@@ -1591,13 +1591,13 @@ start:
         last_targetdiff = net_diff;

         applog( LOG_BLUE, "New Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
-                                work->height, work->tx_count, net_diff,
-                                work->data[ algo_gate.ntime_index ] );
+                             work->height, work->tx_count, net_diff,
+                             bswap_32( work->data[ algo_gate.ntime_index ] ) );
      }
-      else if ( memcmp( &work->data[1], &g_work.data[1], 32 ) )
+      else if ( memcmp( work->data, g_work.data, algo_gate.work_cmp_size ) )
         applog( LOG_BLUE, "New Work: Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
-                                work->height, work->tx_count, net_diff,
-                                work->data[ algo_gate.ntime_index ] );
+                             work->height, work->tx_count, net_diff,
+                             bswap_32( work->data[ algo_gate.ntime_index ] ) );
      else
        new_work = false;

@@ -1912,6 +1912,8 @@ static bool wanna_mine(int thr_id)
 {
 	bool state = true;

+#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32) || defined(__APPLE__))
+  
 	if (opt_max_temp > 0.0)
   {
 		float temp = cpu_temp(0);
@@ -1921,8 +1923,12 @@ static bool wanna_mine(int thr_id)
           applog(LOG_NOTICE, "CPU temp too high: %.0fC max %.0f, waiting...", temp, opt_max_temp );
         state = false;
 		}
+      if ( temp > hi_temp ) hi_temp = temp;
 	}
-	if (opt_max_diff > 0.0 && net_diff > opt_max_diff)
+
+#endif
+
+   if (opt_max_diff > 0.0 && net_diff > opt_max_diff)
   {
 		if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
 			applog(LOG_NOTICE, "network diff too high, waiting...");
@@ -2133,7 +2139,7 @@ static void *miner_thread( void *userdata )
 //   uint32_t end_nonce = opt_benchmark
 //                      ? ( 0xffffffffU / opt_n_threads ) * (thr_id + 1) - 0x20
 //                      : 0;
-   uint32_t end_nonce = 0xffffffffU / opt_n_threads  * (thr_id + 1) - 0x20;
+   uint32_t end_nonce = 0xffffffffU / opt_n_threads  * (thr_id + 1) - opt_n_threads;

   memset( &work, 0, sizeof(work) );
 
@@ -2200,58 +2206,58 @@ static void *miner_thread( void *userdata )
 //       int64_t max64 = 1000;
       int nonce_found = 0;

-//       if ( likely( algo_gate.do_this_thread( thr_id ) ) )
-//       {
-          if ( have_stratum ) 
+       if ( have_stratum ) 
+       {
+          while ( unlikely( stratum_down ) )
+             sleep( 1 );
+          if ( unlikely( ( *nonceptr >= end_nonce )
+                        && !work_restart[thr_id].restart ) )
          {
-             while ( unlikely( stratum_down ) )
-                sleep( 1 );
-             if ( unlikely( ( *nonceptr >= end_nonce )
-                         && !work_restart[thr_id].restart ) )
+             if ( opt_extranonce )
+                stratum_gen_work( &stratum, &g_work );
+             else
             {
-                if ( opt_extranonce )
-                   stratum_gen_work( &stratum, &g_work );
-                else
+                if ( !thr_id )
                {
-                   if ( !thr_id )
-                   {
-                      applog( LOG_WARNING, "nonce range exhausted, extranonce not subscribed" );
-                      applog( LOG_WARNING, "waiting for new work...");
-                   }
-                   while ( !work_restart[thr_id].restart )
-                      sleep ( 1 );
+                   applog( LOG_WARNING, "Nonce range exhausted, extranonce not subscribed." );
+                   applog( LOG_WARNING, "Waiting for new work...");
                }
+                while ( !work_restart[thr_id].restart )
+                   sleep ( 1 );
             }
          }
-          else if ( !opt_benchmark ) // GBT or getwork
+       }
+       else if ( !opt_benchmark ) // GBT or getwork
+       {
+         // max64 is used to set end_nonce to match the scantime.
+         // It also factors the nonce range to end the scan when nonces are
+         // exhausted. In either case needing new work can be assumed.
+         // Only problem is every thread will call get_work.
+         // First thread resets scantime blocking all subsequent threads
+         // from fetching new work.
+
+          pthread_rwlock_wrlock( &g_work_lock );
+          const time_t now = time(NULL);
+          if ( ( ( now - g_work_time ) >= opt_scantime )
+             || ( *nonceptr >= end_nonce ) )
          {
-             pthread_rwlock_wrlock( &g_work_lock );
-
-             if ( ( ( time(NULL) - g_work_time ) >= opt_scantime )
-               || ( *nonceptr >= end_nonce ) )
+             if ( unlikely( !get_work( mythr, &g_work ) ) )
             {
-                if ( unlikely( !get_work( mythr, &g_work ) ) )
-                {
-                   pthread_rwlock_unlock( &g_work_lock );
-		             applog( LOG_ERR, "work retrieval failed, exiting miner thread %d", thr_id );
-		             goto out;
-	             }
-                g_work_time = time(NULL);
-//                restart_threads();
-             }
-
-             pthread_rwlock_unlock( &g_work_lock );
+                pthread_rwlock_unlock( &g_work_lock );
+                applog( LOG_ERR, "work retrieval failed, exiting miner thread %d", thr_id );
+		          goto out;
+	          }
+             g_work_time = now;
          }
-
-          pthread_rwlock_rdlock( &g_work_lock );
-
-          algo_gate.get_new_work( &work, &g_work, thr_id, &end_nonce );
-          work_restart[thr_id].restart = 0;
-
          pthread_rwlock_unlock( &g_work_lock );
+       }

-//       } // do_this_thread
-//       algo_gate.resync_threads( thr_id, &work );
+       pthread_rwlock_rdlock( &g_work_lock );
+
+       algo_gate.get_new_work( &work, &g_work, thr_id, &end_nonce );
+       work_restart[thr_id].restart = 0;
+
+       pthread_rwlock_unlock( &g_work_lock );

       // conditional mining
       if ( unlikely( !wanna_mine( thr_id ) ) )
@@ -2309,12 +2315,6 @@ static void *miner_thread( void *userdata )
       gettimeofday( (struct timeval *) &tv_start, NULL );

       // Scan for nonce
-//       nonce_found = scanhash_sha256dt_ref( &work, max_nonce, &hashes_done,
-//                                         mythr );
-//       nonce_found = scanhash_sha256dt_4x32( &work, max_nonce, &hashes_done,
-//                                         mythr );
-
-
       nonce_found = algo_gate.scanhash( &work, max_nonce, &hashes_done,
                                         mythr );

@@ -2336,8 +2336,8 @@ static void *miner_thread( void *userdata )
       // If unsubmiited nonce(s) found, submit now. 
       if ( unlikely( nonce_found && !opt_benchmark ) )
       {  
-//          applog( LOG_WARNING, "BUG: See RELEASE_NOTES for reporting bugs. Algo = %s.",
-//                               algo_names[ opt_algo ] );
+          applog( LOG_WARNING, "BUG: See RELEASE_NOTES for reporting bugs. Algo = %s.",
+                               algo_names[ opt_algo ] );
          if ( !submit_work( mythr, &work ) )
          {
             applog( LOG_WARNING, "Failed to submit share." );
@@ -2400,7 +2400,7 @@ static void *miner_thread( void *userdata )
             {
                scale_hash_for_display( &hashrate,  hr_units );
                sprintf( hr, "%.2f", hashrate );
-#if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32))
+#if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32) || defined(__APPLE__))
                applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units );
 #else
                float lo_freq = 0., hi_freq = 0.;
@@ -2828,9 +2828,9 @@ out:

 static void show_credits()
 {
-   printf("\n         **********  "PACKAGE_NAME" "PACKAGE_VERSION"  *********** \n");
+   printf("\n         **********  "PACKAGE_NAME" "PACKAGE_VERSION"  ********** \n");
   printf("     A CPU miner with multi algo support and optimized for CPUs\n");
-   printf("     with AVX512, SHA and VAES extensions by JayDDee.\n");
+   printf("     with AVX512, SHA, AES and NEON extensions by JayDDee.\n");
   printf("     BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n\n");
 }

@@ -2840,40 +2840,46 @@ static void show_credits()
 static bool cpu_capability( bool display_only )
 {
     char cpu_brand[0x40];
-     bool cpu_has_aarch64 = cpu_arch_aarch64();
-     bool cpu_has_x86_64  = cpu_arch_x86_64();
-     bool cpu_has_sse2    = has_sse2();    // X86_64 only
-     bool cpu_has_ssse3   = has_ssse3();    // X86_64 only
-     bool cpu_has_sse41   = has_sse41();    // X86_64 only
-     bool cpu_has_sse42   = has_sse42();
-     bool cpu_has_avx     = has_avx();
-     bool cpu_has_avx2    = has_avx2();
-     bool cpu_has_avx512  = has_avx512();
-     bool cpu_has_avx10   = has_avx10();
-     bool cpu_has_aes     = has_aes_ni();  // x86_64 or AArch64 AES
-     bool cpu_has_vaes    = has_vaes();
-     bool cpu_has_sha256  = has_sha();     // x86_64 or AArch64
-     bool cpu_has_sha512  = has_sha512();
-     bool sw_has_x86_64   = false;
-     bool sw_has_aarch64  = false;
-     int  sw_arm_arch     = 0;            // AArch64
-     bool sw_has_neon     = false;        // AArch64
-//     bool sw_has_sve      = false;        // AArch64
-//     bool sw_has_sve2     = false;        // AArch64
-     bool sw_has_sse2     = false;        // x86_64
-     bool sw_has_ssse3    = false;        // x86_64
-     bool sw_has_sse41    = false;        // x86_64
-     bool sw_has_sse42    = false;
-     bool sw_has_avx      = false;
-     bool sw_has_avx2     = false;
-     bool sw_has_avx512   = false;
+     bool cpu_has_sse2     = has_sse2();     // X86_64 only
+     bool cpu_has_ssse3    = has_ssse3();    // X86_64 only
+     bool cpu_has_sse41    = has_sse41();    // X86_64 only
+     bool cpu_has_sse42    = has_sse42();
+     bool cpu_has_avx      = has_avx();
+     bool cpu_has_neon     = has_neon();     // AArch64 
+     bool cpu_has_sve      = has_sve();      // aarch64 only, insignificant
+     bool cpu_has_sve2     = has_sve2();     // AArch64 only
+     bool cpu_has_sme      = has_sme();
+     bool cpu_has_sme2     = has_sme2();  
+     bool cpu_has_avx2     = has_avx2(); 
+     bool cpu_has_avx512   = has_avx512();
+     bool cpu_has_avx10    = has_avx10();
+     bool cpu_has_aes      = has_aes();      // x86_64 or AArch64
+     bool cpu_has_vaes     = has_vaes();     // X86_64 only
+     bool cpu_has_sha256   = has_sha256();   // x86_64 or AArch64
+     bool cpu_has_sha512   = has_sha512();
+     bool sw_has_x86_64    = false;
+     bool sw_has_aarch64   = false;
+     int  sw_arm_arch      = 0;            // AArch64 version
+     bool sw_has_neon      = false;        // AArch64
+     bool sw_has_sve       = false;        // AArch64
+     bool sw_has_sve2      = false;        // AArch64
+     bool sw_has_sme       = false;  
+     bool sw_has_sme2      = false; 
+     bool sw_has_sse2      = false;        // x86_64
+     bool sw_has_ssse3     = false;        // x86_64
+     bool sw_has_sse41     = false;        // x86_64
+     bool sw_has_sse42     = false;
+     bool sw_has_avx       = false;
+     bool sw_has_avx2      = false;
+     bool sw_has_avx512    = false;
     bool sw_has_avx10_256 = false;
     bool sw_has_avx10_512 = false;
-     bool sw_has_aes      = false;
-     bool sw_has_vaes     = false;
-     bool sw_has_sha256   = false;        // x86_64 or AArch64 SHA2
-     bool sw_has_sha512   = false;        // x86_64 or AArch64 SHA3
-     set_t algo_features = algo_gate.optimizations;
+     bool sw_has_aes       = false;
+     bool sw_has_vaes      = false;
+     bool sw_has_sha256    = false;        // x86_64 or AArch64
+     bool sw_has_sha512    = false;        // x86_64 or AArch64
+/*
+     set_t algo_features   = algo_gate.optimizations;
     bool algo_has_sse2    = set_incl( SSE2_OPT,    algo_features );
     bool algo_has_sse42   = set_incl( SSE42_OPT,   algo_features );
     bool algo_has_avx     = set_incl( AVX_OPT,     algo_features );
@@ -2881,7 +2887,7 @@ static bool cpu_capability( bool display_only )
     bool algo_has_avx512  = set_incl( AVX512_OPT,  algo_features );
     bool algo_has_aes     = set_incl( AES_OPT,     algo_features );
     bool algo_has_vaes    = set_incl( VAES_OPT,    algo_features );
-     bool algo_has_sha256  = set_incl( SHA_OPT,     algo_features );
+     bool algo_has_sha256  = set_incl( SHA256_OPT,  algo_features );
     bool algo_has_sha512  = set_incl( SHA512_OPT,  algo_features );
     bool algo_has_neon    = set_incl( NEON_OPT,    algo_features );
     bool use_sse2;
@@ -2895,8 +2901,7 @@ static bool cpu_capability( bool display_only )
     bool use_sha512;
     bool use_neon;
     bool use_none;
-
-     // x86_64
+*/
     #if defined(__x86_64__)
         sw_has_x86_64 = true;
     #elif defined(__aarch64__)
@@ -2908,6 +2913,8 @@ static bool cpu_capability( bool display_only )
           sw_arm_arch = __ARM_ARCH;
         #endif
     #endif
+
+     // x86_64 only
     #if defined(__SSE2__)
         sw_has_sse2 = true;
     #endif
@@ -2935,9 +2942,10 @@ static bool cpu_capability( bool display_only )
     #if defined(__AVX10_1_512__)
         sw_has_avx10_512 = true;
     #endif
-         
+
+     // x86_64 or AArch64 
     #if defined(__AES__) || defined(__ARM_FEATURE_AES)
-       sw_has_aes = true;
+         sw_has_aes = true;
     #endif
     #ifdef __VAES__
         sw_has_vaes = true;
@@ -2945,95 +2953,117 @@ static bool cpu_capability( bool display_only )
     #if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
         sw_has_sha256 = true;
     #endif
-     #if defined(__SHA512__) || defined(__ARM_FEATURE_SHA3)
+     #if defined(__SHA512__) || defined(__ARM_FEATURE_SHA512)
         sw_has_sha512 = true;
     #endif
+
+     // AArch64 only
     #if defined(__ARM_NEON)
         sw_has_neon = true;
     #endif
-//     #if defined(__ARM_FEATURE_SVE)
-//         sw_has_sve = true;
-//     #endif
-//     #if defined(__ARM_FEATURE_SVE2)
-//         sw_has_sve2 = true;
-//     #endif
+     #if defined(__ARM_FEATURE_SVE)
+         sw_has_sve = true;
+     #endif
+     #if defined(__ARM_FEATURE_SVE2)
+         sw_has_sve2 = true;
+     #endif
+     #if defined(__ARM_FEATURE_SME)
+         sw_has_sme = true;
+     #endif
+     #if defined(__ARM_FEATURE_SME2)
+         sw_has_sme2 = true;
+     #endif

+     // CPU
     cpu_brand_string( cpu_brand );
     printf( "CPU: %s\n", cpu_brand );

-     printf("SW built on " __DATE__
-     #ifdef _MSC_VER
-         " with VC++ 2013\n");
+     // Build
+     printf( "SW built on " __DATE__
+     #if defined(__clang__)
+        " with CLANG-%d.%d.%d", __clang_major__, __clang_minor__,
+                                __clang_patchlevel__ );
     #elif defined(__GNUC__)
-         " with GCC-");
-        printf("%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
-     #else
-        printf("\n");
+        " with GCC-%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ );
     #endif

+     // OS
     #if defined(__linux)
        printf(" Linux\n");
     #elif defined(WIN32)
-        printf(" Windows\n");
+        printf(" Windows");
+        #if defined(__MINGW64__)
+          printf(" MinGW-w64\n");
+        #else
+          printf("\n");
+        #endif
     #elif defined(__APPLE__)
        printf(" MacOS\n");
-#elif defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) 
-        printf(" Unix\n");
+     #elif defined(__bsd__) || defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) 
+        printf(" BSD/Unix\n");
     #else
        printf("\n");
     #endif

     printf("CPU features: ");
-     if ( cpu_has_x86_64  )
+     if ( cpu_arch_x86_64()  )
     {
-                                    printf( " x86_64"  );
-       if      ( cpu_has_avx512 )   printf( " AVX512"  );
-       else if ( cpu_has_avx2   )   printf( " AVX2  "  );
-       else if ( cpu_has_avx    )   printf( " AVX   "  );
-       else if ( cpu_has_sse42  )   printf( " SSE4.2"  );
-       else if ( cpu_has_sse41  )   printf( " SSE4.1"  );
-       else if ( cpu_has_ssse3  )   printf( " SSSE3 "  );
-       else if ( cpu_has_sse2   )   printf( " SSE2  "  );
+       if      ( cpu_has_avx10  )    printf( " AVX10.%d-%d", avx10_version(),
+                                                       avx10_vector_length() );
+       if      ( cpu_has_avx512 )    printf( " AVX512" );
+       else if ( cpu_has_avx2   )    printf( " AVX2  " );
+       else if ( cpu_has_avx    )    printf( " AVX   " );
+       else if ( cpu_has_sse42  )    printf( " SSE4.2" );
+       else if ( cpu_has_sse41  )    printf( " SSE4.1" );
+       else if ( cpu_has_ssse3  )    printf( " SSSE3 " );
+       else if ( cpu_has_sse2   )    printf( " SSE2  " );
     }
-     else if   ( cpu_has_aarch64 )  printf( " AArch64 NEON" ); // NEON assumed
-     if        ( cpu_has_vaes   )   printf( " VAES"    );
-     else if   ( cpu_has_aes    )   printf( "  AES"    );
-     if        ( cpu_has_sha512 )   printf( " SHA512"  );
-     else if   ( cpu_has_sha256 )   printf( " SHA256"  );
-     if        ( cpu_has_avx10  )   printf( " AVX10.%d-%d",
-                                      avx10_version(), avx10_vector_length() );
+     else if   ( cpu_arch_aarch64() )
+     {
+       if      ( cpu_has_neon   )    printf( "       NEON" );
+       if      ( cpu_has_sve2   )    printf( " SVE2-%d", sve_vector_length() );
+       else if ( cpu_has_sve    )    printf( " SVE"    );
+       if      ( cpu_has_sme2   )    printf( " SME2"   );
+       else if ( cpu_has_sme    )    printf( " SME"    );
+     }     
+     if        ( cpu_has_vaes   )    printf( " VAES"   );
+     else if   ( cpu_has_aes    )    printf( "  AES"   );
+     if        ( cpu_has_sha512 )    printf( " SHA512" );
+     else if   ( cpu_has_sha256 )    printf( " SHA256" );

     printf("\nSW features:  ");
     if ( sw_has_x86_64 )
     {                     
-                                     printf( " x86_64"  );
-        if      ( sw_has_avx512  )   printf( " AVX512"  );
-        else if ( sw_has_avx2    )   printf( " AVX2  "  );
-        else if ( sw_has_avx     )   printf( " AVX   "  );
-        else if ( sw_has_sse42   )   printf( " SSE4.2"  );
-        else if ( sw_has_sse41   )   printf( " SSE4.1"  );
-        else if ( sw_has_ssse3   )   printf( " SSSE3 "  );
-        else if ( sw_has_sse2    )   printf( " SSE2  "  );
        if      ( sw_has_avx10_512 ) printf( " AVX10-512" );
        else if ( sw_has_avx10_256 ) printf( " AVX10-256" );
+        else if ( sw_has_avx512    ) printf( " AVX512" );
+        else if ( sw_has_avx2      ) printf( " AVX2  " );
+        else if ( sw_has_avx       ) printf( " AVX   " );
+        else if ( sw_has_sse42     ) printf( " SSE4.2" );
+        else if ( sw_has_sse41     ) printf( " SSE4.1" );
+        else if ( sw_has_ssse3     ) printf( " SSSE3 " );
+        else if ( sw_has_sse2      ) printf( " SSE2  " );
     }
     else if    ( sw_has_aarch64 ) 
     {
-                                     printf( " AArch64" );
        if      ( sw_arm_arch    )   printf( " armv%d", sw_arm_arch );
-        if      ( sw_has_neon    )   printf( " NEON"    );
-//        if      ( sw_has_sve     )   printf( " SVE"     );
-//        else if ( sw_has_sve2    )   printf( " SVE2"    );
-
+        if      ( sw_has_neon    )   printf( " NEON"   );
+        if      ( sw_has_sve2    )   printf( " SVE2"   );
+        else if ( sw_has_sve     )   printf( " SVE"    );
+        if      ( sw_has_sme2    )   printf( " SME2"   );
+        else if ( sw_has_sme     )   printf( " SME"    );
     }
-     if         ( sw_has_vaes    )   printf( " VAES"    );
-     else if    ( sw_has_aes     )   printf( "  AES"    );
-     if         ( sw_has_sha512  )   printf( " SHA512"  );
-     else if    ( sw_has_sha256  )   printf( " SHA256"  );
+     if         ( sw_has_vaes    )   printf( " VAES"   );
+     else if    ( sw_has_aes     )   printf( "  AES"   );
+     if         ( sw_has_sha512  )   printf( " SHA512" );
+     else if    ( sw_has_sha256  )   printf( " SHA256" );

+     printf("\n");
+     
+/*     
     if ( !display_only )
     {
-        printf("\nAlgo features:       ");
+        printf("\nAlgo features:");
        if ( algo_features == EMPTY_SET ) printf( " None" );
        else
        {
@@ -3041,7 +3071,7 @@ static bool cpu_capability( bool display_only )
           else if ( algo_has_avx2   )  printf( " AVX2  " );
           else if ( algo_has_sse42  )  printf( " SSE4.2" );
           else if ( algo_has_sse2   )  printf( " SSE2  " );
-           if      ( algo_has_neon   )  printf( " NEON  " );
+           if      ( algo_has_neon   )  printf( " NEON"   );
           if      ( algo_has_vaes   )  printf( " VAES"   );
           else if ( algo_has_aes    )  printf( "  AES"   );
           if      ( algo_has_sha512 )  printf( " SHA512" );
@@ -3050,37 +3080,9 @@ static bool cpu_capability( bool display_only )
     }
     printf("\n");

+     
     if ( display_only ) return true;

-/*     
-     // Check for CPU and build incompatibilities
-     if ( !cpu_has_sse2 && !cpu_has_aarch64 )
-     {
-        printf( "A CPU with SSE2 is required to use cpuminer-opt\n" );
-        return false;
-     }
-     if ( sw_has_avx2 && !( cpu_has_avx2 && cpu_has_aes ) )
-     {
-        printf( "The SW build requires a CPU with AES and AVX2!\n" );
-        return false;
-     }
-     if ( sw_has_sse42 && !cpu_has_sse42 )
-     {
-        printf( "The SW build requires a CPU with SSE4.2!\n" );
-        return false;
-     }
-     if ( sw_has_aes && !cpu_has_aes )
-     {
-        printf( "The SW build requires a CPU with AES!\n" );
-        return false;
-     }
-     if ( sw_has_sha && !cpu_has_sha )
-     {
-        printf( "The SW build requires a CPU with SHA!\n" );
-        return false;
-     }
-*/
-
     // Determine mining options
     use_sse2   = cpu_has_sse2   && sw_has_sse2   && algo_has_sse2;
     use_sse42  = cpu_has_sse42  && sw_has_sse42  && algo_has_sse42;
@@ -3096,13 +3098,10 @@ static bool cpu_capability( bool display_only )
             || use_avx2 || use_sha256 || use_vaes || use_sha512 || use_neon );

     // Display best options
-     applog_nl( "Enabled optimizations:" );
-     if         ( use_none   ) printf( " none" );
-     else
+     if ( !use_none )
     {
-//        if ( cpu_has_aarch64 ) printf( " AArch64");
-//        else
-//                               printf( " x86_64" );
+        applog_nl( "Enabled optimizations:" );
+        if      ( use_neon   ) printf( " NEON"   );
        if      ( use_avx512 ) printf( " AVX512" );
        else if ( use_avx2   ) printf( " AVX2"   );
        else if ( use_avx    ) printf( " AVX"    );
@@ -3112,15 +3111,14 @@ static bool cpu_capability( bool display_only )
        else if ( use_aes    ) printf( " AES"    );
        if      ( use_sha512 ) printf( " SHA512" );
        else if ( use_sha256 ) printf( " SHA256" );
-        if      ( use_neon   ) printf( " NEON"   );
+        printf( "\n" );
     }
-     printf( "\n" );
+*/

     return true;
 }

-
-
+/*
 void show_version_and_exit(void)
 {
        printf("\n built on " __DATE__
@@ -3130,7 +3128,6 @@ void show_version_and_exit(void)
         " with GCC");
        printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
 #endif
-
        printf(" features:"
 #if defined(USE_ASM) && defined(__i386__)
                " i386"
@@ -3178,12 +3175,11 @@ void show_version_and_exit(void)
        printf("\n");
        exit(0);
 }
-
+*/
 void show_usage_and_exit(int status)
 {
 	if (status)
                fprintf(stderr, "Try `--help' for more information.\n");
-//		fprintf(stderr, "Try `" PACKAGE_NAME " --help' for more information.\n");
 	else
 		printf(usage);
 	exit(status);
@@ -3199,7 +3195,6 @@ void parse_arg(int key, char *arg )
 {
 	char *p;
 	int v, i;
-//	uint64_t ul;
 	double d;

 	switch( key )
@@ -3343,7 +3338,8 @@ void parse_arg(int key, char *arg )
 		free(rpc_user);
 		rpc_user = strdup(arg);
 		break;
-	case 'o':  // url
+
+   case 'o':  // url
   {
 		char *ap, *hp;
 		ap = strstr( arg, "://" );
@@ -3408,7 +3404,8 @@ void parse_arg(int key, char *arg )
 		have_stratum = !opt_benchmark && !strncasecmp( rpc_url, "stratum", 7 );
 		break;
 	}
-	case 'O':  // userpass
+
+   case 'O':  // userpass
 		p = strchr(arg, ':');
 		if (!p)
      {
@@ -3568,10 +3565,10 @@ void parse_arg(int key, char *arg )
   case 1029:  // stratum-keepalive
      opt_stratum_keepalive = true;
      break;
-   case 'V':
+   case 'V':   // version
      display_cpu_capability();
      exit(0);
-	case 'h':
+	case 'h':   // help
 		show_usage_and_exit(0);

   default:
@@ -3710,9 +3707,6 @@ int main(int argc, char *argv[])
 	{
 	   int cpus = GetActiveProcessorCount( i );
 	   num_cpus += cpus;
-
-//	   if (opt_debug)
-//         applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
 	}

 #else
@@ -3883,12 +3877,23 @@ int main(int argc, char *argv[])
 	}
 #endif

-#if defined(WIN32) && defined(WINDOWS_CPU_GROUPS_ENABLED)
-      if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) )
-         applog( LOG_INFO, "Found %d CPUs in %d groups",
-                           num_cpus, num_cpugroups );
+#if defined(WIN32)
+
+#if defined(_WIN32_WINNT)
+   if (opt_debug)
+      applog( LOG_INFO, "_WIN232_WINNT = 0x%04x", _WIN32_WINNT ); 
+#else
+   if (opt_debug)
+      applog( LOG_INFO, "_WIN232_WINNT undefined." );
 #endif
-   
+#if defined(WINDOWS_CPU_GROUPS_ENABLED)
+   if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) )
+      applog( LOG_INFO, "Found %d CPUs in %d groups",
+                              num_cpus, num_cpugroups );
+#endif
+
+#endif
+
   conditional_state = malloc( opt_n_threads * ((sizeof(bool)) ) );
   memset( conditional_state, 0, opt_n_threads * ((sizeof(bool)) ) );
   
@@ -3909,7 +3914,7 @@ int main(int argc, char *argv[])
         if ( cpu < num_cpus ) active_cpus++;
      }
      if ( opt_n_threads > active_cpus )
-         applog( LOG_WARNING, "Affinity: more threads (%d) than active CPUs (%d)", opt_n_threads, active_cpus );
+         applog( LOG_WARNING, "More miner threads (%d) than active CPUs in affinity mask (%d)", opt_n_threads, active_cpus );
      if ( !opt_quiet )
      {
         char affinity_mask[64];
--- a/miner.h
+++ b/miner.h
@@ -3,10 +3,7 @@

 #include <cpuminer-config.h>

-#if !( defined(__SSE2__) || ( defined(__aarch64__) && defined(__ARM_NEON) ) )
-#warning "Unknown or unsupported CPU, requires x86_64 with SSE2 or AArch64 with NEON." 
-#endif
-
+// CPU architecture
 #if defined(__x86_64__)
   #define USER_AGENT_ARCH "x64"     // Intel, AMD x86_64
 #elif defined(__aarch64__)
@@ -17,14 +14,15 @@
   #define USER_AGENT_ARCH
 #endif

+// Operating system
+// __APPLE__ includes MacOS & IOS, no MacOS only macros found.
 #if defined(__linux)
   #define USER_AGENT_OS   "L"      // GNU Linux
 #elif defined(WIN32)
   #define USER_AGENT_OS   "W"      // MS Windows
 #elif defined(__APPLE__)
   #define USER_AGENT_OS   "M"      // Apple MacOS
-// is there a generic BSD macro?
-#elif defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) 
+#elif defined(__bsd__) || defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) 
   #define USER_AGENT_OS   "U"      // BSD unix
 #else
   #define USER_AGENT_OS
@@ -191,7 +189,7 @@ static inline uint32_t swab32(uint32_t x)
   return __builtin_bswap32(x);
 #else
   return ( ( (x) << 24 ) & 0xff000000u ) | ( ( (x) <<  8 ) & 0x00ff0000u )
-        | ( ( (x) >>  8 ) & 0x0000ff00u ) | ( ( (x) >> 24 ) & 0x000000ffu )
+        | ( ( (x) >>  8 ) & 0x0000ff00u ) | ( ( (x) >> 24 ) & 0x000000ffu );


 //   return bswap_32(v);
@@ -291,26 +289,6 @@ static inline void le16enc(void *pp, uint16_t x)

 json_t* json_load_url(char* cfg_url, json_error_t *err);

-//void sha256_init(uint32_t *state);
-//void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
-//void sha256d(unsigned char *hash, const unsigned char *data, int len);
-
-#ifdef USE_ASM
-#if defined(__ARM_NEON__) || defined(__i386__) || defined(__x86_64__)
-#define HAVE_SHA256_4WAY 1
-int sha256_use_4way();
-void sha256_init_4way(uint32_t *state);
-void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
-#endif
-//#if defined(__x86_64__) && defined(USE_AVX2)
-#if defined(__x86_64__) && defined(__AVX2__)
-#define HAVE_SHA256_8WAY 1
-int sha256_use_8way();
-void sha256_init_8way(uint32_t *state);
-void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap);
-#endif
-#endif
-
 struct work;

 void work_free(struct work *w);
@@ -644,7 +622,6 @@ enum algos {
        ALGO_SHA256T,
        ALGO_SHA3D,
        ALGO_SHA512256D,
-        ALGO_SHAVITE3,    
        ALGO_SKEIN,       
        ALGO_SKEIN2,      
        ALGO_SKUNK,
@@ -740,7 +717,6 @@ static const char* const algo_names[] = {
        "sha256t",
        "sha3d",
        "sha512256d",
-        "shavite3",
        "skein",
        "skein2",
        "skunk",
@@ -855,10 +831,9 @@ Options:\n\
  -a, --algo=ALGO       specify the algorithm to use\n\
                          allium        Garlicoin (GRLC)\n\
                          anime         Animecoin (ANI)\n\
-                          argon2        Argon2 Coin (AR2)\n\
                          argon2d250\n\
-                          argon2d500    argon2d-dyn, Dynamic (DYN)\n\
-                          argon2d4096   argon2d-uis, Unitus (UIS)\n\
+                          argon2d500\n\
+                          argon2d4096\n\
                          axiom         Shabal-256 MemoHash\n\
                          blake         blake256r14 (SFR)\n\
                          blake2b       Blake2b 256\n\
@@ -904,7 +879,6 @@ Options:\n\
                          sha256t       Triple SHA-256, Onecoin (OC)\n\
                          sha3d         Double Keccak256 (BSHA3)\n\
                          sha512256d    Double SHA-512 (Radiant)\n\
-                          shavite3      Shavite3\n\
                          skein         Skein+Sha (Skeincoin)\n\
                          skein2        Double Skein (Woodcoin)\n\
                          skunk         Signatum (SIGT)\n\
--- a/simd-utils.h
+++ b/simd-utils.h
@@ -29,7 +29,6 @@
 //    is no significant 64 bit vectorization therefore SSE2 is the practical
 //    minimum for using this code.
 //
-//    MMX:     64 bit vectors  (Not used in cpuminer-opt) 
 //    SSE2:   128 bit vectors  (64 bit CPUs only, such as Intel Core2.
 //    AVX2:   256 bit vectors  (Starting with Intel Haswell and AMD Ryzen)
 //    AVX512: 512 bit vectors  (Starting with SkylakeX)
@@ -141,28 +140,56 @@
 #include <stdint.h>
 #include <stddef.h>

-// SIMD512: Use 512, 256 & 128 bit vectors, excludes AVX512VBMI
-// VL256: Include AVX512VL instructions on 256 & 128 bit vectors
-// VBMI: Include AVX512VBMI instructions on all vectors.
+// AVX512 macros are not a reliable indicator of 512 bit vector capability
+// because they get defined with AVX10_1_256 which doesn't support 512 bit.
+// EVEX512 is also unreliable as it can also be defined when 512b is not
+// available.
+// Use AVX10_1_512 for 512b & AVX10_1_256 for 256b whenever AVX10 is present.
+// Use AVX512 macros only whithout AVX10.

-// AVX10 can exist without support for 512 bit vectors.
-#if defined(__AVX10_1_512__)
-  #define SIMD512 1
-#elif !defined(__AVX10_1__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-   #define SIMD512 1
+/*
+// Test for macros
+#ifdef __AVX10_1__
+#warning "__AVX10_1__"
 #endif
+#ifdef __AVX10_1_256__
+#warning "__AVX10_1_256__"
+#endif
+#ifdef __AVX10_1_512__
+#warning "__AVX10_1_512__"
+#endif
+#ifdef __EVEX256__
+#warning "__EVEX256__"
+#endif
+#ifdef __EVEX512__
+#warning "__EVEX512__"
+#endif
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#warning "AVX512"
+#endif
+*/
+
+// SIMD512: Use 512, 256 & 128 bit vectors, AVX512VBMI is not included and
+// must be tested seperately. 
+// VL256: Include AVX512VL instructions for 256 & 128 bit vectors.
+// VBMI: Include AVX512VBMI instructions for supported vector lengths.

-// AVX512VL instructions applied to 256 & 128 bit vectors is supported with 
-// either AVX512VL or any version of AVX10.
 #if defined(__AVX10_1__)
-  #define VL256 1
-#elif defined(__AVX512VL__)
-  #define VL256 1
-#endif

-// VBMI does not exist on early versions of AVX512
-#if defined(__AVX10_1__) || defined(__AVX512VBMI__)
+  #define VL256 1
  #define VBMI 1
+  #if defined(__AVX10_1_512__)
+    #define SIMD512 1
+  #endif
+
+#elif defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+  #define VL256 1
+  #define SIMD512 1
+  #if defined(__AVX512VBMI__)
+    #define VBMI 1
+  #endif
+
 #endif

 /*
@@ -189,9 +216,6 @@

 #include "simd-utils/simd-int.h"

-// x86_64 MMX 64 bit vectors
-#include "simd-utils/simd-64.h"
-
 // x86_64 SSE2 128 bit vectors
 #include "simd-utils/simd-128.h"

@@ -201,10 +225,6 @@
 // x86_64 AVX512 512 bit vectors
 #include "simd-utils/simd-512.h"

-// move up after cleaning
-// CPU architectire abstraction
-//#include "simd-utils/simd-portable.h"
-
 // aarch64 neon 128 bit vectors
 #include "simd-utils/simd-neon.h"

--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -86,7 +86,7 @@ static inline void extr_lane_2x32( void *dst, const void *src,

 // 4x32

-#if ( defined(__x86_64__) && defined(__SSE2__) ) || ( defined(__aarch64__) && defined(__ARM_NEON) )
+#if defined(__x86_64__) && defined(__SSE2__) 

 #define ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ) \
 { \
@@ -174,6 +174,7 @@ static inline void intrlv_4x32_512( void *dst, const void *src0,
   STOR_DEST_4x32( D0, D1, D2, D3, dst, 12, dst, 13, dst, 14, dst, 15 );
 }

+
 static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
                           void *dst3, const void *src, const int bit_len )
 {
@@ -235,6 +236,190 @@ static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
   STOR_DEST_4x32( D0, D1, D2, D3, dst0, 3, dst1, 3, dst2, 3, dst3, 3 );
 }

+#elif defined(__aarch64__) && defined(__ARM_NEON)
+
+static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
+                      const void *src2, const void *src3, const int bit_len )
+{
+   uint32x4x4_t s;
+
+   s.val[0] = casti_v128u32( src0, 0 );
+   s.val[1] = casti_v128u32( src1, 0 );
+   s.val[2] = casti_v128u32( src2, 0 );
+   s.val[3] = casti_v128u32( src3, 0 );
+   vst4q_u32( dst, s );
+
+   s.val[0] = casti_v128u32( src0, 1 );
+   s.val[1] = casti_v128u32( src1, 1 );
+   s.val[2] = casti_v128u32( src2, 1 );
+   s.val[3] = casti_v128u32( src3, 1 );
+   vst4q_u32( dst + 64, s );
+   
+   if ( bit_len <= 256 ) return;
+
+   s.val[0] = casti_v128u32( src0, 2 );
+   s.val[1] = casti_v128u32( src1, 2 );
+   s.val[2] = casti_v128u32( src2, 2 );
+   s.val[3] = casti_v128u32( src3, 2 );
+   vst4q_u32( dst + 128, s );
+
+   s.val[0] = casti_v128u32( src0, 3 );
+   s.val[1] = casti_v128u32( src1, 3 );
+   s.val[2] = casti_v128u32( src2, 3 );
+   s.val[3] = casti_v128u32( src3, 3 );
+   vst4q_u32( dst + 192, s );
+
+   if ( bit_len <= 512 ) return;
+
+   s.val[0] = casti_v128u32( src0, 4 );
+   s.val[1] = casti_v128u32( src1, 4 );
+   s.val[2] = casti_v128u32( src2, 4 );
+   s.val[3] = casti_v128u32( src3, 4 );
+   vst4q_u32( dst + 256, s );
+
+   if ( bit_len <= 640 ) return;
+
+   s.val[0] = casti_v128u32( src0, 5 );
+   s.val[1] = casti_v128u32( src1, 5 );
+   s.val[2] = casti_v128u32( src2, 5 );
+   s.val[3] = casti_v128u32( src3, 5 );
+   vst4q_u32( dst + 320, s );
+
+   s.val[0] = casti_v128u32( src0, 6 );
+   s.val[1] = casti_v128u32( src1, 6 );
+   s.val[2] = casti_v128u32( src2, 6 );
+   s.val[3] = casti_v128u32( src3, 6 );
+   vst4q_u32( dst + 384, s );
+
+   s.val[0] = casti_v128u32( src0, 7 );
+   s.val[1] = casti_v128u32( src1, 7 );
+   s.val[2] = casti_v128u32( src2, 7 );
+   s.val[3] = casti_v128u32( src3, 7 );
+   vst4q_u32( dst + 448, s );
+
+// if ( bit_len <= 1024 return;
+}
+
+static inline void intrlv_4x32_512( void *dst, const void *src0,
+                     const void *src1, const void *src2, const void *src3 )
+{
+   uint32x4x4_t s;
+
+   s.val[0] = casti_v128u32( src0, 0 );
+   s.val[1] = casti_v128u32( src1, 0 );
+   s.val[2] = casti_v128u32( src2, 0 );
+   s.val[3] = casti_v128u32( src3, 0 );
+   vst4q_u32( dst, s );
+
+   s.val[0] = casti_v128u32( src0, 1 );
+   s.val[1] = casti_v128u32( src1, 1 );
+   s.val[2] = casti_v128u32( src2, 1 );
+   s.val[3] = casti_v128u32( src3, 1 );
+   vst4q_u32( dst + 64, s );
+
+   s.val[0] = casti_v128u32( src0, 2 );
+   s.val[1] = casti_v128u32( src1, 2 );
+   s.val[2] = casti_v128u32( src2, 2 );
+   s.val[3] = casti_v128u32( src3, 2 );
+   vst4q_u32( dst + 128, s );
+
+   s.val[0] = casti_v128u32( src0, 3 );
+   s.val[1] = casti_v128u32( src1, 3 );
+   s.val[2] = casti_v128u32( src2, 3 );
+   s.val[3] = casti_v128u32( src3, 3 );
+   vst4q_u32( dst + 192, s );
+}
+
+static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
+                           void *dst3, const void *src, int bit_len )
+{
+   uint32x4x4_t s = vld4q_u32( src );
+
+   casti_v128( dst0, 0 ) = s.val[0];
+   casti_v128( dst1, 0 ) = s.val[1];
+   casti_v128( dst2, 0 ) = s.val[2];
+   casti_v128( dst3, 0 ) = s.val[3];
+
+   s = vld4q_u32( src + 64 );
+   casti_v128( dst0, 1 ) = s.val[0];
+   casti_v128( dst1, 1 ) = s.val[1];
+   casti_v128( dst2, 1 ) = s.val[2];
+   casti_v128( dst3, 1 ) = s.val[3];
+
+   if ( bit_len <= 256 ) return;
+
+   s = vld4q_u32( src + 128 );
+   casti_v128( dst0, 2 ) = s.val[0];
+   casti_v128( dst1, 2 ) = s.val[1];
+   casti_v128( dst2, 2 ) = s.val[2];
+   casti_v128( dst3, 2 ) = s.val[3];
+
+   s = vld4q_u32( src + 192 );
+   casti_v128( dst0, 3 ) = s.val[0];
+   casti_v128( dst1, 3 ) = s.val[1];
+   casti_v128( dst2, 3 ) = s.val[2];
+   casti_v128( dst3, 3 ) = s.val[3];
+
+   if ( bit_len <= 512 ) return;
+
+   s = vld4q_u32( src + 256 );
+   casti_v128( dst0, 4 ) = s.val[0];
+   casti_v128( dst1, 4 ) = s.val[1];
+   casti_v128( dst2, 4 ) = s.val[2];
+   casti_v128( dst3, 4 ) = s.val[3];
+
+   if ( bit_len <= 640 ) return;
+
+   s = vld4q_u32( src + 320 );
+   casti_v128( dst0, 5 ) = s.val[0];
+   casti_v128( dst1, 5 ) = s.val[1];
+   casti_v128( dst2, 5 ) = s.val[2];
+   casti_v128( dst3, 5 ) = s.val[3];
+
+   s = vld4q_u32( src + 384 );
+   casti_v128( dst0, 6 ) = s.val[0];
+   casti_v128( dst1, 6 ) = s.val[1];
+   casti_v128( dst2, 6 ) = s.val[2];
+   casti_v128( dst3, 6 ) = s.val[3];
+
+   s = vld4q_u32( src + 448 );
+   casti_v128( dst0, 6 ) = s.val[0];
+   casti_v128( dst1, 6 ) = s.val[1];
+   casti_v128( dst2, 6 ) = s.val[2];
+   casti_v128( dst3, 6 ) = s.val[3];
+
+//   if ( bit_len <= 1024 ) return;
+}
+
+static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
+                           void *dst3, const void *src )
+{
+   uint32x4x4_t s = vld4q_u32( src );
+
+   casti_v128( dst0, 0 ) = s.val[0];
+   casti_v128( dst1, 0 ) = s.val[1];
+   casti_v128( dst2, 0 ) = s.val[2];
+   casti_v128( dst3, 0 ) = s.val[3];
+
+   s = vld4q_u32( src + 64 );
+   casti_v128( dst0, 1 ) = s.val[0];
+   casti_v128( dst1, 1 ) = s.val[1];
+   casti_v128( dst2, 1 ) = s.val[2];
+   casti_v128( dst3, 1 ) = s.val[3];
+
+   s = vld4q_u32( src + 128 );
+   casti_v128( dst0, 2 ) = s.val[0];
+   casti_v128( dst1, 2 ) = s.val[1];
+   casti_v128( dst2, 2 ) = s.val[2];
+   casti_v128( dst3, 2 ) = s.val[3];
+
+   s = vld4q_u32( src + 192 );
+   casti_v128( dst0, 3 ) = s.val[0];
+   casti_v128( dst1, 3 ) = s.val[1];
+   casti_v128( dst2, 3 ) = s.val[2];
+   casti_v128( dst3, 3 ) = s.val[3];
+}
+
 #else  // !SSE2 && !NEON

 static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
@@ -456,15 +641,13 @@ static inline void v128_bswap32_80( void *d, void *s )

 #endif

-#if defined(__SSE2__)
-
 static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
 {
-  v128_t s0 = casti_v128( src,0 );
-  v128_t s1 = casti_v128( src,1 );
-  v128_t s2 = casti_v128( src,2 );
-  v128_t s3 = casti_v128( src,3 );
-  v128_t s4 = casti_v128( src,4 );
+  v128u32_t s0 = casti_v128u32( src,0 );
+  v128u32_t s1 = casti_v128u32( src,1 );
+  v128u32_t s2 = casti_v128u32( src,2 );
+  v128u32_t s3 = casti_v128u32( src,3 );
+  v128u32_t s4 = casti_v128u32( src,4 );

 #if defined(__SSSE3__)

@@ -487,79 +670,34 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )

 #endif

-  casti_v128( d, 0 ) = _mm_shuffle_epi32( s0, 0x00 );
-  casti_v128( d, 1 ) = _mm_shuffle_epi32( s0, 0x55 );
-  casti_v128( d, 2 ) = _mm_shuffle_epi32( s0, 0xaa );
-  casti_v128( d, 3 ) = _mm_shuffle_epi32( s0, 0xff );
+  casti_v128u32( d, 0 ) = v128_duplane32( s0, 0 );
+  casti_v128u32( d, 1 ) = v128_duplane32( s0, 1 );
+  casti_v128u32( d, 2 ) = v128_duplane32( s0, 2 );
+  casti_v128u32( d, 3 ) = v128_duplane32( s0, 3 );

-  casti_v128( d, 4 ) = _mm_shuffle_epi32( s1, 0x00 );
-  casti_v128( d, 5 ) = _mm_shuffle_epi32( s1, 0x55 );
-  casti_v128( d, 6 ) = _mm_shuffle_epi32( s1, 0xaa );
-  casti_v128( d, 7 ) = _mm_shuffle_epi32( s1, 0xff );
+  casti_v128u32( d, 4 ) = v128_duplane32( s1, 0 );
+  casti_v128u32( d, 5 ) = v128_duplane32( s1, 1 );
+  casti_v128u32( d, 6 ) = v128_duplane32( s1, 2 );
+  casti_v128u32( d, 7 ) = v128_duplane32( s1, 3 );

-  casti_v128( d, 8 ) = _mm_shuffle_epi32( s2, 0x00 );
-  casti_v128( d, 9 ) = _mm_shuffle_epi32( s2, 0x55 );
-  casti_v128( d,10 ) = _mm_shuffle_epi32( s2, 0xaa );
-  casti_v128( d,11 ) = _mm_shuffle_epi32( s2, 0xff );
+  casti_v128u32( d, 8 ) = v128_duplane32( s2, 0 );
+  casti_v128u32( d, 9 ) = v128_duplane32( s2, 1 );
+  casti_v128u32( d,10 ) = v128_duplane32( s2, 2 );
+  casti_v128u32( d,11 ) = v128_duplane32( s2, 3 );

-  casti_v128( d,12 ) = _mm_shuffle_epi32( s3, 0x00 );
-  casti_v128( d,13 ) = _mm_shuffle_epi32( s3, 0x55 );
-  casti_v128( d,14 ) = _mm_shuffle_epi32( s3, 0xaa );
-  casti_v128( d,15 ) = _mm_shuffle_epi32( s3, 0xff );
+  casti_v128u32( d,12 ) = v128_duplane32( s3, 0 );
+  casti_v128u32( d,13 ) = v128_duplane32( s3, 1 );
+  casti_v128u32( d,14 ) = v128_duplane32( s3, 2 );
+  casti_v128u32( d,15 ) = v128_duplane32( s3, 3 );

-  casti_v128( d,16 ) = _mm_shuffle_epi32( s4, 0x00 );
-  casti_v128( d,17 ) = _mm_shuffle_epi32( s4, 0x55 );
-  casti_v128( d,18 ) = _mm_shuffle_epi32( s4, 0xaa );
-  casti_v128( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
+  casti_v128u32( d,16 ) = v128_duplane32( s2, 0 );
+  casti_v128u32( d,17 ) = v128_duplane32( s2, 1 );
+  casti_v128u32( d,18 ) = v128_duplane32( s2, 2 );
+  casti_v128u32( d,19 ) = v128_duplane32( s2, 3 );
 }

-#elif defined(__aarch64__) && defined(__ARM_NEON)
-
-static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
-{
-  v128_t s0 = casti_v128( src,0 );
-  v128_t s1 = casti_v128( src,1 );
-  v128_t s2 = casti_v128( src,2 );
-  v128_t s3 = casti_v128( src,3 );
-  v128_t s4 = casti_v128( src,4 );
-
-  s0 = v128_bswap32( s0 );
-  s1 = v128_bswap32( s1 );
-  s2 = v128_bswap32( s2 );
-  s3 = v128_bswap32( s3 );
-  s4 = v128_bswap32( s4 );
-
-  casti_v128( d, 0 ) = vdupq_laneq_u32( s0, 0 );
-  casti_v128( d, 1 ) = vdupq_laneq_u32( s0, 1 );
-  casti_v128( d, 2 ) = vdupq_laneq_u32( s0, 2 );
-  casti_v128( d, 3 ) = vdupq_laneq_u32( s0, 3 );
-
-  casti_v128( d, 4 ) = vdupq_laneq_u32( s1, 0 );
-  casti_v128( d, 5 ) = vdupq_laneq_u32( s1, 1 );
-  casti_v128( d, 6 ) = vdupq_laneq_u32( s1, 2 );
-  casti_v128( d, 7 ) = vdupq_laneq_u32( s1, 3 );
-
-  casti_v128( d, 8 ) = vdupq_laneq_u32( s2, 0 );
-  casti_v128( d, 9 ) = vdupq_laneq_u32( s2, 1 );
-  casti_v128( d,10 ) = vdupq_laneq_u32( s2, 2 );
-  casti_v128( d,11 ) = vdupq_laneq_u32( s2, 3 );
-
-  casti_v128( d,12 ) = vdupq_laneq_u32( s3, 0 );
-  casti_v128( d,13 ) = vdupq_laneq_u32( s3, 1 );
-  casti_v128( d,14 ) = vdupq_laneq_u32( s3, 2 );
-  casti_v128( d,15 ) = vdupq_laneq_u32( s3, 3 );
-
-  casti_v128( d,16 ) = vdupq_laneq_u32( s2, 0 );
-  casti_v128( d,17 ) = vdupq_laneq_u32( s2, 1 );
-  casti_v128( d,18 ) = vdupq_laneq_u32( s2, 2 );
-  casti_v128( d,19 ) = vdupq_laneq_u32( s2, 3 );
-}
-
-#endif
-
 // 8x32

-
 #if defined(__AVX2__)

 #define ILEAVE_8x32( D0, D1, D2, D3, D4, D5, D6, D7, \
@@ -1544,7 +1682,9 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
 //
 //     64 bit data

-// 2x64    SSE2, NEON
+// 2x64    
+
+#if defined(__x86_64__) && defined(__SSE2__)

 static inline void intrlv_2x64( void *dst, const void *src0,
                                const void *src1, const int bit_len )
@@ -1602,7 +1742,101 @@ static inline void dintrlv_2x64( void *dst0, void *dst1,
   d1[7] = v128_unpackhi64( s[14], s[15] );
 }

-/*
+#elif defined(__aarch64__) && defined(__ARM_NEON)
+
+static inline void intrlv_2x64( void *dst, const void *src0,
+                                const void *src1, const int bit_len )
+{
+   uint64x2x2_t s;
+
+   s.val[0] = casti_v128u64( src0, 0 );
+   s.val[1] = casti_v128u64( src1, 0 );
+   vst2q_u64( dst, s );
+
+   s.val[0] = casti_v128u64( src0, 1 );
+   s.val[1] = casti_v128u64( src1, 1 );
+   vst2q_u64( dst + 32, s );
+   
+   if ( bit_len <= 256 ) return;
+
+   s.val[0] = casti_v128u64( src0, 2 );
+   s.val[1] = casti_v128u64( src1, 2 );
+   vst2q_u64( dst + 64, s );
+
+   s.val[0] = casti_v128u64( src0, 3 );
+   s.val[1] = casti_v128u64( src1, 3 );
+   vst2q_u64( dst + 96, s );
+
+   if ( bit_len <= 512 ) return;
+
+   s.val[0] = casti_v128u64( src0, 4 );
+   s.val[1] = casti_v128u64( src1, 4 );
+   vst2q_u64( dst + 128, s );
+
+   if ( bit_len <= 640 ) return;
+
+   s.val[0] = casti_v128u64( src0, 5 );
+   s.val[1] = casti_v128u64( src1, 5 );
+   vst2q_u64( dst + 160, s );
+
+   s.val[0] = casti_v128u64( src0, 6 );
+   s.val[1] = casti_v128u64( src1, 6 );
+   vst2q_u64( dst + 192, s );
+
+   s.val[0] = casti_v128u64( src0, 7 );
+   s.val[1] = casti_v128u64( src1, 7 );
+   vst2q_u64( dst + 224, s );
+
+//   if ( bit_len <= 1024 ) return;
+}
+
+static inline void dintrlv_2x64( void *dst0, void *dst1,
+                                 const void *src, const int bit_len )
+{
+   uint64x2x2_t s = vld2q_u64( src );
+
+   casti_v128u64( dst0, 0 ) = s.val[0];
+   casti_v128u64( dst1, 0 ) = s.val[1];
+
+   s = vld2q_u64( src + 32 );
+   casti_v128u64( dst0, 1 ) = s.val[0];
+   casti_v128u64( dst1, 1 ) = s.val[1];
+
+   if ( bit_len <= 256 ) return;
+
+   s = vld2q_u64( src + 64 );
+   casti_v128u64( dst0, 2 ) = s.val[0];
+   casti_v128u64( dst1, 2 ) = s.val[1];
+
+   s = vld2q_u64( src + 96 );
+   casti_v128u64( dst0, 3 ) = s.val[0];
+   casti_v128u64( dst1, 3 ) = s.val[1];
+
+   if ( bit_len <= 512 ) return;
+
+   s = vld2q_u64( src + 128 );
+   casti_v128u64( dst0, 4 ) = s.val[0];
+   casti_v128u64( dst1, 4 ) = s.val[1];
+
+   if ( bit_len <= 640 ) return;
+
+   s = vld2q_u64( src + 160 );
+   casti_v128u64( dst0, 5 ) = s.val[0];
+   casti_v128u64( dst1, 5 ) = s.val[1];  
+
+   s = vld2q_u64( src + 192 );
+   casti_v128u64( dst0, 6 ) = s.val[0];
+   casti_v128u64( dst1, 6 ) = s.val[1];   
+
+   s = vld2q_u64( src + 224 );
+   casti_v128u64( dst0, 7 ) = s.val[0];
+   casti_v128u64( dst1, 7 ) = s.val[1];   
+
+//   if ( bit_len <= 1024 ) return;
+}
+   
+#else
+
 static inline void intrlv_2x64( void *dst, const void *src0,
                                const void *src1, const int bit_len )
 {
@@ -1621,8 +1855,7 @@ static inline void intrlv_2x64( void *dst, const void *src0,
   d[24] = s0[12];    d[25] = s1[12];   d[26] = s0[13];    d[27] = s1[13];
   d[28] = s0[14];    d[29] = s1[14];   d[30] = s0[15];    d[31] = s1[15];
 }
-*/
-/*
+
 static inline void dintrlv_2x64( void *dst0, void *dst1,
                                 const void *src, const int bit_len )
 {
@@ -1642,15 +1875,16 @@ static inline void dintrlv_2x64( void *dst0, void *dst1,
   d0[12] = s[24];   d1[12] = s[25];   d0[13] = s[26];   d1[13] = s[27];
   d0[14] = s[28];   d1[14] = s[29];   d0[15] = s[30];   d1[15] = s[31];
 }
-*/
+
+#endif

 static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
 {
-  v128_t s0 = casti_v128( src,0 );
-  v128_t s1 = casti_v128( src,1 );
-  v128_t s2 = casti_v128( src,2 );
-  v128_t s3 = casti_v128( src,3 );
-  v128_t s4 = casti_v128( src,4 );
+  v128u64_t s0 = casti_v128u64( src,0 );
+  v128u64_t s1 = casti_v128u64( src,1 );
+  v128u64_t s2 = casti_v128u64( src,2 );
+  v128u64_t s3 = casti_v128u64( src,3 );
+  v128u64_t s4 = casti_v128u64( src,4 );

 #if defined(__SSSE3__)

@@ -1673,41 +1907,20 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )

 #endif

-#if defined(__SSE2__)
+  casti_v128u64( d,0 ) = v128_duplane64( s0, 0 );
+  casti_v128u64( d,1 ) = v128_duplane64( s0, 1 );

-  casti_v128( d,0 ) = _mm_shuffle_epi32( s0, 0x44 );
-  casti_v128( d,1 ) = _mm_shuffle_epi32( s0, 0xee );
+  casti_v128u64( d,2 ) = v128_duplane64( s1, 0 );
+  casti_v128u64( d,3 ) = v128_duplane64( s1, 1 );

-  casti_v128( d,2 ) = _mm_shuffle_epi32( s1, 0x44 );
-  casti_v128( d,3 ) = _mm_shuffle_epi32( s1, 0xee );
+  casti_v128u64( d,4 ) = v128_duplane64( s2, 0 );
+  casti_v128u64( d,5 ) = v128_duplane64( s2, 1 );

-  casti_v128( d,4 ) = _mm_shuffle_epi32( s2, 0x44 );
-  casti_v128( d,5 ) = _mm_shuffle_epi32( s2, 0xee );
+  casti_v128u64( d,6 ) = v128_duplane64( s3, 0 );
+  casti_v128u64( d,7 ) = v128_duplane64( s3, 1 );

-  casti_v128( d,6 ) = _mm_shuffle_epi32( s3, 0x44 );
-  casti_v128( d,7 ) = _mm_shuffle_epi32( s3, 0xee );
-
-  casti_v128( d,8 ) = _mm_shuffle_epi32( s4, 0x44 );
-  casti_v128( d,9 ) = _mm_shuffle_epi32( s4, 0xee );
-
-#elif defined(__ARM_NEON)
-
-  casti_v128u64( d,0 ) = vdupq_laneq_u64( (uint64x2_t)s0, 0 );
-  casti_v128u64( d,1 ) = vdupq_laneq_u64( (uint64x2_t)s0, 1 );
-
-  casti_v128u64( d,2 ) = vdupq_laneq_u64( (uint64x2_t)s1, 0 );
-  casti_v128u64( d,3 ) = vdupq_laneq_u64( (uint64x2_t)s1, 1 );
-
-  casti_v128u64( d,4 ) = vdupq_laneq_u64( (uint64x2_t)s2, 0 );
-  casti_v128u64( d,5 ) = vdupq_laneq_u64( (uint64x2_t)s2, 1 );
-
-  casti_v128u64( d,6 ) = vdupq_laneq_u64( (uint64x2_t)s3, 0 );
-  casti_v128u64( d,7 ) = vdupq_laneq_u64( (uint64x2_t)s3, 1 );
-
-  casti_v128u64( d,8 ) = vdupq_laneq_u64( (uint64x2_t)s4, 0 );
-  casti_v128u64( d,9 ) = vdupq_laneq_u64( (uint64x2_t)s4, 1 );
-
-#endif
+  casti_v128u64( d,8 ) = v128_duplane64( s4, 0 );
+  casti_v128u64( d,9 ) = v128_duplane64( s4, 1 );
 }

 static inline void extr_lane_2x64( void *dst, const void *src,
@@ -2436,7 +2649,7 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
 static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
 {
  const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
-                                             0x0405060700010203 );
+                                            0x0405060700010203 );
  const __m512i c1 = v512_64( 1 );
  v128_t s0 = casti_v128( src,0 );
  v128_t s1 = casti_v128( src,1 );
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -32,13 +32,20 @@
 // Intrinsics automatically promote from REX to VEX when AVX is available
 // but ASM needs to be done manually.
 //
+// APX supports EGPR which adds 16 more GPRs and 3 operand instructions.
+// This may affect ASM that include instructions that are superseded by APX
+// versions and are therefore incompatible with APX.
+// As a result GCC-14 disables EGPR by default and can be enabled with
+// "-mapx-inline-asm-use-gpr32"
+//TODO
+// Some ASM functions may need to be updated to support EGPR with APX.
+//
 ///////////////////////////////////////////////////////////////////////////////

 // New architecturally agnostic syntax: 
 //
 //           __m128i -> v128_t
 //           _mm_    -> v128_
-//           mm128_  -> v128_
 // 
 //    There is also new syntax to accomodate ARM's stricter type checking of
 //    vector element size. They have no effect on x86_64.
@@ -145,10 +152,8 @@
 typedef union
 {
   v128_t   v128;
-   __m128i  m128;
   uint32_t u32[4];
-} __attribute__ ((aligned (16))) m128_ovly;
-#define v128_ovly   m128_ovly
+} __attribute__ ((aligned (16))) v128_ovly;

 // use for immediate constants, use load1 for mem.
 #define v128_64                        _mm_set1_epi64x
@@ -167,8 +172,13 @@ typedef union
 // necessary the cvt, set, or set1 intrinsics can be used allowing the
 // compiler to exploit new features to produce optimum code.
 // Currently only used internally and by Luffa.
+// It also has implications for APX EGPR feature.

-static inline __m128i mm128_mov64_128( const uint64_t n )
+#define v128_mov64       _mm_cvtsi64_si128
+#define v128_mov32       _mm_cvtsi32_si128
+
+/*
+static inline __m128i v128_mov64( const uint64_t n )
 {
  __m128i a;
 #if defined(__AVX__)
@@ -178,10 +188,8 @@ static inline __m128i mm128_mov64_128( const uint64_t n )
 #endif
  return a;
 }
-//#define v128_mov64( u64 )              mm128_mov64_128( u64 )

-
-static inline __m128i mm128_mov32_128( const uint32_t n )
+static inline __m128i v128_mov32( const uint32_t n )
 {
  __m128i a;
 #if defined(__AVX__)
@@ -191,11 +199,14 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
 #endif
  return a;
 }
+*/

 // broadcast lane 0 to all lanes
 #define v128_bcast64(v)                 _mm_shuffle_epi32( v, 0x44 )
 #define v128_bcast32(v)                 _mm_shuffle_epi32( v, 0x00 )

+// Not used, test first
+/*
 #if defined(__AVX2__)

 #define v128_bcast16(v)                 _mm_broadcastw_epi16(v)
@@ -203,9 +214,10 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
 #else

 #define v128_bcast16(v) \
-   v128_bcast32( v128_or( v128_sl32( v, 16 ), v ) )
+   _mm_shuffle_epi32( _mm_shufflelo_epi16( v, 0x00 ), 0x00 )

 #endif
+*/

 // Broadcast lane l to all lanes
 #define v128_duplane64( v, l ) \
@@ -221,28 +233,15 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
 // Pseudo constants
 #define v128_zero                       _mm_setzero_si128()

-#if defined(__SSE4_1__)
-
-// Bitwise AND, return 1 if result is all bits clear.
-#define v128_and_eq0(v1, v0)            _mm_testz_si128(v1, v0)
-
-// v128_is_zero?
-static inline int v128_cmpeq0( v128_t v )
-{  return v128_and_eq0( v, v ); }
-
-#endif
-
-// Bitwise compare return 1 if all bits set.
-#define v128_cmpeq1(v)                   _mm_test_all ones(v)
-
-#define v128_one                         mm128_mov64_128(1)
+//#define v128_one                         v128_mov64(1)
+#define v128_one                        _mm_cvtsi64_si128( 1 )

 // ASM avoids the need to initialize return variable to avoid compiler warning.
 // Macro abstracts function parentheses to look like an identifier.
 static inline __m128i v128_neg1_fn()
 {
   __m128i a;
-#if defined(__AVX__) 
+#if defined(__AVX__)
   asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(a) );
 #else
   asm( "pcmpeqq %0, %0\n\t" : "=x"(a) );
@@ -273,7 +272,6 @@ static inline __m128i v128_neg1_fn()
 // p = any aligned pointer, i = scaled array index
 // returns value p[i]
 #define casti_v128(p,i)    (((__m128i*)(p))[(i)])
-#define casti_m128i        casti_v128     // deprecated
 #define casti_v128u64      casti_v128
 #define casti_v128u32      casti_v128
 #define casti_v128u16      casti_v128
@@ -284,13 +282,14 @@ static inline __m128i v128_neg1_fn()
 #define casto_v128(p,o) (((__m128i*)(p))+(o))

 #if defined(__SSE4_1__)
+
 #define v128_get64( v, l )         _mm_extract_epi64( v, l )
 #define v128_get32( v, l )         _mm_extract_epi32( v, l )
 #define v128_get16( v, l )         _mm_extract_epi16( v, l )
 #define v128_get8(  v, l )         _mm_extract_epi8(  v, l )

 #define v128_put64( v, u64, l )    _mm_insert_epi64( v, u64, l )
-#define v128_put32( v, u32, l )    _mm_insert_epi64( v, u32, l )
+#define v128_put32( v, u32, l )    _mm_insert_epi32( v, u32, l )
 #define v128_put16( v, u16, l )    _mm_insert_epi16( v, u16, l )
 #define v128_put8(  v, u8,  l )    _mm_insert_epi8(  v, u8,  l )

@@ -327,7 +326,7 @@ static inline __m128i v128_neg1_fn()
 /*
 // Copy i32 to element c of dest and copy remaining elemnts from v.
 #define v128_put32( v, i32, c ) \
-      v128_xim_32( v, mm128_mov32_128( i32 ), (c)<<4 )
+      v128_xim_32( v, v128_mov32( i32 ), (c)<<4 )
 */


@@ -401,7 +400,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 {   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
 #define  memcpy_128           v128_memcpy  

+// Boolean operations
 #if defined(VL256)
+// Macros with duplicate references to the same argument are
+// not expression safe. Switch to inline function if required.

 // ~v1 | v0
 #define v128_ornot( v1, v0 )      _mm_ternarylogic_epi64( v1, v0, v0, 0xcf )
@@ -435,13 +437,13 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #else

-#define v128_ornot( v1, v0 )      _mm_or_si128( v1, v128_not( v0 ) )
+#define v128_ornot( v1, v0 )      _mm_or_si128( v128_not( v1 ), v0 )

-#define v128_xor3( a, b, c )      _mm_xor_si128( a, _mm_xor_si128( b, c ) )
+#define v128_xor3( a, b, c )      _mm_xor_si128( _mm_xor_si128( a, b ), c )

-#define v128_and3( a, b, c )      _mm_and_si128( a, _mm_and_si128( b, c ) )
+#define v128_and3( a, b, c )      _mm_and_si128( _mm_and_si128( a, b ), c )

-#define v128_or3( a, b, c )       _mm_or_si128( a, _mm_or_si128( b, c ) )
+#define v128_or3( a, b, c )       _mm_or_si128( _mm_or_si128( a, b ), c )

 #define v128_xorand( a, b, c )    _mm_xor_si128( a, _mm_and_si128( b, c ) )

@@ -463,17 +465,13 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 // Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements.
 // Effectively a sign test.

-#define mm128_movmask_64( v ) \
+#define v128_movmask64( v ) \
   _mm_movemask_pd( (__m128d)(v) )
-#define v128_movmask64                 mm128_movmask_64

-#define mm128_movmask_32( v ) \
+#define v128_movmask32( v ) \
   _mm_movemask_ps( (__m128)(v) )
-#define v128_movmask32                 mm128_movmask_32
-
-//
-// Bit rotations

+// Shuffle 16 bit elements within 64 bit lanes.
 #define v128_shuffle16( v, c ) \
       _mm_shufflehi_epi16( _mm_shufflelo_epi16( v, c ), c )

@@ -483,6 +481,9 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_qrev16(v)      v128_shuffle16( v, 0x1b )
 #define v128_lrev16(v)      v128_shuffle16( v, 0xb1 )

+//
+// Bit rotations
+
 // Internal use only, should never be callled from application code.
 #define v128_ror64_sse2( v, c ) \
   _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
@@ -608,10 +609,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #endif

-// deprecated
-#define mm128_rol_32        v128_rol32
-
-// ror( v1 ^ v0, n )
+// (v1 ^ v0) >>> n, ARM NEON has optimized version
 #define v128_ror64xor( v1, v0, n )  v128_ror64( v128_xor( v1, v0 ), n ) 

 /* not used
@@ -689,7 +687,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 /* Not used, exists only for compatibility with NEON if ever needed.
 #define v128_shufflev32( v, vmask ) \
-  v128_shuffle32( v, mm128_movmask_32( vmask ) )
+  v128_shuffle32( v, v128_movmask32( vmask ) )
 */

 #define v128_shuffle8     _mm_shuffle_epi8
@@ -710,15 +708,11 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_swap64(v)      _mm_shuffle_epi32( v, 0x4e )  // grandfathered 
 #define v128_rev64(v)       _mm_shuffle_epi32( v, 0x4e )  // preferred
 #define v128_rev32(v)       _mm_shuffle_epi32( v, 0x1b )
-#define v128_rev16(v)       v128_shuffle16( v, 0x1b )

 // rotate vector elements
 #define v128_shuflr32(v)    _mm_shuffle_epi32( v, 0x39 )
 #define v128_shufll32(v)    _mm_shuffle_epi32( v, 0x93 )

-#define v128_shuflr16(v)    v128_shuffle16( v, 0x39 )
-#define v128_shufll16(v)    v128_shuffle16( v, 0x93 )
-
 // Endian byte swap.

 #if defined(__SSSE3__)
@@ -734,15 +728,12 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_bswap32( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
                                        0x0405060700010203 ) )
-// deprecated
-#define mm128_bswap_32      v128_bswap32
-
 #define v128_bswap16( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0e0f0c0d0a0b0809, \
                                        0x0607040502030001 )

 // 8 byte qword * 8 qwords * 2 lanes = 128 bytes
-#define mm128_block_bswap_64( d, s ) \
+#define v128_block_bswap64( d, s ) \
 { \
  v128_t ctl = _mm_set_epi64x(  0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
  casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
@@ -754,8 +745,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
  casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
  casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
 }
-#define mm128_block_bswap64_512    mm128_block_bswap_64
-#define v128_block_bswap64_512     mm128_block_bswap_64
+#define v128_block_bswap64_512     v128_block_bswap64

 #define v128_block_bswap64_1024( d, s ) \
 { \
@@ -779,7 +769,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 }

 // 4 byte dword * 8 dwords * 4 lanes = 128 bytes
-#define mm128_block_bswap_32( d, s ) \
+#define v128_block_bswap32( d, s ) \
 { \
  v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
@@ -791,11 +781,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
  casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
  casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
 }
-#define mm128_block_bswap32_256      mm128_block_bswap_32
-#define v128_block_bswap32_256       mm128_block_bswap_32
+#define v128_block_bswap32_256       v128_block_bswap32


-#define mm128_block_bswap32_128( d, s ) \
+#define v128_block_bswap32_128( d, s ) \
 { \
  v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
@@ -840,7 +829,6 @@ static inline v128_t v128_bswap32( __m128i v )
      v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
  return  _mm_shufflehi_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
 }
-#define mm128_bswap_32      v128_bswap32 

 static inline v128_t v128_bswap16( __m128i v )
 {
@@ -849,7 +837,7 @@ static inline v128_t v128_bswap16( __m128i v )

 #define v128_bswap128( v )   v128_qrev32( v128_bswap64( v ) )

-static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
+static inline void v128_block_bswap64( __m128i *d, const __m128i *s )
 {
   d[0] = v128_bswap64( s[0] );
   d[1] = v128_bswap64( s[1] );
@@ -860,9 +848,8 @@ static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
   d[6] = v128_bswap64( s[6] );
   d[7] = v128_bswap64( s[7] );
 }
-#define v128_block_bswap64_512 mm128_block_bswap_64

-static inline void mm128_block_bswap64_1024( __m128i *d, const __m128i *s )
+static inline void v128_block_bswap64_1024( __m128i *d, const __m128i *s )
 {
   d[ 0] = v128_bswap64( s[ 0] );
   d[ 1] = v128_bswap64( s[ 1] );
@@ -882,7 +869,7 @@ static inline void mm128_block_bswap64_1024( __m128i *d, const __m128i *s )
   d[15] = v128_bswap64( s[15] );
 }

-static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
+static inline void v128_block_bswap32( __m128i *d, const __m128i *s )
 {
   d[0] = v128_bswap32( s[0] );
   d[1] = v128_bswap32( s[1] );
@@ -893,10 +880,9 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
   d[6] = v128_bswap32( s[6] );
   d[7] = v128_bswap32( s[7] );
 }
-#define mm128_block_bswap32_256 mm128_block_bswap_32
-#define v128_block_bswap32_256  mm128_block_bswap_32
+#define v128_block_bswap32_256  v128_block_bswap32

-static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
+static inline void v128_block_bswap32_512( __m128i *d, const __m128i *s )
 {
   d[ 0] = v128_bswap32( s[ 0] );
   d[ 1] = v128_bswap32( s[ 1] );
@@ -918,9 +904,6 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )

 #endif // SSSE3 else SSE2

-#define v128_block_bswap32             mm128_block_bswap_32
-#define v128_block_bswap64             mm128_block_bswap_64
-
 // alignr instruction for 32 & 64 bit elements is only available with AVX512
 // but emulated here. Behaviour is consistent with Intel alignr intrinsics.
 #if defined(__SSSE3__)
@@ -932,25 +915,27 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
 #else

 #define v128_alignr8( hi, lo, c ) \
-   _mm_or_si128( _mm_slli_si128( hi, c ), _mm_srli_si128( lo, c ) )
+   _mm_or_si128( _mm_slli_si128( hi, 16-(c) ), _mm_srli_si128( lo, c ) )

+// c arg is trivial only valid value is 1
 #define v128_alignr64( hi, lo, c ) \
-   _mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )
+   _mm_or_si128( _mm_slli_si128( hi, 16-((c)*8) ), _mm_srli_si128( lo, (c)*8 ) )

 #define v128_alignr32( hi, lo, c ) \
-   _mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )
+   _mm_or_si128( _mm_slli_si128( hi, 16-((c)*4) ), _mm_srli_si128( lo, (c)*4 ) )

 #endif

 // blend using vector mask
 #if defined(__SSE4_1__)

-// Bytewise using sign bit of each byte element of mask
+// Bytewise using sign bit of each byte element of mask. Use full bitmask
+// for compatibility with SSE2 & NEON.
 #define v128_blendv                    _mm_blendv_epi8

 #else

-// Bitwise
+// Bitwise, use only byte wise for compatibility with SSE4_1.
 #define v128_blendv( v1, v0, mask ) \
   v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )

--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -73,10 +73,10 @@ typedef union

 #else

-#define mm256_bcast128lo_64( i64 )   mm256_bcast_m128( mm128_mov64_128( i64 ) )
+#define mm256_bcast128lo_64( i64 )   mm256_bcast_m128( v128_mov64( i64 ) )

 #define mm256_bcast128hi_64( i64 )   _mm256_permute4x64_epi64( \
-                   _mm256_castsi128_si256( mm128_mov64_128( i64 ) ), 0x11 )
+                   _mm256_castsi128_si256( v128_mov64( i64 ) ), 0x11 )

 #endif

@@ -172,19 +172,24 @@ static inline __m256i mm256_not( const __m256i v )
    
 #else

-#define mm256_ornot( v1, v0 )      _mm256_or_si256( v1, mm256_not( v0 ) )
+#define mm256_ornot( v1, v0 )      _mm256_or_si256( mm256_not( v1 ), v0 )

+// usage hints to improve performance when ternary logic is not avalable:
+// If overwriting an input arg put that arg first so the intermediate
+// result can be stored in the dest.
+// Put an arg with the nearest dependency last so independant args can be
+// processed first.
 #define mm256_xor3( a, b, c ) \
-  _mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
+  _mm256_xor_si256( _mm256_xor_si256( a, b ), c )

 #define mm256_xor4( a, b, c, d ) \
  _mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )

 #define mm256_and3( a, b, c ) \
-  _mm256_and_si256( a, _mm256_and_si256( b, c ) )
+  _mm256_and_si256( _mm256_and_si256( a, b ), c )

 #define mm256_or3( a, b, c ) \
-   _mm256_or_si256( a, _mm256_or_si256( b, c ) )
+   _mm256_or_si256( _mm256_or_si256( a, b ), c )

 #define mm256_xorand( a, b, c ) \
  _mm256_xor_si256( a, _mm256_and_si256( b, c ) )
@@ -217,12 +222,11 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_movmask_32( v ) \
   _mm256_movemask_ps( _mm256_castsi256_ps( v ) )

-//
-//           Bit rotations.
-
+// shuffle 16 bit elements within 64 bit lanes.
 #define mm256_shuffle16( v, c ) \
   _mm256_shufflehi_epi16( _mm256_shufflelo_epi16( v, c ), c )

+// reverse elements within lanes.
 #define mm256_qrev32(v)    _mm256_shuffle_epi32( v, 0xb1 )
 #define mm256_swap64_32    mm256_qrev32       // grandfathered

@@ -242,6 +246,9 @@ static inline __m256i mm256_not( const __m256i v )
   _mm256_shuffle_epi8( v, mm256_bcast_m128( \
                         v128_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )

+//
+//           Bit rotations.
+
 // These should never be called directly by applications.
 #define mm256_ror_64_avx2( v, c ) \
   _mm256_or_si256( _mm256_srli_epi64( v, c ), \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -125,7 +125,7 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
 // Pseudo constants.
 #define m512_zero       _mm512_setzero_si512()

-// use asm to avoid compiler warning for unitialized local
+// use asm to avoid compiler warning for uninitialized local
 static inline __m512i mm512_neg1_fn()
 {
   __m512i v;
@@ -185,6 +185,8 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Ternary logic uses 8 bit truth table to define any 3 input logical
 // expression using any number or combinations of AND, OR, XOR, NOT.
+// Macros with duplicate references to the same argument are
+// not expression safe. Switch to inline function if required.

 // ~v1 | v0
 #define mm512_ornot( v1, v0 )      _mm512_ternarylogic_epi64( v1, v0, v0, 0xcf )
--- a/simd-utils/simd-64.h
+++ b/simd-utils/simd-64.h
@@ -1,182 +0,0 @@
-#if !defined(SIMD_64_H__)
-#define SIMD_64_H__ 1
-
-#if defined(__x86_64__) && defined(__MMX__)
-
-////////////////////////////////////////////////////////////////
-//
-//               64 bit MMX vectors.
-//
-// This code is not used anywhere annd likely never will. It's intent was
-// to support 2 way parallel hashing using  MMX, or NEON for 32 bit hash
-// functions, but hasn't been implementedwas never implemented.
-// 
-
-#define v64_t                        __m64
-#define v64u32_t                     v64_t
-
-#define v64_load                      _mm_load_si64
-#define v64_store                     _mm_store_si64
-
-#define v64_64(i64)                   ((__m64)(i64))
-#define v64_32                        _mm_set1_pi32
-#define v64_16                        _mm_set1_pi16
-#define v64_8                         _mm_set1_pi8
-
-#define v64_add32                     _mm_add_pi32
-#define v64_add16                     _mm_add_pi16
-#define v64_add8                      _mm_add_pi8
-
-#define v64_mul32                     _mm_mullo_pi32
-#define v64_mul16                     _mm_mullo_pi16
-
-// compare
-#define v64_cmpeq32                   _mm_cmpeq_epi32
-#define v64_cmpeq16                   _mm_cmpeq_epi16
-#define v64_cmpeq8                    _mm_cmpeq_epi8
-
-#define v64_cmpgt32                   _mm_cmpgt_epi32
-#define v64_cmpgt16                   _mm_cmpgt_epi16
-#define v64_cmpgt8                    _mm_cmpgt_epi8
-
-#define v64_cmplt32                   _mm_cmplt_epi32
-#define v64_cmplt16                   _mm_cmplt_epi16
-#define v64_cmplt8                    _mm_cmplt_epi8
-
-// bit shift
-#define v64_sl32                      _mm_slli_epi32
-#define v64_sl16                      _mm_slli_epi16
-#define v64_sl8                       _mm_slli_epi8
-
-#define v64_sr32                      _mm_srli_epi32
-#define v64_sr16                      _mm_srli_epi16
-#define v64_sr8                       _mm_srli_epi8
-
-#define v64_sra32                     _mm_srai_epi32
-#define v64_sra16                     _mm_srai_epi16
-#define v64_sra8                      _mm_srai_epi8
-
-#define v64_alignr8                   _mm_alignr_pi8
-#define v64_unpacklo32                _mm_unpacklo_pi32
-#define v64_unpackhi32                _mm_unpackhi_pi32
-#define v64_unpacklo16                _mm_unpacklo_pi16
-#define v64_unpackhi16                _mm_unpacklhi_pi16
-#define v64_unpacklo8                 _mm_unpacklo_pi8
-#define v64_unpackhi8                 _mm_unpackhi_pi16
-
-// Pseudo constants
-
-#define v64_zero        _mm_setzero_si64()
-#define v64_one_64      _mm_set_pi32(  0UL, 1UL )
-#define v64_one_32      v64_32( 1UL )
-#define v64_one_16      v64_16( 1U )
-#define v64_one_8       v64_8(  1U );
-#define v64_neg1        v64_32( 0xFFFFFFFFUL )
-
-#define casti_v64(p,i) (((v64_t*)(p))[(i)])
-
-// Bitwise not: ~(a)
-//#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 )
-#define v64_not( a ) ( (v64_t)( ~( (uint64_t)(a) ) )
-
-/*      
-// Unary negate elements
-#define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, v )
-#define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, v )
-#define mm64_negate_8(  v ) _mm_sub_pi8(  m64_zero, v )
-*/
-
-static inline void v64_memset_zero( __m64 *dst,  const int n )
-{   for ( int i = 0; i < n; i++ ) dst[i] = v64_zero; }
-
-static inline void v64_memset( __m64 *dst, const __m64 a, const int n )
-{   for ( int i = 0; i < n; i++ ) dst[i] = a; }
-
-static inline void v64_memcpy( __m64 *dst, const __m64 *src, const int n )
-{   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
-      
-#define v64_or                       _mm_or_si64
-#define v64_and                      _mm_and_si64
-#define v64_xor                      _mm_xor_si64
-#define v64_andnot                   _mm_andnot_si64
-#define v64_xor3( v2, v1, v0 )       v64_xor( v2, v64_andnot( v1, v0 ) )
-#define v64_xorandnot( v2, v1, v0 )  v64_xor( v2, v64_andnot( v1, v0 ) )
-
-
-// Rotate bits in packed elements of 64 bit vector
-#define v64_rol64( a, n ) \
-   _mm_or_si64( _mm_slli_si64( a, n ), \
-                _mm_srli_si64( a, 64-(n) ) )
-
-#define v64_ror64( a, n ) \
-   _mm_or_si64( _mm_srli_si64( a, n ), \
-                _mm_slli_si64( a, 64-(n) ) )
-
-#define v64_rol32( a, n ) \
-   _mm_or_si64( _mm_slli_pi32( a, n ), \
-                _mm_srli_pi32( a, 32-(n) ) )
-
-#define v64_ror32( a, n ) \
-   _mm_or_si64( _mm_srli_pi32( a, n ), \
-                _mm_slli_pi32( a, 32-(n) ) )
-
-#define v64_rol16( a, n ) \
-   _mm_or_si64( _mm_slli_pi16( a, n ), \
-                _mm_srli_pi16( a, 16-(n) ) )
-
-#define v64_ror16( a, n ) \
-   _mm_or_si64( _mm_srli_pi16( a, n ), \
-                _mm_slli_pi16( a, 16-(n) ) )
-
-// Rotate packed elements accross lanes. Useful for byte swap and byte
-// rotation.
-
-#if defined(__SSE__)
-
-// Swap hi & lo 32 bits.
-#define v64_swap32( a )      _mm_shuffle_pi16( a, 0x4e )
-
-#define v64_shulfr16( a )     _mm_shuffle_pi16( a, 0x39 ) 
-#define v64_shufll16( a )     _mm_shuffle_pi16( a, 0x93 ) 
-
-// Swap hi & lo 16 bits of each 32 bit element
-#define v64_swap32_16( a )    _mm_shuffle_pi16( a, 0xb1 )
-
-#endif   // SSE
-
-#if defined(__SSSE3__)
-
-// Endian byte swap packed elements
-
-#define v64_bswap32( v ) \
-    _mm_shuffle_pi8( v, (__m64)0x0405060700010203 )
-
-#define v64_bswap16( v ) \
-    _mm_shuffle_pi8( v, (__m64)0x0607040502030001 );
-
-// Rotate right by c bytes
-static inline v64_t v64_shuflr_x8( __m64 v, const int c )
-{ return _mm_alignr_pi8( v, v, c ); }
-
-#else
-
-#define v64_bswap32( v ) \
-   _mm_set_pi32( __builtin_bswap32( ((uint32_t*)&v)[1] ), \
-                 __builtin_bswap32( ((uint32_t*)&v)[0] )  )
-
-#define v64_bswap16( v ) \
-   _mm_set_pi16( __builtin_bswap16( ((uint16_t*)&v)[3] ), \
-                 __builtin_bswap16( ((uint16_t*)&v)[2] ), \
-                 __builtin_bswap16( ((uint16_t*)&v)[1] ), \
-                 __builtin_bswap16( ((uint16_t*)&v)[0] )  )
-
-#endif   // SSSE3
-
-#define v64_blendv( v1, v0, mask ) \
-   v64_or( v64_and( mask, v1 ), v64_andnot( mask, v0 ) )
-
-
-#endif // MMX
-
-#endif // SIMD_64_H__
-
--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -2,7 +2,7 @@
 #define SIMD_INT_H__ 1

 //TODO compile time test for byte order
-// be64 etc using HW bowap.
+// be64 etc using HW bswap.
 //
 // Endian byte swap
 #if defined(__x86_64__)
@@ -19,6 +19,9 @@ static inline uint64_t bswap_64( uint64_t a )
   return b;
 }

+// This produces warnings from clang, but its suggested workaround 
+// "rev32 %w0, %w1\n\t" produced errors instead. GCC doesn't complain and
+// it works as is on both.
 static inline uint32_t bswap_32( uint32_t a )
 {
   uint32_t b;
@@ -94,7 +97,7 @@ static inline uint16_t be16( const uint16_t u16 )
  return ( (uint16_t)(p[3]) ) + ( (uint16_t)(p[2]) <<  8 );
 }

-static inline uint32_t le162( const uint16_t u16 )
+static inline uint32_t le16( const uint16_t u16 )
 {
   const uint8_t *p = (uint8_t const *)&u16;
   return ( (uint16_t)(p[0]) ) + ( (uint16_t)(p[1]) <<  8 );
@@ -108,8 +111,12 @@ static inline uint32_t le162( const uint16_t u16 )
 #define rol32       __rold
 #define ror32       __rord

+/*  these don't seem to work
 #elif defined(__aarch64__)

+// Documentation is vague, ror exists but is ambiguous. Docs say it can
+// do 32 or 64 bit registers. Assuming that is architecture specific and can
+// only do 32 bit on 32 bit arch. Rarely used so not a big issue.
 static inline uint64_t ror64( uint64_t a, const int c )
 {
   uint64_t b;
@@ -125,6 +132,7 @@ static inline uint32_t ror32( uint32_t a, const int c )
   return b;
 }
 #define rol32( a, c )     ror32( a, 32-(c) )
+*/

 #else

--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -38,7 +38,9 @@
 #define v128u8_load( p )              vld1q_u16( (uint8_t*)(p) )
 #define v128u8_store( p, v )          vst1q_u16( (uint8_t*)(p), v )

-// load & set1 combined, doesn't work
+// load & set1 combined. What if source is already loaded?
+// Don't use, leave it up to the compiler to optimize.
+// Same with vld1q_lane.
 #define v128_load1_64(p)              vld1q_dup_u64( (uint64_t*)(p) )
 #define v128_load1_32(p)              vld1q_dup_u32( (uint32_t*)(p) )
 #define v128_load1_16(p)              vld1q_dup_u16( (uint16_t*)(p) )
@@ -61,17 +63,13 @@
 #define v128_sub16                    vsubq_u16
 #define v128_sub8                     vsubq_u8

-// returns low half, u64 undocumented, may not exist.
-#define v128_mul64                    vmulq_u64
+// returns low half
 #define v128_mul32                    vmulq_u32
 #define v128_mul16                    vmulq_u16

-// Widening multiply, align source elements with Intel
-static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
-{
-   return vmull_u32( vget_low_u32( vcopyq_laneq_u32( v1, 1, v1, 2 ) ),
-                     vget_low_u32( vcopyq_laneq_u32( v0, 1, v0, 2 ) ) );
-}
+// Widening multiply, realign source elements from x86_64 to NEON.
+#define v128_mulw32( v1, v0 ) \
+   vmull_u32( vmovn_u64( v1 ), vmovn_u64( v0 ) )

 // compare
 #define v128_cmpeq64                  vceqq_u64
@@ -95,6 +93,8 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 #define v128_cmplt16( v1, v0 )      vcltq_s16( (int16x8_t)v1, (int16x8_t)(v0) )
 #define v128_cmplt8( v1, v0 )       vcltq_s8( (int8x16_t)v1, (int8x16_t)(v0) )

+#define v128_cmpeq_zero                vceqzq_u64
+
 // Logical bit shift
 #define v128_sl64                     vshlq_n_u64
 #define v128_sl32                     vshlq_n_u32
@@ -137,14 +137,14 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 #if defined(__ARM_FEATURE_SHA3)
  #define v128_xor3                   veor3q_u32
 #else
-  #define v128_xor3( v2, v1, v0 )     veorq_u32( v2, veorq_u32( v1, v0 ) )
+  #define v128_xor3( v2, v1, v0 )     veorq_u32( veorq_u32( v2, v1 ), v0 )
 #endif

 // v2 & v1 & v0
-#define v128_and3( v2, v1, v0 )       v128_and( v2, v128_and( v1, v0 ) )
+#define v128_and3( v2, v1, v0 )       v128_and( v128_and( v2, v1 ), v0 )

 // v2 | v1 | v0
-#define v128_or3( v2, v1, v0 )        v128_or( v2, v128_or( v1, v0 ) )
+#define v128_or3( v2, v1, v0 )        v128_or( v128_or( v2, v1 ), v0 )

 // v2 ^ ( ~v1 & v0 )
 #if defined(__ARM_FEATURE_SHA3)
@@ -180,6 +180,7 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 #define v128_unpacklo8(  v1, v0 )     vzip1q_u8(  v1, v0 )
 #define v128_unpackhi8(  v1, v0 )     vzip2q_u8(  v1, v0 )

+// vzipq_u32 can do hi & lo and return uint32x4x2, no 64 bit version.

 // AES
 // consistent with Intel AES intrinsics, break up for optimizing
@@ -239,18 +240,15 @@ typedef union
 #define cast_v128u32( p )              (*((uint32x4_t*)(p)))
 #define castp_v128u32( p )             ((uint32x4_t*)(p))

-#define v128_zero                      v128_64( 0ull )
-
-#define v128_cmpeq_zero                vceqzq_u64
-
-#define v128_neg1                      v128_64( 0xffffffffffffffffull )
-
 // set1
 #define v128_64                        vmovq_n_u64
 #define v128_32                        vmovq_n_u32
 #define v128_16                        vmovq_n_u16
 #define v128_8                         vmovq_n_u8

+#define v128_zero                      v128_64( 0ull )
+#define v128_neg1                      v128_64( 0xffffffffffffffffull )
+
 #define v64_set32( u32_1, u32_0 ) \
  vcreate_u32( ( (uint64_t)(u32_1) << 32 ) | (uint64_t)(u32_0) )

@@ -315,7 +313,6 @@ static inline void v128_memset_zero( void *dst, const int n )
    memset( dst, 0, n*16 );
 }

-
 static inline void v128_memset( void *dst, const void *src, const int n )
 {
   for( int i = 0; i < n; i++ )
@@ -360,28 +357,23 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
                             ((uint16x8_t)(v)), c )

 #define v128_rol16( v, c ) \
-  ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
+  ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)(v)) ) \
               : vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \
                             ((uint16x8_t)(v)), c )

 #define v128_ror8( v, c ) \
-      vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)),  8-(c) ), \
+      vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
                  ((uint8x16_t)(v)), c )

 #define v128_rol8( v, c ) \
-      vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)),  8-(c) ), \
+      vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
                 ((uint8x16_t)(v)), c )

-
-// ror( v1 ^ v0, n )
+// ( v1 ^ v0 ) >>> c 
 #if defined(__ARM_FEATURE_SHA3)
-
-#define v128_ror64xor( v1, v0, n )  vxarq_u64( v1, v0, n ) 
-
+  #define v128_ror64xor( v1, v0, c )  vxarq_u64( v1, v0, c ) 
 #else
-
-#define v128_ror64xor( v1, v0, n )  v128_ror64( v128_xor( v1, v0 ), n ) 
-
+  #define v128_ror64xor( v1, v0, c )  v128_ror64( v128_xor( v1, v0 ), c ) 
 #endif

 #define v128_2ror64( v1, v0, c ) \
@@ -414,7 +406,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 v1 = vorrq_u32( v1, t1 ); \
 }

-#define v128_2rorx32( v1, v0, c ) \
+#define v128_2ror32( v1, v0, c ) \
 { \
 uint32x4_t t0 = vshlq_n_u32( v0, c ); \
 uint32x4_t t1 = vshlq_n_u32( v1, c ); \
@@ -438,7 +430,6 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )

 // sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
 // Bit rotation already promotes faster widths. Usage is context sensitive.
-// preferred.

 // reverse elements in vector lanes
 #define v128_qrev32            vrev64q_u32
@@ -448,9 +439,9 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 #define v128_lrev16            vrev32q_u16

 // aka bswap
-#define v128_qrev8             vrev64q_u8
-#define v128_lrev8             vrev32q_u8
-#define v128_wrev8             vrev16q_u8
+// #define v128_qrev8             vrev64q_u8
+// #define v128_lrev8             vrev32q_u8
+// #define v128_wrev8             vrev16q_u8

 // full vector rotation

@@ -460,7 +451,6 @@ static inline uint64x2_t v128_rev64( uint64x2_t v )
 #define v128_swap64     v128_rev64   // grandfathered

 #define v128_rev32(v)        v128_rev64( v128_qrev32( v ) )
-#define v128_rev16(v)        v128_rev64( v128_qrev16( v ) )

 // shuffle-rotate vector elements
 static inline uint32x4_t v128_shuflr32( uint32x4_t v )
@@ -469,12 +459,6 @@ static inline uint32x4_t v128_shuflr32( uint32x4_t v )
 static inline uint32x4_t v128_shufll32( uint32x4_t v )
 {   return vextq_u32( v, v, 3 ); }

-static inline uint16x8_t v128_shuflr16( uint16x8_t v )
-{   return vextq_u16( v, v, 1 ); }
-
-static inline uint16x8_t v128_shufll16( uint16x8_t v )
-{   return vextq_u16( v, v, 7 ); }
-
 // reverse bits in bytes, nothing like it in x86_64
 #define v128_bitrev8           vrbitq_u8

@@ -482,9 +466,9 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
 #define v128_bswap16(v)        (uint16x8_t)vrev16q_u8( (uint8x16_t)(v) )
 #define v128_bswap32(v)        (uint32x4_t)vrev32q_u8( (uint8x16_t)(v) )
 #define v128_bswap64(v)        (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) )
-#define v128_bswap128(v)       (uint32x4_t)v128_swap64( v128_bswap64(v) )
+#define v128_bswap128(v)       (uint32x4_t)v128_rev64( v128_bswap64(v) )

-// Usefull for x86_64 but does nothing for ARM
+// Useful for x86_64 but does nothing for ARM
 #define v128_block_bswap32( dst, src ) \
 { \
   casti_v128u32( dst,0 ) = v128_bswap32( casti_v128u32( src,0 ) ); \
@@ -496,7 +480,7 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
   casti_v128u32( dst,6 ) = v128_bswap32( casti_v128u32( src,6 ) ); \
   casti_v128u32( dst,7 ) = v128_bswap32( casti_v128u32( src,7 ) ); \
 }
-#define v128_block_bswap32_256( dst, src ) \
+#define v128_block_bswap32_256    v128_block_bswap32

 #define v128_block_bswap32_512( dst, src ) \
 { \
@@ -551,8 +535,9 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
   casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \
 }

-// Bitwise blend using vector mask
-#define v128_blendv( v1, v0, mask )    vbslq_u32( mask, v1, v0 )
+// Bitwise blend using vector mask, use only bytewise for compatibility
+// with x86_64.
+#define v128_blendv( v1, v0, mask )    vbslq_u32( mask, v0, v1 )

 #endif   // __ARM_NEON
 #endif   // SIMD_NEON_H__
--- a/simd-utils/simd-sve.h
+++ b/simd-utils/simd-sve.h
@@ -0,0 +1,152 @@
+// Placeholder for now.
+//
+// This file will hold AArch64 SVE code, a replecement for NEON that uses
+// vector length agnostic instructions. This means the same code can be used
+// on CPUs with different SVE vector register lengths. This is not good for
+// vectorized hashing.
+// Optimum hash is sensitive to the vector register length with different code
+// used for different register sizes. On X86_64 the vector length is tied to
+// the CPU feature making it simple and efficient to handle different lengths
+// although it results in multiple executables. Theoretically SVE could use a
+// single executable for any vector length.
+//
+// With the SVE vector length only known at run time it results in run time
+// overhead to test the vector length. Theoretically it could be tested at
+// program loading and appropriate libraries loaded. However I don't know if
+// this can be done and if specified how to do it.
+//
+// SVE is not expected to be used for 128 bit vectors as it does not provide any
+// advantages over NEON. However, it may be implemented for testing purposes
+// because CPU with registers larger than 128 bits are currently very rare and
+// very expensive server class CPUs.
+//
+// However, 128 bit vectors also need to be supported with 256 bit registers.
+// This could be a challenge for un-predicated functions.
+//
+// N-way parallel hashing could be the best use of SVE, usimg the same code
+// for all vector lengths with the only variable being the number of lanes.
+// This will still require run time checking but should be lighter than
+// substituting functions.
+
+// Current approach is to hard code the length in these intrinsics and called
+// by existing length specific code.
+// define with sv_ prefix for generic use predicate provided by caller,
+// use sv<size>_ with hard coded predicate.
+// v<size>_ only if and when it's compatible with SSE & NEON
+
+// Many instructions have no predicate operand, how is VVL handled?
+// How does the CPU know how long the vector is and whether it spans
+// multiple registers without the predicate?
+
+// Also how does the predicate define the vector size? How to tell if inactive
+// high lanes are part of the vector or beyond its range.
+//
+// Some intructions may have an implied predicate by other arguments. 
+// TBL for example will only have shuffle indexes for active lanes.
+// However this is dependant on software being aware of register size.
+
+
+ 
+#if 0
+// #if defined USE_SV128
+// NEON needs to be disabled
+
+#define PRED128 0xffff
+#define PRED256 0xffffffff
+
+// Types should be transparent
+
+
+#define sv128u32_t  svuint32_t
+#define sv256u32_t  svuint32_t
+
+
+// load1
+
+
+// arithmetic
+
+// _z zero inactive elements, _x undefined inactive elements, _m inactive
+// elements from first arg. arg order only matters when _m used. Use _x.
+
+#define sv_add32( p, v1, v0 )         svadd_u32_x( p, v1, v0 )
+
+#define sv128_add32( v1, v0 )         svadd_u32_x( PRED128, v1, v0 )
+#define sv256_add32( v1, v0 )         svadd_u32_x( PRED256, v1, v0 )
+
+// Add integer to each element
+#define sv_addi32( p, v, i )           svadd_n_u32_x( p, v, i )
+
+
+
+// compare
+
+#define sv_cmpeq32( p, v1, v0 )       svcmpeq_u32( p, v1, v0 )
+
+#define sv128_cmpeq32( v1, v0 )       svcmpeq_u32( PRED128, v1, v0 )
+#define sv256_cmpeq32( v1, v0 )       svcmpeq_u32( PRED256, v1, v0 )
+
+
+// bit shift
+
+#define sv_sl32( v, c )              svlsl_n_u32_x( p, v, c )
+
+#define sv128_sl32( v, c )           svlsl_n_u32_x( PRED128, v, c )
+#define sv256_sl32( v, c )           svlsl_n_u32_x( PRED256, v, c )
+
+
+// logic
+
+#define sv_or( p, v1, v0 )           svorr_u32_x( p, v1, v0 )
+
+#define sv128_or( v1, v0 )           svorr_u32_x( PRED128, v1, v0 )
+#define sv256_or( v1, v0 )           svorr_u32_x( PRED256, v1, v0 )
+
+// ext used for alignr, and zip used for unpack have no predicate arg.
+// How is vector length determined? How are register sizes handled?
+// How are part registers handled?
+
+// alignr (ext)
+
+// unpack
+
+
+// AES
+
+// AES uses fixed 128 bit vectors, how does this work with larger registers?
+ 
+// set1
+
+#define sv128_32( n )      svdup_n_u32_x( PRED128, n )
+#define sv256_32( n )      svdup_n_u32_x( PRED256, n )
+
+// broadcast
+
+// svdup_lane has no predicate
+
+// constants
+
+
+// pointer cast
+
+
+// Bit rotation
+
+// No predication for shift instructions
+
+// Cross lane shuffles
+
+// Very limited shuffling, mostly svtbl which has no predicate and  uses
+// vector for the index.
+
+
+// endian byte swap
+
+
+#define sv128_bswap32(v)        svrevb_u32_x( p, v )
+
+
+// blend
+
+#enfif
+
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -16,14 +16,19 @@
 #include "miner.h"
 #include "simd-utils.h"

-#if defined(__aarch64__) && !defined(__APPLE__)
+// Missing on MinGW, MacOS
+#if defined(__aarch64__) && !defined(WIN32) && !defined(__APPLE__)
+#define ARM_AUXV 
+#endif
+
+#if defined(ARM_AUXV)
 // for arm's "cpuid"
 #include <sys/auxv.h>
 #include <asm/hwcap.h>
-
+#include <sys/prctl.h>
 #endif

-#ifndef WIN32
+#if !(defined(WIN32) || defined(__APPLE__))

 // 1035g1: /sys/devices/platform/coretemp.0/hwmon/hwmon3/temp1_input
 // 1035g1: /sys/class/hwmon/hwmon1/temp1_input wrong temp
@@ -147,7 +152,7 @@ static inline void linux_cpu_hilo_freq( float *lo, float *hi )

 static inline float cpu_temp( int core )
 {
-#ifdef WIN32
+#if defined(WIN32) || defined(__APPLE__)
 	return 0.;
 #else
 	return linux_cputemp( core );
@@ -156,7 +161,7 @@ static inline float cpu_temp( int core )

 static inline uint32_t cpu_clock( int core )
 {
-#ifdef WIN32
+#if defined(WIN32) || defined(__APPLE__)
 	return 0;
 #else
 	return linux_cpufreq( core );
@@ -169,17 +174,17 @@ static inline int cpu_fanpercent()
 }


-// CPUID
+// x86_64 CPUID

 // This list is incomplete, it only contains features of interest to cpuminer.
 // refer to http://en.wikipedia.org/wiki/CPUID for details.

 // AVX10 compatibility notes
 //
-// Notation used: AVX10i.[version]_[vectorwidth]
-// AVX10.1_512 is a rebranding of AVX512 and is effectively the AVX* superset
+// Display format: AVX10.[version]-[vectorwidth]
+// AVX10.1-512 is a rebranding of AVX512 and is effectively the AVX* superset
 // with full 512 bit vector support.
-// AVX10.2_256 is effectively AVX2 + AVX512_VL, all AVX512 instructions and
+// AVX10.2-256 is effectively AVX2 + AVX512_VL, all AVX512 instructions and
 // features applied only to 256 bit and 128 bit vectors.
 // Future AVX10 versions will add new instructions and features.

@@ -275,8 +280,8 @@ static inline int cpu_fanpercent()
 #define FMA3_mask    (FMA3_Flag|AVX_mask)
 #define AVX512_mask  (AVX512_VL_Flag|AVX512_BW_Flag|AVX512_DQ_Flag|AVX512_F_Flag)

-
 #if defined(__x86_64__)
+
 static inline void cpuid( unsigned int leaf, unsigned int subleaf,
                          unsigned int output[4] )
 {
@@ -309,16 +314,65 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf,
 #endif
 }

-#elif defined(__aarch64__) && !defined(__APPLE__)
+#elif defined(ARM_AUXV)
+
+// Always test if HWCAP variable is defined in the kernel before attempting
+// to compile it. If not defined the feature can't be tested and won't be
+// included in the compile.
+// This can occur if compiling with an old kernel and a new CPU and could
+// result in a suboptimal build.
+// leaf and subleaf arguments are ignored.

 static inline void cpuid( unsigned int leaf, unsigned int subleaf,
                          unsigned int output[4] )
 {
-    output[0] = getauxval(AT_HWCAP);
+#if defined(AT_HWCAP)
+    output[0] = getauxval( AT_HWCAP );
+#else
+    output[0] = 0;
+#endif
+#if defined(AT_HWCAP2)
+    output[1] = getauxval( AT_HWCAP2 );
+#else
+    output[1] = 0;
+#endif    
+
+/*    
+#define has(CAP, hwcap) !!((hwcap) & HWCAP_##CAP)
+#define pr(CAP, hwcap) printf("%10s = %d\n", #CAP, has(CAP, hwcap))
+
+	unsigned long hwcaps = getauxval(AT_HWCAP);
+	printf("HWCAP = 0x%lx\n", hwcaps);
+
+	pr(FP, hwcaps);
+	pr(ASIMD, hwcaps);
+	pr(EVTSTRM, hwcaps);
+	pr(AES, hwcaps);
+	pr(PMULL, hwcaps);
+	pr(SHA1, hwcaps);
+	pr(SHA2, hwcaps);
+	pr(CRC32, hwcaps);
+	pr(ATOMICS, hwcaps);
+	pr(FPHP, hwcaps);
+	pr(ASIMDHP, hwcaps);
+	pr(CPUID, hwcaps);
+	pr(ASIMDRDM, hwcaps);
+	pr(JSCVT, hwcaps);
+	pr(FCMA, hwcaps);
+	pr(LRCPC, hwcaps);
+	pr(DCPOP, hwcaps);
+	pr(SHA3, hwcaps);
+	pr(SM3, hwcaps);
+	pr(SM4, hwcaps);
+	pr(ASIMDDP, hwcaps);
+	pr(SHA512, hwcaps);
+	pr(SVE, hwcaps);
+*/    
 }   

 #else
-#define cpuid(leaf, subleaf, out) out[0] = 0;
+#define cpuid( leaf, subleaf, output ) \
+   output[0] = output[1] = output[2] = output[3] = 0;
 #endif

 static inline void cpu_getname(char *outbuf, size_t maxsz)
@@ -447,31 +501,20 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 #ifdef __ARM_FEATURE_SHA3
 #warning "__ARM_FEATURE_SHA3"
 #endif
-*/
-
-// GCC-14.1: the AVX512 macros are defined even when compiled with only
-// -mavx10.1-256, causing compile errors in AVX512 code. Only with
-// -mavx10.1-512 does it compile successfully.
-// __EVEX512__ is set only when compiled with -mavx10.1-512.
-// Adding -fno-evex512 doesn't help.
-// Building with -mapxf fails to configure on a CPU without APX because it can
-// run the test program.
-/*
-#ifdef __AVX10_1__
-#warning "__AVX10_1__"
+#ifdef __ARM_FEATURE_SHA512
+#warning "__ARM_FEATURE_SHA512"
 #endif
-#ifdef __AVX10_1_256__
-#warning "__AVX10_1_256__"
+#ifdef __ARM_FEATURE_SVE
+#warning "__ARM_FEATURE_SVE"
 #endif
-#ifdef __AVX10_1_512__
-#warning "__AVX10_1_512__"
+#ifdef __ARM_FEATURE_SVE2
+#warning "__ARM_FEATURE_SVE2"
 #endif
-#ifdef __EVEX512__
-#warning "__EVEX512__"
+#ifdef __ARM_FEATURE_SME
+#warning "__ARM_FEATURE_SME"
 #endif
 */

-
 // Typical display format: AVX10.[version]_[vectorlength], if vector length is
 // omitted 256 is the default.
 //    Ex: AVX10.1_512
@@ -482,7 +525,6 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 //   1     1    1    1    = AVX10 512 bit max  (version 1 granite rapids)
 // Other combinations are not defined.

-// No technical need for this, the code won't run if false.
 static inline bool cpu_arch_x86_64()
 {
 #if defined(__x86_64__)
@@ -515,11 +557,11 @@ static inline bool has_sse()
 static inline bool has_sse2()
 {
 #if defined(__x86_64__)
-    unsigned int cpu_info[4] = { 0 };
-    cpuid( CPU_INFO, 0, cpu_info );
-    return cpu_info[ EDX_Reg ] & SSE2_Flag;
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( CPU_INFO, 0, cpu_info );
+   return cpu_info[ EDX_Reg ] & SSE2_Flag;
 #else
-    return false;
+   return false;
 #endif
 }

@@ -556,39 +598,11 @@ static inline bool has_sse42()
 #endif
 }

+// There's no HWCAP for NEON, assume it's always true.
 static inline bool has_neon()
 {
-#if defined(__aarch64__) && !defined(__APPLE__)
-    unsigned int cpu_info[4] = { 0 };
-    return cpu_info[0];
-#else
-    return false;
-#endif
-}
-
-static inline bool has_aes_ni()
-{
-#if defined(__x86_64__)
-   if ( has_sse2() )
-   {
-      unsigned int cpu_info[4] = { 0 };
-      cpuid( CPU_INFO, 0, cpu_info );
-      return cpu_info[ ECX_Reg ] & AES_NI_Flag;
-   }
-   return false;
-#elif defined(__aarch64__) && !defined(__APPLE__)
-   if ( has_neon() )
-   {
-#if defined(KERNEL_HWCAP_AES)
-      return true;
-#else
-      return false;
-#endif
-/*      unsigned int cpu_info[4] = { 0 };
-      cpuid( 0, 0, cpu_info );
-      return cpu_info[0] & HWCAP_AES;
-*/   }
-   return false;
+#if defined(__aarch64__)
+   return true;
 #else
   return false;
 #endif
@@ -616,54 +630,48 @@ static inline bool has_avx2()
 #endif
 }

-static inline bool has_sha()
+// SVE vector width is determined at run time.
+static inline bool has_sve()
 {
-#if defined(__x86_64__)
-    if ( has_avx() )
-    {
-       unsigned int cpu_info[4] = { 0 };
-       cpuid( EXTENDED_FEATURES, 0, cpu_info );
-       return cpu_info[ EBX_Reg ] & SHA_Flag;
-    }
-    return false;
-#elif defined(__aarch64__) && !defined(__APPLE__)
-    if ( has_neon() )
-    {
-#if defined(KERNEL_HWCAP_SHA2)
-       return true;
+#if defined(__aarch64__) && defined(HWCAP_SVE)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[0] & HWCAP_SVE;
 #else
-       return false;
-#endif
-/*       unsigned int cpu_info[4] = { 0 };
-       cpuid( 0, 0, cpu_info );
-       return cpu_info[0] & HWCAP_SHA2;
-*/    }
-    return false;
-#else
-    return false;
+   return false;
 #endif
 }

-static inline bool has_sha512()
+static inline bool has_sve2()
 {
-#if defined(__x86_64__)
-    if ( has_avx2() )
-    {
-       unsigned int cpu_info[4] = { 0 };
-       cpuid( EXTENDED_FEATURES, 1, cpu_info );
-       return cpu_info[ EAX_Reg ] & SHA512_Flag;
-    }
-    return false;
-#elif defined(__aarch64__) && !defined(__APPLE__)
-    if ( has_neon() )
-    {
-       unsigned int cpu_info[4] = { 0 };
-       cpuid( 0, 0, cpu_info );
-       return cpu_info[0] & HWCAP_SHA3;
-    }
-    return false;
+#if defined(__aarch64__) && defined(HWCAP2_SVE2)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[1] & HWCAP2_SVE2;
 #else
-    return false;
+   return false;
+#endif
+}
+
+static inline bool has_sme()
+{
+#if defined(__aarch64__) && defined(HWCAP2_SME)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[1] & HWCAP2_SME;
+#else
+   return false;
+#endif
+}
+
+static inline bool has_sme2()
+{
+#if defined(__aarch64__) && defined(HWCAP2_SME2)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[1] & HWCAP2_SME2;
+#else
+   return false;
 #endif
 }

@@ -723,6 +731,48 @@ static inline bool has_avx512()
 #endif
 }

+static inline bool has_vbmi()
+{
+#if defined(__x86_64__)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( EXTENDED_FEATURES, 0, cpu_info );
+   return cpu_info[ ECX_Reg ] & AVX512_VBMI_Flag;
+#else
+   return false;
+#endif
+}
+
+static inline bool has_vbmi2()
+{
+#if defined(__x86_64__)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( EXTENDED_FEATURES, 0, cpu_info );
+   return cpu_info[ ECX_Reg ] & AVX512_VBMI2_Flag;
+#else
+   return false;
+#endif
+}
+
+static inline bool has_aes()
+{
+#if defined(__x86_64__)
+   if ( has_sse2() )
+   {
+      unsigned int cpu_info[4] = { 0 };
+      cpuid( CPU_INFO, 0, cpu_info );
+      return cpu_info[ ECX_Reg ] & AES_NI_Flag;
+   }
+   return false;
+#elif defined(__aarch64__) && defined(HWCAP_AES)
+   // NEON AES
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[0] & HWCAP_AES;
+#else
+   return false;
+#endif
+}
+
 static inline bool has_vaes()
 {
 #if defined(__x86_64__)
@@ -738,25 +788,78 @@ static inline bool has_vaes()
 #endif
 }

-static inline bool has_vbmi()
+static inline bool has_sveaes()
 {
-#if defined(__x86_64__)
-    unsigned int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, 0, cpu_info );
-    return cpu_info[ ECX_Reg ] & AVX512_VBMI_Flag;
+#if defined(__aarch64__) && defined(HWCAP2_SVEAES)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[1] & HWCAP2_SVEAES;
 #else
   return false;
 #endif
 }

-static inline bool has_vbmi2()
+static inline bool has_sha256()
 {
 #if defined(__x86_64__)
-    unsigned int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, 0, cpu_info );
-    return cpu_info[ ECX_Reg ] & AVX512_VBMI2_Flag;
+   if ( has_avx() )
+   {
+      unsigned int cpu_info[4] = { 0 };
+      cpuid( EXTENDED_FEATURES, 0, cpu_info );
+      return cpu_info[ EBX_Reg ] & SHA_Flag;
+   }
+   return false;
+#elif defined(__aarch64__) && defined(HWCAP_SHA2)
+   // NEON SHA256
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[0] & HWCAP_SHA2;
 #else
-    return false;
+   return false;
+#endif
+}
+
+static inline bool has_sha512()
+{
+#if defined(__x86_64__)
+   if ( has_avx2() )
+   {
+      unsigned int cpu_info[4] = { 0 };
+      cpuid( EXTENDED_FEATURES, 1, cpu_info );
+      return cpu_info[ EAX_Reg ] & SHA512_Flag;
+   }
+   return false;
+#elif defined(__aarch64__) && defined(HWCAP_SHA512)
+   // NEON SHA512
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[0] & HWCAP_SHA512;
+#else
+   return false;
+#endif
+}
+
+// Arm only
+static inline bool has_sha3()
+{
+#if defined(__aarch64__) && defined(HWCAP_SHA3)
+   // NEON SHA3
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[0] & HWCAP_SHA3;
+#else
+   return false;
+#endif
+}
+
+static inline bool has_svesha3()
+{
+#if defined(__aarch64__) && defined(HWCAP2_SVESHA3)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[1] & HWCAP2_SVESHA3;
+#else
+   return false;
 #endif
 }

@@ -815,10 +918,8 @@ static inline unsigned int avx10_version()
       cpuid( AVX10_FEATURES, 0, cpu_info );
       return cpu_info[ EBX_Reg ] & AVX10_VERSION_mask;
    }
-    return 0;
-#else
-    return 0;
 #endif
+    return 0;
 }

 // also includes 256 & 128
@@ -831,13 +932,11 @@ static inline bool has_avx10_512()
       cpuid( AVX10_FEATURES, 0, cpu_info );
       return cpu_info[ EBX_Reg ] & AVX10_512_Flag;
    }
-    return false;
-#else
-    return false;
 #endif
+    return false;
 }

-// Includes 128 but may not include 512
+// Includes 128 but might not include 512
 static inline bool has_avx10_256()
 {
 #if defined(__x86_64__)
@@ -847,13 +946,11 @@ static inline bool has_avx10_256()
       cpuid( AVX10_FEATURES, 0, cpu_info );
       return cpu_info[ EBX_Reg ] & AVX10_256_Flag;
    }
-    return false;
-#else
-    return false;
 #endif
+    return false;
 }

-// Maximum vector length
+// AVX10 vector register length
 static inline unsigned int avx10_vector_length()
 {
 #if defined(__x86_64__)
@@ -864,24 +961,28 @@ static inline unsigned int avx10_vector_length()
       return cpu_info[ EBX_Reg ] & AVX10_512_Flag ? 512
          : ( cpu_info[ EBX_Reg ] & AVX10_256_Flag ? 256 : 0 );
    }
-    return 0;
-#else
-    return 0;
 #endif
+    return 0;
 }

+// ARM SVE vector register length, converted from bytes to bits.
+static inline int sve_vector_length()
+{
+#if defined(ARM_AUXV)
+   if ( has_sve() )
+      return prctl( (PR_SVE_GET_VL & PR_SVE_VL_LEN_MASK) * 8 );
+#endif
+   return 0;
+}

 static inline uint32_t cpuid_get_highest_function_number()
 {
 #if defined(__x86_64__)
- 
  unsigned int cpu_info[4] = {0};
  cpuid( VENDOR_ID, 0, cpu_info);
  return cpu_info[ EAX_Reg ];
-
-#else
-  return 0;  
 #endif
+  return 0;
 }

 // out of date
@@ -962,9 +1063,7 @@ static inline void cpu_brand_string( char* s )

 #elif defined(__arm__) || defined(__aarch64__)

-    unsigned int cpu_info[4] = { 0 };
-    cpuid( 0, 0, cpu_info );
-    sprintf( s, "ARM 64 bit CPU, HWCAP %08x", cpu_info[0] );
+    sprintf( s, "ARM 64 bit CPU" );

 #else

--- a/util.c
+++ b/util.c
@@ -1414,6 +1414,12 @@ static bool send_line( struct stratum_ctx *sctx, char *s )
 		int n;
 		fd_set wd;

+// Something nasty going on With Windows on aarch64. This hack prevents
+// corrupting the sctx pointer. This only works if placed inside the while loop.
+#if defined(__aarch64__) && defined(WIN32) && defined(ARM_WIN_HACK)
+      printf("");
+#endif
+
 		FD_ZERO( &wd );
 		FD_SET( sctx->sock, &wd );
 		if ( select( (int) ( sctx->sock + 1 ), NULL, &wd, NULL, &timeout ) < 1 )
@@ -2239,7 +2245,7 @@ static bool stratum_benchdata(json_t *result, json_t *params, int thr_id)
 #endif

 	cpu_bestfeature(arch, 16);
-	if (has_aes_ni()) strcat(arch, " NI");
+	if (has_aes()) strcat(arch, " NI");

 	cpu_getmodelid(vendorid, 32);
 	cpu_getname(cpuname, 80);
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -11,16 +11,11 @@

 export LOCAL_LIB="$HOME/usr/lib"
 export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --host=x86_64-w64-mingw32"
-#export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
 export MINGW_LIB="/usr/x86_64-w64-mingw32/lib"
 # set correct gcc version
 export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
 # used by GCC
 export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs"
-#export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
-# Support for Windows 7 CPU groups, AES sometimes not included in -march
-# CPU groups disabled due to incompatibilities between Intel and AMD CPUs.
-#export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
 export DEFAULT_CFLAGS="-maes -O3 -Wall"
 export DEFAULT_CFLAGS_OLD="-O3 -Wall"

@@ -40,7 +35,6 @@ cp $MINGW_LIB/zlib1.dll release/
 cp $MINGW_LIB/libwinpthread-1.dll release/
 cp $GCC_MINGW_LIB/libstdc++-6.dll release/
 cp $GCC_MINGW_LIB/libgcc_s_seh-1.dll release/
-#cp ./../libcrypto-1_1-x64.dll release/
 cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/

 # Start building...
@@ -126,12 +120,12 @@ CFLAGS="-msse2 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-sse2.exe
-make clean || echo clean
+#make clean || echo clean

 # Native with CPU groups ennabled
-make clean || echo clean
-rm -f config.status
-CFLAGS="-march=native $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
-make -j 8
-strip -s cpuminer.exe
+#make clean || echo clean
+#rm -f config.status
+#CFLAGS="-march=native $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
+#make -j 8
+#strip -s cpuminer.exe
Author	SHA1	Message	Date
Jay D Dee	dd99580a4c	v25.3	2025-01-16 12:31:53 -05:00
Jay D Dee	1ed18bf22e	v25.2	2025-01-12 18:58:21 -05:00
Jay D Dee	1d9341ee92	v25.1	2024-12-30 21:33:04 -05:00
Jay D Dee	a45a333b40	v24.8	2024-12-25 23:12:29 -05:00
Jay D Dee	2b1037a7c7	v24.7	2024-12-16 19:17:19 -05:00
Jay D Dee	06624a0ff2	v24.6	2024-12-08 11:14:08 -05:00
Jay D Dee	8e91bfbe19	v24.5	2024-09-13 14:14:57 -04:00
Jay D Dee	47e24b50e8	v24.4	2024-07-01 00:33:19 -04:00
Jay D Dee	c47c4a8885	v24.3	2024-05-28 18:20:19 -04:00