v24.4

v24.3
2025-09-17 23:44:27 +00:00 · 2024-07-01 00:33:19 -04:00 · 2024-05-28 18:20:19 -04:00
47 changed files with 2463 additions and 3211 deletions
--- a/14
+++ b/14
@@ -75,6 +75,20 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v24.4
+
+x86_64: fixed a bug in ornot macro for AVX2 which broke some algos in v24.2.
+x86_64: fixed a bug in alignr macros for SSE2.
+ARM: CPU feature reporting enhancements.
+Some code cleanup.
+
+v24.3
+
+ARM: CPU feature detection and reporting is now working.
+ARM: Verthash is now working.
+ARM: Small speedup for yescrypt, yespower & argon2d.
+Code cleanup.
+
 v24.2

 x86_64: Fixed blakes2s for AVX2 & AVX512, x25x for AVX512, broken in v3.23.4.
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -98,7 +98,6 @@ typedef  uint32_t set_t;
 #define AVX512_OPT       1 <<  6   // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
 #define AES_OPT          1 <<  7   // Intel Westmere, AArch64
 #define VAES_OPT         1 <<  8   // Icelake, Zen3
-#define SHA_OPT          1 <<  9   // Zen1, Icelake, AArch64 
 #define SHA256_OPT       1 <<  9   // Zen1, Icelake, AArch64 
 #define SHA512_OPT       1 << 10   // Intel Arrow Lake, AArch64 
 #define NEON_OPT         1 << 11   // AArch64 
--- a/algo/blake/blake2b-hash.c
+++ b/algo/blake/blake2b-hash.c
@@ -240,7 +240,7 @@ static const uint8_t sigma[12][16] =
   v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
 }

-static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
+static void blake2b_8x64_compress( blake2b_8x64_ctx *ctx, int last )
 {  
   __m512i v[16], m[16];

@@ -306,7 +306,7 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
   ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
 }

-int blake2b_8way_init( blake2b_8way_ctx *ctx )
+int blake2b_8x64_init( blake2b_8x64_ctx *ctx )
 {
   size_t i;

@@ -333,7 +333,7 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
 }


-void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
+void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
                          size_t inlen )
 {
   __m512i* in =(__m512i*)input;
@@ -348,7 +348,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
         ctx->t[0] += ctx->c;
         if ( ctx->t[0] < ctx->c )
            ctx->t[1]++;
-         blake2b_8way_compress( ctx, 0 );
+         blake2b_8x64_compress( ctx, 0 );
         ctx->c = 0;
      }
      ctx->b[ c++ ] = in[i];
@@ -356,7 +356,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
   }
 }

-void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
+void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out )
 {
   size_t c;
   c = ctx->c >> 3;
@@ -371,7 +371,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
      ctx->c += 8;
   }

-   blake2b_8way_compress( ctx, 1 );           // final block flag = 1
+   blake2b_8x64_compress( ctx, 1 );           // final block flag = 1

   casti_m512i( out, 0 ) = ctx->h[0];
   casti_m512i( out, 1 ) = ctx->h[1];
@@ -407,7 +407,7 @@ static const uint64_t blake2b_iv[8] = {
 };
 */

-static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
+static void blake2b_4x64_compress( blake2b_4x64_ctx *ctx, int last )
 {
 	__m256i v[16], m[16];

@@ -473,7 +473,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
   ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
 }

-int blake2b_4way_init( blake2b_4way_ctx *ctx ) 
+int blake2b_4x64_init( blake2b_4x64_ctx *ctx ) 
 {
 	size_t i;

@@ -499,7 +499,7 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
 	return 0;
 }

-void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
+void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
                          size_t inlen ) 
 {
   __m256i* in =(__m256i*)input;
@@ -514,7 +514,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
 			ctx->t[0] += ctx->c;
 			if ( ctx->t[0] < ctx->c )
 				ctx->t[1]++;
-			blake2b_4way_compress( ctx, 0 );
+			blake2b_4x64_compress( ctx, 0 );
 			ctx->c = 0;
 		}
      ctx->b[ c++ ] = in[i];
@@ -522,7 +522,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
   }
 }

-void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
+void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out )
 {
 	size_t c;
   c = ctx->c >> 3;
@@ -537,7 +537,7 @@ void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
      ctx->c += 8;
   }

-   blake2b_4way_compress( ctx, 1 );           // final block flag = 1
+   blake2b_4x64_compress( ctx, 1 );           // final block flag = 1

   casti_m256i( out, 0 ) = ctx->h[0];
   casti_m256i( out, 1 ) = ctx->h[1];
--- a/algo/blake/blake2b-hash.h
+++ b/algo/blake/blake2b-hash.h
@@ -1,6 +1,6 @@
 #pragma once
-#ifndef __BLAKE2B_HASH_4WAY_H__
-#define __BLAKE2B_HASH_4WAY_H__
+#ifndef BLAKE2B_HASH_4WAY_H__
+#define BLAKE2B_HASH_4WAY_H__

 #include "simd-utils.h"
 #include <stddef.h>
@@ -23,12 +23,17 @@ typedef struct ALIGN( 64 ) {
   uint64_t t[2];  // total number of bytes
   size_t c;       // pointer for b[]
   size_t outlen;  // digest size
-} blake2b_8way_ctx;
+} blake2b_8x64_ctx;

-int blake2b_8way_init( blake2b_8way_ctx *ctx );
-void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
+int blake2b_8x64_init( blake2b_8x64_ctx *ctx );
+void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
                          size_t inlen );
-void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
+void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out );
+
+#define blake2b_8way_ctx         blake2b_8x64_ctx
+#define blake2b_8way_init        blake2b_8x64_init
+#define blake2b_8way_update      blake2b_8x64_update
+#define blake2b_8way_final       blake2b_8x64_final

 #endif

@@ -41,12 +46,17 @@ typedef struct ALIGN( 64 ) {
 	uint64_t t[2];  // total number of bytes
 	size_t c;       // pointer for b[]
 	size_t outlen;  // digest size
-} blake2b_4way_ctx;
+} blake2b_4x64_ctx;

-int blake2b_4way_init( blake2b_4way_ctx *ctx );
-void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
+int blake2b_4x64_init( blake2b_4x64_ctx *ctx );
+void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
                          size_t inlen );
-void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
+void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out );
+
+#define blake2b_4way_ctx         blake2b_4x64_ctx
+#define blake2b_4way_init        blake2b_4x64_init
+#define blake2b_4way_update      blake2b_4x64_update
+#define blake2b_4way_final       blake2b_4x64_final

 #endif

--- a/algo/blake/blake2s-hash.h
+++ b/algo/blake/blake2s-hash.h
@@ -11,8 +11,8 @@
 * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
 */
 //#pragma once
-#ifndef __BLAKE2S_HASH_4WAY_H__
-#define __BLAKE2S_HASH_4WAY_H__ 1
+#ifndef BLAKE2S_HASH_4WAY_H__
+#define BLAKE2S_HASH_4WAY_H__ 1

 #if defined(__SSE2__) || defined(__ARM_NEON)

--- a/algo/groestl/groestl512-intr-4way.h
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -239,7 +239,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
  { \
    /* AddRoundConstant P1024 */\
    xmm8 = _mm512_xor_si512( xmm8, mm512_bcast_m128( \
-             casti_m128i( round_const_p, round_counter ) ) ); \
+             casti_v128u32( round_const_p, round_counter ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm8  = _mm512_shuffle_epi8( xmm8,  SUBSH_MASK0 ); \
    xmm9  = _mm512_shuffle_epi8( xmm9,  SUBSH_MASK1 );\
@@ -254,7 +254,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    \
     /* AddRoundConstant P1024 */\
    xmm0 = _mm512_xor_si512( xmm0, mm512_bcast_m128( \
-             casti_m128i( round_const_p, round_counter+1 ) ) ); \
+             casti_v128u32( round_const_p, round_counter+1 ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
    xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK1 );\
@@ -283,7 +283,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
    xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
    xmm15 = _mm512_xor_si512( xmm15, mm512_bcast_m128( \
-                 casti_m128i( round_const_q, round_counter ) ) ); \
+                 casti_v128u32( round_const_q, round_counter ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm8  = _mm512_shuffle_epi8( xmm8,  SUBSH_MASK1 );\
    xmm9  = _mm512_shuffle_epi8( xmm9,  SUBSH_MASK3 );\
@@ -306,7 +306,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
    xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
    xmm7 = _mm512_xor_si512( xmm7, mm512_bcast_m128( \
-             casti_m128i( round_const_q, round_counter+1 ) ) ); \
+             casti_v128u32( round_const_q, round_counter+1 ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
    xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK3 );\
@@ -812,7 +812,7 @@ static const __m256i SUBSH_MASK7_2WAY =
  { \
    /* AddRoundConstant P1024 */\
    xmm8 = _mm256_xor_si256( xmm8, mm256_bcast_m128( \
-             casti_m128i( round_const_p, round_counter ) ) ); \
+             casti_v128u32( round_const_p, round_counter ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm8  = _mm256_shuffle_epi8( xmm8,  SUBSH_MASK0_2WAY ); \
    xmm9  = _mm256_shuffle_epi8( xmm9,  SUBSH_MASK1_2WAY );\
@@ -827,7 +827,7 @@ static const __m256i SUBSH_MASK7_2WAY =
    \
     /* AddRoundConstant P1024 */\
    xmm0 = _mm256_xor_si256( xmm0, mm256_bcast_m128( \
-             casti_m128i( round_const_p, round_counter+1 ) ) ); \
+             casti_v128u32( round_const_p, round_counter+1 ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
    xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK1_2WAY );\
@@ -856,7 +856,7 @@ static const __m256i SUBSH_MASK7_2WAY =
    xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
    xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
    xmm15 = _mm256_xor_si256( xmm15, mm256_bcast_m128( \
-                 casti_m128i( round_const_q, round_counter ) ) ); \
+                 casti_v128u32( round_const_q, round_counter ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm8  = _mm256_shuffle_epi8( xmm8,  SUBSH_MASK1_2WAY );\
    xmm9  = _mm256_shuffle_epi8( xmm9,  SUBSH_MASK3_2WAY );\
@@ -879,7 +879,7 @@ static const __m256i SUBSH_MASK7_2WAY =
    xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
    xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
    xmm7 = _mm256_xor_si256( xmm7, mm256_bcast_m128( \
-             casti_m128i( round_const_q, round_counter+1 ) ) ); \
+             casti_v128u32( round_const_q, round_counter+1 ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
    xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK3_2WAY );\
--- a/algo/groestl/myrgr-gate.c
+++ b/algo/groestl/myrgr-gate.c
@@ -16,7 +16,7 @@ bool register_myriad_algo( algo_gate_t* gate )
  init_myrgr_ctx();
  gate->scanhash  = (void*)&scanhash_myriad;
  gate->hash      = (void*)&myriad_hash;
-  gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT;
+  gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA256_OPT | VAES_OPT;
 #endif
  return true;
 };
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -47,25 +47,19 @@
  a1 = _mm_alignr_epi8( b, a1, 4 ); \
 }

-#elif defined(__ARM_NEON)
+
+#elif defined(__ARM_NEON) || defined(__SSE2__)

 // { a1_0, 0, a1_0, a1_0 }
 #define MULT2( a0, a1 ) \
 { \
-  v128_t b = v128_xor( a0, v128_and( vdupq_laneq_u32( a1, 0 ), MASK ) ); \
+  v128_t b = v128_xor( a0, v128_and( v128_bcast32( a1 ), MASK ) ); \
  a0 = v128_alignr32( a1, b, 1 ); \
  a1 = v128_alignr32( b, a1, 1 ); \
 }

-#else   // assume SSE2
-
-#define MULT2( a0, a1 ) \
-{ \
-  v128_t b = v128_xor( a0, v128_and( _mm_shuffle_epi32( a1, 0 ), MASK ) ); \
-  a0 = v128_or( _mm_srli_si128(  b, 4 ), _mm_slli_si128( a1, 12 ) ); \
-  a1 = v128_or( _mm_srli_si128( a1, 4 ), _mm_slli_si128(  b, 12 ) ); \
-} 
-
+#else
+  #warning __FILE__ ":" __LINE__ " Unknown or unsupported CPU architecture."
 #endif

 #if defined(VL256)
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -195,10 +195,6 @@ static const uint64_t blake2b_IV[8] =

 #endif // AVX2 else SSE2

-static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
-    return ( w >> c ) | ( w << ( 64 - c ) );
-}
-
 #define G( r, i, a, b, c, d ) \
 { \
    a = a + b; \
--- a/algo/m7m/m7m.c
+++ b/algo/m7m/m7m.c
@@ -306,7 +306,7 @@ bool register_m7m_algo( algo_gate_t *gate )
  applog( LOG_ERR, "M7M algo is not supported on MacOS");
  return false;
 #else  
-  gate->optimizations = SHA_OPT;
+  gate->optimizations = SHA256_OPT;
  init_m7m_ctx();
  gate->scanhash              = (void*)&scanhash_m7m_hash;
  gate->build_stratum_request = (void*)&std_be_build_stratum_request;
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -104,14 +104,14 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
   int thr_id = mythr->id;  // thr_id arg is deprecated

   // we need bigendian data...
-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-   casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
-   casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
-   casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
+   casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
+   casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
+   casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
+   casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
+   casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
+   casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
+   casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
+   casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
   intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
        edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );

@@ -224,14 +224,14 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
   int thr_id = mythr->id;  // thr_id arg is deprecated

   // we need bigendian data...
-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-   casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
-   casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
-   casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
+   casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
+   casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
+   casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
+   casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
+   casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
+   casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
+   casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
+   casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
   intrlv_8x32( vdata, edata, edata, edata, edata,
                       edata, edata, edata, edata, 1024 );

--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -51,7 +51,6 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }

 bool register_lbry_algo( algo_gate_t* gate )
 {
-//  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
 #if defined (LBRY_16WAY)
  gate->scanhash              = (void*)&scanhash_lbry_16way;
  gate->hash                  = (void*)&lbry_16way_hash;
@@ -67,7 +66,7 @@ bool register_lbry_algo( algo_gate_t* gate )
 #else 
  gate->scanhash              = (void*)&scanhash_lbry;
  gate->hash                  = (void*)&lbry_hash;
-  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
+  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA256_OPT;
 #endif
  gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
  gate->build_extraheader     = (void*)&lbry_build_extraheader;
--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -319,7 +319,7 @@ void ripemd160_4way_close( ripemd160_4way_context  *sc, void *dst )
    sc->buf[ (pad>>2) + 1 ] = _mm_set1_epi32( high );
    ripemd160_4way_round( sc );
    for (u = 0; u < 5; u ++)
-        casti_m128i( dst, u ) = sc->val[u];
+        casti_v128u32( dst, u ) = sc->val[u];
 }

 #endif
--- a/algo/scrypt/scrypt-core-4way.c
+++ b/algo/scrypt/scrypt-core-4way.c
@@ -2074,7 +2074,7 @@ void scrypt_core_4way( v128_t *X, v128_t *V, const uint32_t N )
         v128_ovly v;    
         for ( int l = 0; l < 4; l++ )
            v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
-         X[i] = v128_xor( X[i], v.m128 );
+         X[i] = v128_xor( X[i], v.v128 );
      }

      xor_salsa8_4way( &X[ 0], &X[16] );
@@ -2211,10 +2211,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
   // X2 is shuffled left 2 (swap_64)    { xd, x8, x7, x2 }
   // X3 is shuffled left 3 (ror_1x32)   { xc, xb, x6, x1 }

-   y[0].m128 = X0;
-   y[1].m128 = X1;
-   y[2].m128 = X2;
-   y[3].m128 = X3;
+   y[0].v128 = X0;
+   y[1].v128 = X1;
+   y[2].v128 = X2;
+   y[3].v128 = X3;

   z[0].u32[0] = y[0].u32[0];
   z[0].u32[3] = y[1].u32[0];
@@ -2236,10 +2236,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
   z[3].u32[1] = y[2].u32[3];
   z[3].u32[0] = y[3].u32[3];

-   B[0] = v128_add32( B[0], z[0].m128 );
-   B[1] = v128_add32( B[1], z[1].m128 );
-   B[2] = v128_add32( B[2], z[2].m128 );
-   B[3] = v128_add32( B[3], z[3].m128 );
+   B[0] = v128_add32( B[0], z[0].v128 );
+   B[1] = v128_add32( B[1], z[1].v128 );
+   B[2] = v128_add32( B[2], z[2].v128 );
+   B[3] = v128_add32( B[3], z[3].v128 );

 #endif

@@ -2404,14 +2404,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
 /*
   v128_ovly ya[4], za[4], yb[4], zb[4];

-   ya[0].m128 = XA[0];
-   yb[0].m128 = XB[0];
-   ya[1].m128 = XA[1];
-   yb[1].m128 = XB[1];
-   ya[2].m128 = XA[2];
-   yb[2].m128 = XB[2];
-   ya[3].m128 = XA[3];
-   yb[3].m128 = XB[3];
+   ya[0].v128 = XA[0];
+   yb[0].v128 = XB[0];
+   ya[1].v128 = XA[1];
+   yb[1].v128 = XB[1];
+   ya[2].v128 = XA[2];
+   yb[2].v128 = XB[2];
+   ya[3].v128 = XA[3];
+   yb[3].v128 = XB[3];

   za[0].u32[0] = ya[0].u32[0];
   zb[0].u32[0] = yb[0].u32[0];
@@ -2449,14 +2449,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
   za[3].u32[3] = ya[0].u32[3];
   zb[3].u32[3] = yb[0].u32[3];

-   XA[0] = za[0].m128;
-   XB[0] = zb[0].m128;
-   XA[1] = za[1].m128;
-   XB[1] = zb[1].m128;
-   XA[2] = za[2].m128;
-   XB[2] = zb[2].m128;
-   XA[3] = za[3].m128;
-   XB[3] = zb[3].m128;
+   XA[0] = za[0].v128;
+   XB[0] = zb[0].v128;
+   XA[1] = za[1].v128;
+   XB[1] = zb[1].v128;
+   XA[2] = za[2].v128;
+   XB[2] = zb[2].v128;
+   XA[3] = za[3].v128;
+   XB[3] = zb[3].v128;
 */
 }

@@ -2770,18 +2770,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
 /*  
   v128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4];

-   ya[0].m128 = XA[0];
-   yb[0].m128 = XB[0];
-   yc[0].m128 = XC[0];
-   ya[1].m128 = XA[1];
-   yb[1].m128 = XB[1];
-   yc[1].m128 = XC[1];
-   ya[2].m128 = XA[2];
-   yb[2].m128 = XB[2];
-   yc[2].m128 = XC[2];
-   ya[3].m128 = XA[3];
-   yb[3].m128 = XB[3];
-   yc[3].m128 = XC[3];
+   ya[0].v128 = XA[0];
+   yb[0].v128 = XB[0];
+   yc[0].v128 = XC[0];
+   ya[1].v128 = XA[1];
+   yb[1].v128 = XB[1];
+   yc[1].v128 = XC[1];
+   ya[2].v128 = XA[2];
+   yb[2].v128 = XB[2];
+   yc[2].v128 = XC[2];
+   ya[3].v128 = XA[3];
+   yb[3].v128 = XB[3];
+   yc[3].v128 = XC[3];

   za[0].u32[0] = ya[0].u32[0];
   zb[0].u32[0] = yb[0].u32[0];
@@ -2835,18 +2835,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
   zb[3].u32[3] = yb[0].u32[3];
   zc[3].u32[3] = yc[0].u32[3];

-   XA[0] = za[0].m128;
-   XB[0] = zb[0].m128;
-   XC[0] = zc[0].m128;
-   XA[1] = za[1].m128;
-   XB[1] = zb[1].m128;
-   XC[1] = zc[1].m128;
-   XA[2] = za[2].m128;
-   XB[2] = zb[2].m128;
-   XC[2] = zc[2].m128;
-   XA[3] = za[3].m128;
-   XB[3] = zb[3].m128;
-   XC[3] = zc[3].m128;
+   XA[0] = za[0].v128;
+   XB[0] = zb[0].v128;
+   XC[0] = zc[0].v128;
+   XA[1] = za[1].v128;
+   XB[1] = zb[1].v128;
+   XC[1] = zc[1].v128;
+   XA[2] = za[2].v128;
+   XB[2] = zb[2].v128;
+   XC[2] = zc[2].v128;
+   XA[3] = za[3].v128;
+   XB[3] = zb[3].v128;
+   XC[3] = zc[3].v128;
 */
 }   

@@ -3049,7 +3049,7 @@ static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
            xf = (B[15] ^= C[15]);

   
-   #define ROL32( a, c )    ror32( a, c )
+   #define ROL32( a, c )    rol32( a, c )
   #define ADD32( a, b )    ( (a)+(b) )
   #define XOR( a, b )      ( (a)^(b) )

--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -1481,7 +1481,7 @@ bool scrypt_miner_thread_init( int thr_id )
 bool register_scrypt_algo( algo_gate_t* gate )
 {
 #if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
-   gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+   gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
 #else
   gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
 #endif
--- a/algo/sha/hmac-sha256-hash-4way.c
+++ b/algo/sha/hmac-sha256-hash-4way.c
@@ -74,8 +74,8 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
 	memset( pad, 0x36, 64*4 );

   for ( i = 0; i < Klen; i++ )
-		casti_m128i( pad, i ) = _mm_xor_si128( casti_m128i( pad, i ),
-                                             casti_m128i( K, i ) );
+		casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
+                                               casti_v128u32( K, i ) );

   sha256_4way_update( &ctx->ictx, pad, 64 );

@@ -83,8 +83,8 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
 	sha256_4way_init( &ctx->octx );
 	memset( pad, 0x5c, 64*4 );
 	for ( i = 0; i < Klen/4; i++ )
-		casti_m128i( pad, i ) = _mm_xor_si128( casti_m128i( pad, i ),
-                                             casti_m128i( K, i ) );
+		casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
+                                               casti_v128u32( K, i ) );
 	sha256_4way_update( &ctx->octx, pad, 64 );
 }

@@ -158,8 +158,8 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,

 			/* ... xor U_j ... */
 			for ( k = 0; k < 8; k++ )
-				casti_m128i( T, k ) = _mm_xor_si128( casti_m128i( T, k ),
-                                                 casti_m128i( U, k ) );
+				casti_v128u32( T, k ) = _mm_xor_si128( casti_v128u32( T, k ),
+                                                   casti_v128u32( U, k ) );
 		}

 		/* Copy as many bytes as necessary into buf. */
--- a/algo/sha/sha256-hash.c
+++ b/algo/sha/sha256-hash.c
@@ -569,8 +569,8 @@ void sha256_x86_sha_prehash_3rounds( uint32_t *ostate, const void *msg,
   __m128i STATE0, STATE1, MSG, TMP;

   // Load initial values
-   TMP    = casti_m128i( istate, 0 );
-   STATE1 = casti_m128i( istate, 1 );
+   TMP    = casti_v128u32( istate, 0 );
+   STATE1 = casti_v128u32( istate, 1 );

   TMP    = _mm_shuffle_epi32( TMP, 0xB1 );       // CDAB
   STATE1 = _mm_shuffle_epi32( STATE1, 0x1B );    // EFGH
@@ -578,17 +578,17 @@ void sha256_x86_sha_prehash_3rounds( uint32_t *ostate, const void *msg,
   STATE1 = _mm_blend_epi16( STATE1, TMP, 0xF0 ); // CDGH

   // Save current hash
-   casti_m128i( sstate, 0 ) = STATE0;
-   casti_m128i( sstate, 1 ) = STATE1;
+   casti_v128u32( sstate, 0 ) = STATE0;
+   casti_v128u32( sstate, 1 ) = STATE1;

   // Rounds 0 to 3
-   MSG = casti_m128i( msg, 0 );
+   MSG = casti_v128u32( msg, 0 );
   TMP = _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL );
   MSG = _mm_add_epi32( MSG, TMP );
   STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
   MSG = _mm_shuffle_epi32( MSG, 0x0E );
-   casti_m128i( ostate, 0 ) = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-   casti_m128i( ostate, 1 ) = STATE1;
+   casti_v128u32( ostate, 0 ) = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
+   casti_v128u32( ostate, 1 ) = STATE1;
 }

 void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
@@ -601,22 +601,22 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
    __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
    __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;

-    STATE0_X = casti_m128i( state_mid_X, 0 );
-    STATE1_X = casti_m128i( state_mid_X, 1 );
-    STATE0_Y = casti_m128i( state_mid_Y, 0 );
-    STATE1_Y = casti_m128i( state_mid_Y, 1 );
+    STATE0_X = casti_v128u32( state_mid_X, 0 );
+    STATE1_X = casti_v128u32( state_mid_X, 1 );
+    STATE0_Y = casti_v128u32( state_mid_Y, 0 );
+    STATE1_Y = casti_v128u32( state_mid_Y, 1 );

    // Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3)
-    TMSG0_X = casti_m128i( msg_X, 0 );
-    TMSG0_Y = casti_m128i( msg_Y, 0 );
+    TMSG0_X = casti_v128u32( msg_X, 0 );
+    TMSG0_Y = casti_v128u32( msg_Y, 0 );
    TMP_X = v128_xim32( TMSG0_X, TMSG0_X, 0xd5 );
    TMP_Y = v128_xim32( TMSG0_Y, TMSG0_Y, 0xd5 );
    STATE0_X = _mm_add_epi32( STATE0_X, TMP_X );
    STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y );

    // Rounds 4 to 7
-    TMSG1_X = casti_m128i( msg_X, 1 );
-    TMSG1_Y = casti_m128i( msg_Y, 1 );
+    TMSG1_X = casti_v128u32( msg_X, 1 );
+    TMSG1_Y = casti_v128u32( msg_Y, 1 );
    TMP_X = _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL );
    MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
    MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
@@ -638,8 +638,8 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_X );

    // Rounds 12 to 15
-    TMSG3_X = casti_m128i( msg_X, 3 );
-    TMSG3_Y = casti_m128i( msg_Y, 3 );
+    TMSG3_X = casti_v128u32( msg_X, 3 );
+    TMSG3_Y = casti_v128u32( msg_Y, 3 );
    TMP_X = _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL );
    MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
    MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
@@ -867,20 +867,20 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );

    // Add saved state to new state
-    STATE0_X = _mm_add_epi32( STATE0_X, casti_m128i( state_save_X, 0 ) );
-    STATE1_X = _mm_add_epi32( STATE1_X, casti_m128i( state_save_X, 1 ) );
-    STATE0_Y = _mm_add_epi32( STATE0_Y, casti_m128i( state_save_Y, 0 ) );
-    STATE1_Y = _mm_add_epi32( STATE1_Y, casti_m128i( state_save_Y, 1 ) );
+    STATE0_X = _mm_add_epi32( STATE0_X, casti_v128u32( state_save_X, 0 ) );
+    STATE1_X = _mm_add_epi32( STATE1_X, casti_v128u32( state_save_X, 1 ) );
+    STATE0_Y = _mm_add_epi32( STATE0_Y, casti_v128u32( state_save_Y, 0 ) );
+    STATE1_Y = _mm_add_epi32( STATE1_Y, casti_v128u32( state_save_Y, 1 ) );

    // Unshuffle & save state    
    TMP_X = _mm_shuffle_epi32( STATE0_X, 0x1B );                        // FEBA
    TMP_Y = _mm_shuffle_epi32( STATE0_Y, 0x1B );
    STATE1_X = _mm_shuffle_epi32( STATE1_X, 0xB1 );                     // DCHG
    STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0xB1 );
-    casti_m128i( out_X, 0 ) = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA
-    casti_m128i( out_Y, 0 ) = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 );
-    casti_m128i( out_X, 1 ) = _mm_alignr_epi8( STATE1_X, TMP_X, 8 );    // ABEF
-    casti_m128i( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
+    casti_v128u32( out_X, 0 ) = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA
+    casti_v128u32( out_Y, 0 ) = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 );
+    casti_v128u32( out_X, 1 ) = _mm_alignr_epi8( STATE1_X, TMP_X, 8 );    // ABEF
+    casti_v128u32( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
 }

 #endif     // SHA
--- a/algo/sha/sha256d.c
+++ b/algo/sha/sha256d.c
@@ -8,14 +8,14 @@ void sha256d( void *hash, const void *data, int len )
 }
 bool register_sha256d_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
 #if defined(SHA256D_16WAY)
   gate->scanhash = (void*)&scanhash_sha256d_16way;
 #elif defined(SHA256D_SHA)
-   gate->optimizations = SHA_OPT;
+   gate->optimizations = SSE2_OPT | SHA256_OPT;
   gate->scanhash = (void*)&scanhash_sha256d_sha;
 #elif defined(SHA256D_NEON_SHA2)
-   gate->optimizations = SHA_OPT;
+   gate->optimizations = NEON_OPT | SHA256_OPT;
   gate->scanhash = (void*)&scanhash_sha256d_neon_sha2;
 #elif defined(SHA256D_8WAY)
   gate->scanhash = (void*)&scanhash_sha256d_8way;
--- a/algo/sha/sha256dt.c
+++ b/algo/sha/sha256dt.c
@@ -500,10 +500,10 @@ bool register_sha256dt_algo( algo_gate_t* gate )
 #if defined(SHA256DT_16X32)
    gate->scanhash = (void*)&scanhash_sha256dt_16x32;
 #elif defined(SHA256DT_X86_SHA256)
-    gate->optimizations = SHA_OPT;
+    gate->optimizations = SSE2_OPT | SHA256_OPT;
    gate->scanhash = (void*)&scanhash_sha256dt_x86_x2sha;    
 #elif defined(SHA256DT_NEON_SHA256)
-    gate->optimizations = SHA_OPT;
+    gate->optimizations = NEON_OPT | SHA256_OPT;
    gate->scanhash = (void*)&scanhash_sha256dt_neon_x2sha;
 #elif defined(SHA256DT_8X32)
    gate->scanhash = (void*)&scanhash_sha256dt_8x32;
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -6,9 +6,10 @@ bool register_sha256t_algo( algo_gate_t* gate )
 #if defined(SHA256T_16WAY)
    gate->scanhash   = (void*)&scanhash_sha256t_16way;
 #elif defined(SHA256T_SHA)
-    gate->optimizations = SHA_OPT;
+    gate->optimizations = SSE2_OPT | SHA256_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t_sha;
 #elif defined(SHA256T_NEON_SHA2)
+    gate->optimizations = NEON_OPT | SHA256_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t_neon_sha2;
 #elif defined(SHA256T_8WAY)
    gate->scanhash   = (void*)&scanhash_sha256t_8way;
@@ -28,7 +29,7 @@ bool register_sha256q_algo( algo_gate_t* gate )
    gate->scanhash   = (void*)&scanhash_sha256q_16way;
    gate->hash       = (void*)&sha256q_16way_hash;
 //#elif defined(SHA256T_SHA)
-//    gate->optimizations = SHA_OPT;
+//    gate->optimizations = SHA256_OPT;
 //    gate->scanhash   = (void*)&scanhash_sha256q;
 //    gate->hash       = (void*)&sha256q_hash;
 #elif defined(SHA256T_8WAY)
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -71,12 +71,13 @@ static const uint64_t K512[80] =

 // SHA-512 implemented using SHA512 CPU extension.

-// Experimental. Not tested. Not reviewed. Compile tested only.
+// Experimental. Not supported. Not tested. Not reviewed. Compile tested only.
+// Modelled after noloader sha256 implementation, replacing 4x32 bit
+// instructions with equivalent 4x64 bit instructions and increasing rounds
+// to 80.

 // Needs GCC-14 for compilation.
 // Needs Intel Lunarlake or Arrowlake CPU, or AMD Zen-6? for execution.
-// Modelled after noloader sha256 implementation.
-

 void sha512_opt_transform_be( uint64_t *state_out, const void *input,
                              const uint64_t *state_in )
@@ -571,6 +572,20 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,

 #endif

+/*
+#if defined(__ARM_FEATURE_NEON) && defined(__ARM_FEATURE_SHA512)
+
+uint64x2_t sha512_compile_test( uint64x2_t test )
+{
+   test = vsha512hq_u64( test, test, test );
+   test = vsha512h2q_u64( test, test, test );
+   test = vsha512su0q_u64( test, test );
+   test = vsha512su1q_u64( test, test, test );
+   return test;
+}
+
+#endif
+*/

 #if defined(SIMD512)

--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -300,11 +300,12 @@ static inline __m512i v512_mult_x5( const __m512i x )

 #define PERM_ELT16( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
-   xa0 = mm512_xor3( xm, xb1, mm512_xorandnot( \
-           v512_mult_x3( mm512_xor3( xa0, xc, \
-              v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ) ), \
-           xb3, xb2 ) ); \
-   xb0 = mm512_xnor( xa0, mm512_rol_32( xb0, 1 ) ); \
+   xa0 = mm512_xor3( xa0, xc, \
+                     v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ); \
+   xb0 = mm512_rol_32( xb0, 1 ); \
+   xa0 = mm512_xor3( xm, xb1, \
+                     mm512_xorandnot( v512_mult_x3( xa0 ), xb3, xb2 ) ); \
+   xb0 = mm512_xnor( xa0, xb0 ); \
 } while (0)

 #define PERM_STEP_0_16  do { \
@@ -905,11 +906,12 @@ static inline __m256i v256_mult_x5( const __m256i x )

 #define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
-   xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
-           v256_mult_x3( mm256_xor3( xa0, xc, \
-              v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ) ), \
-           xb3, xb2 ) ); \
-   xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
+   xa0 = mm256_xor3( xa0, xc, \
+                     v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ); \
+   xb0 = mm256_rol_32( xb0, 1 ); \
+   xa0 = mm256_xor3( xm, xb1, \
+                     mm256_xorandnot( v256_mult_x3( xa0 ), xb3, xb2 ) ); \
+   xb0 = mm256_xnor( xa0, xb0 ); \
 } while (0)

 #define PERM_STEP_0_8   do { \
--- a/algo/simd/vector.h
+++ b/algo/simd/vector.h
@@ -62,8 +62,6 @@ union u32 {
 #define v32_andn(x,y) ((v32) vec_andn((x), (y)))
 #endif

-//TODO  aarch support for widening multiply
-
 #if defined(__SSE2__)

 #define vec_and(x,y) ((x)&(y))
--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -8,15 +8,15 @@ bool register_skein_algo( algo_gate_t* gate )
    gate->scanhash  = (void*)&scanhash_skein_8way;
    gate->hash      = (void*)&skeinhash_8way;
 #elif defined(SKEIN_4WAY)
-    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
+    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA256_OPT | NEON_OPT;
    gate->scanhash  = (void*)&scanhash_skein_4way;
    gate->hash      = (void*)&skeinhash_4way;
 #elif defined(SKEIN_2WAY)
-    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
+    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA256_OPT | NEON_OPT;
    gate->scanhash  = (void*)&scanhash_skein_2x64;
    gate->hash      = (void*)&skeinhash_2x64;
 #else
-    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
+    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA256_OPT | NEON_OPT;
    gate->scanhash  = (void*)&scanhash_skein;
    gate->hash      = (void*)&skeinhash;
 #endif
--- a/algo/sm3/sm3-hash-4way.c
+++ b/algo/sm3/sm3-hash-4way.c
@@ -240,10 +240,10 @@ void sm3_8way_close( void *cc, void *dst )

 #if defined(__SSE2__)

-#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x,  9 ), \
-                                               mm128_rol_32( x, 17 ) ) ) 
-#define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 15 ), \
-                                               mm128_rol_32( x, 23 ) ) ) 
+#define P0(x) _mm_xor_si128( x, _mm_xor_si128( v128_rol32( x,  9 ), \
+                                               v128_rol32( x, 17 ) ) )
+#define P1(x) _mm_xor_si128( x, _mm_xor_si128( v128_rol32( x, 15 ), \
+                                               v128_rol32( x, 23 ) ) )

 #define FF0(x,y,z) _mm_xor_si128( x, _mm_xor_si128( y, z ) )
 #define FF1(x,y,z) _mm_or_si128( _mm_or_si128( _mm_and_si128( x, y ), \
@@ -273,13 +273,13 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
   int j;

   for ( j = 0; j < 16; j++ )
-      W[j] = mm128_bswap_32( block[j] );
+      W[j] = v128_bswap32( block[j] );

   for ( j = 16; j < 68; j++ )
      W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ],
                                                              W[ j-9 ] ),
-                                               mm128_rol_32( W[ j-3 ], 15 ) ) ),
-                            _mm_xor_si128( mm128_rol_32( W[ j-13 ], 7 ),
+                                               v128_rol32( W[ j-3 ], 15 ) ) ),
+                            _mm_xor_si128( v128_rol32( W[ j-13 ], 7 ),
                                           W[ j-6 ] ) );

   for( j = 0; j < 64; j++ )
@@ -288,19 +288,19 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
   T = _mm_set1_epi32( 0x79CC4519UL );
   for( j =0; j < 16; j++ )
   {
-      SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ),
+      SS1 = v128_rol32( _mm_add_epi32( _mm_add_epi32( v128_rol32(A,12), E ),
                                      mm128_rol_var_32( T, j ) ), 7 );
-      SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) );
+      SS2 = _mm_xor_si128( SS1, v128_rol32( A, 12 ) );
      TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF0( A, B, C ), D ),
                                          SS2 ), W1[j] );
      TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG0( E, F, G ), H ),
                                          SS1 ), W[j] );
      D = C;
-      C = mm128_rol_32( B, 9 );
+      C = v128_rol32( B, 9 );
      B = A;
      A = TT1;
      H = G;
-      G = mm128_rol_32( F, 19 );
+      G = v128_rol32( F, 19 );
      F = E;
      E = P0( TT2 );
   }
@@ -308,19 +308,19 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
   T = _mm_set1_epi32( 0x7A879D8AUL );
   for( j =16; j < 64; j++ )
   {
-      SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ),
+      SS1 = v128_rol32( _mm_add_epi32( _mm_add_epi32( v128_rol32(A,12), E ),
                                      mm128_rol_var_32( T, j&31 ) ), 7 );
-      SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) );
+      SS2 = _mm_xor_si128( SS1, v128_rol32( A, 12 ) );
      TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF1( A, B, C ), D ), 
                                          SS2 ), W1[j] );
      TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG1( E, F, G ), H ),
                                          SS1 ), W[j] );
      D = C;
-      C = mm128_rol_32( B, 9 );
+      C = v128_rol32( B, 9 );
      B = A;
      A = TT1;
      H = G;
-      G = mm128_rol_32( F, 19 );
+      G = v128_rol32( F, 19 );
      F = E;
      E = P0( TT2 );
   }
@@ -408,14 +408,14 @@ void sm3_4way_close( void *cc, void *dst )
      memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
   }

-   count[0] = mm128_bswap_32(
+   count[0] = v128_bswap32(
                  _mm_set1_epi32( ctx->nblocks >> 23 ) );
-   count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
-                                              ( ctx->num     << 3 ) ) );
+   count[1] = v128_bswap32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
+                                            ( ctx->num     << 3 ) ) );
   sm3_4way_compress( ctx->digest, block );

   for ( i = 0; i < 8 ; i++ )
-     hash[i] = mm128_bswap_32( ctx->digest[i] );
+     hash[i] = v128_bswap32( ctx->digest[i] );
 }

 #endif
--- a/algo/verthash/Verthash.c
+++ b/algo/verthash/Verthash.c
@@ -137,53 +137,8 @@ void verthash_info_free(verthash_info_t* info)
 #define VH_N_INDEXES 4096
 #define VH_BYTE_ALIGNMENT 16

-static inline uint32_t fnv1a(const uint32_t a, const uint32_t b)
-{
-    return (a ^ b) * 0x1000193;
-}
+#define fnv1a( a, b )           ( ( (a) ^ (b) ) * 0x1000193 )

-#if 0
-static void rotate_indexes( uint32_t *p )
-{
-#if defined(__AVX2__)
-
-   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m256i); x += 8 )
-   {
-      __m256i *px = (__m256i*)p + x;
-
-      px[0] = mm256_rol_32( px[0], 1 );
-      px[1] = mm256_rol_32( px[1], 1 );
-      px[2] = mm256_rol_32( px[2], 1 );
-      px[3] = mm256_rol_32( px[3], 1 );
-      px[4] = mm256_rol_32( px[4], 1 );
-      px[5] = mm256_rol_32( px[5], 1 );
-      px[6] = mm256_rol_32( px[6], 1 );
-      px[7] = mm256_rol_32( px[7], 1 );
-   }
-
-#else
-
-   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m128i); x += 8 )
-   {
-      __m128i *px = (__m128i*)p0_index + x;
-
-      px[0] = mm128_rol_32( px[0], 1 );
-      px[1] = mm128_rol_32( px[1], 1 );
-      px[2] = mm128_rol_32( px[2], 1 );
-      px[3] = mm128_rol_32( px[3], 1 );
-      px[4] = mm128_rol_32( px[4], 1 );
-      px[5] = mm128_rol_32( px[5], 1 );
-      px[6] = mm128_rol_32( px[6], 1 );
-      px[7] = mm128_rol_32( px[7], 1 );
-   }
-
-#endif
-/*   
-   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(uint32_t); ++x )
-      p[x] = ( p[x] << 1 ) | ( p[x] >> 31 );
-*/
-}
-#endif
 // Vectorized and targetted version of fnv1a
 #if defined (__AVX2__)        

@@ -191,7 +146,7 @@ static void rotate_indexes( uint32_t *p )
   *(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \
                                 *(__m256i*)hash, *(__m256i*)blob_off ), k );

-#elif defined(__SSE4_1__)  || defined(__ARM_NEON)
+#elif defined(__SSE4_1__) || defined(__ARM_NEON)

 #define MULXOR \
   casti_v128( hash, 0 ) = v128_mul32( v128_xor( \
@@ -229,7 +184,7 @@ for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
   MULXOR; \
 }

-// subsequent passes rotate by r on demand, no need for mass rotate
+// subsequent passes rotate by r
 #define ROUND_r( r ) \
 for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
 { \
@@ -243,8 +198,8 @@ for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
 void verthash_hash( const void *blob_bytes, const size_t blob_size,
                    const void *input, void *output )
 {
-    uint32_t hash[ VH_HASH_OUT_SIZE / 4 ] __attribute__ ((aligned (64)));
    uint32_t subset[ VH_N_SUBSET / 4 ] __attribute__ ((aligned (64)));
+    uint32_t hash[ VH_HASH_OUT_SIZE / 4 ] __attribute__ ((aligned (32)));
    const uint32_t *blob = (const uint32_t*)blob_bytes;
    uint32_t accumulator = 0x811c9dc5;
    const uint32_t mdiv = ( ( blob_size - VH_HASH_OUT_SIZE )
--- a/algo/verthash/verthash-gate.c
+++ b/algo/verthash/verthash-gate.c
@@ -91,8 +91,8 @@ void verthash_sha3_512_final_8( void *hash, const uint64_t nonce )
 int scanhash_verthash( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t edata[20] __attribute__((aligned(64)));
   uint32_t hash[8] __attribute__((aligned(64)));
+   uint32_t edata[20] __attribute__((aligned(32)));
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -101,9 +101,7 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce,
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;

-   for (int i = 0; i < 20; i++)
-         edata[i] = bswap_32( pdata[i] );
-//   v128_bswap32_80( edata, pdata );
+   v128_bswap32_80( edata, pdata );
   verthash_sha3_512_prehash_72( edata );

   do
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -204,11 +204,11 @@ int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
   const __m512i eight = _mm512_set1_epi64( 8 );
   const bool bench = opt_benchmark;

-   edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
-   edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
-   edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
-   edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
-   edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
+   edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
+   edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
+   edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
+   edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
+   edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );

   mm512_intrlv80_8x64( vdata, edata );
   *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
@@ -372,11 +372,11 @@ int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
   const __m256i four = _mm256_set1_epi64x( 4 );
   const bool bench = opt_benchmark;

-   edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
-   edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
-   edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
-   edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
-   edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
+   edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
+   edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
+   edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
+   edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
+   edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );

   mm256_intrlv80_4x64( vdata, edata );

--- a/algo/x16/minotaur.c
+++ b/algo/x16/minotaur.c
@@ -318,7 +318,7 @@ bool register_minotaur_algo( algo_gate_t* gate )
  gate->hash              = (void*)&minotaur_hash;
  gate->miner_thread_init = (void*)&initialize_torture_garden;
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
-  if ( opt_algo == ALGO_MINOTAURX ) gate->optimizations |= SHA_OPT;
+  if ( opt_algo == ALGO_MINOTAURX ) gate->optimizations |= SHA256_OPT;
  return true;
 };

--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -418,11 +418,11 @@ int scanhash_x17_16x32( struct work *work, uint32_t max_nonce,
   const bool bench = opt_benchmark;

   // convert LE32 to LE64
-   edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
-   edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
-   edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
-   edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
-   edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
+   edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
+   edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
+   edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
+   edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
+   edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );

   mm512_intrlv80_8x64( vdata, edata );
   blake512_8way_prehash_le( &blake512_8way_ctx, x17_16way_midstate, vdata );
@@ -681,11 +681,11 @@ int scanhash_x17_8x64( struct work *work, uint32_t max_nonce,
   const bool bench = opt_benchmark;

   // convert LE32 to LE64
-   edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
-   edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
-   edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
-   edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
-   edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
+   edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
+   edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
+   edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
+   edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
+   edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );

   mm512_intrlv80_8x64( vdata, edata );
   *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
@@ -895,11 +895,11 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,
   const bool bench = opt_benchmark;

   // convert LE32 to LE64
-   edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
-   edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
-   edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
-   edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
-   edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
+   edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
+   edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
+   edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
+   edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
+   edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );

   mm256_intrlv80_4x64( vdata, edata );
   *noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32( 0,3,0,2, 0,1,0,0 ) );
--- a/algo/x22/x22i-gate.c
+++ b/algo/x22/x22i-gate.c
@@ -31,7 +31,7 @@ bool register_x22i_algo( algo_gate_t* gate )

 #endif

-  gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT
+  gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA256_OPT
                      | AVX512_OPT | VAES_OPT | NEON_OPT;
  return true;
 };
@@ -48,7 +48,7 @@ bool register_x25x_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x25x;
  gate->hash      = (void*)&x25x_hash;
 #endif
-  gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT |
+  gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA256_OPT |
                        AVX512_OPT | VAES_OPT | NEON_OPT;
  InitializeSWIFFTX();
  return true;
--- a/algo/yespower/yescrypt-r8g.c
+++ b/algo/yespower/yescrypt-r8g.c
@@ -71,7 +71,7 @@ int scanhash_yespower_r8g( struct work *work, uint32_t max_nonce,

 bool register_yescryptr8g_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+  gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
  gate->scanhash      = (void*)&scanhash_yespower_r8g;
 #if (__SSE2__) || defined(__aarch64__)
  gate->hash          = (void*)&yespower_hash;
--- a/algo/yespower/yespower-gate.c
+++ b/algo/yespower/yespower-gate.c
@@ -162,7 +162,7 @@ bool register_yespower_algo( algo_gate_t* gate )
  if ( yespower_params.pers )
     applog( LOG_NOTICE,"Key= \"%s\"\n", yespower_params.pers );

-  gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+  gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
  gate->scanhash      = (void*)&scanhash_yespower;
 #if (__SSE2__) || defined(__aarch64__)
  gate->hash          = (void*)&yespower_hash;
@@ -180,7 +180,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate )
  yespower_params.r       = 16;
  yespower_params.pers    = NULL;
  yespower_params.perslen = 0;
-  gate->optimizations     = SSE2_OPT | SHA_OPT | NEON_OPT;
+  gate->optimizations     = SSE2_OPT | SHA256_OPT | NEON_OPT;
  gate->scanhash          = (void*)&scanhash_yespower;
 #if (__SSE2__) || defined(__aarch64__)
  gate->hash              = (void*)&yespower_hash;
@@ -195,7 +195,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate )

 bool register_yescrypt_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+   gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
 #if (__SSE2__) || defined(__aarch64__)
   gate->hash       = (void*)&yespower_hash;
@@ -233,7 +233,7 @@ bool register_yescrypt_algo( algo_gate_t* gate )

 bool register_yescryptr8_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+   gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
   gate->scanhash      = (void*)&scanhash_yespower;
 #if (__SSE2__) || defined(__aarch64__)
   gate->hash          = (void*)&yespower_hash;
@@ -251,7 +251,7 @@ bool register_yescryptr8_algo( algo_gate_t* gate )

 bool register_yescryptr16_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+   gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
 #if (__SSE2__) || defined(__aarch64__)
   gate->hash          = (void*)&yespower_hash;
@@ -269,7 +269,7 @@ bool register_yescryptr16_algo( algo_gate_t* gate )

 bool register_yescryptr32_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+   gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
 #if (__SSE2__) || defined(__aarch64__)
   gate->hash          = (void*)&yespower_hash;
--- a/armbuild-all.sh
+++ b/armbuild-all.sh
@@ -4,55 +4,45 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.

-rm cpuminer cpuminer-armv9-aes-sha3 cpuminer-armv9-aes-sha3-sve2 cpuminer-armv8.2-aes-sha3-sve2 cpuminer-armv8-aes-sha2-sve2 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null
+rm cpuminer cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null

 # armv9 needs gcc-13
+# -march-armv9-a includes SVE2 but no crypto
+# -march=armv9-a+crypto adds AES & SHA2 but not SHA512

 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=armv9-a+crypto+sha3+aes -Wall -flax-vector-conversions" ./configure  --with-curl
+CFLAGS="-O3 -march=armv9-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure  --with-curl
 make -j $(nproc)
 strip -s cpuminer
-mv cpuminer cpuminer-armv9-aes-sha3
+mv cpuminer cpuminer-armv9-crypto-sha3

 make clean || echo clean
-CFLAGS="-O3 -march=armv9-a+crypto+sha3+aes+sve2 -Wall -flax-vector-conversions" ./configure  --with-curl
+CFLAGS="-O3 -march=armv9-a+crypto -Wall -flax-vector-conversions" ./configure  --with-curl
 make -j $(nproc)
 strip -s cpuminer
-mv cpuminer cpuminer-armv9-aes-sha3-sve2
+mv cpuminer cpuminer-armv9-crypto

 make clean || echo clean
-CFLAGS="-O3 -march=armv8.2-a+crypto+sha3+aes+sve2 -Wall -flax-vector-conversions" ./configure  --with-curl
+CFLAGS="-O3 -march=armv9-a -Wall -flax-vector-conversions" ./configure  --with-curl
 make -j $(nproc)
 strip -s cpuminer
-mv cpuminer cpuminer-armv8.2-aes-sha3-sve2
+mv cpuminer cpuminer-armv9

+# SVE2 available in armv8.5
 make clean || echo clean
-CFLAGS="-O3 -march=armv8-a+crypto+sha2+aes+sve2 -Wall -flax-vector-conversions" ./configure  --with-curl
+CFLAGS="-O3 -march=armv8.5-a+crypto+sha3+sve2 -Wall -flax-vector-conversions" ./configure  --with-curl
 make -j $(nproc)
 strip -s cpuminer
-mv cpuminer cpuminer-armv8-aes-sha2-sve2
+mv cpuminer cpuminer-armv8.5-crypto-sha3-sve2

+# SHA3 available in armv8.4
 make clean || echo clean
-CFLAGS="-O3 -march=armv8-a+crypto+sha2+aes -Wall -flax-vector-conversions" ./configure  --with-curl 
+CFLAGS="-O3 -march=armv8.4-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure  --with-curl
 make -j $(nproc)
 strip -s cpuminer
-mv cpuminer cpuminer-armv8-aes-sha2
-
-make clean || echo clean
-rm -f config.status
-CFLAGS="-O3 -march=armv8-a+crypto+sha2 -Wall -flax-vector-conversions" ./configure  --with-curl      
-make -j $(nproc)
-strip -s cpuminer
-mv cpuminer cpuminer-armv8-sha2
-
-make clean || echo clean
-rm -f config.status
-CFLAGS="-O3 -march=armv8-a+crypto+aes -Wall -flax-vector-conversions" ./configure  --with-curl      
-make -j $(nproc)
-strip -s cpuminer
-mv cpuminer cpuminer-armv8-aes
+mv cpuminer cpuminer-armv8.4-crypto-sha3

 make clean || echo clean
 rm -f config.status
--- a/clean-all.sh
+++ b/clean-all.sh
@@ -2,7 +2,7 @@
 #
 # make clean and rm all the targetted executables.

-rm cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9-aes-sha3 cpuminer-armv9-aes-sha3-sve2 cpuminer-armv8.2-aes-sha3-sve2 cpuminer-armv8-aes-sha3-sve2 cpuminer-armv8-aes-sha2-sve2 cpuminer-armv8-crypto cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-aes-sha3 cpuminer-armv8-aes-sha2 cpuminer-armv8-sha2 > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9 cpuminer-armv9-crypto cpuminer-armv9-crypto-sha3 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8.5-aes-sha3-sve2  cpuminer-armv8-crypto cpuminer-armv8 > /dev/null

 rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null

--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.2.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.4.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='24.2'
-PACKAGE_STRING='cpuminer-opt 24.2'
+PACKAGE_VERSION='24.4'
+PACKAGE_STRING='cpuminer-opt 24.4'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 24.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 24.4 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1432,7 +1432,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 24.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 24.4:";;
   esac
  cat <<\_ACEOF

@@ -1538,7 +1538,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 24.2
+cpuminer-opt configure 24.4
 generated by GNU Autoconf 2.71

 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 24.2, which was
+It was created by cpuminer-opt $as_me 24.4, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='24.2'
+ VERSION='24.4'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 24.2, which was
+This file was extended by cpuminer-opt $as_me 24.4, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 24.2
+cpuminer-opt config.status 24.4
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [24.2])
+AC_INIT([cpuminer-opt], [24.4])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/4355
+++ b/4355
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1912,6 +1912,8 @@ static bool wanna_mine(int thr_id)
 {
 	bool state = true;

+#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32))
+  
 	if (opt_max_temp > 0.0)
   {
 		float temp = cpu_temp(0);
@@ -1921,8 +1923,12 @@ static bool wanna_mine(int thr_id)
           applog(LOG_NOTICE, "CPU temp too high: %.0fC max %.0f, waiting...", temp, opt_max_temp );
         state = false;
 		}
+      if ( temp > hi_temp ) hi_temp = temp;
 	}
-	if (opt_max_diff > 0.0 && net_diff > opt_max_diff)
+
+#endif
+
+   if (opt_max_diff > 0.0 && net_diff > opt_max_diff)
   {
 		if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
 			applog(LOG_NOTICE, "network diff too high, waiting...");
@@ -2828,9 +2834,9 @@ out:

 static void show_credits()
 {
-   printf("\n         **********  "PACKAGE_NAME" "PACKAGE_VERSION"  *********** \n");
+   printf("\n         **********  "PACKAGE_NAME" "PACKAGE_VERSION"  ********** \n");
   printf("     A CPU miner with multi algo support and optimized for CPUs\n");
-   printf("     with AVX512, SHA and VAES extensions by JayDDee.\n");
+   printf("     with AVX512, SHA, AES and NEON extensions by JayDDee.\n");
   printf("     BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n\n");
 }

@@ -2840,40 +2846,47 @@ static void show_credits()
 static bool cpu_capability( bool display_only )
 {
     char cpu_brand[0x40];
-     bool cpu_has_aarch64 = cpu_arch_aarch64();
-     bool cpu_has_x86_64  = cpu_arch_x86_64();
-     bool cpu_has_sse2    = has_sse2();    // X86_64 only
-     bool cpu_has_ssse3   = has_ssse3();    // X86_64 only
-     bool cpu_has_sse41   = has_sse41();    // X86_64 only
-     bool cpu_has_sse42   = has_sse42();
-     bool cpu_has_avx     = has_avx();
-     bool cpu_has_avx2    = has_avx2();
-     bool cpu_has_avx512  = has_avx512();
-     bool cpu_has_avx10   = has_avx10();
-     bool cpu_has_aes     = has_aes_ni();  // x86_64 or AArch64 AES
-     bool cpu_has_vaes    = has_vaes();
-     bool cpu_has_sha256  = has_sha();     // x86_64 or AArch64
-     bool cpu_has_sha512  = has_sha512();
-     bool sw_has_x86_64   = false;
-     bool sw_has_aarch64  = false;
-     int  sw_arm_arch     = 0;            // AArch64
-     bool sw_has_neon     = false;        // AArch64
-//     bool sw_has_sve      = false;        // AArch64
-//     bool sw_has_sve2     = false;        // AArch64
-     bool sw_has_sse2     = false;        // x86_64
-     bool sw_has_ssse3    = false;        // x86_64
-     bool sw_has_sse41    = false;        // x86_64
-     bool sw_has_sse42    = false;
-     bool sw_has_avx      = false;
-     bool sw_has_avx2     = false;
-     bool sw_has_avx512   = false;
+     bool cpu_has_aarch64  = cpu_arch_aarch64();
+     bool cpu_has_x86_64   = cpu_arch_x86_64();
+     bool cpu_has_sse2     = has_sse2();     // X86_64 only
+     bool cpu_has_ssse3    = has_ssse3();    // X86_64 only
+     bool cpu_has_sse41    = has_sse41();    // X86_64 only
+     bool cpu_has_sse42    = has_sse42();
+     bool cpu_has_avx      = has_avx();
+     bool cpu_has_neon     = has_neon();     // AArch64 
+     bool cpu_has_sve      = has_sve();      // aarch64 only, insignificant
+     bool cpu_has_sve2     = has_sve2();     // AArch64 only
+     bool cpu_has_sme      = has_sme();
+     bool cpu_has_sme2     = has_sme2();  
+     bool cpu_has_avx2     = has_avx2(); 
+     bool cpu_has_avx512   = has_avx512();
+     bool cpu_has_avx10    = has_avx10();
+     bool cpu_has_aes      = has_aes();      // x86_64 or AArch64
+     bool cpu_has_vaes     = has_vaes();     // X86_64 only
+     bool cpu_has_sha256   = has_sha256();   // x86_64 or AArch64
+     bool cpu_has_sha512   = has_sha512();
+     bool sw_has_x86_64    = false;
+     bool sw_has_aarch64   = false;
+     int  sw_arm_arch      = 0;            // AArch64 version
+     bool sw_has_neon      = false;        // AArch64
+     bool sw_has_sve       = false;        // AArch64
+     bool sw_has_sve2      = false;        // AArch64
+     bool sw_has_sme       = false;  
+     bool sw_has_sme2      = false; 
+     bool sw_has_sse2      = false;        // x86_64
+     bool sw_has_ssse3     = false;        // x86_64
+     bool sw_has_sse41     = false;        // x86_64
+     bool sw_has_sse42     = false;
+     bool sw_has_avx       = false;
+     bool sw_has_avx2      = false;
+     bool sw_has_avx512    = false;
     bool sw_has_avx10_256 = false;
     bool sw_has_avx10_512 = false;
-     bool sw_has_aes      = false;
-     bool sw_has_vaes     = false;
-     bool sw_has_sha256   = false;        // x86_64 or AArch64 SHA2
-     bool sw_has_sha512   = false;        // x86_64 or AArch64 SHA3
-     set_t algo_features = algo_gate.optimizations;
+     bool sw_has_aes       = false;
+     bool sw_has_vaes      = false;
+     bool sw_has_sha256    = false;        // x86_64 or AArch64
+     bool sw_has_sha512    = false;        // x86_64 or AArch64
+     set_t algo_features   = algo_gate.optimizations;
     bool algo_has_sse2    = set_incl( SSE2_OPT,    algo_features );
     bool algo_has_sse42   = set_incl( SSE42_OPT,   algo_features );
     bool algo_has_avx     = set_incl( AVX_OPT,     algo_features );
@@ -2881,7 +2894,7 @@ static bool cpu_capability( bool display_only )
     bool algo_has_avx512  = set_incl( AVX512_OPT,  algo_features );
     bool algo_has_aes     = set_incl( AES_OPT,     algo_features );
     bool algo_has_vaes    = set_incl( VAES_OPT,    algo_features );
-     bool algo_has_sha256  = set_incl( SHA_OPT,     algo_features );
+     bool algo_has_sha256  = set_incl( SHA256_OPT,  algo_features );
     bool algo_has_sha512  = set_incl( SHA512_OPT,  algo_features );
     bool algo_has_neon    = set_incl( NEON_OPT,    algo_features );
     bool use_sse2;
@@ -2896,7 +2909,6 @@ static bool cpu_capability( bool display_only )
     bool use_neon;
     bool use_none;

-     // x86_64
     #if defined(__x86_64__)
         sw_has_x86_64 = true;
     #elif defined(__aarch64__)
@@ -2908,6 +2920,7 @@ static bool cpu_capability( bool display_only )
           sw_arm_arch = __ARM_ARCH;
         #endif
     #endif
+     // x86_64_only
     #if defined(__SSE2__)
         sw_has_sse2 = true;
     #endif
@@ -2935,7 +2948,7 @@ static bool cpu_capability( bool display_only )
     #if defined(__AVX10_1_512__)
         sw_has_avx10_512 = true;
     #endif
-         
+     // x86_64 or AArch64 
     #if defined(__AES__) || defined(__ARM_FEATURE_AES)
       sw_has_aes = true;
     #endif
@@ -2945,18 +2958,25 @@ static bool cpu_capability( bool display_only )
     #if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
         sw_has_sha256 = true;
     #endif
-     #if defined(__SHA512__) || defined(__ARM_FEATURE_SHA3)
+     #if defined(__SHA512__) || defined(__ARM_FEATURE_SHA512)
         sw_has_sha512 = true;
     #endif
+     // AArch64 only
     #if defined(__ARM_NEON)
         sw_has_neon = true;
     #endif
-//     #if defined(__ARM_FEATURE_SVE)
-//         sw_has_sve = true;
-//     #endif
-//     #if defined(__ARM_FEATURE_SVE2)
-//         sw_has_sve2 = true;
-//     #endif
+     #if defined(__ARM_FEATURE_SVE)
+         sw_has_sve = true;
+     #endif
+     #if defined(__ARM_FEATURE_SVE2)
+         sw_has_sve2 = true;
+     #endif
+     #if defined(__ARM_FEATURE_SME)
+         sw_has_sme = true;
+     #endif
+     #if defined(__ARM_FEATURE_SME2)
+         sw_has_sme2 = true;
+     #endif

     cpu_brand_string( cpu_brand );
     printf( "CPU: %s\n", cpu_brand );
@@ -2986,27 +3006,32 @@ static bool cpu_capability( bool display_only )
     printf("CPU features: ");
     if ( cpu_has_x86_64  )
     {
-                                    printf( " x86_64"  );
-       if      ( cpu_has_avx512 )   printf( " AVX512"  );
-       else if ( cpu_has_avx2   )   printf( " AVX2  "  );
-       else if ( cpu_has_avx    )   printf( " AVX   "  );
-       else if ( cpu_has_sse42  )   printf( " SSE4.2"  );
-       else if ( cpu_has_sse41  )   printf( " SSE4.1"  );
-       else if ( cpu_has_ssse3  )   printf( " SSSE3 "  );
-       else if ( cpu_has_sse2   )   printf( " SSE2  "  );
+       if      ( cpu_has_avx512 )    printf( " AVX512"  );
+       else if ( cpu_has_avx2   )    printf( " AVX2  "  );
+       else if ( cpu_has_avx    )    printf( " AVX   "  );
+       else if ( cpu_has_sse42  )    printf( " SSE4.2"  );
+       else if ( cpu_has_sse41  )    printf( " SSE4.1"  );
+       else if ( cpu_has_ssse3  )    printf( " SSSE3 "  );
+       else if ( cpu_has_sse2   )    printf( " SSE2  "  );
     }
-     else if   ( cpu_has_aarch64 )  printf( " AArch64 NEON" ); // NEON assumed
-     if        ( cpu_has_vaes   )   printf( " VAES"    );
-     else if   ( cpu_has_aes    )   printf( "  AES"    );
-     if        ( cpu_has_sha512 )   printf( " SHA512"  );
-     else if   ( cpu_has_sha256 )   printf( " SHA256"  );
-     if        ( cpu_has_avx10  )   printf( " AVX10.%d-%d",
+     else if   ( cpu_has_aarch64 )
+     {
+       if      ( cpu_has_neon   )    printf( "       NEON" );
+       if      ( cpu_has_sve2   )    printf( " SVE2-%d", sve_vector_length() );
+       else if ( cpu_has_sve    )    printf( " SVE"     );
+       if      ( cpu_has_sme2   )    printf( " SME2"    );
+       else if ( cpu_has_sme    )    printf( " SME"     );
+     }
+     if        ( cpu_has_vaes   )    printf( " VAES"    );
+     else if   ( cpu_has_aes    )    printf( "  AES"    );
+     if        ( cpu_has_sha512 )    printf( " SHA512"  );
+     else if   ( cpu_has_sha256 )    printf( " SHA256"  );
+     if        ( cpu_has_avx10  )    printf( " AVX10.%d-%d",
                                      avx10_version(), avx10_vector_length() );

     printf("\nSW features:  ");
     if ( sw_has_x86_64 )
     {                     
-                                     printf( " x86_64"  );
        if      ( sw_has_avx512  )   printf( " AVX512"  );
        else if ( sw_has_avx2    )   printf( " AVX2  "  );
        else if ( sw_has_avx     )   printf( " AVX   "  );
@@ -3019,12 +3044,12 @@ static bool cpu_capability( bool display_only )
     }
     else if    ( sw_has_aarch64 ) 
     {
-                                     printf( " AArch64" );
        if      ( sw_arm_arch    )   printf( " armv%d", sw_arm_arch );
        if      ( sw_has_neon    )   printf( " NEON"    );
-//        if      ( sw_has_sve     )   printf( " SVE"     );
-//        else if ( sw_has_sve2    )   printf( " SVE2"    );
-
+        if      ( sw_has_sve2    )   printf( " SVE2"    );
+        else if ( sw_has_sve     )   printf( " SVE"     );
+        if      ( sw_has_sme2    )   printf( " SME2"    );
+        else if ( sw_has_sme     )   printf( " SME"     );
     }
     if         ( sw_has_vaes    )   printf( " VAES"    );
     else if    ( sw_has_aes     )   printf( "  AES"    );
@@ -3052,35 +3077,6 @@ static bool cpu_capability( bool display_only )

     if ( display_only ) return true;

-/*     
-     // Check for CPU and build incompatibilities
-     if ( !cpu_has_sse2 && !cpu_has_aarch64 )
-     {
-        printf( "A CPU with SSE2 is required to use cpuminer-opt\n" );
-        return false;
-     }
-     if ( sw_has_avx2 && !( cpu_has_avx2 && cpu_has_aes ) )
-     {
-        printf( "The SW build requires a CPU with AES and AVX2!\n" );
-        return false;
-     }
-     if ( sw_has_sse42 && !cpu_has_sse42 )
-     {
-        printf( "The SW build requires a CPU with SSE4.2!\n" );
-        return false;
-     }
-     if ( sw_has_aes && !cpu_has_aes )
-     {
-        printf( "The SW build requires a CPU with AES!\n" );
-        return false;
-     }
-     if ( sw_has_sha && !cpu_has_sha )
-     {
-        printf( "The SW build requires a CPU with SHA!\n" );
-        return false;
-     }
-*/
-
     // Determine mining options
     use_sse2   = cpu_has_sse2   && sw_has_sse2   && algo_has_sse2;
     use_sse42  = cpu_has_sse42  && sw_has_sse42  && algo_has_sse42;
@@ -3096,13 +3092,10 @@ static bool cpu_capability( bool display_only )
             || use_avx2 || use_sha256 || use_vaes || use_sha512 || use_neon );

     // Display best options
-     applog_nl( "Enabled optimizations:" );
-     if         ( use_none   ) printf( " none" );
-     else
+     if ( !use_none )
     {
-//        if ( cpu_has_aarch64 ) printf( " AArch64");
-//        else
-//                               printf( " x86_64" );
+        applog_nl( "Enabled optimizations:" );
+        if      ( use_neon   ) printf( " NEON"   );
        if      ( use_avx512 ) printf( " AVX512" );
        else if ( use_avx2   ) printf( " AVX2"   );
        else if ( use_avx    ) printf( " AVX"    );
@@ -3112,15 +3105,12 @@ static bool cpu_capability( bool display_only )
        else if ( use_aes    ) printf( " AES"    );
        if      ( use_sha512 ) printf( " SHA512" );
        else if ( use_sha256 ) printf( " SHA256" );
-        if      ( use_neon   ) printf( " NEON"   );
+        printf( "\n" );
     }
-     printf( "\n" );

     return true;
 }

-
-
 void show_version_and_exit(void)
 {
        printf("\n built on " __DATE__
@@ -3130,7 +3120,6 @@ void show_version_and_exit(void)
         " with GCC");
        printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
 #endif
-
        printf(" features:"
 #if defined(USE_ASM) && defined(__i386__)
                " i386"
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -2436,7 +2436,7 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
 static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
 {
  const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
-                                             0x0405060700010203 );
+                                            0x0405060700010203 );
  const __m512i c1 = v512_64( 1 );
  v128_t s0 = casti_v128( src,0 );
  v128_t s1 = casti_v128( src,1 );
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -38,7 +38,6 @@
 //
 //           __m128i -> v128_t
 //           _mm_    -> v128_
-//           mm128_  -> v128_
 // 
 //    There is also new syntax to accomodate ARM's stricter type checking of
 //    vector element size. They have no effect on x86_64.
@@ -145,10 +144,8 @@
 typedef union
 {
   v128_t   v128;
-   __m128i  m128;
   uint32_t u32[4];
-} __attribute__ ((aligned (16))) m128_ovly;
-#define v128_ovly   m128_ovly
+} __attribute__ ((aligned (16))) v128_ovly;

 // use for immediate constants, use load1 for mem.
 #define v128_64                        _mm_set1_epi64x
@@ -168,7 +165,12 @@ typedef union
 // compiler to exploit new features to produce optimum code.
 // Currently only used internally and by Luffa.

-static inline __m128i mm128_mov64_128( const uint64_t n )
+
+#define v128_mov64       _mm_cvtsi64_si128
+#define v128_mov32       _mm_cvtsi32_si128
+
+/*
+static inline __m128i v128_mov64( const uint64_t n )
 {
  __m128i a;
 #if defined(__AVX__)
@@ -178,10 +180,8 @@ static inline __m128i mm128_mov64_128( const uint64_t n )
 #endif
  return a;
 }
-//#define v128_mov64( u64 )              mm128_mov64_128( u64 )

-
-static inline __m128i mm128_mov32_128( const uint32_t n )
+static inline __m128i v128_mov32( const uint32_t n )
 {
  __m128i a;
 #if defined(__AVX__)
@@ -191,11 +191,14 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
 #endif
  return a;
 }
+*/

 // broadcast lane 0 to all lanes
 #define v128_bcast64(v)                 _mm_shuffle_epi32( v, 0x44 )
 #define v128_bcast32(v)                 _mm_shuffle_epi32( v, 0x00 )

+// Not used, test first
+/*
 #if defined(__AVX2__)

 #define v128_bcast16(v)                 _mm_broadcastw_epi16(v)
@@ -203,9 +206,10 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
 #else

 #define v128_bcast16(v) \
-   v128_bcast32( v128_or( v128_sl32( v, 16 ), v ) )
+   _mm_shuffle_epi32( _mm_shufflelo_epi16( v, 0x00 ), 0x00 )

 #endif
+*/

 // Broadcast lane l to all lanes
 #define v128_duplane64( v, l ) \
@@ -221,28 +225,15 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
 // Pseudo constants
 #define v128_zero                       _mm_setzero_si128()

-#if defined(__SSE4_1__)
-
-// Bitwise AND, return 1 if result is all bits clear.
-#define v128_and_eq0(v1, v0)            _mm_testz_si128(v1, v0)
-
-// v128_is_zero?
-static inline int v128_cmpeq0( v128_t v )
-{  return v128_and_eq0( v, v ); }
-
-#endif
-
-// Bitwise compare return 1 if all bits set.
-#define v128_cmpeq1(v)                   _mm_test_all ones(v)
-
-#define v128_one                         mm128_mov64_128(1)
+//#define v128_one                         v128_mov64(1)
+#define v128_one                        _mm_cvtsi64_si128( 1 )

 // ASM avoids the need to initialize return variable to avoid compiler warning.
 // Macro abstracts function parentheses to look like an identifier.
 static inline __m128i v128_neg1_fn()
 {
   __m128i a;
-#if defined(__AVX__) 
+#if defined(__AVX__)
   asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(a) );
 #else
   asm( "pcmpeqq %0, %0\n\t" : "=x"(a) );
@@ -273,7 +264,6 @@ static inline __m128i v128_neg1_fn()
 // p = any aligned pointer, i = scaled array index
 // returns value p[i]
 #define casti_v128(p,i)    (((__m128i*)(p))[(i)])
-#define casti_m128i        casti_v128     // deprecated
 #define casti_v128u64      casti_v128
 #define casti_v128u32      casti_v128
 #define casti_v128u16      casti_v128
@@ -284,13 +274,14 @@ static inline __m128i v128_neg1_fn()
 #define casto_v128(p,o) (((__m128i*)(p))+(o))

 #if defined(__SSE4_1__)
+
 #define v128_get64( v, l )         _mm_extract_epi64( v, l )
 #define v128_get32( v, l )         _mm_extract_epi32( v, l )
 #define v128_get16( v, l )         _mm_extract_epi16( v, l )
 #define v128_get8(  v, l )         _mm_extract_epi8(  v, l )

 #define v128_put64( v, u64, l )    _mm_insert_epi64( v, u64, l )
-#define v128_put32( v, u32, l )    _mm_insert_epi64( v, u32, l )
+#define v128_put32( v, u32, l )    _mm_insert_epi32( v, u32, l )
 #define v128_put16( v, u16, l )    _mm_insert_epi16( v, u16, l )
 #define v128_put8(  v, u8,  l )    _mm_insert_epi8(  v, u8,  l )

@@ -327,7 +318,7 @@ static inline __m128i v128_neg1_fn()
 /*
 // Copy i32 to element c of dest and copy remaining elemnts from v.
 #define v128_put32( v, i32, c ) \
-      v128_xim_32( v, mm128_mov32_128( i32 ), (c)<<4 )
+      v128_xim_32( v, v128_mov32( i32 ), (c)<<4 )
 */


@@ -401,7 +392,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 {   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
 #define  memcpy_128           v128_memcpy  

+// Boolean operations
 #if defined(VL256)
+// Macros with duplicate references to the same argument are
+// not expression safe. Switch to inline function if required.

 // ~v1 | v0
 #define v128_ornot( v1, v0 )      _mm_ternarylogic_epi64( v1, v0, v0, 0xcf )
@@ -435,7 +429,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #else

-#define v128_ornot( v1, v0 )      _mm_or_si128( v1, v128_not( v0 ) )
+#define v128_ornot( v1, v0 )      _mm_or_si128( v128_not( v1 ), v0 )

 #define v128_xor3( a, b, c )      _mm_xor_si128( a, _mm_xor_si128( b, c ) )

@@ -463,17 +457,13 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 // Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements.
 // Effectively a sign test.

-#define mm128_movmask_64( v ) \
+#define v128_movmask64( v ) \
   _mm_movemask_pd( (__m128d)(v) )
-#define v128_movmask64                 mm128_movmask_64

-#define mm128_movmask_32( v ) \
+#define v128_movmask32( v ) \
   _mm_movemask_ps( (__m128)(v) )
-#define v128_movmask32                 mm128_movmask_32
-
-//
-// Bit rotations

+// Shuffle 16 bit elements within 64 bit lanes.
 #define v128_shuffle16( v, c ) \
       _mm_shufflehi_epi16( _mm_shufflelo_epi16( v, c ), c )

@@ -483,6 +473,9 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_qrev16(v)      v128_shuffle16( v, 0x1b )
 #define v128_lrev16(v)      v128_shuffle16( v, 0xb1 )

+//
+// Bit rotations
+
 // Internal use only, should never be callled from application code.
 #define v128_ror64_sse2( v, c ) \
   _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
@@ -608,10 +601,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #endif

-// deprecated
-#define mm128_rol_32        v128_rol32
-
-// ror( v1 ^ v0, n )
+// (v1 ^ v0) >>> n, ARM NEON has optimized version
 #define v128_ror64xor( v1, v0, n )  v128_ror64( v128_xor( v1, v0 ), n ) 

 /* not used
@@ -689,7 +679,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 /* Not used, exists only for compatibility with NEON if ever needed.
 #define v128_shufflev32( v, vmask ) \
-  v128_shuffle32( v, mm128_movmask_32( vmask ) )
+  v128_shuffle32( v, v128_movmask32( vmask ) )
 */

 #define v128_shuffle8     _mm_shuffle_epi8
@@ -710,15 +700,11 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_swap64(v)      _mm_shuffle_epi32( v, 0x4e )  // grandfathered 
 #define v128_rev64(v)       _mm_shuffle_epi32( v, 0x4e )  // preferred
 #define v128_rev32(v)       _mm_shuffle_epi32( v, 0x1b )
-#define v128_rev16(v)       v128_shuffle16( v, 0x1b )

 // rotate vector elements
 #define v128_shuflr32(v)    _mm_shuffle_epi32( v, 0x39 )
 #define v128_shufll32(v)    _mm_shuffle_epi32( v, 0x93 )

-#define v128_shuflr16(v)    v128_shuffle16( v, 0x39 )
-#define v128_shufll16(v)    v128_shuffle16( v, 0x93 )
-
 // Endian byte swap.

 #if defined(__SSSE3__)
@@ -734,15 +720,12 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_bswap32( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
                                        0x0405060700010203 ) )
-// deprecated
-#define mm128_bswap_32      v128_bswap32
-
 #define v128_bswap16( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0e0f0c0d0a0b0809, \
                                        0x0607040502030001 )

 // 8 byte qword * 8 qwords * 2 lanes = 128 bytes
-#define mm128_block_bswap_64( d, s ) \
+#define v128_block_bswap64( d, s ) \
 { \
  v128_t ctl = _mm_set_epi64x(  0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
  casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
@@ -754,8 +737,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
  casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
  casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
 }
-#define mm128_block_bswap64_512    mm128_block_bswap_64
-#define v128_block_bswap64_512     mm128_block_bswap_64
+#define v128_block_bswap64_512     v128_block_bswap64

 #define v128_block_bswap64_1024( d, s ) \
 { \
@@ -779,7 +761,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 }

 // 4 byte dword * 8 dwords * 4 lanes = 128 bytes
-#define mm128_block_bswap_32( d, s ) \
+#define v128_block_bswap32( d, s ) \
 { \
  v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
@@ -791,11 +773,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
  casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
  casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
 }
-#define mm128_block_bswap32_256      mm128_block_bswap_32
-#define v128_block_bswap32_256       mm128_block_bswap_32
+#define v128_block_bswap32_256       v128_block_bswap32


-#define mm128_block_bswap32_128( d, s ) \
+#define v128_block_bswap32_128( d, s ) \
 { \
  v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
@@ -840,7 +821,6 @@ static inline v128_t v128_bswap32( __m128i v )
      v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
  return  _mm_shufflehi_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
 }
-#define mm128_bswap_32      v128_bswap32 

 static inline v128_t v128_bswap16( __m128i v )
 {
@@ -849,7 +829,7 @@ static inline v128_t v128_bswap16( __m128i v )

 #define v128_bswap128( v )   v128_qrev32( v128_bswap64( v ) )

-static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
+static inline void v128_block_bswap64( __m128i *d, const __m128i *s )
 {
   d[0] = v128_bswap64( s[0] );
   d[1] = v128_bswap64( s[1] );
@@ -860,9 +840,8 @@ static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
   d[6] = v128_bswap64( s[6] );
   d[7] = v128_bswap64( s[7] );
 }
-#define v128_block_bswap64_512 mm128_block_bswap_64

-static inline void mm128_block_bswap64_1024( __m128i *d, const __m128i *s )
+static inline void v128_block_bswap64_1024( __m128i *d, const __m128i *s )
 {
   d[ 0] = v128_bswap64( s[ 0] );
   d[ 1] = v128_bswap64( s[ 1] );
@@ -882,7 +861,7 @@ static inline void mm128_block_bswap64_1024( __m128i *d, const __m128i *s )
   d[15] = v128_bswap64( s[15] );
 }

-static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
+static inline void v128_block_bswap32( __m128i *d, const __m128i *s )
 {
   d[0] = v128_bswap32( s[0] );
   d[1] = v128_bswap32( s[1] );
@@ -893,10 +872,9 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
   d[6] = v128_bswap32( s[6] );
   d[7] = v128_bswap32( s[7] );
 }
-#define mm128_block_bswap32_256 mm128_block_bswap_32
-#define v128_block_bswap32_256  mm128_block_bswap_32
+#define v128_block_bswap32_256  v128_block_bswap32

-static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
+static inline void v128_block_bswap32_512( __m128i *d, const __m128i *s )
 {
   d[ 0] = v128_bswap32( s[ 0] );
   d[ 1] = v128_bswap32( s[ 1] );
@@ -918,9 +896,6 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )

 #endif // SSSE3 else SSE2

-#define v128_block_bswap32             mm128_block_bswap_32
-#define v128_block_bswap64             mm128_block_bswap_64
-
 // alignr instruction for 32 & 64 bit elements is only available with AVX512
 // but emulated here. Behaviour is consistent with Intel alignr intrinsics.
 #if defined(__SSSE3__)
@@ -932,25 +907,27 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
 #else

 #define v128_alignr8( hi, lo, c ) \
-   _mm_or_si128( _mm_slli_si128( hi, c ), _mm_srli_si128( lo, c ) )
+   _mm_or_si128( _mm_slli_si128( hi, 16-(c) ), _mm_srli_si128( lo, c ) )

+// c arg is trivial only valid value is 1
 #define v128_alignr64( hi, lo, c ) \
-   _mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )
+   _mm_or_si128( _mm_slli_si128( hi, 16-((c)*8) ), _mm_srli_si128( lo, (c)*8 ) )

 #define v128_alignr32( hi, lo, c ) \
-   _mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )
+   _mm_or_si128( _mm_slli_si128( hi, 16-((c)*4) ), _mm_srli_si128( lo, (c)*4 ) )

 #endif

 // blend using vector mask
 #if defined(__SSE4_1__)

-// Bytewise using sign bit of each byte element of mask
+// Bytewise using sign bit of each byte element of mask. Use full bitmask
+// for compatibility with SSE2 & NEON.
 #define v128_blendv                    _mm_blendv_epi8

 #else

-// Bitwise
+// Bitwise, use only byte wise for compatibility with SSE4_1.
 #define v128_blendv( v1, v0, mask ) \
   v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )

--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -73,10 +73,10 @@ typedef union

 #else

-#define mm256_bcast128lo_64( i64 )   mm256_bcast_m128( mm128_mov64_128( i64 ) )
+#define mm256_bcast128lo_64( i64 )   mm256_bcast_m128( v128_mov64( i64 ) )

 #define mm256_bcast128hi_64( i64 )   _mm256_permute4x64_epi64( \
-                   _mm256_castsi128_si256( mm128_mov64_128( i64 ) ), 0x11 )
+                   _mm256_castsi128_si256( v128_mov64( i64 ) ), 0x11 )

 #endif

@@ -172,7 +172,7 @@ static inline __m256i mm256_not( const __m256i v )
    
 #else

-#define mm256_ornot( v1, v0 )      _mm256_or_si256( v1, mm256_not( v0 ) )
+#define mm256_ornot( v1, v0 )      _mm256_or_si256( mm256_not( v1 ), v0 )

 #define mm256_xor3( a, b, c ) \
  _mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
@@ -217,12 +217,11 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_movmask_32( v ) \
   _mm256_movemask_ps( _mm256_castsi256_ps( v ) )

-//
-//           Bit rotations.
-
+// shuffle 16 bit elements within 64 bit lanes.
 #define mm256_shuffle16( v, c ) \
   _mm256_shufflehi_epi16( _mm256_shufflelo_epi16( v, c ), c )

+// reverse elements within lanes.
 #define mm256_qrev32(v)    _mm256_shuffle_epi32( v, 0xb1 )
 #define mm256_swap64_32    mm256_qrev32       // grandfathered

@@ -242,6 +241,9 @@ static inline __m256i mm256_not( const __m256i v )
   _mm256_shuffle_epi8( v, mm256_bcast_m128( \
                         v128_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )

+//
+//           Bit rotations.
+
 // These should never be called directly by applications.
 #define mm256_ror_64_avx2( v, c ) \
   _mm256_or_si256( _mm256_srli_epi64( v, c ), \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -185,6 +185,8 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Ternary logic uses 8 bit truth table to define any 3 input logical
 // expression using any number or combinations of AND, OR, XOR, NOT.
+// Macros with duplicate references to the same argument are
+// not expression safe. Switch to inline function if required.

 // ~v1 | v0
 #define mm512_ornot( v1, v0 )      _mm512_ternarylogic_epi64( v1, v0, v0, 0xcf )
--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -108,8 +108,12 @@ static inline uint32_t le162( const uint16_t u16 )
 #define rol32       __rold
 #define ror32       __rord

+/*  these don't seem to work
 #elif defined(__aarch64__)

+// Documentation is vague, ror exists but is ambiguous. Docs say it can
+// do 32 or 64 registers. Assuming that is architecture specific andcan
+// only do 32 bit on 32 bit arch. Rarely used so not a big issue.
 static inline uint64_t ror64( uint64_t a, const int c )
 {
   uint64_t b;
@@ -125,6 +129,7 @@ static inline uint32_t ror32( uint32_t a, const int c )
   return b;
 }
 #define rol32( a, c )     ror32( a, 32-(c) )
+*/

 #else

--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -38,7 +38,9 @@
 #define v128u8_load( p )              vld1q_u16( (uint8_t*)(p) )
 #define v128u8_store( p, v )          vst1q_u16( (uint8_t*)(p), v )

-// load & set1 combined, doesn't work
+// load & set1 combined. What if source is already loaded?
+// Don't use, leave it up to the compiler to optimize.
+// Same with vld1q_lane.
 #define v128_load1_64(p)              vld1q_dup_u64( (uint64_t*)(p) )
 #define v128_load1_32(p)              vld1q_dup_u32( (uint32_t*)(p) )
 #define v128_load1_16(p)              vld1q_dup_u16( (uint16_t*)(p) )
@@ -61,17 +63,13 @@
 #define v128_sub16                    vsubq_u16
 #define v128_sub8                     vsubq_u8

-// returns low half, u64 undocumented, may not exist.
-#define v128_mul64                    vmulq_u64
+// returns low half
 #define v128_mul32                    vmulq_u32
 #define v128_mul16                    vmulq_u16

-// Widening multiply, align source elements with Intel
-static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
-{
-   return vmull_u32( vget_low_u32( vcopyq_laneq_u32( v1, 1, v1, 2 ) ),
-                     vget_low_u32( vcopyq_laneq_u32( v0, 1, v0, 2 ) ) );
-}
+// Widening multiply, realign source elements from x86_64 to NEON.
+#define v128_mulw32( v1, v0 ) \
+   vmull_u32( vmovn_u64( v1 ), vmovn_u64( v0 ) )

 // compare
 #define v128_cmpeq64                  vceqq_u64
@@ -315,7 +313,6 @@ static inline void v128_memset_zero( void *dst, const int n )
    memset( dst, 0, n*16 );
 }

-
 static inline void v128_memset( void *dst, const void *src, const int n )
 {
   for( int i = 0; i < n; i++ )
@@ -373,7 +370,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
                 ((uint8x16_t)(v)), c )


-// ror( v1 ^ v0, n )
+// ( v1 ^ v0 ) >>> n 
 #if defined(__ARM_FEATURE_SHA3)

 #define v128_ror64xor( v1, v0, n )  vxarq_u64( v1, v0, n ) 
@@ -438,7 +435,6 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )

 // sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
 // Bit rotation already promotes faster widths. Usage is context sensitive.
-// preferred.

 // reverse elements in vector lanes
 #define v128_qrev32            vrev64q_u32
@@ -460,7 +456,6 @@ static inline uint64x2_t v128_rev64( uint64x2_t v )
 #define v128_swap64     v128_rev64   // grandfathered

 #define v128_rev32(v)        v128_rev64( v128_qrev32( v ) )
-#define v128_rev16(v)        v128_rev64( v128_qrev16( v ) )

 // shuffle-rotate vector elements
 static inline uint32x4_t v128_shuflr32( uint32x4_t v )
@@ -469,12 +464,6 @@ static inline uint32x4_t v128_shuflr32( uint32x4_t v )
 static inline uint32x4_t v128_shufll32( uint32x4_t v )
 {   return vextq_u32( v, v, 3 ); }

-static inline uint16x8_t v128_shuflr16( uint16x8_t v )
-{   return vextq_u16( v, v, 1 ); }
-
-static inline uint16x8_t v128_shufll16( uint16x8_t v )
-{   return vextq_u16( v, v, 7 ); }
-
 // reverse bits in bytes, nothing like it in x86_64
 #define v128_bitrev8           vrbitq_u8

@@ -496,7 +485,7 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
   casti_v128u32( dst,6 ) = v128_bswap32( casti_v128u32( src,6 ) ); \
   casti_v128u32( dst,7 ) = v128_bswap32( casti_v128u32( src,7 ) ); \
 }
-#define v128_block_bswap32_256( dst, src ) \
+#define v128_block_bswap32_256    v128_block_bswap32

 #define v128_block_bswap32_512( dst, src ) \
 { \
@@ -551,7 +540,8 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
   casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \
 }

-// Bitwise blend using vector mask
+// Bitwise blend using vector mask, use only bytewise for compatibility
+// with x86_64.
 #define v128_blendv( v1, v0, mask )    vbslq_u32( mask, v1, v0 )

 #endif   // __ARM_NEON
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -16,11 +16,11 @@
 #include "miner.h"
 #include "simd-utils.h"

-#if defined(__aarch64__) && !defined(__APPLE__)
+#if defined(__aarch64__)
 // for arm's "cpuid"
 #include <sys/auxv.h>
 #include <asm/hwcap.h>
-
+#include <sys/prctl.h>
 #endif

 #ifndef WIN32
@@ -309,12 +309,59 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf,
 #endif
 }

-#elif defined(__aarch64__) && !defined(__APPLE__)
+#elif defined(__aarch64__)
+
+// Always test if HWCAP variable is defined in the kernel before attempting
+// to compile it. If not defined the feature can't be tested and won't be
+// included in the compile.
+// This can occur if compiling with an old kernel and a new CPU and could
+// result in a suboptimal build.

 static inline void cpuid( unsigned int leaf, unsigned int subleaf,
                          unsigned int output[4] )
 {
+#if defined(AT_HWCAP)
    output[0] = getauxval(AT_HWCAP);
+#else
+    output[0] = 0;
+#endif
+#if defined(AT_HWCAP2)
+    output[1] = getauxval(AT_HWCAP2);
+#else
+    output[1] = 0;
+#endif    
+
+/*    
+#define has(CAP, hwcap) !!((hwcap) & HWCAP_##CAP)
+#define pr(CAP, hwcap) printf("%10s = %d\n", #CAP, has(CAP, hwcap))
+
+	unsigned long hwcaps = getauxval(AT_HWCAP);
+	printf("HWCAP = 0x%lx\n", hwcaps);
+
+	pr(FP, hwcaps);
+	pr(ASIMD, hwcaps);
+	pr(EVTSTRM, hwcaps);
+	pr(AES, hwcaps);
+	pr(PMULL, hwcaps);
+	pr(SHA1, hwcaps);
+	pr(SHA2, hwcaps);
+	pr(CRC32, hwcaps);
+	pr(ATOMICS, hwcaps);
+	pr(FPHP, hwcaps);
+	pr(ASIMDHP, hwcaps);
+	pr(CPUID, hwcaps);
+	pr(ASIMDRDM, hwcaps);
+	pr(JSCVT, hwcaps);
+	pr(FCMA, hwcaps);
+	pr(LRCPC, hwcaps);
+	pr(DCPOP, hwcaps);
+	pr(SHA3, hwcaps);
+	pr(SM3, hwcaps);
+	pr(SM4, hwcaps);
+	pr(ASIMDDP, hwcaps);
+	pr(SHA512, hwcaps);
+	pr(SVE, hwcaps);
+*/    
 }   

 #else
@@ -447,6 +494,18 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 #ifdef __ARM_FEATURE_SHA3
 #warning "__ARM_FEATURE_SHA3"
 #endif
+#ifdef __ARM_FEATURE_SHA512
+#warning "__ARM_FEATURE_SHA512"
+#endif
+#ifdef __ARM_FEATURE_SVE
+#warning "__ARM_FEATURE_SVE"
+#endif
+#ifdef __ARM_FEATURE_SVE2
+#warning "__ARM_FEATURE_SVE2"
+#endif
+#ifdef __ARM_FEATURE_SME
+#warning "__ARM_FEATURE_SME"
+#endif
 */

 // GCC-14.1: the AVX512 macros are defined even when compiled with only
@@ -454,8 +513,8 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 // -mavx10.1-512 does it compile successfully.
 // __EVEX512__ is set only when compiled with -mavx10.1-512.
 // Adding -fno-evex512 doesn't help.
-// Building with -mapxf fails to configure on a CPU without APX because it can
-// run the test program.
+// Building with -mapxf fails on a CPU without APX because configure can't
+// run its test program.
 /*
 #ifdef __AVX10_1__
 #warning "__AVX10_1__"
@@ -482,7 +541,6 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 //   1     1    1    1    = AVX10 512 bit max  (version 1 granite rapids)
 // Other combinations are not defined.

-// No technical need for this, the code won't run if false.
 static inline bool cpu_arch_x86_64()
 {
 #if defined(__x86_64__)
@@ -515,11 +573,11 @@ static inline bool has_sse()
 static inline bool has_sse2()
 {
 #if defined(__x86_64__)
-    unsigned int cpu_info[4] = { 0 };
-    cpuid( CPU_INFO, 0, cpu_info );
-    return cpu_info[ EDX_Reg ] & SSE2_Flag;
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( CPU_INFO, 0, cpu_info );
+   return cpu_info[ EDX_Reg ] & SSE2_Flag;
 #else
-    return false;
+   return false;
 #endif
 }

@@ -556,39 +614,11 @@ static inline bool has_sse42()
 #endif
 }

+// There's no HWCAP for NEON, assume it's always true.
 static inline bool has_neon()
 {
-#if defined(__aarch64__) && !defined(__APPLE__)
-    unsigned int cpu_info[4] = { 0 };
-    return cpu_info[0];
-#else
-    return false;
-#endif
-}
-
-static inline bool has_aes_ni()
-{
-#if defined(__x86_64__)
-   if ( has_sse2() )
-   {
-      unsigned int cpu_info[4] = { 0 };
-      cpuid( CPU_INFO, 0, cpu_info );
-      return cpu_info[ ECX_Reg ] & AES_NI_Flag;
-   }
-   return false;
-#elif defined(__aarch64__) && !defined(__APPLE__)
-   if ( has_neon() )
-   {
-#if defined(KERNEL_HWCAP_AES)
-      return true;
-#else
-      return false;
-#endif
-/*      unsigned int cpu_info[4] = { 0 };
-      cpuid( 0, 0, cpu_info );
-      return cpu_info[0] & HWCAP_AES;
-*/   }
-   return false;
+#if defined(__aarch64__)
+   return true;
 #else
   return false;
 #endif
@@ -616,54 +646,48 @@ static inline bool has_avx2()
 #endif
 }

-static inline bool has_sha()
+// Also ensure kernel supports feature
+static inline bool has_sve()
 {
-#if defined(__x86_64__)
-    if ( has_avx() )
-    {
-       unsigned int cpu_info[4] = { 0 };
-       cpuid( EXTENDED_FEATURES, 0, cpu_info );
-       return cpu_info[ EBX_Reg ] & SHA_Flag;
-    }
-    return false;
-#elif defined(__aarch64__) && !defined(__APPLE__)
-    if ( has_neon() )
-    {
-#if defined(KERNEL_HWCAP_SHA2)
-       return true;
+#if defined(__aarch64__) && defined(HWCAP_SVE)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[0] & HWCAP_SVE;
 #else
-       return false;
-#endif
-/*       unsigned int cpu_info[4] = { 0 };
-       cpuid( 0, 0, cpu_info );
-       return cpu_info[0] & HWCAP_SHA2;
-*/    }
-    return false;
-#else
-    return false;
+   return false;
 #endif
 }

-static inline bool has_sha512()
+static inline bool has_sve2()
 {
-#if defined(__x86_64__)
-    if ( has_avx2() )
-    {
-       unsigned int cpu_info[4] = { 0 };
-       cpuid( EXTENDED_FEATURES, 1, cpu_info );
-       return cpu_info[ EAX_Reg ] & SHA512_Flag;
-    }
-    return false;
-#elif defined(__aarch64__) && !defined(__APPLE__)
-    if ( has_neon() )
-    {
-       unsigned int cpu_info[4] = { 0 };
-       cpuid( 0, 0, cpu_info );
-       return cpu_info[0] & HWCAP_SHA3;
-    }
-    return false;
+#if defined(__aarch64__) && defined(HWCAP2_SVE2)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[1] & HWCAP2_SVE2;
 #else
-    return false;
+   return false;
+#endif
+}
+
+static inline bool has_sme()
+{
+#if defined(__aarch64__) && defined(HWCAP2_SME)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[1] & HWCAP2_SME;
+#else
+   return false;
+#endif
+}
+
+static inline bool has_sme2()
+{
+#if defined(__aarch64__) && defined(HWCAP2_SME2)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[1] & HWCAP2_SME2;
+#else
+   return false;
 #endif
 }

@@ -723,6 +747,47 @@ static inline bool has_avx512()
 #endif
 }

+static inline bool has_vbmi()
+{
+#if defined(__x86_64__)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( EXTENDED_FEATURES, 0, cpu_info );
+   return cpu_info[ ECX_Reg ] & AVX512_VBMI_Flag;
+#else
+   return false;
+#endif
+}
+
+static inline bool has_vbmi2()
+{
+#if defined(__x86_64__)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( EXTENDED_FEATURES, 0, cpu_info );
+   return cpu_info[ ECX_Reg ] & AVX512_VBMI2_Flag;
+#else
+   return false;
+#endif
+}
+
+static inline bool has_aes()
+{
+#if defined(__x86_64__)
+   if ( has_sse2() )
+   {
+      unsigned int cpu_info[4] = { 0 };
+      cpuid( CPU_INFO, 0, cpu_info );
+      return cpu_info[ ECX_Reg ] & AES_NI_Flag;
+   }
+   return false;
+#elif defined(__aarch64__) && defined(HWCAP_AES)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[0] & HWCAP_AES;
+#else
+   return false;
+#endif
+}
+
 static inline bool has_vaes()
 {
 #if defined(__x86_64__)
@@ -738,25 +803,75 @@ static inline bool has_vaes()
 #endif
 }

-static inline bool has_vbmi()
+static inline bool has_sveaes()
 {
-#if defined(__x86_64__)
-    unsigned int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, 0, cpu_info );
-    return cpu_info[ ECX_Reg ] & AVX512_VBMI_Flag;
+#if defined(__aarch64__) && defined(HWCAP2_SVEAES)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[1] & HWCAP2_SVEAES;
 #else
   return false;
 #endif
 }

-static inline bool has_vbmi2()
+static inline bool has_sha256()
 {
 #if defined(__x86_64__)
-    unsigned int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, 0, cpu_info );
-    return cpu_info[ ECX_Reg ] & AVX512_VBMI2_Flag;
+   if ( has_avx() )
+   {
+      unsigned int cpu_info[4] = { 0 };
+      cpuid( EXTENDED_FEATURES, 0, cpu_info );
+      return cpu_info[ EBX_Reg ] & SHA_Flag;
+   }
+   return false;
+#elif defined(__aarch64__) && defined(HWCAP_SHA2)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[0] & HWCAP_SHA2;
 #else
-    return false;
+   return false;
+#endif
+}
+
+static inline bool has_sha512()
+{
+#if defined(__x86_64__)
+   if ( has_avx2() )
+   {
+      unsigned int cpu_info[4] = { 0 };
+      cpuid( EXTENDED_FEATURES, 1, cpu_info );
+      return cpu_info[ EAX_Reg ] & SHA512_Flag;
+   }
+   return false;
+#elif defined(__aarch64__) && defined(HWCAP_SHA512)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[0] & HWCAP_SHA512;
+#else
+   return false;
+#endif
+}
+
+// Arm only
+static inline bool has_sha3()
+{
+#if defined(__aarch64__) && defined(HWCAP_SHA3)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[0] & HWCAP_SHA3;
+#else
+   return false;
+#endif
+}
+
+static inline bool has_svesha3()
+{
+#if defined(__aarch64__) && defined(HWCAP2_SVESHA3)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[1] & HWCAP2_SVESHA3;
+#else
+   return false;
 #endif
 }

@@ -815,10 +930,8 @@ static inline unsigned int avx10_version()
       cpuid( AVX10_FEATURES, 0, cpu_info );
       return cpu_info[ EBX_Reg ] & AVX10_VERSION_mask;
    }
-    return 0;
-#else
-    return 0;
 #endif
+    return 0;
 }

 // also includes 256 & 128
@@ -831,10 +944,8 @@ static inline bool has_avx10_512()
       cpuid( AVX10_FEATURES, 0, cpu_info );
       return cpu_info[ EBX_Reg ] & AVX10_512_Flag;
    }
-    return false;
-#else
-    return false;
 #endif
+    return false;
 }

 // Includes 128 but may not include 512
@@ -847,13 +958,11 @@ static inline bool has_avx10_256()
       cpuid( AVX10_FEATURES, 0, cpu_info );
       return cpu_info[ EBX_Reg ] & AVX10_256_Flag;
    }
-    return false;
-#else
-    return false;
 #endif
+    return false;
 }

-// Maximum vector length
+// AVX10 vector register length
 static inline unsigned int avx10_vector_length()
 {
 #if defined(__x86_64__)
@@ -864,24 +973,28 @@ static inline unsigned int avx10_vector_length()
       return cpu_info[ EBX_Reg ] & AVX10_512_Flag ? 512
          : ( cpu_info[ EBX_Reg ] & AVX10_256_Flag ? 256 : 0 );
    }
-    return 0;
-#else
-    return 0;
 #endif
+    return 0;
 }

+// ARM SVE vector register length
+static inline int sve_vector_length()
+{
+#if defined(__aarch64__)
+   if ( has_sve() )
+      return prctl( (PR_SVE_GET_VL & PR_SVE_VL_LEN_MASK) * 8 );
+#endif
+   return 0;
+}

 static inline uint32_t cpuid_get_highest_function_number()
 {
 #if defined(__x86_64__)
- 
  unsigned int cpu_info[4] = {0};
  cpuid( VENDOR_ID, 0, cpu_info);
  return cpu_info[ EAX_Reg ];
-
-#else
-  return 0;  
 #endif
+  return 0;
 }

 // out of date
@@ -962,9 +1075,7 @@ static inline void cpu_brand_string( char* s )

 #elif defined(__arm__) || defined(__aarch64__)

-    unsigned int cpu_info[4] = { 0 };
-    cpuid( 0, 0, cpu_info );
-    sprintf( s, "ARM 64 bit CPU, HWCAP %08x", cpu_info[0] );
+    sprintf( s, "ARM 64 bit CPU" );

 #else

--- a/util.c
+++ b/util.c
@@ -2239,7 +2239,7 @@ static bool stratum_benchdata(json_t *result, json_t *params, int thr_id)
 #endif

 	cpu_bestfeature(arch, 16);
-	if (has_aes_ni()) strcat(arch, " NI");
+	if (has_aes()) strcat(arch, " NI");

 	cpu_getmodelid(vendorid, 32);
 	cpu_getname(cpuname, 80);
Author	SHA1	Message	Date
Jay D Dee	47e24b50e8	v24.4	2024-07-01 00:33:19 -04:00
Jay D Dee	c47c4a8885	v24.3	2024-05-28 18:20:19 -04:00