v24.5

v24.4
2025-09-17 23:44:27 +00:00 · 2024-09-13 14:14:57 -04:00 · 2024-07-01 00:33:19 -04:00
31 changed files with 475 additions and 386 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -166,7 +166,6 @@ cpuminer_SOURCES = \
  algo/shavite/sph-shavite-aesni.c \
  algo/shavite/shavite-hash-2way.c \
  algo/shavite/shavite-hash-4way.c \
-  algo/shavite/shavite.c \
  algo/simd/nist.c \
  algo/simd/vector.c \
  algo/simd/sph_simd.c \
--- a/13
+++ b/13
@@ -75,6 +75,19 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v24.5
+
+Fix MinGW compile error after MSys2 upgrade to GCC-14.2. 
+#427: GBT: Improved handling of new work.
+Removed shavite3 algo.
+
+v24.4
+
+x86_64: fixed a bug in ornot macro for AVX2 which broke some algos in v24.2.
+x86_64: fixed a bug in alignr macros for SSE2.
+ARM: CPU feature reporting enhancements.
+Some code cleanup.
+
 v24.3

 ARM: CPU feature detection and reporting is now working.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -263,8 +263,8 @@ static void init_algo_gate( algo_gate_t* gate )
   gate->build_block_header      = (void*)&std_build_block_header;
   gate->build_extraheader       = (void*)&std_build_extraheader;
   gate->set_work_data_endian    = (void*)&do_nothing;
-   gate->resync_threads          = (void*)&do_nothing;
-   gate->do_this_thread          = (void*)&return_true;
+//   gate->resync_threads          = (void*)&do_nothing;
+//   gate->do_this_thread          = (void*)&return_true;
   gate->longpoll_rpc_call       = (void*)&std_longpoll_rpc_call;
   gate->get_work_data_size      = (void*)&std_get_work_data_size;
   gate->optimizations           = EMPTY_SET;
@@ -340,7 +340,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_SHA256T:      rc = register_sha256t_algo       ( gate ); break;
    case ALGO_SHA3D:        rc = register_sha3d_algo         ( gate ); break;
    case ALGO_SHA512256D:   rc = register_sha512256d_algo    ( gate ); break;
-    case ALGO_SHAVITE3:     rc = register_shavite_algo       ( gate ); break;
    case ALGO_SKEIN:        rc = register_skein_algo         ( gate ); break;
    case ALGO_SKEIN2:       rc = register_skein2_algo        ( gate ); break;
    case ALGO_SKUNK:        rc = register_skunk_algo         ( gate ); break;
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -165,10 +165,10 @@ char* ( *malloc_txs_request )   ( struct work* );
 void ( *set_work_data_endian )  ( struct work* );

 // Diverge mining threads
-bool ( *do_this_thread )        ( int );
+//bool ( *do_this_thread )        ( int );

 // After do_this_thread
-void ( *resync_threads )        ( int, struct work* );
+//void ( *resync_threads )        ( int, struct work* );

 json_t* ( *longpoll_rpc_call )  ( CURL*, int*, char* );

--- a/algo/groestl/groestl512-intr-4way.h
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -239,7 +239,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
  { \
    /* AddRoundConstant P1024 */\
    xmm8 = _mm512_xor_si512( xmm8, mm512_bcast_m128( \
-             casti_m128i( round_const_p, round_counter ) ) ); \
+             casti_v128u32( round_const_p, round_counter ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm8  = _mm512_shuffle_epi8( xmm8,  SUBSH_MASK0 ); \
    xmm9  = _mm512_shuffle_epi8( xmm9,  SUBSH_MASK1 );\
@@ -254,7 +254,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    \
     /* AddRoundConstant P1024 */\
    xmm0 = _mm512_xor_si512( xmm0, mm512_bcast_m128( \
-             casti_m128i( round_const_p, round_counter+1 ) ) ); \
+             casti_v128u32( round_const_p, round_counter+1 ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
    xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK1 );\
@@ -283,7 +283,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
    xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
    xmm15 = _mm512_xor_si512( xmm15, mm512_bcast_m128( \
-                 casti_m128i( round_const_q, round_counter ) ) ); \
+                 casti_v128u32( round_const_q, round_counter ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm8  = _mm512_shuffle_epi8( xmm8,  SUBSH_MASK1 );\
    xmm9  = _mm512_shuffle_epi8( xmm9,  SUBSH_MASK3 );\
@@ -306,7 +306,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
    xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
    xmm7 = _mm512_xor_si512( xmm7, mm512_bcast_m128( \
-             casti_m128i( round_const_q, round_counter+1 ) ) ); \
+             casti_v128u32( round_const_q, round_counter+1 ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
    xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK3 );\
@@ -812,7 +812,7 @@ static const __m256i SUBSH_MASK7_2WAY =
  { \
    /* AddRoundConstant P1024 */\
    xmm8 = _mm256_xor_si256( xmm8, mm256_bcast_m128( \
-             casti_m128i( round_const_p, round_counter ) ) ); \
+             casti_v128u32( round_const_p, round_counter ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm8  = _mm256_shuffle_epi8( xmm8,  SUBSH_MASK0_2WAY ); \
    xmm9  = _mm256_shuffle_epi8( xmm9,  SUBSH_MASK1_2WAY );\
@@ -827,7 +827,7 @@ static const __m256i SUBSH_MASK7_2WAY =
    \
     /* AddRoundConstant P1024 */\
    xmm0 = _mm256_xor_si256( xmm0, mm256_bcast_m128( \
-             casti_m128i( round_const_p, round_counter+1 ) ) ); \
+             casti_v128u32( round_const_p, round_counter+1 ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
    xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK1_2WAY );\
@@ -856,7 +856,7 @@ static const __m256i SUBSH_MASK7_2WAY =
    xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
    xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
    xmm15 = _mm256_xor_si256( xmm15, mm256_bcast_m128( \
-                 casti_m128i( round_const_q, round_counter ) ) ); \
+                 casti_v128u32( round_const_q, round_counter ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm8  = _mm256_shuffle_epi8( xmm8,  SUBSH_MASK1_2WAY );\
    xmm9  = _mm256_shuffle_epi8( xmm9,  SUBSH_MASK3_2WAY );\
@@ -879,7 +879,7 @@ static const __m256i SUBSH_MASK7_2WAY =
    xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
    xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
    xmm7 = _mm256_xor_si256( xmm7, mm256_bcast_m128( \
-             casti_m128i( round_const_q, round_counter+1 ) ) ); \
+             casti_v128u32( round_const_q, round_counter+1 ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
    xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK3_2WAY );\
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -47,25 +47,19 @@
  a1 = _mm_alignr_epi8( b, a1, 4 ); \
 }

-#elif defined(__ARM_NEON)
+
+#elif defined(__ARM_NEON) || defined(__SSE2__)

 // { a1_0, 0, a1_0, a1_0 }
 #define MULT2( a0, a1 ) \
 { \
-  v128_t b = v128_xor( a0, v128_and( vdupq_laneq_u32( a1, 0 ), MASK ) ); \
+  v128_t b = v128_xor( a0, v128_and( v128_bcast32( a1 ), MASK ) ); \
  a0 = v128_alignr32( a1, b, 1 ); \
  a1 = v128_alignr32( b, a1, 1 ); \
 }

-#else   // assume SSE2
-
-#define MULT2( a0, a1 ) \
-{ \
-  v128_t b = v128_xor( a0, v128_and( _mm_shuffle_epi32( a1, 0 ), MASK ) ); \
-  a0 = v128_or( _mm_srli_si128(  b, 4 ), _mm_slli_si128( a1, 12 ) ); \
-  a1 = v128_or( _mm_srli_si128( a1, 4 ), _mm_slli_si128(  b, 12 ) ); \
-} 
-
+#else
+  #warning __FILE__ ":" __LINE__ " Unknown or unsupported CPU architecture."
 #endif

 #if defined(VL256)
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -104,14 +104,14 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
   int thr_id = mythr->id;  // thr_id arg is deprecated

   // we need bigendian data...
-   casti_m128i( edata, 0 ) = v128_bswap32( casti_m128i( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = v128_bswap32( casti_m128i( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = v128_bswap32( casti_m128i( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = v128_bswap32( casti_m128i( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = v128_bswap32( casti_m128i( pdata, 4 ) );
-   casti_m128i( edata, 5 ) = v128_bswap32( casti_m128i( pdata, 5 ) );
-   casti_m128i( edata, 6 ) = v128_bswap32( casti_m128i( pdata, 6 ) );
-   casti_m128i( edata, 7 ) = v128_bswap32( casti_m128i( pdata, 7 ) );
+   casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
+   casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
+   casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
+   casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
+   casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
+   casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
+   casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
+   casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
   intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
        edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );

@@ -224,14 +224,14 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
   int thr_id = mythr->id;  // thr_id arg is deprecated

   // we need bigendian data...
-   casti_m128i( edata, 0 ) = v128_bswap32( casti_m128i( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = v128_bswap32( casti_m128i( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = v128_bswap32( casti_m128i( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = v128_bswap32( casti_m128i( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = v128_bswap32( casti_m128i( pdata, 4 ) );
-   casti_m128i( edata, 5 ) = v128_bswap32( casti_m128i( pdata, 5 ) );
-   casti_m128i( edata, 6 ) = v128_bswap32( casti_m128i( pdata, 6 ) );
-   casti_m128i( edata, 7 ) = v128_bswap32( casti_m128i( pdata, 7 ) );
+   casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
+   casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
+   casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
+   casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
+   casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
+   casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
+   casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
+   casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
   intrlv_8x32( vdata, edata, edata, edata, edata,
                       edata, edata, edata, edata, 1024 );

--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -319,7 +319,7 @@ void ripemd160_4way_close( ripemd160_4way_context  *sc, void *dst )
    sc->buf[ (pad>>2) + 1 ] = _mm_set1_epi32( high );
    ripemd160_4way_round( sc );
    for (u = 0; u < 5; u ++)
-        casti_m128i( dst, u ) = sc->val[u];
+        casti_v128u32( dst, u ) = sc->val[u];
 }

 #endif
--- a/algo/sha/hmac-sha256-hash-4way.c
+++ b/algo/sha/hmac-sha256-hash-4way.c
@@ -74,8 +74,8 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
 	memset( pad, 0x36, 64*4 );

   for ( i = 0; i < Klen; i++ )
-		casti_m128i( pad, i ) = _mm_xor_si128( casti_m128i( pad, i ),
-                                             casti_m128i( K, i ) );
+		casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
+                                               casti_v128u32( K, i ) );

   sha256_4way_update( &ctx->ictx, pad, 64 );

@@ -83,8 +83,8 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
 	sha256_4way_init( &ctx->octx );
 	memset( pad, 0x5c, 64*4 );
 	for ( i = 0; i < Klen/4; i++ )
-		casti_m128i( pad, i ) = _mm_xor_si128( casti_m128i( pad, i ),
-                                             casti_m128i( K, i ) );
+		casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
+                                               casti_v128u32( K, i ) );
 	sha256_4way_update( &ctx->octx, pad, 64 );
 }

@@ -158,8 +158,8 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,

 			/* ... xor U_j ... */
 			for ( k = 0; k < 8; k++ )
-				casti_m128i( T, k ) = _mm_xor_si128( casti_m128i( T, k ),
-                                                 casti_m128i( U, k ) );
+				casti_v128u32( T, k ) = _mm_xor_si128( casti_v128u32( T, k ),
+                                                   casti_v128u32( U, k ) );
 		}

 		/* Copy as many bytes as necessary into buf. */
--- a/algo/sha/sha256-hash.c
+++ b/algo/sha/sha256-hash.c
@@ -569,8 +569,8 @@ void sha256_x86_sha_prehash_3rounds( uint32_t *ostate, const void *msg,
   __m128i STATE0, STATE1, MSG, TMP;

   // Load initial values
-   TMP    = casti_m128i( istate, 0 );
-   STATE1 = casti_m128i( istate, 1 );
+   TMP    = casti_v128u32( istate, 0 );
+   STATE1 = casti_v128u32( istate, 1 );

   TMP    = _mm_shuffle_epi32( TMP, 0xB1 );       // CDAB
   STATE1 = _mm_shuffle_epi32( STATE1, 0x1B );    // EFGH
@@ -578,17 +578,17 @@ void sha256_x86_sha_prehash_3rounds( uint32_t *ostate, const void *msg,
   STATE1 = _mm_blend_epi16( STATE1, TMP, 0xF0 ); // CDGH

   // Save current hash
-   casti_m128i( sstate, 0 ) = STATE0;
-   casti_m128i( sstate, 1 ) = STATE1;
+   casti_v128u32( sstate, 0 ) = STATE0;
+   casti_v128u32( sstate, 1 ) = STATE1;

   // Rounds 0 to 3
-   MSG = casti_m128i( msg, 0 );
+   MSG = casti_v128u32( msg, 0 );
   TMP = _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL );
   MSG = _mm_add_epi32( MSG, TMP );
   STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
   MSG = _mm_shuffle_epi32( MSG, 0x0E );
-   casti_m128i( ostate, 0 ) = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-   casti_m128i( ostate, 1 ) = STATE1;
+   casti_v128u32( ostate, 0 ) = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
+   casti_v128u32( ostate, 1 ) = STATE1;
 }

 void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
@@ -601,22 +601,22 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
    __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
    __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;

-    STATE0_X = casti_m128i( state_mid_X, 0 );
-    STATE1_X = casti_m128i( state_mid_X, 1 );
-    STATE0_Y = casti_m128i( state_mid_Y, 0 );
-    STATE1_Y = casti_m128i( state_mid_Y, 1 );
+    STATE0_X = casti_v128u32( state_mid_X, 0 );
+    STATE1_X = casti_v128u32( state_mid_X, 1 );
+    STATE0_Y = casti_v128u32( state_mid_Y, 0 );
+    STATE1_Y = casti_v128u32( state_mid_Y, 1 );

    // Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3)
-    TMSG0_X = casti_m128i( msg_X, 0 );
-    TMSG0_Y = casti_m128i( msg_Y, 0 );
+    TMSG0_X = casti_v128u32( msg_X, 0 );
+    TMSG0_Y = casti_v128u32( msg_Y, 0 );
    TMP_X = v128_xim32( TMSG0_X, TMSG0_X, 0xd5 );
    TMP_Y = v128_xim32( TMSG0_Y, TMSG0_Y, 0xd5 );
    STATE0_X = _mm_add_epi32( STATE0_X, TMP_X );
    STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y );

    // Rounds 4 to 7
-    TMSG1_X = casti_m128i( msg_X, 1 );
-    TMSG1_Y = casti_m128i( msg_Y, 1 );
+    TMSG1_X = casti_v128u32( msg_X, 1 );
+    TMSG1_Y = casti_v128u32( msg_Y, 1 );
    TMP_X = _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL );
    MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
    MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
@@ -638,8 +638,8 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_X );

    // Rounds 12 to 15
-    TMSG3_X = casti_m128i( msg_X, 3 );
-    TMSG3_Y = casti_m128i( msg_Y, 3 );
+    TMSG3_X = casti_v128u32( msg_X, 3 );
+    TMSG3_Y = casti_v128u32( msg_Y, 3 );
    TMP_X = _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL );
    MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
    MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
@@ -867,20 +867,20 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );

    // Add saved state to new state
-    STATE0_X = _mm_add_epi32( STATE0_X, casti_m128i( state_save_X, 0 ) );
-    STATE1_X = _mm_add_epi32( STATE1_X, casti_m128i( state_save_X, 1 ) );
-    STATE0_Y = _mm_add_epi32( STATE0_Y, casti_m128i( state_save_Y, 0 ) );
-    STATE1_Y = _mm_add_epi32( STATE1_Y, casti_m128i( state_save_Y, 1 ) );
+    STATE0_X = _mm_add_epi32( STATE0_X, casti_v128u32( state_save_X, 0 ) );
+    STATE1_X = _mm_add_epi32( STATE1_X, casti_v128u32( state_save_X, 1 ) );
+    STATE0_Y = _mm_add_epi32( STATE0_Y, casti_v128u32( state_save_Y, 0 ) );
+    STATE1_Y = _mm_add_epi32( STATE1_Y, casti_v128u32( state_save_Y, 1 ) );

    // Unshuffle & save state    
    TMP_X = _mm_shuffle_epi32( STATE0_X, 0x1B );                        // FEBA
    TMP_Y = _mm_shuffle_epi32( STATE0_Y, 0x1B );
    STATE1_X = _mm_shuffle_epi32( STATE1_X, 0xB1 );                     // DCHG
    STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0xB1 );
-    casti_m128i( out_X, 0 ) = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA
-    casti_m128i( out_Y, 0 ) = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 );
-    casti_m128i( out_X, 1 ) = _mm_alignr_epi8( STATE1_X, TMP_X, 8 );    // ABEF
-    casti_m128i( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
+    casti_v128u32( out_X, 0 ) = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA
+    casti_v128u32( out_Y, 0 ) = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 );
+    casti_v128u32( out_X, 1 ) = _mm_alignr_epi8( STATE1_X, TMP_X, 8 );    // ABEF
+    casti_v128u32( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
 }

 #endif     // SHA
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -300,11 +300,12 @@ static inline __m512i v512_mult_x5( const __m512i x )

 #define PERM_ELT16( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
-   xa0 = mm512_xor3( xm, xb1, mm512_xorandnot( \
-           v512_mult_x3( mm512_xor3( xa0, xc, \
-              v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ) ), \
-           xb3, xb2 ) ); \
-   xb0 = mm512_xnor( xa0, mm512_rol_32( xb0, 1 ) ); \
+   xa0 = mm512_xor3( xa0, xc, \
+                     v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ); \
+   xb0 = mm512_rol_32( xb0, 1 ); \
+   xa0 = mm512_xor3( xm, xb1, \
+                     mm512_xorandnot( v512_mult_x3( xa0 ), xb3, xb2 ) ); \
+   xb0 = mm512_xnor( xa0, xb0 ); \
 } while (0)

 #define PERM_STEP_0_16  do { \
@@ -905,11 +906,12 @@ static inline __m256i v256_mult_x5( const __m256i x )

 #define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
-   xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
-           v256_mult_x3( mm256_xor3( xa0, xc, \
-              v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ) ), \
-           xb3, xb2 ) ); \
-   xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
+   xa0 = mm256_xor3( xa0, xc, \
+                     v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ); \
+   xb0 = mm256_rol_32( xb0, 1 ); \
+   xa0 = mm256_xor3( xm, xb1, \
+                     mm256_xorandnot( v256_mult_x3( xa0 ), xb3, xb2 ) ); \
+   xb0 = mm256_xnor( xa0, xb0 ); \
 } while (0)

 #define PERM_STEP_0_8   do { \
--- a/algo/simd/vector.h
+++ b/algo/simd/vector.h
@@ -62,8 +62,6 @@ union u32 {
 #define v32_andn(x,y) ((v32) vec_andn((x), (y)))
 #endif

-//TODO  aarch support for widening multiply
-
 #if defined(__SSE2__)

 #define vec_and(x,y) ((x)&(y))
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -204,11 +204,11 @@ int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
   const __m512i eight = _mm512_set1_epi64( 8 );
   const bool bench = opt_benchmark;

-   edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
-   edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
-   edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
-   edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
-   edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
+   edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
+   edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
+   edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
+   edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
+   edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );

   mm512_intrlv80_8x64( vdata, edata );
   *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
@@ -372,11 +372,11 @@ int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
   const __m256i four = _mm256_set1_epi64x( 4 );
   const bool bench = opt_benchmark;

-   edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
-   edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
-   edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
-   edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
-   edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
+   edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
+   edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
+   edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
+   edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
+   edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );

   mm256_intrlv80_4x64( vdata, edata );

--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -418,11 +418,11 @@ int scanhash_x17_16x32( struct work *work, uint32_t max_nonce,
   const bool bench = opt_benchmark;

   // convert LE32 to LE64
-   edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
-   edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
-   edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
-   edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
-   edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
+   edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
+   edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
+   edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
+   edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
+   edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );

   mm512_intrlv80_8x64( vdata, edata );
   blake512_8way_prehash_le( &blake512_8way_ctx, x17_16way_midstate, vdata );
@@ -681,11 +681,11 @@ int scanhash_x17_8x64( struct work *work, uint32_t max_nonce,
   const bool bench = opt_benchmark;

   // convert LE32 to LE64
-   edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
-   edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
-   edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
-   edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
-   edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
+   edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
+   edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
+   edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
+   edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
+   edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );

   mm512_intrlv80_8x64( vdata, edata );
   *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
@@ -895,11 +895,11 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,
   const bool bench = opt_benchmark;

   // convert LE32 to LE64
-   edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
-   edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
-   edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
-   edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
-   edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
+   edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
+   edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
+   edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
+   edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
+   edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );

   mm256_intrlv80_4x64( vdata, edata );
   *noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32( 0,3,0,2, 0,1,0,0 ) );
--- a/api.c
+++ b/api.c
@@ -531,7 +531,7 @@ static void api()
 	time_t bindstart;
 	struct sockaddr_in serv;
 	struct sockaddr_in cli;
-	socklen_t clisiz;
+	uint32_t clisiz;
 	bool addrok = false;
 	long long counter;
 	char *result;
--- a/armbuild-all.sh
+++ b/armbuild-all.sh
@@ -4,57 +4,45 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.

-rm cpuminer cpuminer-armv9-aes-sha3 cpuminer-armv9-aes-sha3-sve2 cpuminer-armv8.5-aes-sha3-sve2 cpuminer-armv8.4-aes-sha3 cpuminer-armv8-aes-sha2 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null
+rm cpuminer cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null

 # armv9 needs gcc-13
+# -march-armv9-a includes SVE2 but no crypto
+# -march=armv9-a+crypto adds AES & SHA2 but not SHA512

 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=armv9-a+crypto+sha3+aes -Wall -flax-vector-conversions" ./configure  --with-curl
+CFLAGS="-O3 -march=armv9-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure  --with-curl
 make -j $(nproc)
 strip -s cpuminer
-mv cpuminer cpuminer-armv9-aes-sha3
+mv cpuminer cpuminer-armv9-crypto-sha3

-#make clean || echo clean
-#CFLAGS="-O3 -march=armv9-a+crypto+sha3+aes+sve2 -Wall -flax-vector-conversions" ./configure  --with-curl
-#make -j $(nproc)
-#strip -s cpuminer
-#mv cpuminer cpuminer-armv9-aes-sha3-sve2
+make clean || echo clean
+CFLAGS="-O3 -march=armv9-a+crypto -Wall -flax-vector-conversions" ./configure  --with-curl
+make -j $(nproc)
+strip -s cpuminer
+mv cpuminer cpuminer-armv9-crypto
+
+make clean || echo clean
+CFLAGS="-O3 -march=armv9-a -Wall -flax-vector-conversions" ./configure  --with-curl
+make -j $(nproc)
+strip -s cpuminer
+mv cpuminer cpuminer-armv9

 # SVE2 available in armv8.5
-#make clean || echo clean
-#CFLAGS="-O3 -march=armv8.5-a+crypto+sha3+aes+sve2 -Wall -flax-vector-conversions" ./configure  --with-curl
-#make -j $(nproc)
-#strip -s cpuminer
-#mv cpuminer cpuminer-armv8.5-aes-sha3-sve2
+make clean || echo clean
+CFLAGS="-O3 -march=armv8.5-a+crypto+sha3+sve2 -Wall -flax-vector-conversions" ./configure  --with-curl
+make -j $(nproc)
+strip -s cpuminer
+mv cpuminer cpuminer-armv8.5-crypto-sha3-sve2

 # SHA3 available in armv8.4
 make clean || echo clean
-CFLAGS="-O3 -march=armv8.4-a+crypto+sha3+aes -Wall -flax-vector-conversions" ./configure  --with-curl
+CFLAGS="-O3 -march=armv8.4-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure  --with-curl
 make -j $(nproc)
 strip -s cpuminer
-mv cpuminer cpuminer-armv8.4-aes-sha3
-
-make clean || echo clean
-CFLAGS="-O3 -march=armv8-a+crypto+sha2+aes -Wall -flax-vector-conversions" ./configure  --with-curl
-make -j $(nproc)
-strip -s cpuminer
-mv cpuminer cpuminer-armv8-aes-sha2
-
-make clean || echo clean
-rm -f config.status
-CFLAGS="-O3 -march=armv8-a+crypto+sha2 -Wall -flax-vector-conversions" ./configure  --with-curl      
-make -j $(nproc)
-strip -s cpuminer
-mv cpuminer cpuminer-armv8-sha2
-
-make clean || echo clean
-rm -f config.status
-CFLAGS="-O3 -march=armv8-a+crypto+aes -Wall -flax-vector-conversions" ./configure  --with-curl      
-make -j $(nproc)
-strip -s cpuminer
-mv cpuminer cpuminer-armv8-aes
+mv cpuminer cpuminer-armv8.4-crypto-sha3

 make clean || echo clean
 rm -f config.status
--- a/clean-all.sh
+++ b/clean-all.sh
@@ -2,7 +2,7 @@
 #
 # make clean and rm all the targetted executables.

-rm cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9-aes-sha3 cpuminer-armv9-aes-sha3-sve2 cpuminer-armv8.4-aes-sha3 cpuminer-armv8.5-aes-sha3-sve2  cpuminer-armv8-crypto cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-aes-sha3 cpuminer-armv8-aes-sha2 cpuminer-armv8-sha2 > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9 cpuminer-armv9-crypto cpuminer-armv9-crypto-sha3 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8.5-aes-sha3-sve2  cpuminer-armv8-crypto cpuminer-armv8 > /dev/null

 rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null

--- a/28
+++ b/28
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.3.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.5.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='24.3'
-PACKAGE_STRING='cpuminer-opt 24.3'
+PACKAGE_VERSION='24.5'
+PACKAGE_STRING='cpuminer-opt 24.5'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 24.3 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 24.5 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1432,7 +1432,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 24.3:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 24.5:";;
   esac
  cat <<\_ACEOF

@@ -1538,7 +1538,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 24.3
+cpuminer-opt configure 24.5
 generated by GNU Autoconf 2.71

 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 24.3, which was
+It was created by cpuminer-opt $as_me 24.5, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='24.3'
+ VERSION='24.5'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -5810,11 +5810,11 @@ if test x$ac_prog_cxx_stdcxx = xno
 then :
  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++11 features" >&5
 printf %s "checking for $CXX option to enable C++11 features... " >&6; }
-if test ${ac_cv_prog_cxx_cxx11+y}
+if test ${ac_cv_prog_cxx_11+y}
 then :
  printf %s "(cached) " >&6
 else $as_nop
-  ac_cv_prog_cxx_cxx11=no
+  ac_cv_prog_cxx_11=no
 ac_save_CXX=$CXX
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
@@ -5856,11 +5856,11 @@ if test x$ac_prog_cxx_stdcxx = xno
 then :
  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++98 features" >&5
 printf %s "checking for $CXX option to enable C++98 features... " >&6; }
-if test ${ac_cv_prog_cxx_cxx98+y}
+if test ${ac_cv_prog_cxx_98+y}
 then :
  printf %s "(cached) " >&6
 else $as_nop
-  ac_cv_prog_cxx_cxx98=no
+  ac_cv_prog_cxx_98=no
 ac_save_CXX=$CXX
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 24.3, which was
+This file was extended by cpuminer-opt $as_me 24.5, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 24.3
+cpuminer-opt config.status 24.5
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [24.3])
+AC_INIT([cpuminer-opt], [24.5])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.2.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.4.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='24.2'
-PACKAGE_STRING='cpuminer-opt 24.2'
+PACKAGE_VERSION='24.4'
+PACKAGE_STRING='cpuminer-opt 24.4'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 24.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 24.4 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1432,7 +1432,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 24.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 24.4:";;
   esac
  cat <<\_ACEOF

@@ -1538,7 +1538,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 24.2
+cpuminer-opt configure 24.4
 generated by GNU Autoconf 2.71

 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 24.2, which was
+It was created by cpuminer-opt $as_me 24.4, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='24.2'
+ VERSION='24.4'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 24.2, which was
+This file was extended by cpuminer-opt $as_me 24.4, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 24.2
+cpuminer-opt config.status 24.4
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"

--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1591,13 +1591,13 @@ start:
         last_targetdiff = net_diff;

         applog( LOG_BLUE, "New Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
-                                work->height, work->tx_count, net_diff,
-                                work->data[ algo_gate.ntime_index ] );
+                             work->height, work->tx_count, net_diff,
+                             bswap_32( work->data[ algo_gate.ntime_index ] ) );
      }
-      else if ( memcmp( &work->data[1], &g_work.data[1], 32 ) )
+      else if ( memcmp( work->data, g_work.data, algo_gate.work_cmp_size ) )
         applog( LOG_BLUE, "New Work: Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
-                                work->height, work->tx_count, net_diff,
-                                work->data[ algo_gate.ntime_index ] );
+                             work->height, work->tx_count, net_diff,
+                             bswap_32( work->data[ algo_gate.ntime_index ] ) );
      else
        new_work = false;

@@ -1912,6 +1912,8 @@ static bool wanna_mine(int thr_id)
 {
 	bool state = true;

+#if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32))
+  
 	if (opt_max_temp > 0.0)
   {
 		float temp = cpu_temp(0);
@@ -1921,8 +1923,12 @@ static bool wanna_mine(int thr_id)
           applog(LOG_NOTICE, "CPU temp too high: %.0fC max %.0f, waiting...", temp, opt_max_temp );
         state = false;
 		}
+      if ( temp > hi_temp ) hi_temp = temp;
 	}
-	if (opt_max_diff > 0.0 && net_diff > opt_max_diff)
+
+#endif
+
+   if (opt_max_diff > 0.0 && net_diff > opt_max_diff)
   {
 		if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
 			applog(LOG_NOTICE, "network diff too high, waiting...");
@@ -2133,7 +2139,7 @@ static void *miner_thread( void *userdata )
 //   uint32_t end_nonce = opt_benchmark
 //                      ? ( 0xffffffffU / opt_n_threads ) * (thr_id + 1) - 0x20
 //                      : 0;
-   uint32_t end_nonce = 0xffffffffU / opt_n_threads  * (thr_id + 1) - 0x20;
+   uint32_t end_nonce = 0xffffffffU / opt_n_threads  * (thr_id + 1) - opt_n_threads;

   memset( &work, 0, sizeof(work) );
 
@@ -2200,58 +2206,58 @@ static void *miner_thread( void *userdata )
 //       int64_t max64 = 1000;
       int nonce_found = 0;

-//       if ( likely( algo_gate.do_this_thread( thr_id ) ) )
-//       {
-          if ( have_stratum ) 
+       if ( have_stratum ) 
+       {
+          while ( unlikely( stratum_down ) )
+             sleep( 1 );
+          if ( unlikely( ( *nonceptr >= end_nonce )
+                        && !work_restart[thr_id].restart ) )
          {
-             while ( unlikely( stratum_down ) )
-                sleep( 1 );
-             if ( unlikely( ( *nonceptr >= end_nonce )
-                         && !work_restart[thr_id].restart ) )
+             if ( opt_extranonce )
+                stratum_gen_work( &stratum, &g_work );
+             else
             {
-                if ( opt_extranonce )
-                   stratum_gen_work( &stratum, &g_work );
-                else
+                if ( !thr_id )
                {
-                   if ( !thr_id )
-                   {
-                      applog( LOG_WARNING, "nonce range exhausted, extranonce not subscribed" );
-                      applog( LOG_WARNING, "waiting for new work...");
-                   }
-                   while ( !work_restart[thr_id].restart )
-                      sleep ( 1 );
+                   applog( LOG_WARNING, "Nonce range exhausted, extranonce not subscribed." );
+                   applog( LOG_WARNING, "Waiting for new work...");
                }
+                while ( !work_restart[thr_id].restart )
+                   sleep ( 1 );
             }
          }
-          else if ( !opt_benchmark ) // GBT or getwork
+       }
+       else if ( !opt_benchmark ) // GBT or getwork
+       {
+         // max64 is used to set end_nonce to match the scantime.
+         // It also factors the nonce range to end the scan when nonces are
+         // exhausted. In either case needing new work can be assumed.
+         // Only problem is every thread will call get_work.
+         // First thread resets scantime blocking all subsequent threads
+         // from fetching new work.
+
+          pthread_rwlock_wrlock( &g_work_lock );
+          const time_t now = time(NULL);
+          if ( ( ( now - g_work_time ) >= opt_scantime )
+             || ( *nonceptr >= end_nonce ) )
          {
-             pthread_rwlock_wrlock( &g_work_lock );
-
-             if ( ( ( time(NULL) - g_work_time ) >= opt_scantime )
-               || ( *nonceptr >= end_nonce ) )
+             if ( unlikely( !get_work( mythr, &g_work ) ) )
             {
-                if ( unlikely( !get_work( mythr, &g_work ) ) )
-                {
-                   pthread_rwlock_unlock( &g_work_lock );
-		             applog( LOG_ERR, "work retrieval failed, exiting miner thread %d", thr_id );
-		             goto out;
-	             }
-                g_work_time = time(NULL);
-//                restart_threads();
-             }
-
-             pthread_rwlock_unlock( &g_work_lock );
+                pthread_rwlock_unlock( &g_work_lock );
+                applog( LOG_ERR, "work retrieval failed, exiting miner thread %d", thr_id );
+		          goto out;
+	          }
+             g_work_time = now;
          }
-
-          pthread_rwlock_rdlock( &g_work_lock );
-
-          algo_gate.get_new_work( &work, &g_work, thr_id, &end_nonce );
-          work_restart[thr_id].restart = 0;
-
          pthread_rwlock_unlock( &g_work_lock );
+       }

-//       } // do_this_thread
-//       algo_gate.resync_threads( thr_id, &work );
+       pthread_rwlock_rdlock( &g_work_lock );
+
+       algo_gate.get_new_work( &work, &g_work, thr_id, &end_nonce );
+       work_restart[thr_id].restart = 0;
+
+       pthread_rwlock_unlock( &g_work_lock );

       // conditional mining
       if ( unlikely( !wanna_mine( thr_id ) ) )
@@ -2309,12 +2315,6 @@ static void *miner_thread( void *userdata )
       gettimeofday( (struct timeval *) &tv_start, NULL );

       // Scan for nonce
-//       nonce_found = scanhash_sha256dt_ref( &work, max_nonce, &hashes_done,
-//                                         mythr );
-//       nonce_found = scanhash_sha256dt_4x32( &work, max_nonce, &hashes_done,
-//                                         mythr );
-
-
       nonce_found = algo_gate.scanhash( &work, max_nonce, &hashes_done,
                                         mythr );

@@ -2336,8 +2336,8 @@ static void *miner_thread( void *userdata )
       // If unsubmiited nonce(s) found, submit now. 
       if ( unlikely( nonce_found && !opt_benchmark ) )
       {  
-//          applog( LOG_WARNING, "BUG: See RELEASE_NOTES for reporting bugs. Algo = %s.",
-//                               algo_names[ opt_algo ] );
+          applog( LOG_WARNING, "BUG: See RELEASE_NOTES for reporting bugs. Algo = %s.",
+                               algo_names[ opt_algo ] );
          if ( !submit_work( mythr, &work ) )
          {
             applog( LOG_WARNING, "Failed to submit share." );
@@ -2828,9 +2828,9 @@ out:

 static void show_credits()
 {
-   printf("\n         **********  "PACKAGE_NAME" "PACKAGE_VERSION"  *********** \n");
+   printf("\n         **********  "PACKAGE_NAME" "PACKAGE_VERSION"  ********** \n");
   printf("     A CPU miner with multi algo support and optimized for CPUs\n");
-   printf("     with AVX512, SHA and VAES extensions by JayDDee.\n");
+   printf("     with AVX512, SHA, AES and NEON extensions by JayDDee.\n");
   printf("     BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n\n");
 }

@@ -2847,21 +2847,26 @@ static bool cpu_capability( bool display_only )
     bool cpu_has_sse41    = has_sse41();    // X86_64 only
     bool cpu_has_sse42    = has_sse42();
     bool cpu_has_avx      = has_avx();
-//     bool cpu_has_sve      = has_sve();      // aarch64 only
-//     bool cpu_has_sve2     = has_sve2();
-     bool cpu_has_avx2     = has_avx2();
+     bool cpu_has_neon     = has_neon();     // AArch64 
+     bool cpu_has_sve      = has_sve();      // aarch64 only, insignificant
+     bool cpu_has_sve2     = has_sve2();     // AArch64 only
+     bool cpu_has_sme      = has_sme();
+     bool cpu_has_sme2     = has_sme2();  
+     bool cpu_has_avx2     = has_avx2(); 
     bool cpu_has_avx512   = has_avx512();
     bool cpu_has_avx10    = has_avx10();
     bool cpu_has_aes      = has_aes();      // x86_64 or AArch64
-     bool cpu_has_vaes     = has_vaes();
+     bool cpu_has_vaes     = has_vaes();     // X86_64 only
     bool cpu_has_sha256   = has_sha256();   // x86_64 or AArch64
     bool cpu_has_sha512   = has_sha512();
     bool sw_has_x86_64    = false;
     bool sw_has_aarch64   = false;
-     int  sw_arm_arch      = 0;            // AArch64
+     int  sw_arm_arch      = 0;            // AArch64 version
     bool sw_has_neon      = false;        // AArch64
-//     bool sw_has_sve       = false;        // AArch64
-//     bool sw_has_sve2      = false;        // AArch64
+     bool sw_has_sve       = false;        // AArch64
+     bool sw_has_sve2      = false;        // AArch64
+     bool sw_has_sme       = false;  
+     bool sw_has_sme2      = false; 
     bool sw_has_sse2      = false;        // x86_64
     bool sw_has_ssse3     = false;        // x86_64
     bool sw_has_sse41     = false;        // x86_64
@@ -2873,8 +2878,8 @@ static bool cpu_capability( bool display_only )
     bool sw_has_avx10_512 = false;
     bool sw_has_aes       = false;
     bool sw_has_vaes      = false;
-     bool sw_has_sha256    = false;        // x86_64 or AArch64 SHA2
-     bool sw_has_sha512    = false;        // x86_64 or AArch64 SHA3
+     bool sw_has_sha256    = false;        // x86_64 or AArch64
+     bool sw_has_sha512    = false;        // x86_64 or AArch64
     set_t algo_features   = algo_gate.optimizations;
     bool algo_has_sse2    = set_incl( SSE2_OPT,    algo_features );
     bool algo_has_sse42   = set_incl( SSE42_OPT,   algo_features );
@@ -2954,12 +2959,18 @@ static bool cpu_capability( bool display_only )
     #if defined(__ARM_NEON)
         sw_has_neon = true;
     #endif
-//     #if defined(__ARM_FEATURE_SVE)
-//         sw_has_sve = true;
-//     #endif
-//     #if defined(__ARM_FEATURE_SVE2)
-//         sw_has_sve2 = true;
-//     #endif
+     #if defined(__ARM_FEATURE_SVE)
+         sw_has_sve = true;
+     #endif
+     #if defined(__ARM_FEATURE_SVE2)
+         sw_has_sve2 = true;
+     #endif
+     #if defined(__ARM_FEATURE_SME)
+         sw_has_sme = true;
+     #endif
+     #if defined(__ARM_FEATURE_SME2)
+         sw_has_sme2 = true;
+     #endif

     cpu_brand_string( cpu_brand );
     printf( "CPU: %s\n", cpu_brand );
@@ -2989,7 +3000,6 @@ static bool cpu_capability( bool display_only )
     printf("CPU features: ");
     if ( cpu_has_x86_64  )
     {
-                                     printf( " x86_64"  );
       if      ( cpu_has_avx512 )    printf( " AVX512"  );
       else if ( cpu_has_avx2   )    printf( " AVX2  "  );
       else if ( cpu_has_avx    )    printf( " AVX   "  );
@@ -3000,9 +3010,11 @@ static bool cpu_capability( bool display_only )
     }
     else if   ( cpu_has_aarch64 )
     {
-                                     printf( " AArch64 NEON" ); // NEON assumed
-//       if      ( cpu_has_sve2   )    printf( " SVE2  "  );
-//       else if ( cpu_has_sve    )    printf( " SVE   "  );
+       if      ( cpu_has_neon   )    printf( "       NEON" );
+       if      ( cpu_has_sve2   )    printf( " SVE2-%d", sve_vector_length() );
+       else if ( cpu_has_sve    )    printf( " SVE"     );
+       if      ( cpu_has_sme2   )    printf( " SME2"    );
+       else if ( cpu_has_sme    )    printf( " SME"     );
     }
     if        ( cpu_has_vaes   )    printf( " VAES"    );
     else if   ( cpu_has_aes    )    printf( "  AES"    );
@@ -3014,7 +3026,6 @@ static bool cpu_capability( bool display_only )
     printf("\nSW features:  ");
     if ( sw_has_x86_64 )
     {                     
-                                     printf( " x86_64"  );
        if      ( sw_has_avx512  )   printf( " AVX512"  );
        else if ( sw_has_avx2    )   printf( " AVX2  "  );
        else if ( sw_has_avx     )   printf( " AVX   "  );
@@ -3027,12 +3038,12 @@ static bool cpu_capability( bool display_only )
     }
     else if    ( sw_has_aarch64 ) 
     {
-                                     printf( " AArch64" );
        if      ( sw_arm_arch    )   printf( " armv%d", sw_arm_arch );
        if      ( sw_has_neon    )   printf( " NEON"    );
-//        if      ( sw_has_sve2    )   printf( " SVE2"    );
-//        else if ( sw_has_sve     )   printf( " SVE"     );
-
+        if      ( sw_has_sve2    )   printf( " SVE2"    );
+        else if ( sw_has_sve     )   printf( " SVE"     );
+        if      ( sw_has_sme2    )   printf( " SME2"    );
+        else if ( sw_has_sme     )   printf( " SME"     );
     }
     if         ( sw_has_vaes    )   printf( " VAES"    );
     else if    ( sw_has_aes     )   printf( "  AES"    );
@@ -3075,13 +3086,9 @@ static bool cpu_capability( bool display_only )
             || use_avx2 || use_sha256 || use_vaes || use_sha512 || use_neon );

     // Display best options
-     applog_nl( "Enabled optimizations:" );
-     if         ( use_none   ) printf( " none" );
-     else
+     if ( !use_none )
     {
-//        if ( cpu_has_aarch64 ) printf( " AArch64");
-//        else
-//                               printf( " x86_64" );
+        applog_nl( "Enabled optimizations:" );
        if      ( use_neon   ) printf( " NEON"   );
        if      ( use_avx512 ) printf( " AVX512" );
        else if ( use_avx2   ) printf( " AVX2"   );
@@ -3092,14 +3099,12 @@ static bool cpu_capability( bool display_only )
        else if ( use_aes    ) printf( " AES"    );
        if      ( use_sha512 ) printf( " SHA512" );
        else if ( use_sha256 ) printf( " SHA256" );
+        printf( "\n" );
     }
-     printf( "\n" );

     return true;
 }

-
-
 void show_version_and_exit(void)
 {
        printf("\n built on " __DATE__
@@ -3109,7 +3114,6 @@ void show_version_and_exit(void)
         " with GCC");
        printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
 #endif
-
        printf(" features:"
 #if defined(USE_ASM) && defined(__i386__)
                " i386"
--- a/miner.h
+++ b/miner.h
@@ -644,7 +644,6 @@ enum algos {
        ALGO_SHA256T,
        ALGO_SHA3D,
        ALGO_SHA512256D,
-        ALGO_SHAVITE3,    
        ALGO_SKEIN,       
        ALGO_SKEIN2,      
        ALGO_SKUNK,
@@ -740,7 +739,6 @@ static const char* const algo_names[] = {
        "sha256t",
        "sha3d",
        "sha512256d",
-        "shavite3",
        "skein",
        "skein2",
        "skunk",
@@ -904,7 +902,6 @@ Options:\n\
                          sha256t       Triple SHA-256, Onecoin (OC)\n\
                          sha3d         Double Keccak256 (BSHA3)\n\
                          sha512256d    Double SHA-512 (Radiant)\n\
-                          shavite3      Shavite3\n\
                          skein         Skein+Sha (Skeincoin)\n\
                          skein2        Double Skein (Woodcoin)\n\
                          skunk         Signatum (SIGT)\n\
--- a/simd-utils.h
+++ b/simd-utils.h
@@ -141,9 +141,40 @@
 #include <stdint.h>
 #include <stddef.h>

-// SIMD512: Use 512, 256 & 128 bit vectors, excludes AVX512VBMI
-// VL256: Include AVX512VL instructions on 256 & 128 bit vectors
-// VBMI: Include AVX512VBMI instructions on all vectors.
+// GCC-14.1: the AVX512 macros are defined even when compiled with only
+// -mavx10.1-256, causing compile errors in AVX512 code. Only with
+// -mavx10.1-512 does it compile successfully.
+// __EVEX512__ is set only when compiled with -mavx10.1-512.
+// Adding -fno-evex512 doesn't help.
+// Building with -mapxf fails on a CPU without APX because configure can't
+// run its test program.
+/*
+// Test for macros
+#ifdef __AVX10__
+#warning "__AVX10__"
+#endif
+#ifdef __AVX10_1__
+#warning "__AVX10_1__"
+#endif
+#ifdef __AVX10_1_256__
+#warning "__AVX10_1_256__"
+#endif
+#ifdef __AVX10_1_512__
+#warning "__AVX10_1_512__"
+#endif
+#ifdef __EVEX512__
+#warning "__EVEX512__"
+#endif
+*/
+
+// AVX10 complicates vector support by adding AVX512 features to CPUs without 512 bit
+// vector support. AVX10.1 is just a renaming of AVX512 and is only available for
+// Intel P-core only CPUs. AVX10.2 adds support for E-cores that don't support 512 bit
+// vectors. The following macros simplify things.
+// SIMD512: Use 512, 256 & 128 bit vectors, AVX512VBMI is not included and must be
+//          tested seperately. 
+// VL256: Include AVX512VL instructions for 256 & 128 bit vectors.
+// VBMI: Include AVX512VBMI instructions for supported vector lengths.

 // AVX10 can exist without support for 512 bit vectors.
 #if defined(__AVX10_1_512__)
@@ -153,8 +184,9 @@
 #endif

 // AVX512VL instructions applied to 256 & 128 bit vectors is supported with 
-// either AVX512VL or any version of AVX10.
-#if defined(__AVX10_1__)
+// either AVX512VL or AVX10. Support for CPUs without 512 bit vectors is available 
+// with AVX10.2.
+#if defined(__AVX10_2__) || defined(__AVX10_1_512__)
  #define VL256 1
 #elif defined(__AVX512VL__)
  #define VL256 1
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -2436,7 +2436,7 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
 static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
 {
  const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
-                                             0x0405060700010203 );
+                                            0x0405060700010203 );
  const __m512i c1 = v512_64( 1 );
  v128_t s0 = casti_v128( src,0 );
  v128_t s1 = casti_v128( src,1 );
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -32,6 +32,14 @@
 // Intrinsics automatically promote from REX to VEX when AVX is available
 // but ASM needs to be done manually.
 //
+// APX supports EGPR which adds 16 more GPRs and 3 operand instructions.
+// This may affect ASM that include instructions that are superseded by APX
+// versions and are therefore incompatible with APX.
+// As a result GCC-14 disables EGPR by default and can be enabled with
+// "-mapx-inline-asm-use-gpr32"
+//TODO
+// Some ASM functions may need to be updated to support EGPR with APX.
+//
 ///////////////////////////////////////////////////////////////////////////////

 // New architecturally agnostic syntax: 
@@ -164,7 +172,12 @@ typedef union
 // necessary the cvt, set, or set1 intrinsics can be used allowing the
 // compiler to exploit new features to produce optimum code.
 // Currently only used internally and by Luffa.
+// It also has implications for APX EGPR feature.

+#define v128_mov64       _mm_cvtsi64_si128
+#define v128_mov32       _mm_cvtsi32_si128
+
+/*
 static inline __m128i v128_mov64( const uint64_t n )
 {
  __m128i a;
@@ -186,11 +199,14 @@ static inline __m128i v128_mov32( const uint32_t n )
 #endif
  return a;
 }
+*/

 // broadcast lane 0 to all lanes
 #define v128_bcast64(v)                 _mm_shuffle_epi32( v, 0x44 )
 #define v128_bcast32(v)                 _mm_shuffle_epi32( v, 0x00 )

+// Not used, test first
+/*
 #if defined(__AVX2__)

 #define v128_bcast16(v)                 _mm_broadcastw_epi16(v)
@@ -198,9 +214,10 @@ static inline __m128i v128_mov32( const uint32_t n )
 #else

 #define v128_bcast16(v) \
-   v128_bcast32( v128_or( v128_sl32( v, 16 ), v ) )
+   _mm_shuffle_epi32( _mm_shufflelo_epi16( v, 0x00 ), 0x00 )

 #endif
+*/

 // Broadcast lane l to all lanes
 #define v128_duplane64( v, l ) \
@@ -216,28 +233,15 @@ static inline __m128i v128_mov32( const uint32_t n )
 // Pseudo constants
 #define v128_zero                       _mm_setzero_si128()

-#if defined(__SSE4_1__)
-
-// Bitwise AND, return 1 if result is all bits clear.
-#define v128_and_eq0(v1, v0)            _mm_testz_si128(v1, v0)
-
-// v128_is_zero?
-static inline int v128_cmpeq0( v128_t v )
-{  return v128_and_eq0( v, v ); }
-
-#endif
-
-// Bitwise compare return 1 if all bits set.
-#define v128_cmpeq1(v)                   _mm_test_all ones(v)
-
-#define v128_one                         v128_mov64(1)
+//#define v128_one                         v128_mov64(1)
+#define v128_one                        _mm_cvtsi64_si128( 1 )

 // ASM avoids the need to initialize return variable to avoid compiler warning.
 // Macro abstracts function parentheses to look like an identifier.
 static inline __m128i v128_neg1_fn()
 {
   __m128i a;
-#if defined(__AVX__) 
+#if defined(__AVX__)
   asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(a) );
 #else
   asm( "pcmpeqq %0, %0\n\t" : "=x"(a) );
@@ -268,7 +272,6 @@ static inline __m128i v128_neg1_fn()
 // p = any aligned pointer, i = scaled array index
 // returns value p[i]
 #define casti_v128(p,i)    (((__m128i*)(p))[(i)])
-#define casti_m128i        casti_v128     // deprecated
 #define casti_v128u64      casti_v128
 #define casti_v128u32      casti_v128
 #define casti_v128u16      casti_v128
@@ -279,13 +282,14 @@ static inline __m128i v128_neg1_fn()
 #define casto_v128(p,o) (((__m128i*)(p))+(o))

 #if defined(__SSE4_1__)
+
 #define v128_get64( v, l )         _mm_extract_epi64( v, l )
 #define v128_get32( v, l )         _mm_extract_epi32( v, l )
 #define v128_get16( v, l )         _mm_extract_epi16( v, l )
 #define v128_get8(  v, l )         _mm_extract_epi8(  v, l )

 #define v128_put64( v, u64, l )    _mm_insert_epi64( v, u64, l )
-#define v128_put32( v, u32, l )    _mm_insert_epi64( v, u32, l )
+#define v128_put32( v, u32, l )    _mm_insert_epi32( v, u32, l )
 #define v128_put16( v, u16, l )    _mm_insert_epi16( v, u16, l )
 #define v128_put8(  v, u8,  l )    _mm_insert_epi8(  v, u8,  l )

@@ -396,7 +400,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 {   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
 #define  memcpy_128           v128_memcpy  

+// Boolean operations
 #if defined(VL256)
+// Macros with duplicate references to the same argument are
+// not expression safe. Switch to inline function if required.

 // ~v1 | v0
 #define v128_ornot( v1, v0 )      _mm_ternarylogic_epi64( v1, v0, v0, 0xcf )
@@ -430,7 +437,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #else

-#define v128_ornot( v1, v0 )      _mm_or_si128( v1, v128_not( v0 ) )
+#define v128_ornot( v1, v0 )      _mm_or_si128( v128_not( v1 ), v0 )

 #define v128_xor3( a, b, c )      _mm_xor_si128( a, _mm_xor_si128( b, c ) )

@@ -464,9 +471,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_movmask32( v ) \
   _mm_movemask_ps( (__m128)(v) )

-//
-// Bit rotations
-
+// Shuffle 16 bit elements within 64 bit lanes.
 #define v128_shuffle16( v, c ) \
       _mm_shufflehi_epi16( _mm_shufflelo_epi16( v, c ), c )

@@ -476,6 +481,9 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_qrev16(v)      v128_shuffle16( v, 0x1b )
 #define v128_lrev16(v)      v128_shuffle16( v, 0xb1 )

+//
+// Bit rotations
+
 // Internal use only, should never be callled from application code.
 #define v128_ror64_sse2( v, c ) \
   _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
@@ -601,7 +609,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #endif

-// ror( v1 ^ v0, n )
+// (v1 ^ v0) >>> n, ARM NEON has optimized version
 #define v128_ror64xor( v1, v0, n )  v128_ror64( v128_xor( v1, v0 ), n ) 

 /* not used
@@ -700,15 +708,11 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_swap64(v)      _mm_shuffle_epi32( v, 0x4e )  // grandfathered 
 #define v128_rev64(v)       _mm_shuffle_epi32( v, 0x4e )  // preferred
 #define v128_rev32(v)       _mm_shuffle_epi32( v, 0x1b )
-#define v128_rev16(v)       v128_shuffle16( v, 0x1b )

 // rotate vector elements
 #define v128_shuflr32(v)    _mm_shuffle_epi32( v, 0x39 )
 #define v128_shufll32(v)    _mm_shuffle_epi32( v, 0x93 )

-#define v128_shuflr16(v)    v128_shuffle16( v, 0x39 )
-#define v128_shufll16(v)    v128_shuffle16( v, 0x93 )
-
 // Endian byte swap.

 #if defined(__SSSE3__)
@@ -911,25 +915,27 @@ static inline void v128_block_bswap32_512( __m128i *d, const __m128i *s )
 #else

 #define v128_alignr8( hi, lo, c ) \
-   _mm_or_si128( _mm_slli_si128( hi, c ), _mm_srli_si128( lo, c ) )
+   _mm_or_si128( _mm_slli_si128( hi, 16-(c) ), _mm_srli_si128( lo, c ) )

+// c arg is trivial only valid value is 1
 #define v128_alignr64( hi, lo, c ) \
-   _mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )
+   _mm_or_si128( _mm_slli_si128( hi, 16-((c)*8) ), _mm_srli_si128( lo, (c)*8 ) )

 #define v128_alignr32( hi, lo, c ) \
-   _mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )
+   _mm_or_si128( _mm_slli_si128( hi, 16-((c)*4) ), _mm_srli_si128( lo, (c)*4 ) )

 #endif

 // blend using vector mask
 #if defined(__SSE4_1__)

-// Bytewise using sign bit of each byte element of mask
+// Bytewise using sign bit of each byte element of mask. Use full bitmask
+// for compatibility with SSE2 & NEON.
 #define v128_blendv                    _mm_blendv_epi8

 #else

-// Bitwise
+// Bitwise, use only byte wise for compatibility with SSE4_1.
 #define v128_blendv( v1, v0, mask ) \
   v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )

--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -172,7 +172,7 @@ static inline __m256i mm256_not( const __m256i v )
    
 #else

-#define mm256_ornot( v1, v0 )      _mm256_or_si256( v1, mm256_not( v0 ) )
+#define mm256_ornot( v1, v0 )      _mm256_or_si256( mm256_not( v1 ), v0 )

 #define mm256_xor3( a, b, c ) \
  _mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
@@ -217,12 +217,11 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_movmask_32( v ) \
   _mm256_movemask_ps( _mm256_castsi256_ps( v ) )

-//
-//           Bit rotations.
-
+// shuffle 16 bit elements within 64 bit lanes.
 #define mm256_shuffle16( v, c ) \
   _mm256_shufflehi_epi16( _mm256_shufflelo_epi16( v, c ), c )

+// reverse elements within lanes.
 #define mm256_qrev32(v)    _mm256_shuffle_epi32( v, 0xb1 )
 #define mm256_swap64_32    mm256_qrev32       // grandfathered

@@ -242,6 +241,9 @@ static inline __m256i mm256_not( const __m256i v )
   _mm256_shuffle_epi8( v, mm256_bcast_m128( \
                         v128_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )

+//
+//           Bit rotations.
+
 // These should never be called directly by applications.
 #define mm256_ror_64_avx2( v, c ) \
   _mm256_or_si256( _mm256_srli_epi64( v, c ), \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -125,7 +125,7 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
 // Pseudo constants.
 #define m512_zero       _mm512_setzero_si512()

-// use asm to avoid compiler warning for unitialized local
+// use asm to avoid compiler warning for uninitialized local
 static inline __m512i mm512_neg1_fn()
 {
   __m512i v;
@@ -185,6 +185,8 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Ternary logic uses 8 bit truth table to define any 3 input logical
 // expression using any number or combinations of AND, OR, XOR, NOT.
+// Macros with duplicate references to the same argument are
+// not expression safe. Switch to inline function if required.

 // ~v1 | v0
 #define mm512_ornot( v1, v0 )      _mm512_ternarylogic_epi64( v1, v0, v0, 0xcf )
--- a/simd-utils/simd-64.h
+++ b/simd-utils/simd-64.h
@@ -10,7 +10,18 @@
 // This code is not used anywhere annd likely never will. It's intent was
 // to support 2 way parallel hashing using  MMX, or NEON for 32 bit hash
 // functions, but hasn't been implementedwas never implemented.
-// 
+//
+// MMX is being deprecated by compilers, all intrinsics will be converted to use SSE
+// registers and instructions. MMX will still be available using ASM.
+// For backward compatibility it's likely the compiler won't allow mixing explicit SSE
+// with promoted MMX. It is therefore preferable to implement all 64 bit vector code
+// using explicit SSE with the upper 64 bits being ignored.
+// Using SSE for 64 bit vectors will complicate loading arrays from memory which will
+// always load 128 bits. Odd indexes will need to be extracted from the upper 64 bits
+// of the even index SSE register. 
+// In most cases the exiting 4x32 SSE code can be used with 2 lanes being ignored
+// making ths file obsolete.
+

 #define v64_t                        __m64
 #define v64u32_t                     v64_t
--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -456,7 +456,6 @@ static inline uint64x2_t v128_rev64( uint64x2_t v )
 #define v128_swap64     v128_rev64   // grandfathered

 #define v128_rev32(v)        v128_rev64( v128_qrev32( v ) )
-#define v128_rev16(v)        v128_rev64( v128_qrev16( v ) )

 // shuffle-rotate vector elements
 static inline uint32x4_t v128_shuflr32( uint32x4_t v )
@@ -465,12 +464,6 @@ static inline uint32x4_t v128_shuflr32( uint32x4_t v )
 static inline uint32x4_t v128_shufll32( uint32x4_t v )
 {   return vextq_u32( v, v, 3 ); }

-static inline uint16x8_t v128_shuflr16( uint16x8_t v )
-{   return vextq_u16( v, v, 1 ); }
-
-static inline uint16x8_t v128_shufll16( uint16x8_t v )
-{   return vextq_u16( v, v, 7 ); }
-
 // reverse bits in bytes, nothing like it in x86_64
 #define v128_bitrev8           vrbitq_u8

@@ -547,7 +540,8 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
   casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \
 }

-// Bitwise blend using vector mask
+// Bitwise blend using vector mask, use only bytewise for compatibility
+// with x86_64.
 #define v128_blendv( v1, v0, mask )    vbslq_u32( mask, v1, v0 )

 #endif   // __ARM_NEON
--- a/simd-utils/simd-sve.h
+++ b/simd-utils/simd-sve.h
@@ -0,0 +1,25 @@
+// Placeholder for now.
+//
+// This file will hold AArch64 SVE code, a replecement for NEON that uses vector length
+// agnostic instructions. This means the same code can be used on CPUs with different
+// SVE vector register lengths. This is not good for vectorized hashing.
+// Optimum hash is sensitive to the vector register length with different code
+// used for different register sizes. On X86_64 the vector length is tied to the CPU
+// feature making it simple and efficient to handle different lengths although it
+// results in multiple executables. Theoretically SVE could use a single executable for
+// any vector length.
+//
+// With the SVE vector length only known at run time it resultis in run time overhead
+// to test the vector length. Theoretically it could be tested at program loading and
+// appropriate libraries loaded. However I don't know if this can be done and if so
+// how to do it.
+//
+// SVE is not expected to be used for 128 bit vectors as it does not provide any
+// advantages over NEON. However, it may be implemented for testing purposes
+// because CPU with registers larger than 128 bits are currently very rare and very
+// expensive server class CPUs.
+//
+// N-way parallel hashing could be the best use of SVE, usimg the same code for all 
+// vector lengths with the only variable being the number of lanes. This will still
+// require run time checking but should be lighter than substituting functions.
+
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -20,7 +20,7 @@
 // for arm's "cpuid"
 #include <sys/auxv.h>
 #include <asm/hwcap.h>
-
+#include <sys/prctl.h>
 #endif

 #ifndef WIN32
@@ -169,17 +169,17 @@ static inline int cpu_fanpercent()
 }


-// CPUID
+// x86_64 CPUID

 // This list is incomplete, it only contains features of interest to cpuminer.
 // refer to http://en.wikipedia.org/wiki/CPUID for details.

 // AVX10 compatibility notes
 //
-// Notation used: AVX10i.[version]_[vectorwidth]
-// AVX10.1_512 is a rebranding of AVX512 and is effectively the AVX* superset
+// Display format: AVX10.[version]-[vectorwidth]
+// AVX10.1-512 is a rebranding of AVX512 and is effectively the AVX* superset
 // with full 512 bit vector support.
-// AVX10.2_256 is effectively AVX2 + AVX512_VL, all AVX512 instructions and
+// AVX10.2-256 is effectively AVX2 + AVX512_VL, all AVX512 instructions and
 // features applied only to 256 bit and 128 bit vectors.
 // Future AVX10 versions will add new instructions and features.

@@ -311,12 +311,25 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf,

 #elif defined(__aarch64__)

+// Always test if HWCAP variable is defined in the kernel before attempting
+// to compile it. If not defined the feature can't be tested and won't be
+// included in the compile.
+// This can occur if compiling with an old kernel and a new CPU and could
+// result in a suboptimal build.
+
 static inline void cpuid( unsigned int leaf, unsigned int subleaf,
                          unsigned int output[4] )
 {
-   
-    output[0] = getauxval(AT_HWCAP);
-    output[1] = getauxval(AT_HWCAP2);
+#if defined(AT_HWCAP)
+    output[0] = getauxval( AT_HWCAP );
+#else
+    output[0] = 0;
+#endif
+#if defined(AT_HWCAP2)
+    output[1] = getauxval( AT_HWCAP2 );
+#else
+    output[1] = 0;
+#endif    

 /*    
 #define has(CAP, hwcap) !!((hwcap) & HWCAP_##CAP)
@@ -351,7 +364,6 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf,
 */    
 }   

-
 #else
 #define cpuid(leaf, subleaf, out) out[0] = 0;
 #endif
@@ -482,31 +494,20 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 #ifdef __ARM_FEATURE_SHA3
 #warning "__ARM_FEATURE_SHA3"
 #endif
-*/
-
-// GCC-14.1: the AVX512 macros are defined even when compiled with only
-// -mavx10.1-256, causing compile errors in AVX512 code. Only with
-// -mavx10.1-512 does it compile successfully.
-// __EVEX512__ is set only when compiled with -mavx10.1-512.
-// Adding -fno-evex512 doesn't help.
-// Building with -mapxf fails to configure on a CPU without APX because it can
-// run the test program.
-/*
-#ifdef __AVX10_1__
-#warning "__AVX10_1__"
+#ifdef __ARM_FEATURE_SHA512
+#warning "__ARM_FEATURE_SHA512"
 #endif
-#ifdef __AVX10_1_256__
-#warning "__AVX10_1_256__"
+#ifdef __ARM_FEATURE_SVE
+#warning "__ARM_FEATURE_SVE"
 #endif
-#ifdef __AVX10_1_512__
-#warning "__AVX10_1_512__"
+#ifdef __ARM_FEATURE_SVE2
+#warning "__ARM_FEATURE_SVE2"
 #endif
-#ifdef __EVEX512__
-#warning "__EVEX512__"
+#ifdef __ARM_FEATURE_SME
+#warning "__ARM_FEATURE_SME"
 #endif
 */

-
 // Typical display format: AVX10.[version]_[vectorlength], if vector length is
 // omitted 256 is the default.
 //    Ex: AVX10.1_512
@@ -590,18 +591,15 @@ static inline bool has_sse42()
 #endif
 }

-/* doesn't work
+// There's no HWCAP for NEON, assume it's always true.
 static inline bool has_neon()
 {
 #if defined(__aarch64__)
-   unsigned int cpu_info[4] = { 0 };
-   cpuid( 0, 0, cpu_info );
-   return cpu_info[0] & HWCAP_NEON;
+   return true;
 #else
   return false;
 #endif
 }
-*/

 static inline bool has_avx()
 {
@@ -625,9 +623,10 @@ static inline bool has_avx2()
 #endif
 }

+// SVE vector width is determined at run time.
 static inline bool has_sve()
 {
-#if defined(__aarch64__)
+#if defined(__aarch64__) && defined(HWCAP_SVE)
   unsigned int cpu_info[4] = { 0 };
   cpuid( 0, 0, cpu_info );
   return cpu_info[0] & HWCAP_SVE;
@@ -638,7 +637,7 @@ static inline bool has_sve()

 static inline bool has_sve2()
 {
-#if defined(__aarch64__)
+#if defined(__aarch64__) && defined(HWCAP2_SVE2)
   unsigned int cpu_info[4] = { 0 };
   cpuid( 0, 0, cpu_info );
   return cpu_info[1] & HWCAP2_SVE2;
@@ -647,6 +646,28 @@ static inline bool has_sve2()
 #endif
 }

+static inline bool has_sme()
+{
+#if defined(__aarch64__) && defined(HWCAP2_SME)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[1] & HWCAP2_SME;
+#else
+   return false;
+#endif
+}
+
+static inline bool has_sme2()
+{
+#if defined(__aarch64__) && defined(HWCAP2_SME2)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[1] & HWCAP2_SME2;
+#else
+   return false;
+#endif
+}
+
 static inline bool has_avx512f()
 {
 #if defined(__x86_64__)
@@ -706,9 +727,9 @@ static inline bool has_avx512()
 static inline bool has_vbmi()
 {
 #if defined(__x86_64__)
-    unsigned int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, 0, cpu_info );
-    return cpu_info[ ECX_Reg ] & AVX512_VBMI_Flag;
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( EXTENDED_FEATURES, 0, cpu_info );
+   return cpu_info[ ECX_Reg ] & AVX512_VBMI_Flag;
 #else
   return false;
 #endif
@@ -717,11 +738,11 @@ static inline bool has_vbmi()
 static inline bool has_vbmi2()
 {
 #if defined(__x86_64__)
-    unsigned int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, 0, cpu_info );
-    return cpu_info[ ECX_Reg ] & AVX512_VBMI2_Flag;
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( EXTENDED_FEATURES, 0, cpu_info );
+   return cpu_info[ ECX_Reg ] & AVX512_VBMI2_Flag;
 #else
-    return false;
+   return false;
 #endif
 }

@@ -735,7 +756,8 @@ static inline bool has_aes()
      return cpu_info[ ECX_Reg ] & AES_NI_Flag;
   }
   return false;
-#elif defined(__aarch64__)
+#elif defined(__aarch64__) && defined(HWCAP_AES)
+   // NEON AES
   unsigned int cpu_info[4] = { 0 };
   cpuid( 0, 0, cpu_info );
   return cpu_info[0] & HWCAP_AES;
@@ -761,7 +783,7 @@ static inline bool has_vaes()

 static inline bool has_sveaes()
 {
-#if defined(__aarch64__)
+#if defined(__aarch64__) && defined(HWCAP2_SVEAES)
   unsigned int cpu_info[4] = { 0 };
   cpuid( 0, 0, cpu_info );
   return cpu_info[1] & HWCAP2_SVEAES;
@@ -780,7 +802,8 @@ static inline bool has_sha256()
      return cpu_info[ EBX_Reg ] & SHA_Flag;
   }
   return false;
-#elif defined(__aarch64__)
+#elif defined(__aarch64__) && defined(HWCAP_SHA2)
+   // NEON SHA256
   unsigned int cpu_info[4] = { 0 };
   cpuid( 0, 0, cpu_info );
   return cpu_info[0] & HWCAP_SHA2;
@@ -799,7 +822,8 @@ static inline bool has_sha512()
      return cpu_info[ EAX_Reg ] & SHA512_Flag;
   }
   return false;
-#elif defined(__aarch64__)
+#elif defined(__aarch64__) && defined(HWCAP_SHA512)
+   // NEON SHA512
   unsigned int cpu_info[4] = { 0 };
   cpuid( 0, 0, cpu_info );
   return cpu_info[0] & HWCAP_SHA512;
@@ -811,7 +835,8 @@ static inline bool has_sha512()
 // Arm only
 static inline bool has_sha3()
 {
-#if defined(__aarch64__)
+#if defined(__aarch64__) && defined(HWCAP_SHA3)
+   // NEON SHA3
   unsigned int cpu_info[4] = { 0 };
   cpuid( 0, 0, cpu_info );
   return cpu_info[0] & HWCAP_SHA3;
@@ -822,7 +847,7 @@ static inline bool has_sha3()

 static inline bool has_svesha3()
 {
-#if defined(__aarch64__)
+#if defined(__aarch64__) && defined(HWCAP2_SVESHA3)
   unsigned int cpu_info[4] = { 0 };
   cpuid( 0, 0, cpu_info );
   return cpu_info[1] & HWCAP2_SVESHA3;
@@ -886,10 +911,8 @@ static inline unsigned int avx10_version()
       cpuid( AVX10_FEATURES, 0, cpu_info );
       return cpu_info[ EBX_Reg ] & AVX10_VERSION_mask;
    }
-    return 0;
-#else
-    return 0;
 #endif
+    return 0;
 }

 // also includes 256 & 128
@@ -902,13 +925,11 @@ static inline bool has_avx10_512()
       cpuid( AVX10_FEATURES, 0, cpu_info );
       return cpu_info[ EBX_Reg ] & AVX10_512_Flag;
    }
-    return false;
-#else
-    return false;
 #endif
+    return false;
 }

-// Includes 128 but may not include 512
+// Includes 128 but might not include 512
 static inline bool has_avx10_256()
 {
 #if defined(__x86_64__)
@@ -918,13 +939,11 @@ static inline bool has_avx10_256()
       cpuid( AVX10_FEATURES, 0, cpu_info );
       return cpu_info[ EBX_Reg ] & AVX10_256_Flag;
    }
-    return false;
-#else
-    return false;
 #endif
+    return false;
 }

-// Maximum vector length
+// AVX10 vector register length
 static inline unsigned int avx10_vector_length()
 {
 #if defined(__x86_64__)
@@ -935,24 +954,28 @@ static inline unsigned int avx10_vector_length()
       return cpu_info[ EBX_Reg ] & AVX10_512_Flag ? 512
          : ( cpu_info[ EBX_Reg ] & AVX10_256_Flag ? 256 : 0 );
    }
-    return 0;
-#else
-    return 0;
 #endif
+    return 0;
 }

+// ARM SVE vector register length
+static inline int sve_vector_length()
+{
+#if defined(__aarch64__)
+   if ( has_sve() )
+      return prctl( (PR_SVE_GET_VL & PR_SVE_VL_LEN_MASK) * 8 );
+#endif
+   return 0;
+}

 static inline uint32_t cpuid_get_highest_function_number()
 {
 #if defined(__x86_64__)
- 
  unsigned int cpu_info[4] = {0};
  cpuid( VENDOR_ID, 0, cpu_info);
  return cpu_info[ EAX_Reg ];
-
-#else
-  return 0;  
 #endif
+  return 0;
 }

 // out of date
Author	SHA1	Message	Date
Jay D Dee	8e91bfbe19	v24.5	2024-09-13 14:14:57 -04:00
Jay D Dee	47e24b50e8	v24.4	2024-07-01 00:33:19 -04:00