v24.7

v24.6
v24.5
2025-09-17 23:44:27 +00:00 · 2024-12-16 19:17:19 -05:00 · 2024-12-08 11:14:08 -05:00 · 2024-09-13 14:14:57 -04:00
25 changed files with 3730 additions and 2253 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -166,7 +166,6 @@ cpuminer_SOURCES = \
  algo/shavite/sph-shavite-aesni.c \
  algo/shavite/shavite-hash-2way.c \
  algo/shavite/shavite-hash-4way.c \
-  algo/shavite/shavite.c \
  algo/simd/nist.c \
  algo/simd/vector.c \
  algo/simd/sph_simd.c \
@@ -293,10 +292,6 @@ cpuminer_LDADD	= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
 cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
 cpuminer_CFLAGS   = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)

-if HAVE_WINDOWS
-cpuminer_CFLAGS += -Wl,--stack,10485760
-endif
-
 if HAVE_WINDOWS
 # use to profile an object
 # gprof_cflags = -pg -g3
--- a/18
+++ b/18
@@ -75,6 +75,24 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v24.7
+
+ARM: compile works for Windows using MSys2 & MingW, see wiki for details.
+
+v24.6
+
+ARM: Fixed scryptn2, x16*, broken in v24.2. 
+ARM: Small improvement to interleaving.
+Eliminated some potential compile errors in code that was dependent on 
+compiler optimisations.
+x86_64: improved support for AVX10 compilation, needs GCC-14 or higher.
+
+v24.5
+
+Fix MinGW compile error after MSys2 upgrade to GCC-14.2. 
+#427: GBT: Improved handling of new work.
+Removed shavite3 algo.
+
 v24.4

 x86_64: fixed a bug in ornot macro for AVX2 which broke some algos in v24.2.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -263,8 +263,8 @@ static void init_algo_gate( algo_gate_t* gate )
   gate->build_block_header      = (void*)&std_build_block_header;
   gate->build_extraheader       = (void*)&std_build_extraheader;
   gate->set_work_data_endian    = (void*)&do_nothing;
-   gate->resync_threads          = (void*)&do_nothing;
-   gate->do_this_thread          = (void*)&return_true;
+//   gate->resync_threads          = (void*)&do_nothing;
+//   gate->do_this_thread          = (void*)&return_true;
   gate->longpoll_rpc_call       = (void*)&std_longpoll_rpc_call;
   gate->get_work_data_size      = (void*)&std_get_work_data_size;
   gate->optimizations           = EMPTY_SET;
@@ -340,7 +340,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_SHA256T:      rc = register_sha256t_algo       ( gate ); break;
    case ALGO_SHA3D:        rc = register_sha3d_algo         ( gate ); break;
    case ALGO_SHA512256D:   rc = register_sha512256d_algo    ( gate ); break;
-    case ALGO_SHAVITE3:     rc = register_shavite_algo       ( gate ); break;
    case ALGO_SKEIN:        rc = register_skein_algo         ( gate ); break;
    case ALGO_SKEIN2:       rc = register_skein2_algo        ( gate ); break;
    case ALGO_SKUNK:        rc = register_skunk_algo         ( gate ); break;
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -165,10 +165,10 @@ char* ( *malloc_txs_request )   ( struct work* );
 void ( *set_work_data_endian )  ( struct work* );

 // Diverge mining threads
-bool ( *do_this_thread )        ( int );
+//bool ( *do_this_thread )        ( int );

 // After do_this_thread
-void ( *resync_threads )        ( int, struct work* );
+//void ( *resync_threads )        ( int, struct work* );

 json_t* ( *longpoll_rpc_call )  ( CURL*, int*, char* );

--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -387,7 +387,7 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
 // Hamsi 8 way AVX512 

 // Intel docs say _mm512_movepi64_mask & _mm512_cmplt_epi64_mask have same
-// timig. However, when tested hashing X13 on i9-9940x using cmplt with zero
+// timing. However, testing hashing X13 on i9-9940x using cmplt with zero
 // had a 3% faster overall hashrate than than using movepi. 

 #define INPUT_BIG8 \
@@ -418,13 +418,11 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
  tb = mm512_xoror( b, d, a ); \
  a = _mm512_xor_si512( a, c ); \
  b = mm512_xoror( td, tb, a ); \
-  td = mm512_xorand( a, td, tb ); \
+  d = _mm512_ternarylogic_epi64( a, td, tb, 0x87 );/* not( xorand( a, td, tb ) ); */ \
  a = c; \
-  c = mm512_xor3( tb, b, td ); \
-  d = mm512_not( td ); \
+  c = _mm512_ternarylogic_epi64( tb, b, d, 0x69 ); /* not( xor3( tb, b, d ) ); */ \
 }

-
 /*
 #define SBOX8( a, b, c, d ) \
 do { \
@@ -1155,11 +1153,99 @@ do { \
  b = mm256_xoror( td, tb, a ); \
  d = _mm256_ternarylogic_epi64( a, td, tb, 0x87 );/* mm256_not( mm256_xorand( a, td, tb ) ); */ \
  a = c; \
-  c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /*mm256_not( mm256_xor3( tb, b, d ) );*/ \
+  c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /* mm256_not( mm256_xor3( tb, b, d ) ); */ \
 }

 #else

+#define INPUT_BIG_sub( db_i ) \
+{ \
+     const __m256i dm = _mm256_cmpgt_epi64( zero, db_i ); \
+     m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, v256_64( tp[0] ) ) ); \
+     m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, v256_64( tp[1] ) ) ); \
+     m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, v256_64( tp[2] ) ) ); \
+     m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, v256_64( tp[3] ) ) ); \
+     m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, v256_64( tp[4] ) ) ); \
+     m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, v256_64( tp[5] ) ) ); \
+     m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, v256_64( tp[6] ) ) ); \
+     m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, v256_64( tp[7] ) ) ); \
+     tp += 8; \
+}
+
+#define INPUT_BIG \
+{ \
+  const __m256i db = *buf; \
+  const __m256i zero = m256_zero; \
+  const uint64_t *tp = (const uint64_t*)T512;  \
+  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,63 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,62 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,61 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,60 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,59 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,58 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,57 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,56 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,55 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,54 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,53 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,52 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,51 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,50 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,49 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,48 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,47 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,46 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,45 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,44 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,43 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,42 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,41 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,40 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,39 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,38 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,37 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,36 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,35 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,34 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,33 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,32 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,31 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,30 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,29 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,28 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,27 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,26 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,25 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,24 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,23 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,22 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,21 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,20 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,19 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,18 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,17 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,16 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,15 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,14 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,13 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,12 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,11 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db,10 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 9 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 8 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 7 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 6 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 5 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 4 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 3 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 2 ) ); \
+  INPUT_BIG_sub( _mm256_slli_epi64( db, 1 ) ); \
+  INPUT_BIG_sub( db ); \
+}
+
+#if 0
+// dependent on the compiler unrolling the loop
 #define INPUT_BIG \
 do { \
  __m256i db = *buf; \
@@ -1180,6 +1266,7 @@ do { \
     tp += 8; \
  } \
 } while (0)
+#endif

 // v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
 #define SBOX( a, b, c, d ) \
@@ -1219,7 +1306,7 @@ do { \
 do { \
   a = mm256_rol_32( a, 13 ); \
   c = mm256_rol_32( c,  3 ); \
-   b = mm256_xor3( a, b, c ); \
+   b = mm256_xor3( b, a, c ); \
   d = mm256_xor3( d, c, _mm256_slli_epi32( a, 3 ) ); \
   b = mm256_rol_32( b, 1 ); \
   d = mm256_rol_32( d, 7 ); \
@@ -1961,6 +2048,94 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
   sc->h[6] = c6; \
   sc->h[7] = c7;

+#define INPUT_2x64_sub( db_i ) \
+{ \
+     const v128u64_t dm = v128_cmpgt64( zero, db_i ); \
+     m0 = v128_xor( m0, v128_and( dm, v128_64( tp[0] ) ) ); \
+     m1 = v128_xor( m1, v128_and( dm, v128_64( tp[1] ) ) ); \
+     m2 = v128_xor( m2, v128_and( dm, v128_64( tp[2] ) ) ); \
+     m3 = v128_xor( m3, v128_and( dm, v128_64( tp[3] ) ) ); \
+     m4 = v128_xor( m4, v128_and( dm, v128_64( tp[4] ) ) ); \
+     m5 = v128_xor( m5, v128_and( dm, v128_64( tp[5] ) ) ); \
+     m6 = v128_xor( m6, v128_and( dm, v128_64( tp[6] ) ) ); \
+     m7 = v128_xor( m7, v128_and( dm, v128_64( tp[7] ) ) ); \
+     tp += 8; \
+}
+
+#define INPUT_2x64 \
+{ \
+  const v128u64_t db = *buf; \
+  const v128u64_t zero = v128_zero; \
+  const uint64_t *tp = (const uint64_t*)T512;  \
+  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
+  INPUT_2x64_sub( v128_sl64( db,63 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,62 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,61 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,60 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,59 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,58 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,57 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,56 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,55 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,54 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,53 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,52 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,51 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,50 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,49 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,48 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,47 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,46 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,45 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,44 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,43 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,42 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,41 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,40 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,39 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,38 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,37 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,36 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,35 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,34 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,33 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,32 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,31 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,30 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,29 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,28 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,27 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,26 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,25 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,24 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,23 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,22 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,21 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,20 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,19 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,18 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,17 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,16 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,15 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,14 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,13 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,12 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,11 ) ); \
+  INPUT_2x64_sub( v128_sl64( db,10 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 9 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 8 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 7 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 6 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 5 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 4 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 3 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 2 ) ); \
+  INPUT_2x64_sub( v128_sl64( db, 1 ) ); \
+  INPUT_2x64_sub( db ); \
+}
+
+#if 0
+// Dependent on the compiler unrolling the loop.
 #define INPUT_2x64 \
 { \
  v128u64_t db = *buf; \
@@ -1981,6 +2156,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
     tp += 8; \
  } \
 }
+#endif

 // v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
 #define SBOX_2x64( a, b, c, d ) \
@@ -2001,7 +2177,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
 { \
   a = v128_rol32( a, 13 ); \
   c = v128_rol32( c,  3 ); \
-   b = v128_xor3( a, b, c ); \
+   b = v128_xor3( c, a, b ); \
   d = v128_xor3( d, c, v128_sl32( a, 3 ) ); \
   b = v128_rol32( b, 1 ); \
   d = v128_rol32( d, 7 ); \
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -231,7 +231,7 @@ static void FFT64( void *a )
   //  Unrolled decimation in frequency (DIF) radix-2 NTT.
   //  Output data is in revbin_permuted order.

-  static const int w[] = {0, 2, 4, 6};
+//  static const int w[] = {0, 2, 4, 6};

 #define BUTTERFLY_0( i,j ) \
 do { \
@@ -240,25 +240,25 @@ do { \
    X(i) = v128_sub16( X(i), v ); \
 } while(0)

-#define BUTTERFLY_N( i,j,n ) \
+#define BUTTERFLY_N( i, j, w_n ) \
 do { \
    v128u16_t v = X(j); \
    X(j) = v128_add16( X(i), X(j) ); \
-    X(i) = v128_sl16( v128_sub16( X(i), v ), w[n] ); \
+    X(i) = v128_sl16( v128_sub16( X(i), v ), w_n ); \
 } while(0)

  BUTTERFLY_0( 0, 4 );
-  BUTTERFLY_N( 1, 5, 1 );
-  BUTTERFLY_N( 2, 6, 2 );
-  BUTTERFLY_N( 3, 7, 3 );
+  BUTTERFLY_N( 1, 5, 2 );
+  BUTTERFLY_N( 2, 6, 4 );
+  BUTTERFLY_N( 3, 7, 6 );

  DO_REDUCE( 2 );
  DO_REDUCE( 3 );

  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
-  BUTTERFLY_N( 1, 3, 2 );
-  BUTTERFLY_N( 5, 7, 2 );
+  BUTTERFLY_N( 1, 3, 4 );
+  BUTTERFLY_N( 5, 7, 4 );

  DO_REDUCE( 1 );

@@ -329,10 +329,10 @@ do { \
 } while(0)


-#define BUTTERFLY_N( i,j,n ) \
+#define BUTTERFLY_N( i, j, w_n ) \
 do { \
   v128u16_t u = X(j); \
-   X(i) = v128_sl16( X(i), w[n] ); \
+   X(i) = v128_sl16( X(i), w_n ); \
   X(j) = v128_sub16( X(j), X(i) ); \
   X(i) = v128_add16( u, X(i) ); \
 } while(0)
@@ -353,15 +353,15 @@ do { \

  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
-  BUTTERFLY_N( 1, 3, 2 );
-  BUTTERFLY_N( 5, 7, 2 );
+  BUTTERFLY_N( 1, 3, 4 );
+  BUTTERFLY_N( 5, 7, 4 );

  DO_REDUCE( 3 );

  BUTTERFLY_0( 0, 4 );
-  BUTTERFLY_N( 1, 5, 1 );
-  BUTTERFLY_N( 2, 6, 2 );
-  BUTTERFLY_N( 3, 7, 3 );
+  BUTTERFLY_N( 1, 5, 2 );
+  BUTTERFLY_N( 2, 6, 4 );
+  BUTTERFLY_N( 3, 7, 6 );

  DO_REDUCE_FULL_S( 0 );
  DO_REDUCE_FULL_S( 1 );
@@ -853,7 +853,7 @@ static void fft64_2way( void *a )
   //  Unrolled decimation in frequency (DIF) radix-2 NTT.
   //  Output data is in revbin_permuted order.

-  static const int w[] = {0, 2, 4, 6};
+//  static const int w[] = {0, 2, 4, 6};
 //   __m256i *Twiddle = (__m256i*)FFT64_Twiddle;


@@ -864,25 +864,25 @@ do { \
    X(i) = _mm256_sub_epi16( X(i), v ); \
 } while(0)

-#define BUTTERFLY_N( i,j,n ) \
+#define BUTTERFLY_N( i, j, w_n ) \
 do { \
    __m256i v = X(j); \
    X(j) = _mm256_add_epi16( X(i), X(j) ); \
-    X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \
+    X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w_n ); \
 } while(0)

  BUTTERFLY_0( 0, 4 );
-  BUTTERFLY_N( 1, 5, 1 );
-  BUTTERFLY_N( 2, 6, 2 );
-  BUTTERFLY_N( 3, 7, 3 );
+  BUTTERFLY_N( 1, 5, 2 );
+  BUTTERFLY_N( 2, 6, 4 );
+  BUTTERFLY_N( 3, 7, 6 );

  DO_REDUCE( 2 );
  DO_REDUCE( 3 );

  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
-  BUTTERFLY_N( 1, 3, 2 );
-  BUTTERFLY_N( 5, 7, 2 );
+  BUTTERFLY_N( 1, 3, 4 );
+  BUTTERFLY_N( 5, 7, 4 );

  DO_REDUCE( 1 );

@@ -953,10 +953,10 @@ do { \
 } while(0)


-#define BUTTERFLY_N( i,j,n ) \
+#define BUTTERFLY_N( i, j, w_n ) \
 do { \
   __m256i u = X(j); \
-   X(i) = _mm256_slli_epi16( X(i), w[n] ); \
+   X(i) = _mm256_slli_epi16( X(i), w_n ); \
   X(j) = _mm256_sub_epi16( X(j), X(i) ); \
   X(i) = _mm256_add_epi16( u, X(i) ); \
 } while(0)
@@ -977,15 +977,15 @@ do { \

  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
-  BUTTERFLY_N( 1, 3, 2 );
-  BUTTERFLY_N( 5, 7, 2 );
+  BUTTERFLY_N( 1, 3, 4 );
+  BUTTERFLY_N( 5, 7, 4 );

  DO_REDUCE( 3 );

  BUTTERFLY_0( 0, 4 );
-  BUTTERFLY_N( 1, 5, 1 );
-  BUTTERFLY_N( 2, 6, 2 );
-  BUTTERFLY_N( 3, 7, 3 );
+  BUTTERFLY_N( 1, 5, 2 );
+  BUTTERFLY_N( 2, 6, 4 );
+  BUTTERFLY_N( 3, 7, 6 );

  DO_REDUCE_FULL_S( 0 );
  DO_REDUCE_FULL_S( 1 );
@@ -1709,11 +1709,11 @@ do { \
    X(i) = _mm512_sub_epi16( X(i), v ); \
 } while(0)

-#define BUTTERFLY_N( i, j, w ) \
+#define BUTTERFLY_N( i, j, w_n ) \
 do { \
    __m512i v = X(j); \
    X(j) = _mm512_add_epi16( X(i), X(j) ); \
-    X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w ); \
+    X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w_n ); \
 } while(0)

  BUTTERFLY_0( 0, 4 );
@@ -1792,10 +1792,10 @@ do { \
 } while(0)


-#define BUTTERFLY_N( i, j, w ) \
+#define BUTTERFLY_N( i, j, w_n ) \
 do { \
   __m512i u = X(j); \
-   X(i) = _mm512_slli_epi16( X(i), w ); \
+   X(i) = _mm512_slli_epi16( X(i), w_n ); \
   X(j) = _mm512_sub_epi16( X(j), X(i) ); \
   X(i) = _mm512_add_epi16( u, X(i) ); \
 } while(0)
--- a/api.c
+++ b/api.c
@@ -531,7 +531,7 @@ static void api()
 	time_t bindstart;
 	struct sockaddr_in serv;
 	struct sockaddr_in cli;
-	socklen_t clisiz;
+	uint32_t clisiz;
 	bool addrok = false;
 	long long counter;
 	char *result;
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,7 +4,7 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.

-rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-zen5 cpuminer-alderlake cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2  > /dev/null
+rm cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-zen5 cpuminer-alderlake cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2  > /dev/null

 # AVX512 SHA VAES: Intel Core Icelake, Rocketlake
 make distclean || echo clean
@@ -18,28 +18,55 @@ strip -s cpuminer
 mv cpuminer cpuminer-avx512-sha-vaes

 # Intel Core Alderlake: AVX2 SHA VAES, needs gcc-12
-make clean || echo clean
-rm -f config.status
-CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl
-make -j 8
-strip -s cpuminer
-mv cpuminer cpuminer-alderlake
+#make clean || echo clean
+#rm -f config.status
+#CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl
+#make -j 8
+#strip -s cpuminer
+#mv cpuminer cpuminer-alderlake

-# Intel Core Arrowlake: AVX2 SHA512 VAES, needs gcc-14
+# Intel Core Arrowlake-s: AVX2 SHA512 VAES, needs gcc-14
+# Arrowlake-s includes SHA512, Arrowlake does not?
 #make clean || echo clean
 #rm -f config.status
 #CFLAGS="-O3 -march=arrowlake-s -Wall" ./configure --with-curl
 #make -j 8
 #strip -s cpuminer
-#mv cpuminer cpuminer-arrowlake
+#mv cpuminer cpuminer-arrowlake-s
+
+# Intel Core Graniterapids: AVX512, SHA256, VAES, needs gcc-14
+# Apparently Granitrapids will not include AVX10, SHA512 or APX,
+# wait for Diamondrapids & gcc-15.
+#make clean || echo clean
+#rm -f config.status
+#CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl
+#make -j 8
+#strip -s cpuminer
+#mv cpuminer cpuminer-graniterapids
+
+# Force AVX10-256
+#make clean || echo clean
+#rm -f config.status
+#CFLAGS="-O3 -march=arrowlake-s -mavx10.1-256 -Wall" ./configure --with-curl
+#make -j 8
+#strip -s cpuminer
+#mv cpuminer cpuminer-avx10-256
+
+# Force SHA512 AVX10-512
+#make clean || echo clean
+#rm -f config.status
+#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1-512 -Wall" ./configure --with-curl
+#make -j 8
+#strip -s cpuminer
+#mv cpuminer cpuminer-avx10-512

 # Zen5: AVX512 SHA VAES, requires gcc-14.
 #make clean || echo clean
 #rm -f config.status
-#CFLAGS="-O3 -march=znver5" ./configure --with-curl
+#CFLAGS="-O3 -march=znver5 -Wall" ./configure --with-curl
 #make -j $(nproc)
 #strip -s cpuminer
-#mv cpuminer cpuminer-zen4
+#mv cpuminer cpuminer-zen5

 # Zen4: AVX512 SHA VAES
 make clean || echo clean
@@ -70,7 +97,7 @@ make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-avx512

-# AVX2 SHA VAES: generic
+# AVX2 SHA VAES: generic, zen3, alderlake...arrowlake
 make clean || echo done
 rm -f config.status
 # vaes doesn't include aes
--- a/clean-all.sh
+++ b/clean-all.sh
@@ -2,7 +2,7 @@
 #
 # make clean and rm all the targetted executables.

-rm cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9 cpuminer-armv9-crypto cpuminer-armv9-crypto-sha3 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8.5-aes-sha3-sve2  cpuminer-armv8-crypto cpuminer-armv8 > /dev/null
+rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9 cpuminer-armv9-crypto cpuminer-armv9-crypto-sha3 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8.5-crypto-sha3-sve2  cpuminer-armv8-crypto cpuminer-armv8 > /dev/null

 rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null

--- a/230
+++ b/230
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.4.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.7.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='24.4'
-PACKAGE_STRING='cpuminer-opt 24.4'
+PACKAGE_VERSION='24.7'
+PACKAGE_STRING='cpuminer-opt 24.7'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 24.4 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 24.7 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1432,7 +1432,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 24.4:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 24.7:";;
   esac
  cat <<\_ACEOF

@@ -1538,7 +1538,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 24.4
+cpuminer-opt configure 24.7
 generated by GNU Autoconf 2.71

 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 24.4, which was
+It was created by cpuminer-opt $as_me 24.7, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='24.4'
+ VERSION='24.7'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -6502,34 +6502,33 @@ then :
 fi


-MINGW_TARGET=`$CC -dumpmachine 2>&1`
-case $MINGW_TARGET in
-  arm*-*-*)
-    have_arm=true
-    ;;
-  i*86-*-mingw*)
-    have_x86=true
-    have_win32=true
-    CFLAGS="-Icompat/pthreads $CFLAGS"
-    PTHREAD_LDFLAGS="-Lcompat/pthreads/x86"
-    WS2_LIBS="-lws2_32"
-    ;;
-  x86_64-*-mingw*|amd64-*-mingw*)
-    have_x86_64=true
-    have_win32=true
-    CFLAGS="-Icompat/pthreads $CFLAGS"
-    PTHREAD_LDFLAGS="-Lcompat/pthreads/x64"
-    # SHOULD BE AT END! after -lcrypto #
-    WS2_LIBS="-L/mingw/x86_64-w64-mingw32/lib -lws2_32"
-    ;;
+case $target in
  i*86-*-*)
    have_x86=true
    ;;
  x86_64-*-*|amd64-*-*)
    have_x86_64=true
    ;;
+  arm*-*-*)
+    have_arm=true
+    ;;
+  powerpc*-*-*)
+    have_ppc=true
+    ;;
 esac

+PTHREAD_FLAGS="-pthread"
+WS2_LIBS=""
+
+case $target in
+  *-*-mingw*)
+    have_win32=true
+    PTHREAD_FLAGS=""
+    WS2_LIBS="-lws2_32"
+    ;;
+esac
+
+
 # Check whether --enable-assembly was given.
 if test ${enable_assembly+y}
 then :
@@ -6705,51 +6704,7 @@ else $as_nop
 fi


-# GC2 for GNU static
-if test "x$have_win32" = "xtrue" ; then
-   # MinGW
-   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
-printf %s "checking for pthread_create in -lpthread... " >&6; }
-if test ${ac_cv_lib_pthread_pthread_create+y}
-then :
-  printf %s "(cached) " >&6
-else $as_nop
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-lpthread  $LIBS"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-char pthread_create ();
-int
-main (void)
-{
-return pthread_create ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"
-then :
-  ac_cv_lib_pthread_pthread_create=yes
-else $as_nop
-  ac_cv_lib_pthread_pthread_create=no
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.beam \
-    conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
-fi
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthread_pthread_create" >&5
-printf "%s\n" "$ac_cv_lib_pthread_pthread_create" >&6; }
-if test "x$ac_cv_lib_pthread_pthread_create" = xyes
-then :
-  PTHREAD_LIBS="-lpthreadGC2"
-fi
-
-else
-   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
 printf %s "checking for pthread_create in -lpthread... " >&6; }
 if test ${ac_cv_lib_pthread_pthread_create+y}
 then :
@@ -6787,11 +6742,134 @@ printf "%s\n" "$ac_cv_lib_pthread_pthread_create" >&6; }
 if test "x$ac_cv_lib_pthread_pthread_create" = xyes
 then :
  PTHREAD_LIBS="-lpthread"
+else $as_nop
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC2" >&5
+printf %s "checking for pthread_create in -lpthreadGC2... " >&6; }
+if test ${ac_cv_lib_pthreadGC2_pthread_create+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lpthreadGC2  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+char pthread_create ();
+int
+main (void)
+{
+return pthread_create ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  ac_cv_lib_pthreadGC2_pthread_create=yes
+else $as_nop
+  ac_cv_lib_pthreadGC2_pthread_create=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC2_pthread_create" >&5
+printf "%s\n" "$ac_cv_lib_pthreadGC2_pthread_create" >&6; }
+if test "x$ac_cv_lib_pthreadGC2_pthread_create" = xyes
+then :
+  PTHREAD_LIBS="-lpthreadGC2"
+else $as_nop
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC1" >&5
+printf %s "checking for pthread_create in -lpthreadGC1... " >&6; }
+if test ${ac_cv_lib_pthreadGC1_pthread_create+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lpthreadGC1  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+char pthread_create ();
+int
+main (void)
+{
+return pthread_create ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  ac_cv_lib_pthreadGC1_pthread_create=yes
+else $as_nop
+  ac_cv_lib_pthreadGC1_pthread_create=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC1_pthread_create" >&5
+printf "%s\n" "$ac_cv_lib_pthreadGC1_pthread_create" >&6; }
+if test "x$ac_cv_lib_pthreadGC1_pthread_create" = xyes
+then :
+  PTHREAD_LIBS="-lpthreadGC1"
+else $as_nop
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC" >&5
+printf %s "checking for pthread_create in -lpthreadGC... " >&6; }
+if test ${ac_cv_lib_pthreadGC_pthread_create+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lpthreadGC  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+char pthread_create ();
+int
+main (void)
+{
+return pthread_create ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  ac_cv_lib_pthreadGC_pthread_create=yes
+else $as_nop
+  ac_cv_lib_pthreadGC_pthread_create=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC_pthread_create" >&5
+printf "%s\n" "$ac_cv_lib_pthreadGC_pthread_create" >&6; }
+if test "x$ac_cv_lib_pthreadGC_pthread_create" = xyes
+then :
+  PTHREAD_LIBS="-lpthreadGC"
+
+fi
+
 fi

 fi

-LDFLAGS="$PTHREAD_LDFLAGS $LDFLAGS"
+fi
+
+
+#LDFLAGS="$PTHREAD_LDFLAGS $LDFLAGS"
 # PTHREAD_LIBS="$PTHREAD_LIBS"

 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether __uint128_t is supported" >&5
@@ -7508,7 +7586,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 24.4, which was
+This file was extended by cpuminer-opt $as_me 24.7, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7576,7 +7654,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 24.4
+cpuminer-opt config.status 24.7
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [24.4])
+AC_INIT([cpuminer-opt], [24.7])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
@@ -41,34 +41,33 @@ AC_CHECK_DECLS([be32dec, le32dec, be32enc, le32enc, le16dec, le16enc], [], [],
 AC_FUNC_ALLOCA
 AC_CHECK_FUNCS([getopt_long])

-MINGW_TARGET=`$CC -dumpmachine 2>&1`
-case $MINGW_TARGET in
-  arm*-*-*)
-    have_arm=true
-    ;;
-  i*86-*-mingw*)
-    have_x86=true
-    have_win32=true
-    CFLAGS="-Icompat/pthreads $CFLAGS"
-    PTHREAD_LDFLAGS="-Lcompat/pthreads/x86"
-    WS2_LIBS="-lws2_32"
-    ;;
-  x86_64-*-mingw*|amd64-*-mingw*)
-    have_x86_64=true
-    have_win32=true
-    CFLAGS="-Icompat/pthreads $CFLAGS"
-    PTHREAD_LDFLAGS="-Lcompat/pthreads/x64"
-    # SHOULD BE AT END! after -lcrypto #
-    WS2_LIBS="-L/mingw/x86_64-w64-mingw32/lib -lws2_32"
-    ;;
+case $target in
  i*86-*-*)
    have_x86=true
    ;;
  x86_64-*-*|amd64-*-*)
    have_x86_64=true
    ;;
+  arm*-*-*)
+    have_arm=true
+    ;;
+  powerpc*-*-*)
+    have_ppc=true
+    ;;
 esac

+PTHREAD_FLAGS="-pthread"
+WS2_LIBS=""
+
+case $target in
+  *-*-mingw*)
+    have_win32=true
+    PTHREAD_FLAGS=""
+    WS2_LIBS="-lws2_32"
+    ;;
+esac
+
+
 AC_ARG_ENABLE([assembly],
  AS_HELP_STRING([--disable-assembly], [disable assembly-language routines]))
 if test x$enable_assembly != xno; then
@@ -113,15 +112,13 @@ fi

 AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true)

-# GC2 for GNU static
-if test "x$have_win32" = "xtrue" ; then
-   # MinGW
-   AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthreadGC2",[])
-else
-   AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",[])
-fi
+AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",
+  AC_CHECK_LIB([pthreadGC2], [pthread_create], PTHREAD_LIBS="-lpthreadGC2",
+    AC_CHECK_LIB([pthreadGC1], [pthread_create], PTHREAD_LIBS="-lpthreadGC1",
+      AC_CHECK_LIB([pthreadGC], [pthread_create], PTHREAD_LIBS="-lpthreadGC"
+))))

-LDFLAGS="$PTHREAD_LDFLAGS $LDFLAGS"
+#LDFLAGS="$PTHREAD_LDFLAGS $LDFLAGS"
 # PTHREAD_LIBS="$PTHREAD_LIBS"

 AC_MSG_CHECKING(whether __uint128_t is supported)
--- a/4350
+++ b/4350
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1591,13 +1591,13 @@ start:
         last_targetdiff = net_diff;

         applog( LOG_BLUE, "New Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
-                                work->height, work->tx_count, net_diff,
-                                work->data[ algo_gate.ntime_index ] );
+                             work->height, work->tx_count, net_diff,
+                             bswap_32( work->data[ algo_gate.ntime_index ] ) );
      }
-      else if ( memcmp( &work->data[1], &g_work.data[1], 32 ) )
+      else if ( memcmp( work->data, g_work.data, algo_gate.work_cmp_size ) )
         applog( LOG_BLUE, "New Work: Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
-                                work->height, work->tx_count, net_diff,
-                                work->data[ algo_gate.ntime_index ] );
+                             work->height, work->tx_count, net_diff,
+                             bswap_32( work->data[ algo_gate.ntime_index ] ) );
      else
        new_work = false;

@@ -2139,7 +2139,7 @@ static void *miner_thread( void *userdata )
 //   uint32_t end_nonce = opt_benchmark
 //                      ? ( 0xffffffffU / opt_n_threads ) * (thr_id + 1) - 0x20
 //                      : 0;
-   uint32_t end_nonce = 0xffffffffU / opt_n_threads  * (thr_id + 1) - 0x20;
+   uint32_t end_nonce = 0xffffffffU / opt_n_threads  * (thr_id + 1) - opt_n_threads;

   memset( &work, 0, sizeof(work) );
 
@@ -2206,58 +2206,58 @@ static void *miner_thread( void *userdata )
 //       int64_t max64 = 1000;
       int nonce_found = 0;

-//       if ( likely( algo_gate.do_this_thread( thr_id ) ) )
-//       {
-          if ( have_stratum ) 
+       if ( have_stratum ) 
+       {
+          while ( unlikely( stratum_down ) )
+             sleep( 1 );
+          if ( unlikely( ( *nonceptr >= end_nonce )
+                        && !work_restart[thr_id].restart ) )
          {
-             while ( unlikely( stratum_down ) )
-                sleep( 1 );
-             if ( unlikely( ( *nonceptr >= end_nonce )
-                         && !work_restart[thr_id].restart ) )
+             if ( opt_extranonce )
+                stratum_gen_work( &stratum, &g_work );
+             else
             {
-                if ( opt_extranonce )
-                   stratum_gen_work( &stratum, &g_work );
-                else
+                if ( !thr_id )
                {
-                   if ( !thr_id )
-                   {
-                      applog( LOG_WARNING, "nonce range exhausted, extranonce not subscribed" );
-                      applog( LOG_WARNING, "waiting for new work...");
-                   }
-                   while ( !work_restart[thr_id].restart )
-                      sleep ( 1 );
+                   applog( LOG_WARNING, "Nonce range exhausted, extranonce not subscribed." );
+                   applog( LOG_WARNING, "Waiting for new work...");
                }
+                while ( !work_restart[thr_id].restart )
+                   sleep ( 1 );
             }
          }
-          else if ( !opt_benchmark ) // GBT or getwork
+       }
+       else if ( !opt_benchmark ) // GBT or getwork
+       {
+         // max64 is used to set end_nonce to match the scantime.
+         // It also factors the nonce range to end the scan when nonces are
+         // exhausted. In either case needing new work can be assumed.
+         // Only problem is every thread will call get_work.
+         // First thread resets scantime blocking all subsequent threads
+         // from fetching new work.
+
+          pthread_rwlock_wrlock( &g_work_lock );
+          const time_t now = time(NULL);
+          if ( ( ( now - g_work_time ) >= opt_scantime )
+             || ( *nonceptr >= end_nonce ) )
          {
-             pthread_rwlock_wrlock( &g_work_lock );
-
-             if ( ( ( time(NULL) - g_work_time ) >= opt_scantime )
-               || ( *nonceptr >= end_nonce ) )
+             if ( unlikely( !get_work( mythr, &g_work ) ) )
             {
-                if ( unlikely( !get_work( mythr, &g_work ) ) )
-                {
-                   pthread_rwlock_unlock( &g_work_lock );
-		             applog( LOG_ERR, "work retrieval failed, exiting miner thread %d", thr_id );
-		             goto out;
-	             }
-                g_work_time = time(NULL);
-//                restart_threads();
-             }
-
-             pthread_rwlock_unlock( &g_work_lock );
+                pthread_rwlock_unlock( &g_work_lock );
+                applog( LOG_ERR, "work retrieval failed, exiting miner thread %d", thr_id );
+		          goto out;
+	          }
+             g_work_time = now;
          }
-
-          pthread_rwlock_rdlock( &g_work_lock );
-
-          algo_gate.get_new_work( &work, &g_work, thr_id, &end_nonce );
-          work_restart[thr_id].restart = 0;
-
          pthread_rwlock_unlock( &g_work_lock );
+       }

-//       } // do_this_thread
-//       algo_gate.resync_threads( thr_id, &work );
+       pthread_rwlock_rdlock( &g_work_lock );
+
+       algo_gate.get_new_work( &work, &g_work, thr_id, &end_nonce );
+       work_restart[thr_id].restart = 0;
+
+       pthread_rwlock_unlock( &g_work_lock );

       // conditional mining
       if ( unlikely( !wanna_mine( thr_id ) ) )
@@ -2315,12 +2315,6 @@ static void *miner_thread( void *userdata )
       gettimeofday( (struct timeval *) &tv_start, NULL );

       // Scan for nonce
-//       nonce_found = scanhash_sha256dt_ref( &work, max_nonce, &hashes_done,
-//                                         mythr );
-//       nonce_found = scanhash_sha256dt_4x32( &work, max_nonce, &hashes_done,
-//                                         mythr );
-
-
       nonce_found = algo_gate.scanhash( &work, max_nonce, &hashes_done,
                                         mythr );

@@ -2342,8 +2336,8 @@ static void *miner_thread( void *userdata )
       // If unsubmiited nonce(s) found, submit now. 
       if ( unlikely( nonce_found && !opt_benchmark ) )
       {  
-//          applog( LOG_WARNING, "BUG: See RELEASE_NOTES for reporting bugs. Algo = %s.",
-//                               algo_names[ opt_algo ] );
+          applog( LOG_WARNING, "BUG: See RELEASE_NOTES for reporting bugs. Algo = %s.",
+                               algo_names[ opt_algo ] );
          if ( !submit_work( mythr, &work ) )
          {
             applog( LOG_WARNING, "Failed to submit share." );
@@ -2846,8 +2840,6 @@ static void show_credits()
 static bool cpu_capability( bool display_only )
 {
     char cpu_brand[0x40];
-     bool cpu_has_aarch64  = cpu_arch_aarch64();
-     bool cpu_has_x86_64   = cpu_arch_x86_64();
     bool cpu_has_sse2     = has_sse2();     // X86_64 only
     bool cpu_has_ssse3    = has_ssse3();    // X86_64 only
     bool cpu_has_sse41    = has_sse41();    // X86_64 only
@@ -2920,7 +2912,7 @@ static bool cpu_capability( bool display_only )
           sw_arm_arch = __ARM_ARCH;
         #endif
     #endif
-     // x86_64_only
+     // x86_64 only
     #if defined(__SSE2__)
         sw_has_sse2 = true;
     #endif
@@ -3004,57 +2996,57 @@ static bool cpu_capability( bool display_only )
     #endif

     printf("CPU features: ");
-     if ( cpu_has_x86_64  )
+     if ( cpu_arch_x86_64()  )
     {
-       if      ( cpu_has_avx512 )    printf( " AVX512"  );
-       else if ( cpu_has_avx2   )    printf( " AVX2  "  );
-       else if ( cpu_has_avx    )    printf( " AVX   "  );
-       else if ( cpu_has_sse42  )    printf( " SSE4.2"  );
-       else if ( cpu_has_sse41  )    printf( " SSE4.1"  );
-       else if ( cpu_has_ssse3  )    printf( " SSSE3 "  );
-       else if ( cpu_has_sse2   )    printf( " SSE2  "  );
+       if      ( cpu_has_avx10  )    printf( " AVX10.%d-%d", avx10_version(),
+                                                       avx10_vector_length() );
+       if      ( cpu_has_avx512 )    printf( " AVX512" );
+       else if ( cpu_has_avx2   )    printf( " AVX2  " );
+       else if ( cpu_has_avx    )    printf( " AVX   " );
+       else if ( cpu_has_sse42  )    printf( " SSE4.2" );
+       else if ( cpu_has_sse41  )    printf( " SSE4.1" );
+       else if ( cpu_has_ssse3  )    printf( " SSSE3 " );
+       else if ( cpu_has_sse2   )    printf( " SSE2  " );
     }
-     else if   ( cpu_has_aarch64 )
+     else if   ( cpu_arch_aarch64() )
     {
       if      ( cpu_has_neon   )    printf( "       NEON" );
       if      ( cpu_has_sve2   )    printf( " SVE2-%d", sve_vector_length() );
-       else if ( cpu_has_sve    )    printf( " SVE"     );
-       if      ( cpu_has_sme2   )    printf( " SME2"    );
-       else if ( cpu_has_sme    )    printf( " SME"     );
-     }
-     if        ( cpu_has_vaes   )    printf( " VAES"    );
-     else if   ( cpu_has_aes    )    printf( "  AES"    );
-     if        ( cpu_has_sha512 )    printf( " SHA512"  );
-     else if   ( cpu_has_sha256 )    printf( " SHA256"  );
-     if        ( cpu_has_avx10  )    printf( " AVX10.%d-%d",
-                                      avx10_version(), avx10_vector_length() );
+       else if ( cpu_has_sve    )    printf( " SVE"    );
+       if      ( cpu_has_sme2   )    printf( " SME2"   );
+       else if ( cpu_has_sme    )    printf( " SME"    );
+     }     
+     if        ( cpu_has_vaes   )    printf( " VAES"   );
+     else if   ( cpu_has_aes    )    printf( "  AES"   );
+     if        ( cpu_has_sha512 )    printf( " SHA512" );
+     else if   ( cpu_has_sha256 )    printf( " SHA256" );

     printf("\nSW features:  ");
     if ( sw_has_x86_64 )
     {                     
-        if      ( sw_has_avx512  )   printf( " AVX512"  );
-        else if ( sw_has_avx2    )   printf( " AVX2  "  );
-        else if ( sw_has_avx     )   printf( " AVX   "  );
-        else if ( sw_has_sse42   )   printf( " SSE4.2"  );
-        else if ( sw_has_sse41   )   printf( " SSE4.1"  );
-        else if ( sw_has_ssse3   )   printf( " SSSE3 "  );
-        else if ( sw_has_sse2    )   printf( " SSE2  "  );
        if      ( sw_has_avx10_512 ) printf( " AVX10-512" );
        else if ( sw_has_avx10_256 ) printf( " AVX10-256" );
+        else if ( sw_has_avx512    ) printf( " AVX512" );
+        else if ( sw_has_avx2      ) printf( " AVX2  " );
+        else if ( sw_has_avx       ) printf( " AVX   " );
+        else if ( sw_has_sse42     ) printf( " SSE4.2" );
+        else if ( sw_has_sse41     ) printf( " SSE4.1" );
+        else if ( sw_has_ssse3     ) printf( " SSSE3 " );
+        else if ( sw_has_sse2      ) printf( " SSE2  " );
     }
     else if    ( sw_has_aarch64 ) 
     {
        if      ( sw_arm_arch    )   printf( " armv%d", sw_arm_arch );
-        if      ( sw_has_neon    )   printf( " NEON"    );
-        if      ( sw_has_sve2    )   printf( " SVE2"    );
-        else if ( sw_has_sve     )   printf( " SVE"     );
-        if      ( sw_has_sme2    )   printf( " SME2"    );
-        else if ( sw_has_sme     )   printf( " SME"     );
+        if      ( sw_has_neon    )   printf( " NEON"   );
+        if      ( sw_has_sve2    )   printf( " SVE2"   );
+        else if ( sw_has_sve     )   printf( " SVE"    );
+        if      ( sw_has_sme2    )   printf( " SME2"   );
+        else if ( sw_has_sme     )   printf( " SME"    );
     }
-     if         ( sw_has_vaes    )   printf( " VAES"    );
-     else if    ( sw_has_aes     )   printf( "  AES"    );
-     if         ( sw_has_sha512  )   printf( " SHA512"  );
-     else if    ( sw_has_sha256  )   printf( " SHA256"  );
+     if         ( sw_has_vaes    )   printf( " VAES"   );
+     else if    ( sw_has_aes     )   printf( "  AES"   );
+     if         ( sw_has_sha512  )   printf( " SHA512" );
+     else if    ( sw_has_sha256  )   printf( " SHA256" );

     if ( !display_only )
     {
--- a/miner.h
+++ b/miner.h
@@ -191,7 +191,7 @@ static inline uint32_t swab32(uint32_t x)
   return __builtin_bswap32(x);
 #else
   return ( ( (x) << 24 ) & 0xff000000u ) | ( ( (x) <<  8 ) & 0x00ff0000u )
-        | ( ( (x) >>  8 ) & 0x0000ff00u ) | ( ( (x) >> 24 ) & 0x000000ffu )
+        | ( ( (x) >>  8 ) & 0x0000ff00u ) | ( ( (x) >> 24 ) & 0x000000ffu );


 //   return bswap_32(v);
@@ -644,7 +644,6 @@ enum algos {
        ALGO_SHA256T,
        ALGO_SHA3D,
        ALGO_SHA512256D,
-        ALGO_SHAVITE3,    
        ALGO_SKEIN,       
        ALGO_SKEIN2,      
        ALGO_SKUNK,
@@ -740,7 +739,6 @@ static const char* const algo_names[] = {
        "sha256t",
        "sha3d",
        "sha512256d",
-        "shavite3",
        "skein",
        "skein2",
        "skunk",
@@ -904,7 +902,6 @@ Options:\n\
                          sha256t       Triple SHA-256, Onecoin (OC)\n\
                          sha3d         Double Keccak256 (BSHA3)\n\
                          sha512256d    Double SHA-512 (Radiant)\n\
-                          shavite3      Shavite3\n\
                          skein         Skein+Sha (Skeincoin)\n\
                          skein2        Double Skein (Woodcoin)\n\
                          skunk         Signatum (SIGT)\n\
--- a/simd-utils.h
+++ b/simd-utils.h
@@ -141,28 +141,56 @@
 #include <stdint.h>
 #include <stddef.h>

-// SIMD512: Use 512, 256 & 128 bit vectors, excludes AVX512VBMI
-// VL256: Include AVX512VL instructions on 256 & 128 bit vectors
-// VBMI: Include AVX512VBMI instructions on all vectors.
+// AVX512 macros are not a reliable indicator of 512 bit vector capability
+// because they get defined with AVX10_1_256 which doesn't support 512 bit.
+// EVEX512 is also unreliable as it can also be defined when 512b is not
+// available.
+// Use AVX10_1_512 for 512b & AVX10_1_256 for 256b whenever AVX10 is present.
+// Use AVX512 macros only whithout AVX10.

-// AVX10 can exist without support for 512 bit vectors.
-#if defined(__AVX10_1_512__)
-  #define SIMD512 1
-#elif !defined(__AVX10_1__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-   #define SIMD512 1
+/*
+// Test for macros
+#ifdef __AVX10_1__
+#warning "__AVX10_1__"
 #endif
+#ifdef __AVX10_1_256__
+#warning "__AVX10_1_256__"
+#endif
+#ifdef __AVX10_1_512__
+#warning "__AVX10_1_512__"
+#endif
+#ifdef __EVEX256__
+#warning "__EVEX256__"
+#endif
+#ifdef __EVEX512__
+#warning "__EVEX512__"
+#endif
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#warning "AVX512"
+#endif
+*/
+
+// SIMD512: Use 512, 256 & 128 bit vectors, AVX512VBMI is not included and
+// must be tested seperately. 
+// VL256: Include AVX512VL instructions for 256 & 128 bit vectors.
+// VBMI: Include AVX512VBMI instructions for supported vector lengths.

-// AVX512VL instructions applied to 256 & 128 bit vectors is supported with 
-// either AVX512VL or any version of AVX10.
 #if defined(__AVX10_1__)
-  #define VL256 1
-#elif defined(__AVX512VL__)
-  #define VL256 1
-#endif

-// VBMI does not exist on early versions of AVX512
-#if defined(__AVX10_1__) || defined(__AVX512VBMI__)
+  #define VL256 1
  #define VBMI 1
+  #if defined(__AVX10_1_512__)
+    #define SIMD512 1
+  #endif
+
+#elif defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+  #define VL256 1
+  #define SIMD512 1
+  #if defined(__AVX512VBMI__)
+    #define VBMI 1
+  #endif
+
 #endif

 /*
@@ -201,10 +229,6 @@
 // x86_64 AVX512 512 bit vectors
 #include "simd-utils/simd-512.h"

-// move up after cleaning
-// CPU architectire abstraction
-//#include "simd-utils/simd-portable.h"
-
 // aarch64 neon 128 bit vectors
 #include "simd-utils/simd-neon.h"

--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -86,7 +86,7 @@ static inline void extr_lane_2x32( void *dst, const void *src,

 // 4x32

-#if ( defined(__x86_64__) && defined(__SSE2__) ) || ( defined(__aarch64__) && defined(__ARM_NEON) )
+#if defined(__x86_64__) && defined(__SSE2__) 

 #define ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ) \
 { \
@@ -174,6 +174,7 @@ static inline void intrlv_4x32_512( void *dst, const void *src0,
   STOR_DEST_4x32( D0, D1, D2, D3, dst, 12, dst, 13, dst, 14, dst, 15 );
 }

+
 static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
                           void *dst3, const void *src, const int bit_len )
 {
@@ -235,6 +236,190 @@ static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
   STOR_DEST_4x32( D0, D1, D2, D3, dst0, 3, dst1, 3, dst2, 3, dst3, 3 );
 }

+#elif defined(__aarch64__) && defined(__ARM_NEON)
+
+static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
+                      const void *src2, const void *src3, const int bit_len )
+{
+   uint32x4x4_t s;
+
+   s.val[0] = casti_v128u32( src0, 0 );
+   s.val[1] = casti_v128u32( src1, 0 );
+   s.val[2] = casti_v128u32( src2, 0 );
+   s.val[3] = casti_v128u32( src3, 0 );
+   vst4q_u32( dst, s );
+
+   s.val[0] = casti_v128u32( src0, 1 );
+   s.val[1] = casti_v128u32( src1, 1 );
+   s.val[2] = casti_v128u32( src2, 1 );
+   s.val[3] = casti_v128u32( src3, 1 );
+   vst4q_u32( dst + 64, s );
+   
+   if ( bit_len <= 256 ) return;
+
+   s.val[0] = casti_v128u32( src0, 2 );
+   s.val[1] = casti_v128u32( src1, 2 );
+   s.val[2] = casti_v128u32( src2, 2 );
+   s.val[3] = casti_v128u32( src3, 2 );
+   vst4q_u32( dst + 128, s );
+
+   s.val[0] = casti_v128u32( src0, 3 );
+   s.val[1] = casti_v128u32( src1, 3 );
+   s.val[2] = casti_v128u32( src2, 3 );
+   s.val[3] = casti_v128u32( src3, 3 );
+   vst4q_u32( dst + 192, s );
+
+   if ( bit_len <= 512 ) return;
+
+   s.val[0] = casti_v128u32( src0, 4 );
+   s.val[1] = casti_v128u32( src1, 4 );
+   s.val[2] = casti_v128u32( src2, 4 );
+   s.val[3] = casti_v128u32( src3, 4 );
+   vst4q_u32( dst + 256, s );
+
+   if ( bit_len <= 640 ) return;
+
+   s.val[0] = casti_v128u32( src0, 5 );
+   s.val[1] = casti_v128u32( src1, 5 );
+   s.val[2] = casti_v128u32( src2, 5 );
+   s.val[3] = casti_v128u32( src3, 5 );
+   vst4q_u32( dst + 320, s );
+
+   s.val[0] = casti_v128u32( src0, 6 );
+   s.val[1] = casti_v128u32( src1, 6 );
+   s.val[2] = casti_v128u32( src2, 6 );
+   s.val[3] = casti_v128u32( src3, 6 );
+   vst4q_u32( dst + 384, s );
+
+   s.val[0] = casti_v128u32( src0, 7 );
+   s.val[1] = casti_v128u32( src1, 7 );
+   s.val[2] = casti_v128u32( src2, 7 );
+   s.val[3] = casti_v128u32( src3, 7 );
+   vst4q_u32( dst + 448, s );
+
+// if ( bit_len <= 1024 return;
+}
+
+static inline void intrlv_4x32_512( void *dst, const void *src0,
+                     const void *src1, const void *src2, const void *src3 )
+{
+   uint32x4x4_t s;
+
+   s.val[0] = casti_v128u32( src0, 0 );
+   s.val[1] = casti_v128u32( src1, 0 );
+   s.val[2] = casti_v128u32( src2, 0 );
+   s.val[3] = casti_v128u32( src3, 0 );
+   vst4q_u32( dst, s );
+
+   s.val[0] = casti_v128u32( src0, 1 );
+   s.val[1] = casti_v128u32( src1, 1 );
+   s.val[2] = casti_v128u32( src2, 1 );
+   s.val[3] = casti_v128u32( src3, 1 );
+   vst4q_u32( dst + 64, s );
+
+   s.val[0] = casti_v128u32( src0, 2 );
+   s.val[1] = casti_v128u32( src1, 2 );
+   s.val[2] = casti_v128u32( src2, 2 );
+   s.val[3] = casti_v128u32( src3, 2 );
+   vst4q_u32( dst + 128, s );
+
+   s.val[0] = casti_v128u32( src0, 3 );
+   s.val[1] = casti_v128u32( src1, 3 );
+   s.val[2] = casti_v128u32( src2, 3 );
+   s.val[3] = casti_v128u32( src3, 3 );
+   vst4q_u32( dst + 192, s );
+}
+
+static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
+                           void *dst3, const void *src, int bit_len )
+{
+   uint32x4x4_t s = vld4q_u32( src );
+
+   casti_v128( dst0, 0 ) = s.val[0];
+   casti_v128( dst1, 0 ) = s.val[1];
+   casti_v128( dst2, 0 ) = s.val[2];
+   casti_v128( dst3, 0 ) = s.val[3];
+
+   s = vld4q_u32( src + 64 );
+   casti_v128( dst0, 1 ) = s.val[0];
+   casti_v128( dst1, 1 ) = s.val[1];
+   casti_v128( dst2, 1 ) = s.val[2];
+   casti_v128( dst3, 1 ) = s.val[3];
+
+   if ( bit_len <= 256 ) return;
+
+   s = vld4q_u32( src + 128 );
+   casti_v128( dst0, 2 ) = s.val[0];
+   casti_v128( dst1, 2 ) = s.val[1];
+   casti_v128( dst2, 2 ) = s.val[2];
+   casti_v128( dst3, 2 ) = s.val[3];
+
+   s = vld4q_u32( src + 192 );
+   casti_v128( dst0, 3 ) = s.val[0];
+   casti_v128( dst1, 3 ) = s.val[1];
+   casti_v128( dst2, 3 ) = s.val[2];
+   casti_v128( dst3, 3 ) = s.val[3];
+
+   if ( bit_len <= 512 ) return;
+
+   s = vld4q_u32( src + 256 );
+   casti_v128( dst0, 4 ) = s.val[0];
+   casti_v128( dst1, 4 ) = s.val[1];
+   casti_v128( dst2, 4 ) = s.val[2];
+   casti_v128( dst3, 4 ) = s.val[3];
+
+   if ( bit_len <= 640 ) return;
+
+   s = vld4q_u32( src + 320 );
+   casti_v128( dst0, 5 ) = s.val[0];
+   casti_v128( dst1, 5 ) = s.val[1];
+   casti_v128( dst2, 5 ) = s.val[2];
+   casti_v128( dst3, 5 ) = s.val[3];
+
+   s = vld4q_u32( src + 384 );
+   casti_v128( dst0, 6 ) = s.val[0];
+   casti_v128( dst1, 6 ) = s.val[1];
+   casti_v128( dst2, 6 ) = s.val[2];
+   casti_v128( dst3, 6 ) = s.val[3];
+
+   s = vld4q_u32( src + 448 );
+   casti_v128( dst0, 6 ) = s.val[0];
+   casti_v128( dst1, 6 ) = s.val[1];
+   casti_v128( dst2, 6 ) = s.val[2];
+   casti_v128( dst3, 6 ) = s.val[3];
+
+//   if ( bit_len <= 1024 ) return;
+}
+
+static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
+                           void *dst3, const void *src )
+{
+   uint32x4x4_t s = vld4q_u32( src );
+
+   casti_v128( dst0, 0 ) = s.val[0];
+   casti_v128( dst1, 0 ) = s.val[1];
+   casti_v128( dst2, 0 ) = s.val[2];
+   casti_v128( dst3, 0 ) = s.val[3];
+
+   s = vld4q_u32( src + 64 );
+   casti_v128( dst0, 1 ) = s.val[0];
+   casti_v128( dst1, 1 ) = s.val[1];
+   casti_v128( dst2, 1 ) = s.val[2];
+   casti_v128( dst3, 1 ) = s.val[3];
+
+   s = vld4q_u32( src + 128 );
+   casti_v128( dst0, 2 ) = s.val[0];
+   casti_v128( dst1, 2 ) = s.val[1];
+   casti_v128( dst2, 2 ) = s.val[2];
+   casti_v128( dst3, 2 ) = s.val[3];
+
+   s = vld4q_u32( src + 192 );
+   casti_v128( dst0, 3 ) = s.val[0];
+   casti_v128( dst1, 3 ) = s.val[1];
+   casti_v128( dst2, 3 ) = s.val[2];
+   casti_v128( dst3, 3 ) = s.val[3];
+}
+
 #else  // !SSE2 && !NEON

 static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
@@ -456,15 +641,13 @@ static inline void v128_bswap32_80( void *d, void *s )

 #endif

-#if defined(__SSE2__)
-
 static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
 {
-  v128_t s0 = casti_v128( src,0 );
-  v128_t s1 = casti_v128( src,1 );
-  v128_t s2 = casti_v128( src,2 );
-  v128_t s3 = casti_v128( src,3 );
-  v128_t s4 = casti_v128( src,4 );
+  v128u32_t s0 = casti_v128u32( src,0 );
+  v128u32_t s1 = casti_v128u32( src,1 );
+  v128u32_t s2 = casti_v128u32( src,2 );
+  v128u32_t s3 = casti_v128u32( src,3 );
+  v128u32_t s4 = casti_v128u32( src,4 );

 #if defined(__SSSE3__)

@@ -487,79 +670,34 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )

 #endif

-  casti_v128( d, 0 ) = _mm_shuffle_epi32( s0, 0x00 );
-  casti_v128( d, 1 ) = _mm_shuffle_epi32( s0, 0x55 );
-  casti_v128( d, 2 ) = _mm_shuffle_epi32( s0, 0xaa );
-  casti_v128( d, 3 ) = _mm_shuffle_epi32( s0, 0xff );
+  casti_v128u32( d, 0 ) = v128_duplane32( s0, 0 );
+  casti_v128u32( d, 1 ) = v128_duplane32( s0, 1 );
+  casti_v128u32( d, 2 ) = v128_duplane32( s0, 2 );
+  casti_v128u32( d, 3 ) = v128_duplane32( s0, 3 );

-  casti_v128( d, 4 ) = _mm_shuffle_epi32( s1, 0x00 );
-  casti_v128( d, 5 ) = _mm_shuffle_epi32( s1, 0x55 );
-  casti_v128( d, 6 ) = _mm_shuffle_epi32( s1, 0xaa );
-  casti_v128( d, 7 ) = _mm_shuffle_epi32( s1, 0xff );
+  casti_v128u32( d, 4 ) = v128_duplane32( s1, 0 );
+  casti_v128u32( d, 5 ) = v128_duplane32( s1, 1 );
+  casti_v128u32( d, 6 ) = v128_duplane32( s1, 2 );
+  casti_v128u32( d, 7 ) = v128_duplane32( s1, 3 );

-  casti_v128( d, 8 ) = _mm_shuffle_epi32( s2, 0x00 );
-  casti_v128( d, 9 ) = _mm_shuffle_epi32( s2, 0x55 );
-  casti_v128( d,10 ) = _mm_shuffle_epi32( s2, 0xaa );
-  casti_v128( d,11 ) = _mm_shuffle_epi32( s2, 0xff );
+  casti_v128u32( d, 8 ) = v128_duplane32( s2, 0 );
+  casti_v128u32( d, 9 ) = v128_duplane32( s2, 1 );
+  casti_v128u32( d,10 ) = v128_duplane32( s2, 2 );
+  casti_v128u32( d,11 ) = v128_duplane32( s2, 3 );

-  casti_v128( d,12 ) = _mm_shuffle_epi32( s3, 0x00 );
-  casti_v128( d,13 ) = _mm_shuffle_epi32( s3, 0x55 );
-  casti_v128( d,14 ) = _mm_shuffle_epi32( s3, 0xaa );
-  casti_v128( d,15 ) = _mm_shuffle_epi32( s3, 0xff );
+  casti_v128u32( d,12 ) = v128_duplane32( s3, 0 );
+  casti_v128u32( d,13 ) = v128_duplane32( s3, 1 );
+  casti_v128u32( d,14 ) = v128_duplane32( s3, 2 );
+  casti_v128u32( d,15 ) = v128_duplane32( s3, 3 );

-  casti_v128( d,16 ) = _mm_shuffle_epi32( s4, 0x00 );
-  casti_v128( d,17 ) = _mm_shuffle_epi32( s4, 0x55 );
-  casti_v128( d,18 ) = _mm_shuffle_epi32( s4, 0xaa );
-  casti_v128( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
+  casti_v128u32( d,16 ) = v128_duplane32( s2, 0 );
+  casti_v128u32( d,17 ) = v128_duplane32( s2, 1 );
+  casti_v128u32( d,18 ) = v128_duplane32( s2, 2 );
+  casti_v128u32( d,19 ) = v128_duplane32( s2, 3 );
 }

-#elif defined(__aarch64__) && defined(__ARM_NEON)
-
-static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
-{
-  v128_t s0 = casti_v128( src,0 );
-  v128_t s1 = casti_v128( src,1 );
-  v128_t s2 = casti_v128( src,2 );
-  v128_t s3 = casti_v128( src,3 );
-  v128_t s4 = casti_v128( src,4 );
-
-  s0 = v128_bswap32( s0 );
-  s1 = v128_bswap32( s1 );
-  s2 = v128_bswap32( s2 );
-  s3 = v128_bswap32( s3 );
-  s4 = v128_bswap32( s4 );
-
-  casti_v128( d, 0 ) = vdupq_laneq_u32( s0, 0 );
-  casti_v128( d, 1 ) = vdupq_laneq_u32( s0, 1 );
-  casti_v128( d, 2 ) = vdupq_laneq_u32( s0, 2 );
-  casti_v128( d, 3 ) = vdupq_laneq_u32( s0, 3 );
-
-  casti_v128( d, 4 ) = vdupq_laneq_u32( s1, 0 );
-  casti_v128( d, 5 ) = vdupq_laneq_u32( s1, 1 );
-  casti_v128( d, 6 ) = vdupq_laneq_u32( s1, 2 );
-  casti_v128( d, 7 ) = vdupq_laneq_u32( s1, 3 );
-
-  casti_v128( d, 8 ) = vdupq_laneq_u32( s2, 0 );
-  casti_v128( d, 9 ) = vdupq_laneq_u32( s2, 1 );
-  casti_v128( d,10 ) = vdupq_laneq_u32( s2, 2 );
-  casti_v128( d,11 ) = vdupq_laneq_u32( s2, 3 );
-
-  casti_v128( d,12 ) = vdupq_laneq_u32( s3, 0 );
-  casti_v128( d,13 ) = vdupq_laneq_u32( s3, 1 );
-  casti_v128( d,14 ) = vdupq_laneq_u32( s3, 2 );
-  casti_v128( d,15 ) = vdupq_laneq_u32( s3, 3 );
-
-  casti_v128( d,16 ) = vdupq_laneq_u32( s2, 0 );
-  casti_v128( d,17 ) = vdupq_laneq_u32( s2, 1 );
-  casti_v128( d,18 ) = vdupq_laneq_u32( s2, 2 );
-  casti_v128( d,19 ) = vdupq_laneq_u32( s2, 3 );
-}
-
-#endif
-
 // 8x32

-
 #if defined(__AVX2__)

 #define ILEAVE_8x32( D0, D1, D2, D3, D4, D5, D6, D7, \
@@ -1544,7 +1682,9 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
 //
 //     64 bit data

-// 2x64    SSE2, NEON
+// 2x64    
+
+#if defined(__x86_64__) && defined(__SSE2__)

 static inline void intrlv_2x64( void *dst, const void *src0,
                                const void *src1, const int bit_len )
@@ -1602,7 +1742,101 @@ static inline void dintrlv_2x64( void *dst0, void *dst1,
   d1[7] = v128_unpackhi64( s[14], s[15] );
 }

-/*
+#elif defined(__aarch64__) && defined(__ARM_NEON)
+
+static inline void intrlv_2x64( void *dst, const void *src0,
+                                const void *src1, const int bit_len )
+{
+   uint64x2x2_t s;
+
+   s.val[0] = casti_v128u64( src0, 0 );
+   s.val[1] = casti_v128u64( src1, 0 );
+   vst2q_u64( dst, s );
+
+   s.val[0] = casti_v128u64( src0, 1 );
+   s.val[1] = casti_v128u64( src1, 1 );
+   vst2q_u64( dst + 32, s );
+   
+   if ( bit_len <= 256 ) return;
+
+   s.val[0] = casti_v128u64( src0, 2 );
+   s.val[1] = casti_v128u64( src1, 2 );
+   vst2q_u64( dst + 64, s );
+
+   s.val[0] = casti_v128u64( src0, 3 );
+   s.val[1] = casti_v128u64( src1, 3 );
+   vst2q_u64( dst + 96, s );
+
+   if ( bit_len <= 512 ) return;
+
+   s.val[0] = casti_v128u64( src0, 4 );
+   s.val[1] = casti_v128u64( src1, 4 );
+   vst2q_u64( dst + 128, s );
+
+   if ( bit_len <= 640 ) return;
+
+   s.val[0] = casti_v128u64( src0, 5 );
+   s.val[1] = casti_v128u64( src1, 5 );
+   vst2q_u64( dst + 160, s );
+
+   s.val[0] = casti_v128u64( src0, 6 );
+   s.val[1] = casti_v128u64( src1, 6 );
+   vst2q_u64( dst + 192, s );
+
+   s.val[0] = casti_v128u64( src0, 7 );
+   s.val[1] = casti_v128u64( src1, 7 );
+   vst2q_u64( dst + 224, s );
+
+//   if ( bit_len <= 1024 ) return;
+}
+
+static inline void dintrlv_2x64( void *dst0, void *dst1,
+                                 const void *src, const int bit_len )
+{
+   uint64x2x2_t s = vld2q_u64( src );
+
+   casti_v128u64( dst0, 0 ) = s.val[0];
+   casti_v128u64( dst1, 0 ) = s.val[1];
+
+   s = vld2q_u64( src + 32 );
+   casti_v128u64( dst0, 1 ) = s.val[0];
+   casti_v128u64( dst1, 1 ) = s.val[1];
+
+   if ( bit_len <= 256 ) return;
+
+   s = vld2q_u64( src + 64 );
+   casti_v128u64( dst0, 2 ) = s.val[0];
+   casti_v128u64( dst1, 2 ) = s.val[1];
+
+   s = vld2q_u64( src + 96 );
+   casti_v128u64( dst0, 3 ) = s.val[0];
+   casti_v128u64( dst1, 3 ) = s.val[1];
+
+   if ( bit_len <= 512 ) return;
+
+   s = vld2q_u64( src + 128 );
+   casti_v128u64( dst0, 4 ) = s.val[0];
+   casti_v128u64( dst1, 4 ) = s.val[1];
+
+   if ( bit_len <= 640 ) return;
+
+   s = vld2q_u64( src + 160 );
+   casti_v128u64( dst0, 5 ) = s.val[0];
+   casti_v128u64( dst1, 5 ) = s.val[1];  
+
+   s = vld2q_u64( src + 192 );
+   casti_v128u64( dst0, 6 ) = s.val[0];
+   casti_v128u64( dst1, 6 ) = s.val[1];   
+
+   s = vld2q_u64( src + 224 );
+   casti_v128u64( dst0, 7 ) = s.val[0];
+   casti_v128u64( dst1, 7 ) = s.val[1];   
+
+//   if ( bit_len <= 1024 ) return;
+}
+   
+#else
+
 static inline void intrlv_2x64( void *dst, const void *src0,
                                const void *src1, const int bit_len )
 {
@@ -1621,8 +1855,7 @@ static inline void intrlv_2x64( void *dst, const void *src0,
   d[24] = s0[12];    d[25] = s1[12];   d[26] = s0[13];    d[27] = s1[13];
   d[28] = s0[14];    d[29] = s1[14];   d[30] = s0[15];    d[31] = s1[15];
 }
-*/
-/*
+
 static inline void dintrlv_2x64( void *dst0, void *dst1,
                                 const void *src, const int bit_len )
 {
@@ -1642,15 +1875,16 @@ static inline void dintrlv_2x64( void *dst0, void *dst1,
   d0[12] = s[24];   d1[12] = s[25];   d0[13] = s[26];   d1[13] = s[27];
   d0[14] = s[28];   d1[14] = s[29];   d0[15] = s[30];   d1[15] = s[31];
 }
-*/
+
+#endif

 static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
 {
-  v128_t s0 = casti_v128( src,0 );
-  v128_t s1 = casti_v128( src,1 );
-  v128_t s2 = casti_v128( src,2 );
-  v128_t s3 = casti_v128( src,3 );
-  v128_t s4 = casti_v128( src,4 );
+  v128u64_t s0 = casti_v128u64( src,0 );
+  v128u64_t s1 = casti_v128u64( src,1 );
+  v128u64_t s2 = casti_v128u64( src,2 );
+  v128u64_t s3 = casti_v128u64( src,3 );
+  v128u64_t s4 = casti_v128u64( src,4 );

 #if defined(__SSSE3__)

@@ -1673,41 +1907,20 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )

 #endif

-#if defined(__SSE2__)
+  casti_v128u64( d,0 ) = v128_duplane64( s0, 0 );
+  casti_v128u64( d,1 ) = v128_duplane64( s0, 1 );

-  casti_v128( d,0 ) = _mm_shuffle_epi32( s0, 0x44 );
-  casti_v128( d,1 ) = _mm_shuffle_epi32( s0, 0xee );
+  casti_v128u64( d,2 ) = v128_duplane64( s1, 0 );
+  casti_v128u64( d,3 ) = v128_duplane64( s1, 1 );

-  casti_v128( d,2 ) = _mm_shuffle_epi32( s1, 0x44 );
-  casti_v128( d,3 ) = _mm_shuffle_epi32( s1, 0xee );
+  casti_v128u64( d,4 ) = v128_duplane64( s2, 0 );
+  casti_v128u64( d,5 ) = v128_duplane64( s2, 1 );

-  casti_v128( d,4 ) = _mm_shuffle_epi32( s2, 0x44 );
-  casti_v128( d,5 ) = _mm_shuffle_epi32( s2, 0xee );
+  casti_v128u64( d,6 ) = v128_duplane64( s3, 0 );
+  casti_v128u64( d,7 ) = v128_duplane64( s3, 1 );

-  casti_v128( d,6 ) = _mm_shuffle_epi32( s3, 0x44 );
-  casti_v128( d,7 ) = _mm_shuffle_epi32( s3, 0xee );
-
-  casti_v128( d,8 ) = _mm_shuffle_epi32( s4, 0x44 );
-  casti_v128( d,9 ) = _mm_shuffle_epi32( s4, 0xee );
-
-#elif defined(__ARM_NEON)
-
-  casti_v128u64( d,0 ) = vdupq_laneq_u64( (uint64x2_t)s0, 0 );
-  casti_v128u64( d,1 ) = vdupq_laneq_u64( (uint64x2_t)s0, 1 );
-
-  casti_v128u64( d,2 ) = vdupq_laneq_u64( (uint64x2_t)s1, 0 );
-  casti_v128u64( d,3 ) = vdupq_laneq_u64( (uint64x2_t)s1, 1 );
-
-  casti_v128u64( d,4 ) = vdupq_laneq_u64( (uint64x2_t)s2, 0 );
-  casti_v128u64( d,5 ) = vdupq_laneq_u64( (uint64x2_t)s2, 1 );
-
-  casti_v128u64( d,6 ) = vdupq_laneq_u64( (uint64x2_t)s3, 0 );
-  casti_v128u64( d,7 ) = vdupq_laneq_u64( (uint64x2_t)s3, 1 );
-
-  casti_v128u64( d,8 ) = vdupq_laneq_u64( (uint64x2_t)s4, 0 );
-  casti_v128u64( d,9 ) = vdupq_laneq_u64( (uint64x2_t)s4, 1 );
-
-#endif
+  casti_v128u64( d,8 ) = v128_duplane64( s4, 0 );
+  casti_v128u64( d,9 ) = v128_duplane64( s4, 1 );
 }

 static inline void extr_lane_2x64( void *dst, const void *src,
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -32,6 +32,14 @@
 // Intrinsics automatically promote from REX to VEX when AVX is available
 // but ASM needs to be done manually.
 //
+// APX supports EGPR which adds 16 more GPRs and 3 operand instructions.
+// This may affect ASM that include instructions that are superseded by APX
+// versions and are therefore incompatible with APX.
+// As a result GCC-14 disables EGPR by default and can be enabled with
+// "-mapx-inline-asm-use-gpr32"
+//TODO
+// Some ASM functions may need to be updated to support EGPR with APX.
+//
 ///////////////////////////////////////////////////////////////////////////////

 // New architecturally agnostic syntax: 
@@ -164,7 +172,7 @@ typedef union
 // necessary the cvt, set, or set1 intrinsics can be used allowing the
 // compiler to exploit new features to produce optimum code.
 // Currently only used internally and by Luffa.
-
+// It also has implications for APX EGPR feature.

 #define v128_mov64       _mm_cvtsi64_si128
 #define v128_mov32       _mm_cvtsi32_si128
@@ -431,11 +439,11 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #define v128_ornot( v1, v0 )      _mm_or_si128( v128_not( v1 ), v0 )

-#define v128_xor3( a, b, c )      _mm_xor_si128( a, _mm_xor_si128( b, c ) )
+#define v128_xor3( a, b, c )      _mm_xor_si128( _mm_xor_si128( a, b ), c )

-#define v128_and3( a, b, c )      _mm_and_si128( a, _mm_and_si128( b, c ) )
+#define v128_and3( a, b, c )      _mm_and_si128( _mm_and_si128( a, b ), c )

-#define v128_or3( a, b, c )       _mm_or_si128( a, _mm_or_si128( b, c ) )
+#define v128_or3( a, b, c )       _mm_or_si128( _mm_or_si128( a, b ), c )

 #define v128_xorand( a, b, c )    _mm_xor_si128( a, _mm_and_si128( b, c ) )

--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -174,17 +174,22 @@ static inline __m256i mm256_not( const __m256i v )

 #define mm256_ornot( v1, v0 )      _mm256_or_si256( mm256_not( v1 ), v0 )

+// usage hints to improve performance when ternary logic is not avalable:
+// If overwriting an input arg put that arg first so the intermediate
+// result can be stored in the dest.
+// Put an arg with the nearest dependency last so independant args can be
+// processed first.
 #define mm256_xor3( a, b, c ) \
-  _mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
+  _mm256_xor_si256( _mm256_xor_si256( a, b ), c )

 #define mm256_xor4( a, b, c, d ) \
  _mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )

 #define mm256_and3( a, b, c ) \
-  _mm256_and_si256( a, _mm256_and_si256( b, c ) )
+  _mm256_and_si256( _mm256_and_si256( a, b ), c )

 #define mm256_or3( a, b, c ) \
-   _mm256_or_si256( a, _mm256_or_si256( b, c ) )
+   _mm256_or_si256( _mm256_or_si256( a, b ), c )

 #define mm256_xorand( a, b, c ) \
  _mm256_xor_si256( a, _mm256_and_si256( b, c ) )
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -125,7 +125,7 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
 // Pseudo constants.
 #define m512_zero       _mm512_setzero_si512()

-// use asm to avoid compiler warning for unitialized local
+// use asm to avoid compiler warning for uninitialized local
 static inline __m512i mm512_neg1_fn()
 {
   __m512i v;
--- a/simd-utils/simd-64.h
+++ b/simd-utils/simd-64.h
@@ -10,7 +10,18 @@
 // This code is not used anywhere annd likely never will. It's intent was
 // to support 2 way parallel hashing using  MMX, or NEON for 32 bit hash
 // functions, but hasn't been implementedwas never implemented.
-// 
+//
+// MMX is being deprecated by compilers, all intrinsics will be converted to use SSE
+// registers and instructions. MMX will still be available using ASM.
+// For backward compatibility it's likely the compiler won't allow mixing explicit SSE
+// with promoted MMX. It is therefore preferable to implement all 64 bit vector code
+// using explicit SSE with the upper 64 bits being ignored.
+// Using SSE for 64 bit vectors will complicate loading arrays from memory which will
+// always load 128 bits. Odd indexes will need to be extracted from the upper 64 bits
+// of the even index SSE register. 
+// In most cases the exiting 4x32 SSE code can be used with 2 lanes being ignored
+// making ths file obsolete.
+

 #define v64_t                        __m64
 #define v64u32_t                     v64_t
--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -2,7 +2,7 @@
 #define SIMD_INT_H__ 1

 //TODO compile time test for byte order
-// be64 etc using HW bowap.
+// be64 etc using HW bswap.
 //
 // Endian byte swap
 #if defined(__x86_64__)
@@ -94,7 +94,7 @@ static inline uint16_t be16( const uint16_t u16 )
  return ( (uint16_t)(p[3]) ) + ( (uint16_t)(p[2]) <<  8 );
 }

-static inline uint32_t le162( const uint16_t u16 )
+static inline uint32_t le16( const uint16_t u16 )
 {
   const uint8_t *p = (uint8_t const *)&u16;
   return ( (uint16_t)(p[0]) ) + ( (uint16_t)(p[1]) <<  8 );
@@ -112,7 +112,7 @@ static inline uint32_t le162( const uint16_t u16 )
 #elif defined(__aarch64__)

 // Documentation is vague, ror exists but is ambiguous. Docs say it can
-// do 32 or 64 registers. Assuming that is architecture specific andcan
+// do 32 or 64 bit registers. Assuming that is architecture specific and can
 // only do 32 bit on 32 bit arch. Rarely used so not a big issue.
 static inline uint64_t ror64( uint64_t a, const int c )
 {
--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -93,6 +93,8 @@
 #define v128_cmplt16( v1, v0 )      vcltq_s16( (int16x8_t)v1, (int16x8_t)(v0) )
 #define v128_cmplt8( v1, v0 )       vcltq_s8( (int8x16_t)v1, (int8x16_t)(v0) )

+#define v128_cmpeq_zero                vceqzq_u64
+
 // Logical bit shift
 #define v128_sl64                     vshlq_n_u64
 #define v128_sl32                     vshlq_n_u32
@@ -135,14 +137,14 @@
 #if defined(__ARM_FEATURE_SHA3)
  #define v128_xor3                   veor3q_u32
 #else
-  #define v128_xor3( v2, v1, v0 )     veorq_u32( v2, veorq_u32( v1, v0 ) )
+  #define v128_xor3( v2, v1, v0 )     veorq_u32( veorq_u32( v2, v1 ), v0 )
 #endif

 // v2 & v1 & v0
-#define v128_and3( v2, v1, v0 )       v128_and( v2, v128_and( v1, v0 ) )
+#define v128_and3( v2, v1, v0 )       v128_and( v128_and( v2, v1 ), v0 )

 // v2 | v1 | v0
-#define v128_or3( v2, v1, v0 )        v128_or( v2, v128_or( v1, v0 ) )
+#define v128_or3( v2, v1, v0 )        v128_or( v128_or( v2, v1 ), v0 )

 // v2 ^ ( ~v1 & v0 )
 #if defined(__ARM_FEATURE_SHA3)
@@ -178,6 +180,7 @@
 #define v128_unpacklo8(  v1, v0 )     vzip1q_u8(  v1, v0 )
 #define v128_unpackhi8(  v1, v0 )     vzip2q_u8(  v1, v0 )

+// vzipq_u32 can do hi & lo and return uint32x4x2, no 64 bit version.

 // AES
 // consistent with Intel AES intrinsics, break up for optimizing
@@ -237,18 +240,15 @@ typedef union
 #define cast_v128u32( p )              (*((uint32x4_t*)(p)))
 #define castp_v128u32( p )             ((uint32x4_t*)(p))

-#define v128_zero                      v128_64( 0ull )
-
-#define v128_cmpeq_zero                vceqzq_u64
-
-#define v128_neg1                      v128_64( 0xffffffffffffffffull )
-
 // set1
 #define v128_64                        vmovq_n_u64
 #define v128_32                        vmovq_n_u32
 #define v128_16                        vmovq_n_u16
 #define v128_8                         vmovq_n_u8

+#define v128_zero                      v128_64( 0ull )
+#define v128_neg1                      v128_64( 0xffffffffffffffffull )
+
 #define v64_set32( u32_1, u32_0 ) \
  vcreate_u32( ( (uint64_t)(u32_1) << 32 ) | (uint64_t)(u32_0) )

@@ -357,28 +357,23 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
                             ((uint16x8_t)(v)), c )

 #define v128_rol16( v, c ) \
-  ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
+  ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)(v)) ) \
               : vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \
                             ((uint16x8_t)(v)), c )

 #define v128_ror8( v, c ) \
-      vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)),  8-(c) ), \
+      vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
                  ((uint8x16_t)(v)), c )

 #define v128_rol8( v, c ) \
-      vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)),  8-(c) ), \
+      vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
                 ((uint8x16_t)(v)), c )

-
-// ( v1 ^ v0 ) >>> n 
+// ( v1 ^ v0 ) >>> c 
 #if defined(__ARM_FEATURE_SHA3)
-
-#define v128_ror64xor( v1, v0, n )  vxarq_u64( v1, v0, n ) 
-
+  #define v128_ror64xor( v1, v0, c )  vxarq_u64( v1, v0, c ) 
 #else
-
-#define v128_ror64xor( v1, v0, n )  v128_ror64( v128_xor( v1, v0 ), n ) 
-
+  #define v128_ror64xor( v1, v0, c )  v128_ror64( v128_xor( v1, v0 ), c ) 
 #endif

 #define v128_2ror64( v1, v0, c ) \
@@ -411,7 +406,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 v1 = vorrq_u32( v1, t1 ); \
 }

-#define v128_2rorx32( v1, v0, c ) \
+#define v128_2ror32( v1, v0, c ) \
 { \
 uint32x4_t t0 = vshlq_n_u32( v0, c ); \
 uint32x4_t t1 = vshlq_n_u32( v1, c ); \
@@ -444,9 +439,9 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 #define v128_lrev16            vrev32q_u16

 // aka bswap
-#define v128_qrev8             vrev64q_u8
-#define v128_lrev8             vrev32q_u8
-#define v128_wrev8             vrev16q_u8
+// #define v128_qrev8             vrev64q_u8
+// #define v128_lrev8             vrev32q_u8
+// #define v128_wrev8             vrev16q_u8

 // full vector rotation

@@ -471,9 +466,9 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
 #define v128_bswap16(v)        (uint16x8_t)vrev16q_u8( (uint8x16_t)(v) )
 #define v128_bswap32(v)        (uint32x4_t)vrev32q_u8( (uint8x16_t)(v) )
 #define v128_bswap64(v)        (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) )
-#define v128_bswap128(v)       (uint32x4_t)v128_swap64( v128_bswap64(v) )
+#define v128_bswap128(v)       (uint32x4_t)v128_rev64( v128_bswap64(v) )

-// Usefull for x86_64 but does nothing for ARM
+// Useful for x86_64 but does nothing for ARM
 #define v128_block_bswap32( dst, src ) \
 { \
   casti_v128u32( dst,0 ) = v128_bswap32( casti_v128u32( src,0 ) ); \
@@ -542,7 +537,7 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )

 // Bitwise blend using vector mask, use only bytewise for compatibility
 // with x86_64.
-#define v128_blendv( v1, v0, mask )    vbslq_u32( mask, v1, v0 )
+#define v128_blendv( v1, v0, mask )    vbslq_u32( mask, v0, v1 )

 #endif   // __ARM_NEON
 #endif   // SIMD_NEON_H__
--- a/simd-utils/simd-sve.h
+++ b/simd-utils/simd-sve.h
@@ -0,0 +1,152 @@
+// Placeholder for now.
+//
+// This file will hold AArch64 SVE code, a replecement for NEON that uses
+// vector length agnostic instructions. This means the same code can be used
+// on CPUs with different SVE vector register lengths. This is not good for
+// vectorized hashing.
+// Optimum hash is sensitive to the vector register length with different code
+// used for different register sizes. On X86_64 the vector length is tied to
+// the CPU feature making it simple and efficient to handle different lengths
+// although it results in multiple executables. Theoretically SVE could use a
+// single executable for any vector length.
+//
+// With the SVE vector length only known at run time it results in run time
+// overhead to test the vector length. Theoretically it could be tested at
+// program loading and appropriate libraries loaded. However I don't know if
+// this can be done and if specified how to do it.
+//
+// SVE is not expected to be used for 128 bit vectors as it does not provide any
+// advantages over NEON. However, it may be implemented for testing purposes
+// because CPU with registers larger than 128 bits are currently very rare and
+// very expensive server class CPUs.
+//
+// However, 128 bit vectors also need to be supported with 256 bit registers.
+// This could be a challenge for un-predicated functions.
+//
+// N-way parallel hashing could be the best use of SVE, usimg the same code
+// for all vector lengths with the only variable being the number of lanes.
+// This will still require run time checking but should be lighter than
+// substituting functions.
+
+// Current approach is to hard code the length in these intrinsics and called
+// by existing length specific code.
+// define with sv_ prefix for generic use predicate provided by caller,
+// use sv<size>_ with hard coded predicate.
+// v<size>_ only if and when it's compatible with SSE & NEON
+
+// Many instructions have no predicate operand, how is VVL handled?
+// How does the CPU know how long the vector is and whether it spans
+// multiple registers without the predicate?
+
+// Also how does the predicate define the vector size? How to tell if inactive
+// high lanes are part of the vector or beyond its range.
+//
+// Some intructions may have an implied predicate by other arguments. 
+// TBL for example will only have shuffle indexes for active lanes.
+// However this is dependant on software being aware of register size.
+
+
+ 
+#if 0
+// #if defined USE_SV128
+// NEON needs to be disabled
+
+#define PRED128 0xffff
+#define PRED256 0xffffffff
+
+// Types should be transparent
+
+
+#define sv128u32_t  svuint32_t
+#define sv256u32_t  svuint32_t
+
+
+// load1
+
+
+// arithmetic
+
+// _z zero inactive elements, _x undefined inactive elements, _m inactive
+// elements from first arg. arg order only matters when _m used. Use _x.
+
+#define sv_add32( p, v1, v0 )         svadd_u32_x( p, v1, v0 )
+
+#define sv128_add32( v1, v0 )         svadd_u32_x( PRED128, v1, v0 )
+#define sv256_add32( v1, v0 )         svadd_u32_x( PRED256, v1, v0 )
+
+// Add integer to each element
+#define sv_addi32( p, v, i )           svadd_n_u32_x( p, v, i )
+
+
+
+// compare
+
+#define sv_cmpeq32( p, v1, v0 )       svcmpeq_u32( p, v1, v0 )
+
+#define sv128_cmpeq32( v1, v0 )       svcmpeq_u32( PRED128, v1, v0 )
+#define sv256_cmpeq32( v1, v0 )       svcmpeq_u32( PRED256, v1, v0 )
+
+
+// bit shift
+
+#define sv_sl32( v, c )              svlsl_n_u32_x( p, v, c )
+
+#define sv128_sl32( v, c )           svlsl_n_u32_x( PRED128, v, c )
+#define sv256_sl32( v, c )           svlsl_n_u32_x( PRED256, v, c )
+
+
+// logic
+
+#define sv_or( p, v1, v0 )           svorr_u32_x( p, v1, v0 )
+
+#define sv128_or( v1, v0 )           svorr_u32_x( PRED128, v1, v0 )
+#define sv256_or( v1, v0 )           svorr_u32_x( PRED256, v1, v0 )
+
+// ext used for alignr, and zip used for unpack have no predicate arg.
+// How is vector length determined? How are register sizes handled?
+// How are part registers handled?
+
+// alignr (ext)
+
+// unpack
+
+
+// AES
+
+// AES uses fixed 128 bit vectors, how does this work with larger registers?
+ 
+// set1
+
+#define sv128_32( n )      svdup_n_u32_x( PRED128, n )
+#define sv256_32( n )      svdup_n_u32_x( PRED256, n )
+
+// broadcast
+
+// svdup_lane has no predicate
+
+// constants
+
+
+// pointer cast
+
+
+// Bit rotation
+
+// No predication for shift instructions
+
+// Cross lane shuffles
+
+// Very limited shuffling, mostly svtbl which has no predicate and  uses
+// vector for the index.
+
+
+// endian byte swap
+
+
+#define sv128_bswap32(v)        svrevb_u32_x( p, v )
+
+
+// blend
+
+#enfif
+
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -16,7 +16,12 @@
 #include "miner.h"
 #include "simd-utils.h"

-#if defined(__aarch64__)
+// missing on mingw on arm
+#if defined(__aarch64__) && !defined(WIN32)
+#define ARM_AUXV 
+#endif
+
+#if defined(ARM_AUXV)
 // for arm's "cpuid"
 #include <sys/auxv.h>
 #include <asm/hwcap.h>
@@ -169,17 +174,17 @@ static inline int cpu_fanpercent()
 }


-// CPUID
+// x86_64 CPUID

 // This list is incomplete, it only contains features of interest to cpuminer.
 // refer to http://en.wikipedia.org/wiki/CPUID for details.

 // AVX10 compatibility notes
 //
-// Notation used: AVX10i.[version]_[vectorwidth]
-// AVX10.1_512 is a rebranding of AVX512 and is effectively the AVX* superset
+// Display format: AVX10.[version]-[vectorwidth]
+// AVX10.1-512 is a rebranding of AVX512 and is effectively the AVX* superset
 // with full 512 bit vector support.
-// AVX10.2_256 is effectively AVX2 + AVX512_VL, all AVX512 instructions and
+// AVX10.2-256 is effectively AVX2 + AVX512_VL, all AVX512 instructions and
 // features applied only to 256 bit and 128 bit vectors.
 // Future AVX10 versions will add new instructions and features.

@@ -309,24 +314,25 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf,
 #endif
 }

-#elif defined(__aarch64__)
+#elif defined(ARM_AUXV)

 // Always test if HWCAP variable is defined in the kernel before attempting
 // to compile it. If not defined the feature can't be tested and won't be
 // included in the compile.
 // This can occur if compiling with an old kernel and a new CPU and could
 // result in a suboptimal build.
+// leaf and subleaf arguments are ignored.

 static inline void cpuid( unsigned int leaf, unsigned int subleaf,
                          unsigned int output[4] )
 {
 #if defined(AT_HWCAP)
-    output[0] = getauxval(AT_HWCAP);
+    output[0] = getauxval( AT_HWCAP );
 #else
    output[0] = 0;
 #endif
 #if defined(AT_HWCAP2)
-    output[1] = getauxval(AT_HWCAP2);
+    output[1] = getauxval( AT_HWCAP2 );
 #else
    output[1] = 0;
 #endif    
@@ -365,7 +371,8 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf,
 }   

 #else
-#define cpuid(leaf, subleaf, out) out[0] = 0;
+#define cpuid( leaf, subleaf, output ) \
+   output[0] = output[1] = output[2] = output[3] = 0;
 #endif

 static inline void cpu_getname(char *outbuf, size_t maxsz)
@@ -508,29 +515,6 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 #endif
 */

-// GCC-14.1: the AVX512 macros are defined even when compiled with only
-// -mavx10.1-256, causing compile errors in AVX512 code. Only with
-// -mavx10.1-512 does it compile successfully.
-// __EVEX512__ is set only when compiled with -mavx10.1-512.
-// Adding -fno-evex512 doesn't help.
-// Building with -mapxf fails on a CPU without APX because configure can't
-// run its test program.
-/*
-#ifdef __AVX10_1__
-#warning "__AVX10_1__"
-#endif
-#ifdef __AVX10_1_256__
-#warning "__AVX10_1_256__"
-#endif
-#ifdef __AVX10_1_512__
-#warning "__AVX10_1_512__"
-#endif
-#ifdef __EVEX512__
-#warning "__EVEX512__"
-#endif
-*/
-
-
 // Typical display format: AVX10.[version]_[vectorlength], if vector length is
 // omitted 256 is the default.
 //    Ex: AVX10.1_512
@@ -646,7 +630,7 @@ static inline bool has_avx2()
 #endif
 }

-// Also ensure kernel supports feature
+// SVE vector width is determined at run time.
 static inline bool has_sve()
 {
 #if defined(__aarch64__) && defined(HWCAP_SVE)
@@ -780,6 +764,7 @@ static inline bool has_aes()
   }
   return false;
 #elif defined(__aarch64__) && defined(HWCAP_AES)
+   // NEON AES
   unsigned int cpu_info[4] = { 0 };
   cpuid( 0, 0, cpu_info );
   return cpu_info[0] & HWCAP_AES;
@@ -825,6 +810,7 @@ static inline bool has_sha256()
   }
   return false;
 #elif defined(__aarch64__) && defined(HWCAP_SHA2)
+   // NEON SHA256
   unsigned int cpu_info[4] = { 0 };
   cpuid( 0, 0, cpu_info );
   return cpu_info[0] & HWCAP_SHA2;
@@ -844,6 +830,7 @@ static inline bool has_sha512()
   }
   return false;
 #elif defined(__aarch64__) && defined(HWCAP_SHA512)
+   // NEON SHA512
   unsigned int cpu_info[4] = { 0 };
   cpuid( 0, 0, cpu_info );
   return cpu_info[0] & HWCAP_SHA512;
@@ -856,6 +843,7 @@ static inline bool has_sha512()
 static inline bool has_sha3()
 {
 #if defined(__aarch64__) && defined(HWCAP_SHA3)
+   // NEON SHA3
   unsigned int cpu_info[4] = { 0 };
   cpuid( 0, 0, cpu_info );
   return cpu_info[0] & HWCAP_SHA3;
@@ -948,7 +936,7 @@ static inline bool has_avx10_512()
    return false;
 }

-// Includes 128 but may not include 512
+// Includes 128 but might not include 512
 static inline bool has_avx10_256()
 {
 #if defined(__x86_64__)
@@ -980,7 +968,7 @@ static inline unsigned int avx10_vector_length()
 // ARM SVE vector register length
 static inline int sve_vector_length()
 {
-#if defined(__aarch64__)
+#if defined(ARM_AUXV)
   if ( has_sve() )
      return prctl( (PR_SVE_GET_VL & PR_SVE_VL_LEN_MASK) * 8 );
 #endif
--- a/util.c
+++ b/util.c
@@ -1414,6 +1414,12 @@ static bool send_line( struct stratum_ctx *sctx, char *s )
 		int n;
 		fd_set wd;

+// Something nasty going on With Windows on aarch64. This hack prevents
+// corrupting the sctx pointer. This only works if placed inside the while loop.
+#if defined(__aarch64__) && defined(WIN32) && defined(ARM_WIN_HACK)
+      printf("");
+#endif
+
 		FD_ZERO( &wd );
 		FD_SET( sctx->sock, &wd );
 		if ( select( (int) ( sctx->sock + 1 ), NULL, &wd, NULL, &timeout ) < 1 )
Author	SHA1	Message	Date
Jay D Dee	2b1037a7c7	v24.7	2024-12-16 19:17:19 -05:00
Jay D Dee	06624a0ff2	v24.6	2024-12-08 11:14:08 -05:00
Jay D Dee	8e91bfbe19	v24.5	2024-09-13 14:14:57 -04:00