v24.3

2026-02-23 08:53:08 +00:00 · 2024-05-28 18:20:19 -04:00
parent 042d13d1e1
commit c47c4a8885
36 changed files with 481 additions and 471 deletions
--- a/algo/blake/blake2b-hash.c
+++ b/algo/blake/blake2b-hash.c
@@ -240,7 +240,7 @@ static const uint8_t sigma[12][16] =
   v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
 }

-static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
+static void blake2b_8x64_compress( blake2b_8x64_ctx *ctx, int last )
 {  
   __m512i v[16], m[16];

@@ -306,7 +306,7 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
   ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
 }

-int blake2b_8way_init( blake2b_8way_ctx *ctx )
+int blake2b_8x64_init( blake2b_8x64_ctx *ctx )
 {
   size_t i;

@@ -333,7 +333,7 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
 }


-void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
+void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
                          size_t inlen )
 {
   __m512i* in =(__m512i*)input;
@@ -348,7 +348,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
         ctx->t[0] += ctx->c;
         if ( ctx->t[0] < ctx->c )
            ctx->t[1]++;
-         blake2b_8way_compress( ctx, 0 );
+         blake2b_8x64_compress( ctx, 0 );
         ctx->c = 0;
      }
      ctx->b[ c++ ] = in[i];
@@ -356,7 +356,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
   }
 }

-void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
+void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out )
 {
   size_t c;
   c = ctx->c >> 3;
@@ -371,7 +371,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
      ctx->c += 8;
   }

-   blake2b_8way_compress( ctx, 1 );           // final block flag = 1
+   blake2b_8x64_compress( ctx, 1 );           // final block flag = 1

   casti_m512i( out, 0 ) = ctx->h[0];
   casti_m512i( out, 1 ) = ctx->h[1];
@@ -407,7 +407,7 @@ static const uint64_t blake2b_iv[8] = {
 };
 */

-static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
+static void blake2b_4x64_compress( blake2b_4x64_ctx *ctx, int last )
 {
 	__m256i v[16], m[16];

@@ -473,7 +473,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
   ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
 }

-int blake2b_4way_init( blake2b_4way_ctx *ctx ) 
+int blake2b_4x64_init( blake2b_4x64_ctx *ctx ) 
 {
 	size_t i;

@@ -499,7 +499,7 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
 	return 0;
 }

-void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
+void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
                          size_t inlen ) 
 {
   __m256i* in =(__m256i*)input;
@@ -514,7 +514,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
 			ctx->t[0] += ctx->c;
 			if ( ctx->t[0] < ctx->c )
 				ctx->t[1]++;
-			blake2b_4way_compress( ctx, 0 );
+			blake2b_4x64_compress( ctx, 0 );
 			ctx->c = 0;
 		}
      ctx->b[ c++ ] = in[i];
@@ -522,7 +522,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
   }
 }

-void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
+void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out )
 {
 	size_t c;
   c = ctx->c >> 3;
@@ -537,7 +537,7 @@ void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
      ctx->c += 8;
   }

-   blake2b_4way_compress( ctx, 1 );           // final block flag = 1
+   blake2b_4x64_compress( ctx, 1 );           // final block flag = 1

   casti_m256i( out, 0 ) = ctx->h[0];
   casti_m256i( out, 1 ) = ctx->h[1];
--- a/algo/blake/blake2b-hash.h
+++ b/algo/blake/blake2b-hash.h
@@ -1,6 +1,6 @@
 #pragma once
-#ifndef __BLAKE2B_HASH_4WAY_H__
-#define __BLAKE2B_HASH_4WAY_H__
+#ifndef BLAKE2B_HASH_4WAY_H__
+#define BLAKE2B_HASH_4WAY_H__

 #include "simd-utils.h"
 #include <stddef.h>
@@ -23,12 +23,17 @@ typedef struct ALIGN( 64 ) {
   uint64_t t[2];  // total number of bytes
   size_t c;       // pointer for b[]
   size_t outlen;  // digest size
-} blake2b_8way_ctx;
+} blake2b_8x64_ctx;

-int blake2b_8way_init( blake2b_8way_ctx *ctx );
-void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
+int blake2b_8x64_init( blake2b_8x64_ctx *ctx );
+void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
                          size_t inlen );
-void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
+void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out );
+
+#define blake2b_8way_ctx         blake2b_8x64_ctx
+#define blake2b_8way_init        blake2b_8x64_init
+#define blake2b_8way_update      blake2b_8x64_update
+#define blake2b_8way_final       blake2b_8x64_final

 #endif

@@ -41,12 +46,17 @@ typedef struct ALIGN( 64 ) {
 	uint64_t t[2];  // total number of bytes
 	size_t c;       // pointer for b[]
 	size_t outlen;  // digest size
-} blake2b_4way_ctx;
+} blake2b_4x64_ctx;

-int blake2b_4way_init( blake2b_4way_ctx *ctx );
-void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
+int blake2b_4x64_init( blake2b_4x64_ctx *ctx );
+void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
                          size_t inlen );
-void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
+void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out );
+
+#define blake2b_4way_ctx         blake2b_4x64_ctx
+#define blake2b_4way_init        blake2b_4x64_init
+#define blake2b_4way_update      blake2b_4x64_update
+#define blake2b_4way_final       blake2b_4x64_final

 #endif

--- a/algo/blake/blake2s-hash.h
+++ b/algo/blake/blake2s-hash.h
@@ -11,8 +11,8 @@
 * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
 */
 //#pragma once
-#ifndef __BLAKE2S_HASH_4WAY_H__
-#define __BLAKE2S_HASH_4WAY_H__ 1
+#ifndef BLAKE2S_HASH_4WAY_H__
+#define BLAKE2S_HASH_4WAY_H__ 1

 #if defined(__SSE2__) || defined(__ARM_NEON)

--- a/algo/groestl/myrgr-gate.c
+++ b/algo/groestl/myrgr-gate.c
@@ -16,7 +16,7 @@ bool register_myriad_algo( algo_gate_t* gate )
  init_myrgr_ctx();
  gate->scanhash  = (void*)&scanhash_myriad;
  gate->hash      = (void*)&myriad_hash;
-  gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT;
+  gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA256_OPT | VAES_OPT;
 #endif
  return true;
 };
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -195,10 +195,6 @@ static const uint64_t blake2b_IV[8] =

 #endif // AVX2 else SSE2

-static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
-    return ( w >> c ) | ( w << ( 64 - c ) );
-}
-
 #define G( r, i, a, b, c, d ) \
 { \
    a = a + b; \
--- a/algo/m7m/m7m.c
+++ b/algo/m7m/m7m.c
@@ -306,7 +306,7 @@ bool register_m7m_algo( algo_gate_t *gate )
  applog( LOG_ERR, "M7M algo is not supported on MacOS");
  return false;
 #else  
-  gate->optimizations = SHA_OPT;
+  gate->optimizations = SHA256_OPT;
  init_m7m_ctx();
  gate->scanhash              = (void*)&scanhash_m7m_hash;
  gate->build_stratum_request = (void*)&std_be_build_stratum_request;
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -104,14 +104,14 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
   int thr_id = mythr->id;  // thr_id arg is deprecated

   // we need bigendian data...
-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-   casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
-   casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
-   casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
+   casti_m128i( edata, 0 ) = v128_bswap32( casti_m128i( pdata, 0 ) );
+   casti_m128i( edata, 1 ) = v128_bswap32( casti_m128i( pdata, 1 ) );
+   casti_m128i( edata, 2 ) = v128_bswap32( casti_m128i( pdata, 2 ) );
+   casti_m128i( edata, 3 ) = v128_bswap32( casti_m128i( pdata, 3 ) );
+   casti_m128i( edata, 4 ) = v128_bswap32( casti_m128i( pdata, 4 ) );
+   casti_m128i( edata, 5 ) = v128_bswap32( casti_m128i( pdata, 5 ) );
+   casti_m128i( edata, 6 ) = v128_bswap32( casti_m128i( pdata, 6 ) );
+   casti_m128i( edata, 7 ) = v128_bswap32( casti_m128i( pdata, 7 ) );
   intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
        edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );

@@ -224,14 +224,14 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
   int thr_id = mythr->id;  // thr_id arg is deprecated

   // we need bigendian data...
-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-   casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
-   casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
-   casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
+   casti_m128i( edata, 0 ) = v128_bswap32( casti_m128i( pdata, 0 ) );
+   casti_m128i( edata, 1 ) = v128_bswap32( casti_m128i( pdata, 1 ) );
+   casti_m128i( edata, 2 ) = v128_bswap32( casti_m128i( pdata, 2 ) );
+   casti_m128i( edata, 3 ) = v128_bswap32( casti_m128i( pdata, 3 ) );
+   casti_m128i( edata, 4 ) = v128_bswap32( casti_m128i( pdata, 4 ) );
+   casti_m128i( edata, 5 ) = v128_bswap32( casti_m128i( pdata, 5 ) );
+   casti_m128i( edata, 6 ) = v128_bswap32( casti_m128i( pdata, 6 ) );
+   casti_m128i( edata, 7 ) = v128_bswap32( casti_m128i( pdata, 7 ) );
   intrlv_8x32( vdata, edata, edata, edata, edata,
                       edata, edata, edata, edata, 1024 );

--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -51,7 +51,6 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }

 bool register_lbry_algo( algo_gate_t* gate )
 {
-//  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
 #if defined (LBRY_16WAY)
  gate->scanhash              = (void*)&scanhash_lbry_16way;
  gate->hash                  = (void*)&lbry_16way_hash;
@@ -67,7 +66,7 @@ bool register_lbry_algo( algo_gate_t* gate )
 #else 
  gate->scanhash              = (void*)&scanhash_lbry;
  gate->hash                  = (void*)&lbry_hash;
-  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
+  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA256_OPT;
 #endif
  gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
  gate->build_extraheader     = (void*)&lbry_build_extraheader;
--- a/algo/scrypt/scrypt-core-4way.c
+++ b/algo/scrypt/scrypt-core-4way.c
@@ -2074,7 +2074,7 @@ void scrypt_core_4way( v128_t *X, v128_t *V, const uint32_t N )
         v128_ovly v;    
         for ( int l = 0; l < 4; l++ )
            v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
-         X[i] = v128_xor( X[i], v.m128 );
+         X[i] = v128_xor( X[i], v.v128 );
      }

      xor_salsa8_4way( &X[ 0], &X[16] );
@@ -2211,10 +2211,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
   // X2 is shuffled left 2 (swap_64)    { xd, x8, x7, x2 }
   // X3 is shuffled left 3 (ror_1x32)   { xc, xb, x6, x1 }

-   y[0].m128 = X0;
-   y[1].m128 = X1;
-   y[2].m128 = X2;
-   y[3].m128 = X3;
+   y[0].v128 = X0;
+   y[1].v128 = X1;
+   y[2].v128 = X2;
+   y[3].v128 = X3;

   z[0].u32[0] = y[0].u32[0];
   z[0].u32[3] = y[1].u32[0];
@@ -2236,10 +2236,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
   z[3].u32[1] = y[2].u32[3];
   z[3].u32[0] = y[3].u32[3];

-   B[0] = v128_add32( B[0], z[0].m128 );
-   B[1] = v128_add32( B[1], z[1].m128 );
-   B[2] = v128_add32( B[2], z[2].m128 );
-   B[3] = v128_add32( B[3], z[3].m128 );
+   B[0] = v128_add32( B[0], z[0].v128 );
+   B[1] = v128_add32( B[1], z[1].v128 );
+   B[2] = v128_add32( B[2], z[2].v128 );
+   B[3] = v128_add32( B[3], z[3].v128 );

 #endif

@@ -2404,14 +2404,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
 /*
   v128_ovly ya[4], za[4], yb[4], zb[4];

-   ya[0].m128 = XA[0];
-   yb[0].m128 = XB[0];
-   ya[1].m128 = XA[1];
-   yb[1].m128 = XB[1];
-   ya[2].m128 = XA[2];
-   yb[2].m128 = XB[2];
-   ya[3].m128 = XA[3];
-   yb[3].m128 = XB[3];
+   ya[0].v128 = XA[0];
+   yb[0].v128 = XB[0];
+   ya[1].v128 = XA[1];
+   yb[1].v128 = XB[1];
+   ya[2].v128 = XA[2];
+   yb[2].v128 = XB[2];
+   ya[3].v128 = XA[3];
+   yb[3].v128 = XB[3];

   za[0].u32[0] = ya[0].u32[0];
   zb[0].u32[0] = yb[0].u32[0];
@@ -2449,14 +2449,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
   za[3].u32[3] = ya[0].u32[3];
   zb[3].u32[3] = yb[0].u32[3];

-   XA[0] = za[0].m128;
-   XB[0] = zb[0].m128;
-   XA[1] = za[1].m128;
-   XB[1] = zb[1].m128;
-   XA[2] = za[2].m128;
-   XB[2] = zb[2].m128;
-   XA[3] = za[3].m128;
-   XB[3] = zb[3].m128;
+   XA[0] = za[0].v128;
+   XB[0] = zb[0].v128;
+   XA[1] = za[1].v128;
+   XB[1] = zb[1].v128;
+   XA[2] = za[2].v128;
+   XB[2] = zb[2].v128;
+   XA[3] = za[3].v128;
+   XB[3] = zb[3].v128;
 */
 }

@@ -2770,18 +2770,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
 /*  
   v128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4];

-   ya[0].m128 = XA[0];
-   yb[0].m128 = XB[0];
-   yc[0].m128 = XC[0];
-   ya[1].m128 = XA[1];
-   yb[1].m128 = XB[1];
-   yc[1].m128 = XC[1];
-   ya[2].m128 = XA[2];
-   yb[2].m128 = XB[2];
-   yc[2].m128 = XC[2];
-   ya[3].m128 = XA[3];
-   yb[3].m128 = XB[3];
-   yc[3].m128 = XC[3];
+   ya[0].v128 = XA[0];
+   yb[0].v128 = XB[0];
+   yc[0].v128 = XC[0];
+   ya[1].v128 = XA[1];
+   yb[1].v128 = XB[1];
+   yc[1].v128 = XC[1];
+   ya[2].v128 = XA[2];
+   yb[2].v128 = XB[2];
+   yc[2].v128 = XC[2];
+   ya[3].v128 = XA[3];
+   yb[3].v128 = XB[3];
+   yc[3].v128 = XC[3];

   za[0].u32[0] = ya[0].u32[0];
   zb[0].u32[0] = yb[0].u32[0];
@@ -2835,18 +2835,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
   zb[3].u32[3] = yb[0].u32[3];
   zc[3].u32[3] = yc[0].u32[3];

-   XA[0] = za[0].m128;
-   XB[0] = zb[0].m128;
-   XC[0] = zc[0].m128;
-   XA[1] = za[1].m128;
-   XB[1] = zb[1].m128;
-   XC[1] = zc[1].m128;
-   XA[2] = za[2].m128;
-   XB[2] = zb[2].m128;
-   XC[2] = zc[2].m128;
-   XA[3] = za[3].m128;
-   XB[3] = zb[3].m128;
-   XC[3] = zc[3].m128;
+   XA[0] = za[0].v128;
+   XB[0] = zb[0].v128;
+   XC[0] = zc[0].v128;
+   XA[1] = za[1].v128;
+   XB[1] = zb[1].v128;
+   XC[1] = zc[1].v128;
+   XA[2] = za[2].v128;
+   XB[2] = zb[2].v128;
+   XC[2] = zc[2].v128;
+   XA[3] = za[3].v128;
+   XB[3] = zb[3].v128;
+   XC[3] = zc[3].v128;
 */
 }   

@@ -3049,7 +3049,7 @@ static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
            xf = (B[15] ^= C[15]);

   
-   #define ROL32( a, c )    ror32( a, c )
+   #define ROL32( a, c )    rol32( a, c )
   #define ADD32( a, b )    ( (a)+(b) )
   #define XOR( a, b )      ( (a)^(b) )

--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -1481,7 +1481,7 @@ bool scrypt_miner_thread_init( int thr_id )
 bool register_scrypt_algo( algo_gate_t* gate )
 {
 #if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
-   gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+   gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
 #else
   gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
 #endif
--- a/algo/sha/sha256d.c
+++ b/algo/sha/sha256d.c
@@ -8,14 +8,14 @@ void sha256d( void *hash, const void *data, int len )
 }
 bool register_sha256d_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
 #if defined(SHA256D_16WAY)
   gate->scanhash = (void*)&scanhash_sha256d_16way;
 #elif defined(SHA256D_SHA)
-   gate->optimizations = SHA_OPT;
+   gate->optimizations = SSE2_OPT | SHA256_OPT;
   gate->scanhash = (void*)&scanhash_sha256d_sha;
 #elif defined(SHA256D_NEON_SHA2)
-   gate->optimizations = SHA_OPT;
+   gate->optimizations = NEON_OPT | SHA256_OPT;
   gate->scanhash = (void*)&scanhash_sha256d_neon_sha2;
 #elif defined(SHA256D_8WAY)
   gate->scanhash = (void*)&scanhash_sha256d_8way;
--- a/algo/sha/sha256dt.c
+++ b/algo/sha/sha256dt.c
@@ -500,10 +500,10 @@ bool register_sha256dt_algo( algo_gate_t* gate )
 #if defined(SHA256DT_16X32)
    gate->scanhash = (void*)&scanhash_sha256dt_16x32;
 #elif defined(SHA256DT_X86_SHA256)
-    gate->optimizations = SHA_OPT;
+    gate->optimizations = SSE2_OPT | SHA256_OPT;
    gate->scanhash = (void*)&scanhash_sha256dt_x86_x2sha;    
 #elif defined(SHA256DT_NEON_SHA256)
-    gate->optimizations = SHA_OPT;
+    gate->optimizations = NEON_OPT | SHA256_OPT;
    gate->scanhash = (void*)&scanhash_sha256dt_neon_x2sha;
 #elif defined(SHA256DT_8X32)
    gate->scanhash = (void*)&scanhash_sha256dt_8x32;
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -6,9 +6,10 @@ bool register_sha256t_algo( algo_gate_t* gate )
 #if defined(SHA256T_16WAY)
    gate->scanhash   = (void*)&scanhash_sha256t_16way;
 #elif defined(SHA256T_SHA)
-    gate->optimizations = SHA_OPT;
+    gate->optimizations = SSE2_OPT | SHA256_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t_sha;
 #elif defined(SHA256T_NEON_SHA2)
+    gate->optimizations = NEON_OPT | SHA256_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t_neon_sha2;
 #elif defined(SHA256T_8WAY)
    gate->scanhash   = (void*)&scanhash_sha256t_8way;
@@ -28,7 +29,7 @@ bool register_sha256q_algo( algo_gate_t* gate )
    gate->scanhash   = (void*)&scanhash_sha256q_16way;
    gate->hash       = (void*)&sha256q_16way_hash;
 //#elif defined(SHA256T_SHA)
-//    gate->optimizations = SHA_OPT;
+//    gate->optimizations = SHA256_OPT;
 //    gate->scanhash   = (void*)&scanhash_sha256q;
 //    gate->hash       = (void*)&sha256q_hash;
 #elif defined(SHA256T_8WAY)
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -71,12 +71,13 @@ static const uint64_t K512[80] =

 // SHA-512 implemented using SHA512 CPU extension.

-// Experimental. Not tested. Not reviewed. Compile tested only.
+// Experimental. Not supported. Not tested. Not reviewed. Compile tested only.
+// Modelled after noloader sha256 implementation, replacing 4x32 bit
+// instructions with equivalent 4x64 bit instructions and increasing rounds
+// to 80.

 // Needs GCC-14 for compilation.
 // Needs Intel Lunarlake or Arrowlake CPU, or AMD Zen-6? for execution.
-// Modelled after noloader sha256 implementation.
-

 void sha512_opt_transform_be( uint64_t *state_out, const void *input,
                              const uint64_t *state_in )
@@ -571,6 +572,20 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,

 #endif

+/*
+#if defined(__ARM_FEATURE_NEON) && defined(__ARM_FEATURE_SHA512)
+
+uint64x2_t sha512_compile_test( uint64x2_t test )
+{
+   test = vsha512hq_u64( test, test, test );
+   test = vsha512h2q_u64( test, test, test );
+   test = vsha512su0q_u64( test, test );
+   test = vsha512su1q_u64( test, test, test );
+   return test;
+}
+
+#endif
+*/

 #if defined(SIMD512)

--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -8,15 +8,15 @@ bool register_skein_algo( algo_gate_t* gate )
    gate->scanhash  = (void*)&scanhash_skein_8way;
    gate->hash      = (void*)&skeinhash_8way;
 #elif defined(SKEIN_4WAY)
-    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
+    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA256_OPT | NEON_OPT;
    gate->scanhash  = (void*)&scanhash_skein_4way;
    gate->hash      = (void*)&skeinhash_4way;
 #elif defined(SKEIN_2WAY)
-    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
+    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA256_OPT | NEON_OPT;
    gate->scanhash  = (void*)&scanhash_skein_2x64;
    gate->hash      = (void*)&skeinhash_2x64;
 #else
-    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
+    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA256_OPT | NEON_OPT;
    gate->scanhash  = (void*)&scanhash_skein;
    gate->hash      = (void*)&skeinhash;
 #endif
--- a/algo/sm3/sm3-hash-4way.c
+++ b/algo/sm3/sm3-hash-4way.c
@@ -240,10 +240,10 @@ void sm3_8way_close( void *cc, void *dst )

 #if defined(__SSE2__)

-#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x,  9 ), \
-                                               mm128_rol_32( x, 17 ) ) ) 
-#define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 15 ), \
-                                               mm128_rol_32( x, 23 ) ) ) 
+#define P0(x) _mm_xor_si128( x, _mm_xor_si128( v128_rol32( x,  9 ), \
+                                               v128_rol32( x, 17 ) ) )
+#define P1(x) _mm_xor_si128( x, _mm_xor_si128( v128_rol32( x, 15 ), \
+                                               v128_rol32( x, 23 ) ) )

 #define FF0(x,y,z) _mm_xor_si128( x, _mm_xor_si128( y, z ) )
 #define FF1(x,y,z) _mm_or_si128( _mm_or_si128( _mm_and_si128( x, y ), \
@@ -273,13 +273,13 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
   int j;

   for ( j = 0; j < 16; j++ )
-      W[j] = mm128_bswap_32( block[j] );
+      W[j] = v128_bswap32( block[j] );

   for ( j = 16; j < 68; j++ )
      W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ],
                                                              W[ j-9 ] ),
-                                               mm128_rol_32( W[ j-3 ], 15 ) ) ),
-                            _mm_xor_si128( mm128_rol_32( W[ j-13 ], 7 ),
+                                               v128_rol32( W[ j-3 ], 15 ) ) ),
+                            _mm_xor_si128( v128_rol32( W[ j-13 ], 7 ),
                                           W[ j-6 ] ) );

   for( j = 0; j < 64; j++ )
@@ -288,19 +288,19 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
   T = _mm_set1_epi32( 0x79CC4519UL );
   for( j =0; j < 16; j++ )
   {
-      SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ),
+      SS1 = v128_rol32( _mm_add_epi32( _mm_add_epi32( v128_rol32(A,12), E ),
                                      mm128_rol_var_32( T, j ) ), 7 );
-      SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) );
+      SS2 = _mm_xor_si128( SS1, v128_rol32( A, 12 ) );
      TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF0( A, B, C ), D ),
                                          SS2 ), W1[j] );
      TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG0( E, F, G ), H ),
                                          SS1 ), W[j] );
      D = C;
-      C = mm128_rol_32( B, 9 );
+      C = v128_rol32( B, 9 );
      B = A;
      A = TT1;
      H = G;
-      G = mm128_rol_32( F, 19 );
+      G = v128_rol32( F, 19 );
      F = E;
      E = P0( TT2 );
   }
@@ -308,19 +308,19 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
   T = _mm_set1_epi32( 0x7A879D8AUL );
   for( j =16; j < 64; j++ )
   {
-      SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ),
+      SS1 = v128_rol32( _mm_add_epi32( _mm_add_epi32( v128_rol32(A,12), E ),
                                      mm128_rol_var_32( T, j&31 ) ), 7 );
-      SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) );
+      SS2 = _mm_xor_si128( SS1, v128_rol32( A, 12 ) );
      TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF1( A, B, C ), D ), 
                                          SS2 ), W1[j] );
      TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG1( E, F, G ), H ),
                                          SS1 ), W[j] );
      D = C;
-      C = mm128_rol_32( B, 9 );
+      C = v128_rol32( B, 9 );
      B = A;
      A = TT1;
      H = G;
-      G = mm128_rol_32( F, 19 );
+      G = v128_rol32( F, 19 );
      F = E;
      E = P0( TT2 );
   }
@@ -408,14 +408,14 @@ void sm3_4way_close( void *cc, void *dst )
      memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
   }

-   count[0] = mm128_bswap_32(
+   count[0] = v128_bswap32(
                  _mm_set1_epi32( ctx->nblocks >> 23 ) );
-   count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
-                                              ( ctx->num     << 3 ) ) );
+   count[1] = v128_bswap32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
+                                            ( ctx->num     << 3 ) ) );
   sm3_4way_compress( ctx->digest, block );

   for ( i = 0; i < 8 ; i++ )
-     hash[i] = mm128_bswap_32( ctx->digest[i] );
+     hash[i] = v128_bswap32( ctx->digest[i] );
 }

 #endif
--- a/algo/verthash/Verthash.c
+++ b/algo/verthash/Verthash.c
@@ -137,53 +137,8 @@ void verthash_info_free(verthash_info_t* info)
 #define VH_N_INDEXES 4096
 #define VH_BYTE_ALIGNMENT 16

-static inline uint32_t fnv1a(const uint32_t a, const uint32_t b)
-{
-    return (a ^ b) * 0x1000193;
-}
+#define fnv1a( a, b )           ( ( (a) ^ (b) ) * 0x1000193 )

-#if 0
-static void rotate_indexes( uint32_t *p )
-{
-#if defined(__AVX2__)
-
-   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m256i); x += 8 )
-   {
-      __m256i *px = (__m256i*)p + x;
-
-      px[0] = mm256_rol_32( px[0], 1 );
-      px[1] = mm256_rol_32( px[1], 1 );
-      px[2] = mm256_rol_32( px[2], 1 );
-      px[3] = mm256_rol_32( px[3], 1 );
-      px[4] = mm256_rol_32( px[4], 1 );
-      px[5] = mm256_rol_32( px[5], 1 );
-      px[6] = mm256_rol_32( px[6], 1 );
-      px[7] = mm256_rol_32( px[7], 1 );
-   }
-
-#else
-
-   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m128i); x += 8 )
-   {
-      __m128i *px = (__m128i*)p0_index + x;
-
-      px[0] = mm128_rol_32( px[0], 1 );
-      px[1] = mm128_rol_32( px[1], 1 );
-      px[2] = mm128_rol_32( px[2], 1 );
-      px[3] = mm128_rol_32( px[3], 1 );
-      px[4] = mm128_rol_32( px[4], 1 );
-      px[5] = mm128_rol_32( px[5], 1 );
-      px[6] = mm128_rol_32( px[6], 1 );
-      px[7] = mm128_rol_32( px[7], 1 );
-   }
-
-#endif
-/*   
-   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(uint32_t); ++x )
-      p[x] = ( p[x] << 1 ) | ( p[x] >> 31 );
-*/
-}
-#endif
 // Vectorized and targetted version of fnv1a
 #if defined (__AVX2__)        

@@ -191,7 +146,7 @@ static void rotate_indexes( uint32_t *p )
   *(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \
                                 *(__m256i*)hash, *(__m256i*)blob_off ), k );

-#elif defined(__SSE4_1__)  || defined(__ARM_NEON)
+#elif defined(__SSE4_1__) || defined(__ARM_NEON)

 #define MULXOR \
   casti_v128( hash, 0 ) = v128_mul32( v128_xor( \
@@ -229,7 +184,7 @@ for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
   MULXOR; \
 }

-// subsequent passes rotate by r on demand, no need for mass rotate
+// subsequent passes rotate by r
 #define ROUND_r( r ) \
 for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
 { \
@@ -243,8 +198,8 @@ for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
 void verthash_hash( const void *blob_bytes, const size_t blob_size,
                    const void *input, void *output )
 {
-    uint32_t hash[ VH_HASH_OUT_SIZE / 4 ] __attribute__ ((aligned (64)));
    uint32_t subset[ VH_N_SUBSET / 4 ] __attribute__ ((aligned (64)));
+    uint32_t hash[ VH_HASH_OUT_SIZE / 4 ] __attribute__ ((aligned (32)));
    const uint32_t *blob = (const uint32_t*)blob_bytes;
    uint32_t accumulator = 0x811c9dc5;
    const uint32_t mdiv = ( ( blob_size - VH_HASH_OUT_SIZE )
--- a/algo/verthash/verthash-gate.c
+++ b/algo/verthash/verthash-gate.c
@@ -91,8 +91,8 @@ void verthash_sha3_512_final_8( void *hash, const uint64_t nonce )
 int scanhash_verthash( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t edata[20] __attribute__((aligned(64)));
   uint32_t hash[8] __attribute__((aligned(64)));
+   uint32_t edata[20] __attribute__((aligned(32)));
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -101,9 +101,7 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce,
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;

-   for (int i = 0; i < 20; i++)
-         edata[i] = bswap_32( pdata[i] );
-//   v128_bswap32_80( edata, pdata );
+   v128_bswap32_80( edata, pdata );
   verthash_sha3_512_prehash_72( edata );

   do
--- a/algo/x16/minotaur.c
+++ b/algo/x16/minotaur.c
@@ -318,7 +318,7 @@ bool register_minotaur_algo( algo_gate_t* gate )
  gate->hash              = (void*)&minotaur_hash;
  gate->miner_thread_init = (void*)&initialize_torture_garden;
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
-  if ( opt_algo == ALGO_MINOTAURX ) gate->optimizations |= SHA_OPT;
+  if ( opt_algo == ALGO_MINOTAURX ) gate->optimizations |= SHA256_OPT;
  return true;
 };

--- a/algo/x22/x22i-gate.c
+++ b/algo/x22/x22i-gate.c
@@ -31,7 +31,7 @@ bool register_x22i_algo( algo_gate_t* gate )

 #endif

-  gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT
+  gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA256_OPT
                      | AVX512_OPT | VAES_OPT | NEON_OPT;
  return true;
 };
@@ -48,7 +48,7 @@ bool register_x25x_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x25x;
  gate->hash      = (void*)&x25x_hash;
 #endif
-  gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT |
+  gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA256_OPT |
                        AVX512_OPT | VAES_OPT | NEON_OPT;
  InitializeSWIFFTX();
  return true;
--- a/algo/yespower/yescrypt-r8g.c
+++ b/algo/yespower/yescrypt-r8g.c
@@ -71,7 +71,7 @@ int scanhash_yespower_r8g( struct work *work, uint32_t max_nonce,

 bool register_yescryptr8g_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+  gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
  gate->scanhash      = (void*)&scanhash_yespower_r8g;
 #if (__SSE2__) || defined(__aarch64__)
  gate->hash          = (void*)&yespower_hash;
--- a/algo/yespower/yespower-gate.c
+++ b/algo/yespower/yespower-gate.c
@@ -162,7 +162,7 @@ bool register_yespower_algo( algo_gate_t* gate )
  if ( yespower_params.pers )
     applog( LOG_NOTICE,"Key= \"%s\"\n", yespower_params.pers );

-  gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+  gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
  gate->scanhash      = (void*)&scanhash_yespower;
 #if (__SSE2__) || defined(__aarch64__)
  gate->hash          = (void*)&yespower_hash;
@@ -180,7 +180,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate )
  yespower_params.r       = 16;
  yespower_params.pers    = NULL;
  yespower_params.perslen = 0;
-  gate->optimizations     = SSE2_OPT | SHA_OPT | NEON_OPT;
+  gate->optimizations     = SSE2_OPT | SHA256_OPT | NEON_OPT;
  gate->scanhash          = (void*)&scanhash_yespower;
 #if (__SSE2__) || defined(__aarch64__)
  gate->hash              = (void*)&yespower_hash;
@@ -195,7 +195,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate )

 bool register_yescrypt_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+   gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
 #if (__SSE2__) || defined(__aarch64__)
   gate->hash       = (void*)&yespower_hash;
@@ -233,7 +233,7 @@ bool register_yescrypt_algo( algo_gate_t* gate )

 bool register_yescryptr8_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+   gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
   gate->scanhash      = (void*)&scanhash_yespower;
 #if (__SSE2__) || defined(__aarch64__)
   gate->hash          = (void*)&yespower_hash;
@@ -251,7 +251,7 @@ bool register_yescryptr8_algo( algo_gate_t* gate )

 bool register_yescryptr16_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+   gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
 #if (__SSE2__) || defined(__aarch64__)
   gate->hash          = (void*)&yespower_hash;
@@ -269,7 +269,7 @@ bool register_yescryptr16_algo( algo_gate_t* gate )

 bool register_yescryptr32_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
+   gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
 #if (__SSE2__) || defined(__aarch64__)
   gate->hash          = (void*)&yespower_hash;