v25.6

v25.5
2025-09-17 23:44:27 +00:00 · 2025-07-20 19:43:10 -04:00 · 2025-07-09 01:32:38 -04:00
24 changed files with 706 additions and 801 deletions
--- a/README.md
+++ b/README.md
@@ -54,9 +54,9 @@ Supported Algorithms

                          allium        Garlicoin
                          anime         Animecoin
-                          argon2        Argon2 coin (AR2)
                          argon2d250    
                          argon2d500
+                          argon2d1000
                          argon2d4096
                          blake         Blake-256
                          blake2b       Blake2-512
--- a/11
+++ b/11
@@ -75,6 +75,17 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v25.6
+
+Added argon2d1000, argon2d16000 algos.
+Target specific AES optimizations improve shavite for ARM64 & x86_64.
+
+v25.5
+
+x86_64: Fixed an insidious bug in sha256 early rejection optimization for AVX2 & AVX512.
+x86_64: Faster sha256d, sha256dt for AVX2 & AVX512.
+Other small bug fixes.
+
 v25.4

 x86_64: improved handling of vector constants used for byte permutations.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -297,6 +297,8 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_ANIME:        rc = register_anime_algo         ( gate ); break;
    case ALGO_ARGON2D250:   rc = register_argon2d250_algo    ( gate ); break;
    case ALGO_ARGON2D500:   rc = register_argon2d500_algo    ( gate ); break;
+    case ALGO_ARGON2D1000:  rc = register_argon2d1000_algo   ( gate ); break;
+    case ALGO_ARGON2D16000: rc = register_argon2d16000_algo  ( gate ); break;
    case ALGO_ARGON2D4096:  rc = register_argon2d4096_algo   ( gate ); break;
    case ALGO_AXIOM:        rc = register_axiom_algo         ( gate ); break;
    case ALGO_BLAKE:        rc = register_blake_algo         ( gate ); break;
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -172,8 +172,11 @@ void ( *set_work_data_endian )  ( struct work* );

 json_t* ( *longpoll_rpc_call )  ( CURL*, int*, char* );

+// Deprecated
 set_t optimizations;
+
 int  ( *get_work_data_size )     ();
+
 int  ntime_index;
 int  nbits_index;
 int  nonce_index;            // use with caution, see warning below
@@ -274,8 +277,6 @@ void std_get_new_work( struct work *work, struct work *g_work, int thr_id,

 void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
 void sha256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
-// OpenSSL sha256 deprecated
-//void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );

 bool std_le_work_decode( struct work *work );
 bool std_be_work_decode( struct work *work );
--- a/algo/argon2d/argon2d-gate.c
+++ b/algo/argon2d/argon2d-gate.c
@@ -6,6 +6,38 @@ static const size_t INPUT_BYTES = 80;  // Lenth of a block header in bytes. Inpu
 static const size_t OUTPUT_BYTES = 32; // Length of output needed for a 256-bit hash
 static const unsigned int DEFAULT_ARGON2_FLAG = 2; //Same as ARGON2_DEFAULT_FLAGS

+// generic, works with most variations of argon2d
+int scanhash_argon2d( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t _ALIGN(64) hash[8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const int thr_id = mythr->id;
+   const uint32_t first_nonce = (const uint32_t)pdata[19];
+   const uint32_t last_nonce = (const uint32_t)max_nonce;
+   uint32_t nonce = first_nonce;
+   const bool bench = opt_benchmark;
+
+   v128_bswap32_80( edata, pdata );
+   do
+   {
+      edata[19] = nonce;
+      algo_gate.hash( hash, edata, thr_id );
+      if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
+      {
+          pdata[19] = bswap_32( nonce );
+          submit_solution( work, hash, mythr );
+      }
+      nonce++;
+  } while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
+
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce;
+   return 0;
+}
+
 void argon2d250_hash( void *output, const void *input )
 {
 	argon2_context context;
@@ -32,41 +64,10 @@ void argon2d250_hash( void *output, const void *input )
 	argon2_ctx( &context, Argon2_d );
 }

-int scanhash_argon2d250( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t _ALIGN(64) edata[20];
-   uint32_t _ALIGN(64) hash[8];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t Htarg = ptarget[7];
-   uint32_t nonce = first_nonce;
-
-   swab32_array( edata, pdata, 20 );
-
-   do {
-      be32enc(&edata[19], nonce);
-      argon2d250_hash( hash, edata );
-      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
-      {
-          pdata[19] = nonce;
-          submit_solution( work, hash, mythr );
-      }
-      nonce++;
-   } while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce + 1;
-   return 0;
-}
-
 bool register_argon2d250_algo( algo_gate_t* gate )
 {
-        gate->scanhash = (void*)&scanhash_argon2d250;
+        gate->scanhash = (void*)&scanhash_argon2d;
        gate->hash = (void*)&argon2d250_hash;
-        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
        opt_target_factor = 65536.0;
        return true;
 }
@@ -97,43 +98,78 @@ void argon2d500_hash( void *output, const void *input )
    argon2_ctx( &context, Argon2_d );
 }

-int scanhash_argon2d500( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t _ALIGN(64) edata[20];
-   uint32_t _ALIGN(64) hash[8];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const int thr_id = mythr->id; 
-   const uint32_t first_nonce = (const uint32_t)pdata[19];
-   const uint32_t last_nonce = (const uint32_t)max_nonce;
-   uint32_t nonce = first_nonce;
-   const bool bench = opt_benchmark;
-
-   v128_bswap32_80( edata, pdata );
-   do
-   {
-      edata[19] = nonce;
-      argon2d500_hash( hash, edata );
-      if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
-           && !bench ) )
-      {
-          pdata[19] = bswap_32( nonce );;
-          submit_solution( work, hash, mythr );
-      }
-      nonce++;
-  } while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
-
-   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce;
-   return 0;
-}
-
 bool register_argon2d500_algo( algo_gate_t* gate )
 {
-        gate->scanhash = (void*)&scanhash_argon2d500;
+        gate->scanhash = (void*)&scanhash_argon2d;
        gate->hash = (void*)&argon2d500_hash;
-        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
+        opt_target_factor = 65536.0;
+        return true;
+}
+
+void argon2d1000_hash( void *output, const void *input )
+{
+    argon2_context context;
+    context.out = (uint8_t *)output;
+    context.outlen = (uint32_t)OUTPUT_BYTES;
+    context.pwd = (uint8_t *)input;
+    context.pwdlen = (uint32_t)INPUT_BYTES;
+    context.salt = (uint8_t *)input; //salt = input
+    context.saltlen = (uint32_t)INPUT_BYTES;
+    context.secret = NULL;
+    context.secretlen = 0;
+    context.ad = NULL;
+    context.adlen = 0;
+    context.allocate_cbk = NULL;
+    context.free_cbk = NULL;
+    context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS
+    // main configurable Argon2 hash parameters
+    context.m_cost = 1000;  // Memory in KiB (1MB)
+    context.lanes = 8;     // Degree of Parallelism
+    context.threads = 1;   // Threads
+    context.t_cost = 2;    // Iterations
+    context.version = ARGON2_VERSION_10;
+
+    argon2_ctx( &context, Argon2_d );
+}
+
+bool register_argon2d1000_algo( algo_gate_t* gate )
+{
+        gate->scanhash = (void*)&scanhash_argon2d;
+        gate->hash = (void*)&argon2d1000_hash;
+        opt_target_factor = 65536.0;
+        return true;
+}
+
+void argon2d16000_hash( void *output, const void *input )
+{
+   argon2_context context;
+   context.out = (uint8_t *)output;
+   context.outlen = (uint32_t)OUTPUT_BYTES;
+   context.pwd = (uint8_t *)input;
+   context.pwdlen = (uint32_t)INPUT_BYTES;
+   context.salt = (uint8_t *)input; //salt = input
+   context.saltlen = (uint32_t)INPUT_BYTES;
+   context.secret = NULL;
+   context.secretlen = 0;
+   context.ad = NULL;
+   context.adlen = 0;
+   context.allocate_cbk = NULL;
+   context.free_cbk = NULL;
+   context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS
+   // main configurable Argon2 hash parameters
+   context.m_cost = 16000; // Memory in KiB (~16384KB)
+   context.lanes = 1;    // Degree of Parallelism
+   context.threads = 1;  // Threads
+   context.t_cost = 1;   // Iterations
+   context.version = ARGON2_VERSION_10;
+
+   argon2_ctx( &context, Argon2_d );
+}
+
+bool register_argon2d16000_algo( algo_gate_t* gate )
+{
+        gate->scanhash = (void*)&scanhash_argon2d;
+        gate->hash = (void*)&argon2d16000_hash;
        opt_target_factor = 65536.0;
        return true;
 }
@@ -148,7 +184,7 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = (const uint32_t)max_nonce;
   uint32_t n = first_nonce;
-   const int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;  
   uint32_t t_cost = 1; // 1 iteration
   uint32_t m_cost = 4096; // use 4MB
   uint32_t parallelism = 1; // 1 thread, 2 lanes
@@ -176,7 +212,6 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
 bool register_argon2d4096_algo( algo_gate_t* gate )
 {
        gate->scanhash = (void*)&scanhash_argon2d4096;
-        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT |NEON_OPT;
        opt_target_factor = 65536.0;
        return true;
 }
--- a/algo/argon2d/argon2d-gate.h
+++ b/algo/argon2d/argon2d-gate.h
@@ -4,22 +4,27 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

+int scanhash_argon2d( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );
+
 // Credits: version = 0x10, m_cost = 250.
 bool register_argon2d250_algo( algo_gate_t* gate );

 void argon2d250_hash( void *state, const void *input );

-int scanhash_argon2d250( struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done, struct thr_info *mythr );
-
 // Dynamic: version = 0x10, m_cost = 500.
 bool register_argon2d500_algo( algo_gate_t* gate );

 void argon2d500_hash( void *state, const void *input );

-int scanhash_argon2d500( struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done, struct thr_info *mythr );
+// Zero Dynamics Cash: version = 0x10, m_cost = 1000.
+bool register_argon2d1000_algo( algo_gate_t* gate );

+void argon2d1000_hash( void *state, const void *input );
+
+bool register_argon2d16000_algo( algo_gate_t* gate );
+
+void argon2d16000_hash( void *state, const void *input );

 // Unitus: version = 0x13, m_cost = 4096.
 bool register_argon2d4096_algo( algo_gate_t* gate );
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -441,57 +441,6 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
      W[14] = SHA256_8WAY_MEXP( W[12], W[ 7], W[15], W[14] ); \
      W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] ); 

-#if defined(VL256)
-// AVX512 or AVX10-256
-
-#define CHx(X, Y, Z)    _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
-
-#define MAJx(X, Y, Z)   _mm256_ternarylogic_epi32( X, Y, Z, 0xe8 )
-
-#define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
-do { \
-  __m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i) ] ), \
-                                 W[ i ] ); \
-  __m256i T1 = BSG2_1x( E ); \
-  __m256i T2 = BSG2_0x( A ); \
-  T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
-  T1 = _mm256_add_epi32( T1, H ); \
-  T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \
-  T1 = _mm256_add_epi32( T1, T0 ); \
-  D  = _mm256_add_epi32( D,  T1 ); \
-  H  = _mm256_add_epi32( T1, T2 ); \
-} while (0)
-
-#define SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
-   SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H,  0, j ); \
-   SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G,  1, j ); \
-   SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F,  2, j ); \
-   SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E,  3, j ); \
-   SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D,  4, j ); \
-   SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C,  5, j ); \
-   SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B,  6, j ); \
-   SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A,  7, j ); \
-   SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H,  8, j ); \
-   SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G,  9, j ); \
-   SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 10, j ); \
-   SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 11, j ); \
-   SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 12, j ); \
-   SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 13, j ); \
-   SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 14, j ); \
-   SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, j );
-
-// Not used with AVX512, needed to satisfy the compiler
-#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
-{ \
-   __m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
-                              v256_32( K256[(i)+(j)] ) ); \
-   __m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
-   D  = _mm256_add_epi32( D,  T1 ); \
-   H  = _mm256_add_epi32( T1, T2 ); \
-}
-
-#else  // AVX2
-
 #define CHx(X, Y, Z) \
   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 

@@ -503,61 +452,58 @@ do { \

 #define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
 { \
-   __m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
+   H = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
                              v256_32( K256[(i)+(j)] ) ); \
-   __m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
+   __m256i T = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
   Y_xor_Z = X_xor_Y; \
-   D  = _mm256_add_epi32( D,  T1 ); \
-   H  = _mm256_add_epi32( T1, T2 ); \
+   D  = _mm256_add_epi32( D, H ); \
+   H  = _mm256_add_epi32( H, T ); \
 }

 #define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
-do { \
-  __m256i T0 = _mm256_add_epi32( v256_32( K256[(j)+(i)] ), W[i] ); \
-  __m256i T1 = BSG2_1x( E ); \
+{ \
+  __m256i T1 = _mm256_add_epi32( v256_32( K256[(j)+(i)] ), W[i] ); \
+  H = _mm256_add_epi32( H, BSG2_1x( E ) ); \
  __m256i T2 = BSG2_0x( A ); \
-  T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
-  T1 = _mm256_add_epi32( T1, H ); \
+  T1 = _mm256_add_epi32( T1, CHx( E, F, G ) ); \
  T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \
-  T1 = _mm256_add_epi32( T1, T0 ); \
+  H = _mm256_add_epi32( H, T1 ); \
  Y_xor_Z = X_xor_Y; \
-  D  = _mm256_add_epi32( D,  T1 ); \
-  H  = _mm256_add_epi32( T1, T2 ); \
-} while (0)
+  D  = _mm256_add_epi32( D,  H ); \
+  H  = _mm256_add_epi32( H, T2 ); \
+}

 // read Y_xor_Z, update X_xor_Y
 #define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \
  _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
                                         Y_xor_Z ) )

-// start with toc initialized to y^z:   toc = B ^ C
+// start with toc initialized to y^z, toc = B ^ C for first ound.
 // First round reads toc as Y_xor_Z and saves X_xor_Y as tic.
 // Second round reads tic as Y_xor_Z and saves X_xor_Y as toc.

 #define SHA256_8WAY_2ROUNDS( A, B, C, D, E, F, G, H, i0, i1, j ) \
-do { \
-  __m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i0) ] ), \
+{ \
+  __m256i T1 = _mm256_add_epi32( v256_32( K256[ (j)+(i0) ] ), \
                                 W[ i0 ] ); \
-  __m256i T1 = BSG2_1x( E ); \
+  H = _mm256_add_epi32( H, BSG2_1x( E ) ); \
  __m256i T2 = BSG2_0x( A ); \
-  T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
-  T1 = _mm256_add_epi32( T1, H ); \
+  T1 = _mm256_add_epi32( T1, CHx( E, F, G ) ); \
  T2 = _mm256_add_epi32( T2, MAJ_2step( A, B, C, tic, toc ) ); \
-  T1 = _mm256_add_epi32( T1, T0 ); \
-  D  = _mm256_add_epi32( D,  T1 ); \
-  H  = _mm256_add_epi32( T1, T2 ); \
+  H = _mm256_add_epi32( H, T1 ); \
+  D  = _mm256_add_epi32( D,  H ); \
+  H  = _mm256_add_epi32( H, T2 ); \
 \
-  T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i1) ] ), \
+  T1 = _mm256_add_epi32( v256_32( K256[ (j)+(i1) ] ), \
                                 W[ (i1) ] ); \
-  T1 = BSG2_1x( D ); \
+  G = _mm256_add_epi32( G, BSG2_1x( D ) ); \
  T2 = BSG2_0x( H ); \
-  T0 = _mm256_add_epi32( T0, CHx( D, E, F ) ); \
-  T1 = _mm256_add_epi32( T1, G ); \
+  T1 = _mm256_add_epi32( T1, CHx( D, E, F ) ); \
  T2 = _mm256_add_epi32( T2, MAJ_2step( H, A, B, toc, tic ) ); \
-  T1 = _mm256_add_epi32( T1, T0 ); \
-  C  = _mm256_add_epi32( C,  T1 ); \
-  G  = _mm256_add_epi32( T1, T2 ); \
-} while (0)
+  G = _mm256_add_epi32( G, T1 ); \
+  C  = _mm256_add_epi32( C,  G ); \
+  G  = _mm256_add_epi32( G, T2 ); \
+}

 #define SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
 { \
@@ -572,8 +518,6 @@ do { \
   SHA256_8WAY_2ROUNDS( C, D, E, F, G, H, A, B, 14, 15, j ); \
 }

-#endif   // AVX512VL else AVX2
-
 static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W,
                                          const  __m256i *in ) \
 {
@@ -650,9 +594,7 @@ void sha256_8x32_prehash_3rounds( __m256i *state_mid, __m256i *X,
   G = _mm256_load_si256( state_in + 6 );
   H = _mm256_load_si256( state_in + 7 );

-#if !defined(VL256)
   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
-#endif

   SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H,  0, 0 );
   SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G,  1, 0 );
@@ -692,9 +634,7 @@ void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data,
   G = _mm256_load_si256( state_mid + 6 );
   H = _mm256_load_si256( state_mid + 7 );

-#if !defined(VL256)
   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G );
-#endif

   // round 3 part 2, add nonces  
   A = _mm256_add_epi32( A, W[3] );
@@ -779,10 +719,10 @@ void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data,
 int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
                           const __m256i *state_in, const uint32_t *target )
 {
-   __m256i A, B, C, D, E, F, G, H, T0, T1, T2;
+   __m256i A, B, C, D, E, F, G, H, G57, H56;
   __m256i vmask, targ, hash;
   __m256i W[16];  memcpy_256( W, data, 16 );
-   uint8_t flip, t6_mask;
+   uint8_t flip, t6_mask, t7_mask;

   A = _mm256_load_si256( state_in   );
   B = _mm256_load_si256( state_in+1 );
@@ -793,12 +733,10 @@ int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
   G = _mm256_load_si256( state_in+6 );
   H = _mm256_load_si256( state_in+7 );

-   const __m256i IV7 = H;
-   const __m256i IV6 = G;
+   const __m256i istate6 = G;
+   const __m256i istate7 = H;

-#if !defined(VL256)
   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
-#endif

   // rounds 0 to 16, ignore zero padding W[9..14]
   SHA256_8WAY_ROUND(       A, B, C, D, E, F, G, H,  0, 0 );
@@ -841,11 +779,9 @@ int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
   W[11] = SHA256_8WAY_MEXP( W[ 9], W[ 4], W[12], W[11] );
   W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] );

-#if !defined(VL256)
   Y_xor_Z = _mm256_xor_si256( B, C );
-#endif

-   // rounds 48 to 57
+   // Rounds 48 to 55
   SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H,  0, 48 );
   SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G,  1, 48 );
   SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F,  2, 48 );
@@ -854,77 +790,83 @@ int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
   SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C,  5, 48 );
   SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B,  6, 48 );
   SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A,  7, 48 );
-   SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H,  8, 48 );
-   SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G,  9, 48 );

-   // round 58 to 60 part 1
-   T0 = _mm256_add_epi32( v256_32( K256[58] ),
+   // Round 56
+   H = _mm256_add_epi32( v256_32( K256[56] ),
+                 mm256_add4_32( BSG2_1x( E ), CHx( E, F, G ), W[ 8], H ) );
+   D = _mm256_add_epi32( D, H );
+   H56 = _mm256_add_epi32( H, _mm256_add_epi32( BSG2_0x( A ),
+                                                   MAJx( A, B, C ) ) );
+   Y_xor_Z = X_xor_Y; 
+   
+   // Rounds 57 to 60 part 1
+   G = _mm256_add_epi32( v256_32( K256[57] ),
+                 mm256_add4_32( BSG2_1x( D ), CHx( D, E, F ), W[ 9], G ) );
+   C = _mm256_add_epi32( C, G );
+   G57 = _mm256_add_epi32( G, MAJx( H56, A, B ) );
+   
+   F = _mm256_add_epi32( v256_32( K256[58] ),
                 mm256_add4_32( BSG2_1x( C ), CHx( C, D, E ), W[10], F ) );
-   B = _mm256_add_epi32( B, T0 );
+   B = _mm256_add_epi32( B, F );

-   T1 = _mm256_add_epi32( v256_32( K256[59] ),
+   E = _mm256_add_epi32( v256_32( K256[59] ),
                 mm256_add4_32( BSG2_1x( B ), CHx( B, C, D ), W[11], E ) );
-   A = _mm256_add_epi32( A, T1 );
+   A = _mm256_add_epi32( A, E );

-   T2 = _mm256_add_epi32( v256_32( K256[60] ),
+   D = _mm256_add_epi32( v256_32( K256[60] ),
                 mm256_add4_32( BSG2_1x( A ), CHx( A, B, C ), W[12], D ) );
-   H = _mm256_add_epi32( H, T2 );
+   H = _mm256_add_epi32( H56, D );

   // Got H, test it.
+   hash = mm256_bswap_32( _mm256_add_epi32( H, istate7 ) );
   targ = v256_32( target[7] );
-   hash = mm256_bswap_32( _mm256_add_epi32( H, IV7 ) );
-   if ( target[7] )
-   {
-      flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
-      if ( likely( 0xff == ( flip ^
-                    mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ))
-         return 0;
-   }
+   // A simple unsigned LE test is complicated by the lack of a cmple
+   // instruction, and lack of unsigned compares in AVX2.
+   flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
+   if ( likely( 0xff == ( t7_mask = ( flip ^
+                    mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) )))
+      return 0;
   t6_mask = mm256_movmask_32( vmask =_mm256_cmpeq_epi32( hash, targ ) );

-   // round 58 part 2
-   F = _mm256_add_epi32( T0, _mm256_add_epi32( BSG2_0x( G ),
-                                               MAJx( G, H, A ) ) );
-   // round 61 part 1
-   W[13] = SHA256_8WAY_MEXP( W[11], W[ 6], W[14], W[13] );
-   T0 = _mm256_add_epi32( v256_32( K256[61] ),
-                 mm256_add4_32( BSG2_1x( H ), CHx( H, A, B ), W[13], C ) );
-   G = _mm256_add_epi32( G, T0 );
+   // Round 57 part 2
+   G57 = _mm256_add_epi32( G57, BSG2_0x( H56 ) );
+   Y_xor_Z = X_xor_Y;

-   if ( t6_mask )
+   // Round 61 part 1
+   W[13] = SHA256_8WAY_MEXP( W[11], W[ 6], W[14], W[13] );
+   C = _mm256_add_epi32( v256_32( K256[61] ),
+                 mm256_add4_32( BSG2_1x( H ), CHx( H, A, B ), W[13], C ) );
+   G = _mm256_add_epi32( G57, C );
+
+   if ( t6_mask == (0xff & ~t7_mask ) )
   { 
      // Testing H was inconclusive: hash7 == target7, need to test G
      targ = _mm256_and_si256( vmask, v256_32( target[6] ) );
-      hash = mm256_bswap_32( _mm256_add_epi32( G, IV6 ) );
-
-      if ( likely( 0 == ( t6_mask & mm256_movmask_32(
-                                      _mm256_cmpeq_epi32( hash, targ ) ) ) ))
-      {
-          flip = ( (int)target[6] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
-          if ( likely( 0 != ( t6_mask & ( flip ^
+      hash = mm256_bswap_32( _mm256_add_epi32( G, istate6 ) );
+      flip = ( (int)target[6] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
+      if ( likely( 0 != ( t6_mask & ( flip ^
                   mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ) ))
-             return 0;
-          if ( likely( ( target[6] == 0x80000000 )
-             && ( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpgt_epi32(
-                              hash, _mm256_xor_si256( hash, hash ) ) ) ) ) ))
-             return 0;
-       } 
-//     else inconclusive, testing targ5 isn't practical, fininsh hashing  
+         return 0;
   }

-// At this point either the hash will be good or the test was inconclusive.
-// If the latter it's probably a high target difficulty with a nearly equal
-// high difficulty hash that has a good chance of being good.  
+   // Rounds 58 to 61 part 2
+   F = _mm256_add_epi32( F, _mm256_add_epi32( BSG2_0x( G57 ),
+                                                 MAJx( G57, H, A ) ) );
+   Y_xor_Z = X_xor_Y;

-   // rounds 59 to 61 part 2
-   E = _mm256_add_epi32( T1, _mm256_add_epi32( BSG2_0x( F ),
-                                               MAJx( F, G, H ) ) );
-   D = _mm256_add_epi32( T2, _mm256_add_epi32( BSG2_0x( E ),
-                                               MAJx( E, F, G ) ) );
-   C = _mm256_add_epi32( T0, _mm256_add_epi32( BSG2_0x( D ),
-                                               MAJx( D, E, F ) ) );
+   E = _mm256_add_epi32( E, _mm256_add_epi32( BSG2_0x( F ),
+                                                 MAJx( F, G57, H ) ) );
+   Y_xor_Z = X_xor_Y;

-   // rounds 62 & 63
+   D = _mm256_add_epi32( D, _mm256_add_epi32( BSG2_0x( E ),
+                                                 MAJx( E, F, G57 ) ) );
+   Y_xor_Z = X_xor_Y;
+
+   C = _mm256_add_epi32( C, _mm256_add_epi32( BSG2_0x( D ),
+                                                 MAJx( D, E, F ) ) );
+   Y_xor_Z = X_xor_Y;
+
+   // Rounds 62 & 63
   W[14] = SHA256_8WAY_MEXP( W[12], W[ 7], W[15], W[14] );
   W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );

@@ -1077,40 +1019,26 @@ void sha256_8x32_full( void *dst, const void *data, size_t len )
   W[15] = SHA256_16WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );

 #define SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
-do { \
-  __m512i T0 = _mm512_add_epi32( v512_32( K256[(j)+(i)] ), W[i] ); \
-  __m512i T1 = BSG2_1x16( E ); \
+{ \
+  __m512i T1 = _mm512_add_epi32( v512_32( K256[(j)+(i)] ), W[i] ); \
+  H = _mm512_add_epi32( H, BSG2_1x16( E ) ); \
  __m512i T2 = BSG2_0x16( A ); \
-  T0 = _mm512_add_epi32( T0, CHx16( E, F, G ) ); \
-  T1 = _mm512_add_epi32( T1, H ); \
+  T1 = _mm512_add_epi32( T1, CHx16( E, F, G ) ); \
  T2 = _mm512_add_epi32( T2, MAJx16( A, B, C ) ); \
-  T1 = _mm512_add_epi32( T1, T0 ); \
-  D  = _mm512_add_epi32( D,  T1 ); \
-  H  = _mm512_add_epi32( T1, T2 ); \
-} while (0)
+  H = _mm512_add_epi32( H, T1 ); \
+  D  = _mm512_add_epi32( D,  H ); \
+  H  = _mm512_add_epi32( H, T2 ); \
+}
   
 #define SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
 { \
-   __m512i T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \
+   H = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \
                              v512_32( K256[(i)+(j)] ) ); \
-   __m512i T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
-   D  = _mm512_add_epi32( D,  T1 ); \
-   H  = _mm512_add_epi32( T1, T2 ); \
+   __m512i T = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
+   D  = _mm512_add_epi32( D, H ); \
+   H  = _mm512_add_epi32( H, T ); \
 }

-/*
-#define SHA256_16WAY_ROUND(A, B, C, D, E, F, G, H, i, j) \
-do { \
-  __m512i T1, T2; \
-  __m512i K = v512_32( K256[( (j)+(i) )] ); \
-  T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \
-                                           K, W[i] ) ); \
-  T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
-  D  = _mm512_add_epi32( D,  T1 ); \
-  H  = _mm512_add_epi32( T1, T2 ); \
-} while (0)
-*/
-
 #define SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
   SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H,  0, j ); \
   SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G,  1, j ); \
@@ -1332,10 +1260,9 @@ void sha256_16x32_final_rounds( __m512i *state_out, const __m512i *data,
 int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
                              const __m512i *state_in, const uint32_t *target )
 {
-   __m512i A, B, C, D, E, F, G, H, hash, targ;
-   __m512i T0, T1, T2;
+   __m512i A, B, C, D, E, F, G, H, hash, targ, G57, H56;
   __m512i W[16];      memcpy_512( W, data, 16 );
-   __mmask16 t6_mask;
+   __mmask16 mask;
   
   A = _mm512_load_si512( state_in   );
   B = _mm512_load_si512( state_in+1 );
@@ -1346,8 +1273,8 @@ int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
   G = _mm512_load_si512( state_in+6 );
   H = _mm512_load_si512( state_in+7 );

-   const __m512i IV6 = G;
-   const __m512i IV7 = H; 
+   const __m512i istate6 = G;
+   const __m512i istate7 = H;
   
   // rounds 0 to 8
   SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H,  0, 0 );
@@ -1419,7 +1346,7 @@ int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
   W[11] = SHA256_16WAY_MEXP( W[ 9], W[ 4], W[12], W[11] );
   W[12] = SHA256_16WAY_MEXP( W[10], W[ 5], W[13], W[12] );
   
-   // Rounds 48 to 57
+   // Rounds 48 to 55
   SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H,  0, 48 );
   SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G,  1, 48 );
   SHA256_16WAY_ROUND( G, H, A, B, C, D, E, F,  2, 48 );
@@ -1428,62 +1355,67 @@ int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
   SHA256_16WAY_ROUND( D, E, F, G, H, A, B, C,  5, 48 );
   SHA256_16WAY_ROUND( C, D, E, F, G, H, A, B,  6, 48 );
   SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A,  7, 48 );
-   SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H,  8, 48 );
-   SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G,  9, 48 );

-   // rounds 58 to 60 part 1
-   T0 = _mm512_add_epi32( v512_32( K256[58] ),
+   // Round 56
+   H = _mm512_add_epi32( v512_32( K256[56] ),
+                 mm512_add4_32( BSG2_1x16( E ), CHx16( E, F, G ), W[ 8], H ) );
+   D = _mm512_add_epi32( D, H );
+   H56 = _mm512_add_epi32( H, _mm512_add_epi32( BSG2_0x16( A ),
+                                                   MAJx16( A, B, C ) ) );
+   
+   // Rounds 57 to 60 part 1
+   G = _mm512_add_epi32( v512_32( K256[57] ),
+                 mm512_add4_32( BSG2_1x16( D ), CHx16( D, E, F ), W[ 9], G ) );
+   C = _mm512_add_epi32( C, G );
+   G57 = _mm512_add_epi32( G, MAJx16( H56, A, B ) );
+   
+   F = _mm512_add_epi32( v512_32( K256[58] ),
                 mm512_add4_32( BSG2_1x16( C ), CHx16( C, D, E ), W[10], F ) );
-   B = _mm512_add_epi32( B, T0 );
+   B = _mm512_add_epi32( B, F );
   
-   T1 = _mm512_add_epi32( v512_32( K256[59] ),
+   E = _mm512_add_epi32( v512_32( K256[59] ),
                 mm512_add4_32( BSG2_1x16( B ), CHx16( B, C, D ), W[11], E ) );
-   A = _mm512_add_epi32( A, T1 );
+   A = _mm512_add_epi32( A, E );

-   T2 = _mm512_add_epi32( v512_32( K256[60] ),
+   D = _mm512_add_epi32( v512_32( K256[60] ),
                 mm512_add4_32( BSG2_1x16( A ), CHx16( A, B, C ), W[12], D ) );
-   H = _mm512_add_epi32( H, T2 );
+   H = _mm512_add_epi32( H56, D );

-   // got H, test it against target[7]
-   hash = mm512_bswap_32( _mm512_add_epi32( H , IV7 ) );
+   // got final H, test it against target[7]
+   hash = mm512_bswap_32( _mm512_add_epi32( H , istate7 ) );
   targ = v512_32( target[7] );
-   if ( target[7] )
-   if ( likely( 0 == _mm512_cmple_epu32_mask( hash, targ ) ))
+   if ( likely( 0 == ( mask = _mm512_cmple_epu32_mask( hash, targ ) ) ))
      return 0;
-   t6_mask = _mm512_cmpeq_epi32_mask( hash, targ );

-   // round 58 part 2
-   F = _mm512_add_epi32( T0, _mm512_add_epi32( BSG2_0x16( G ),
-                                               MAJx16( G, H, A ) ) );
+   // Round 57 part 2
+   G57 = _mm512_add_epi32( G57, BSG2_0x16( H56 ) );
   
-   // round 61 part 1
+   // Round 61 part 1
   W[13] = SHA256_16WAY_MEXP( W[11], W[ 6], W[14], W[13] );
-   T0 = _mm512_add_epi32( v512_32( K256[61] ),
+   C = _mm512_add_epi32( v512_32( K256[61] ),
                 mm512_add4_32( BSG2_1x16( H ), CHx16( H, A, B ), W[13], C ) );
-   G = _mm512_add_epi32( G, T0 );
+   G = _mm512_add_epi32( G57, C );

-   // got G, test it against target[6] if indicated
-   if ( (uint16_t)t6_mask )
+   // got final G, test it against target[6] if indicated.
+   if ( mask == _mm512_cmpeq_epi32_mask( hash, targ ) )
   {
-      hash = mm512_bswap_32( _mm512_add_epi32( G, IV6 ) );
+      hash = mm512_bswap_32( _mm512_add_epi32( G, istate6 ) );
      targ = v512_32( target[6] );
-      if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) ))
+      if ( likely( 0 == _mm512_mask_cmple_epu32_mask( mask, hash, targ ) ))
          return 0;
   }

-   // round 59 part 2
-   E = _mm512_add_epi32( T1, _mm512_add_epi32( BSG2_0x16( F ),
-                                                  MAJx16( F, G, H ) ) );
+   // Round 58 to 61 part 2
+   F = _mm512_add_epi32( F, _mm512_add_epi32( BSG2_0x16( G57 ),
+                                                 MAJx16( G57, H, A ) ) );
+   E = _mm512_add_epi32( E, _mm512_add_epi32( BSG2_0x16( F ),
+                                                 MAJx16( F, G57, H ) ) );
+   D = _mm512_add_epi32( D, _mm512_add_epi32( BSG2_0x16( E ),
+                                                 MAJx16( E, F, G57 ) ) );
+   C = _mm512_add_epi32( C, _mm512_add_epi32( BSG2_0x16( D ),
+                                                 MAJx16( D, E, F ) ) );

-   // round 60 part 2
-   D = _mm512_add_epi32( T2, _mm512_add_epi32( BSG2_0x16( E ),
-                                                  MAJx16( E, F, G ) ) );
-   
-   // round 61 part 2
-   C = _mm512_add_epi32( T0, _mm512_add_epi32( BSG2_0x16( D ),
-                                                  MAJx16( D, E, F ) ) );
-
-   // rounds 62, 63
+   // Rounds 62, 63
   W[14] = SHA256_16WAY_MEXP( W[12], W[ 7], W[15], W[14] );
   W[15] = SHA256_16WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );
   
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -783,29 +783,6 @@ void sha512_8x64_ctx( sha512_8x64_context *sc, void *dst, const void *data,
                                    mm256_ror_64( x, 61 ), \
                                    _mm256_srli_epi64( x, 6 ) )

-#if defined(VL256)
-// 4 way is not used whith AVX512 but will be whith AVX10_256 when it
-// becomes available.
-
-#define CH( X, Y, Z )    _mm256_ternarylogic_epi64( X, Y, Z, 0xca )
-
-#define MAJ( X, Y, Z )   _mm256_ternarylogic_epi64( X, Y, Z, 0xe8 )
-   
-#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
-do { \
-  __m256i T0 = _mm256_add_epi64( v256_64( K512[i] ), W[i] ); \
-  __m256i T1 = BSG5_1( E ); \
-  __m256i T2 = BSG5_0( A ); \
-  T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
-  T1 = _mm256_add_epi64( T1, H ); \
-  T2 = _mm256_add_epi64( T2, MAJ( A, B, C ) ); \
-  T1 = _mm256_add_epi64( T1, T0 ); \
-  D  = _mm256_add_epi64( D,  T1 ); \
-  H  = _mm256_add_epi64( T1, T2 ); \
-} while (0)
-
-#else   // AVX2 only
-
 #define CH(X, Y, Z) \
   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 

@@ -827,19 +804,12 @@ do { \
  H  = _mm256_add_epi64( T1, T2 ); \
 } while (0)

-#endif  // AVX512VL AVX10_256
-
 static void
 sha512_4x64_round( sha512_4x64_context *ctx,  __m256i *in, __m256i r[8] )
 {
   int i;
   register __m256i A, B, C, D, E, F, G, H;
-
-#if !defined(VL256)
-// Disable for AVX10_256
   __m256i X_xor_Y, Y_xor_Z;
-#endif
-
   __m256i W[80];

   mm256_block_bswap_64( W  , in );
@@ -872,10 +842,7 @@ sha512_4x64_round( sha512_4x64_context *ctx,  __m256i *in, __m256i r[8] )
      H = v256_64( 0x5BE0CD19137E2179 );
   }

-#if !defined(VL256)
-// Disable for AVX10_256
   Y_xor_Z = _mm256_xor_si256( B, C );
-#endif

   for ( i = 0; i < 80; i += 8 )
   {
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -109,7 +109,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )

   for ( r = 0; r < 3; r ++ )
   {
-      // round 1, 5, 9
+     // round 1, 5, 9

     k00 = _mm256_xor_si256( k13, mm256_shuflr128_32(
                                  mm256_aesenc_2x128( k00, zero ) ) );
--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -21,7 +21,7 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
   __m512i *H = (__m512i*)ctx->h;
   const __m512i count = _mm512_set4_epi32( ctx->count3, ctx->count2,
                                            ctx->count1, ctx->count0 );
-   int r;
+   const __m512i zero = _mm512_setzero_si512();

   P0 = H[0];
   P1 = H[1];
@@ -37,182 +37,160 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
   K6 = M[6];
   K7 = M[7];

-   X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K1 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K2 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K3 ), m512_zero );
+   // round 0

-   P0 = _mm512_xor_si512( P0, X );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K1 ), zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K2 ), zero );
+   P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P0 );

-   X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K5 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K6 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K7 ), m512_zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K5 ), zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K6 ), zero );
+   P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P2 );

-   P2 = _mm512_xor_si512( P2, X );
-
-   // round
-   for ( r = 0; r < 3; r ++ )
+   for ( int r = 0; r < 3; r ++ )
   {
-      // round 1, 5, 9
+     // round 1, 5, 9

     K0 = _mm512_xor_si512( K7, mm512_shuflr128_32(
-                                  _mm512_aesenc_epi128( K0, m512_zero ) ) );
+                                  _mm512_aesenc_epi128( K0, zero ) ) );

     if ( r == 0 )
        K0 = _mm512_xor_si512( K0,
-                    _mm512_mask_xor_epi32( count, 0x8888, count, m512_neg1 ) );
+             _mm512_mask_ternarylogic_epi32( count, 0x8888, count, count, 1 ) );

-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), zero );
     K1 = _mm512_xor_si512( K0,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K1, zero ) ) );

     if ( r == 1 )
        K1 = _mm512_xor_si512( K1, mm512_shuflr128_32(
-                 _mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) );
+           _mm512_mask_ternarylogic_epi32( count, 0x1111, count, count, 1 ) ) );

-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
     K2 = _mm512_xor_si512( K1,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K2, zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
     K3 = _mm512_xor_si512( K2,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
-
-     P3 = _mm512_xor_si512( P3, X );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K3, zero ) ) );
+     P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P3 );

     K4 = _mm512_xor_si512( K3,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K4, zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), zero );
     K5 = _mm512_xor_si512( K4,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K5, zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
     K6 = _mm512_xor_si512( K5,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K6, zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
     K7 = _mm512_xor_si512( K6,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K7, zero ) ) );

     if ( r == 2 )
        K7 = _mm512_xor_si512( K7, mm512_swap128_64(
-                 _mm512_mask_xor_epi32( count, 0x2222, count, m512_neg1 ) ) );
+           _mm512_mask_ternarylogic_epi32( count, 0x2222, count, count, 1 ) ) );
 
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
-     P1 = _mm512_xor_si512( P1, X );
+     P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P1 );
     
     // round 2, 6, 10

     K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), zero );
     K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
     K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
     K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
-
-     P2 = _mm512_xor_si512( P2, X );
+     P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P2 );

     K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), zero );
     K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
     K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
     K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
-
-     P0 = _mm512_xor_si512( P0, X );
+     P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P0 );

     // round 3, 7, 11

     K0 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero );
+                               _mm512_aesenc_epi128( K0, zero ) ), K7 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), zero );
     K1 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+                               _mm512_aesenc_epi128( K1, zero ) ), K0 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
     K2 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+                               _mm512_aesenc_epi128( K2, zero ) ), K1 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
     K3 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
-
-     P1 = _mm512_xor_si512( P1, X );
+                               _mm512_aesenc_epi128( K3, zero ) ), K2 );
+     P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P1 );

     K4 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero );
+                               _mm512_aesenc_epi128( K4, zero ) ), K3 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), zero );
     K5 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+                               _mm512_aesenc_epi128( K5, zero ) ), K4 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
     K6 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K6, m512_zero ) ), K5 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+                               _mm512_aesenc_epi128( K6, zero ) ), K5 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
     K7 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
-
-     P3 = _mm512_xor_si512( P3, X );
+                               _mm512_aesenc_epi128( K7, zero ) ), K6 );
+     P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P3 );

     // round 4, 8, 12

     K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), zero );
     K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
     K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
     K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
-
-     P0 = _mm512_xor_si512( P0, X );
+     P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P0 );

     K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), zero );
     K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
     K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
     K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
-
-     P2 = _mm512_xor_si512( P2, X );
+     P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P2 );
   }

   // round 13

   K0 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K0, m512_zero ) ), K7  );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
+			             _mm512_aesenc_epi128( K0, zero ) ), K7  );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), zero );
   K1 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+			             _mm512_aesenc_epi128( K1, zero ) ), K0 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
   K2 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+			             _mm512_aesenc_epi128( K2, zero ) ), K1 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
   K3 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
-
-   P3 = _mm512_xor_si512( P3, X );
+			             _mm512_aesenc_epi128( K3, zero ) ), K2 );
+   P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P3 );

   K4 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
+			             _mm512_aesenc_epi128( K4, zero ) ), K3 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), zero );
   K5 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
-   K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
-   K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5,  mm512_swap64_32( 
-              _mm512_mask_xor_epi32( count, 0x4444, count, m512_neg1 ) ) ) );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+			             _mm512_aesenc_epi128( K5, zero ) ), K4 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
+   K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, zero ) );
+   K6 = mm512_xor3( K6, K5, mm512_swap64_32(
+        _mm512_mask_ternarylogic_epi32( count, 0x4444, count, count, 1 ) ) );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
   K7= _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
-
-   P1 = _mm512_xor_si512( P1, X );
+			             _mm512_aesenc_epi128( K7, zero ) ), K6 );
+   P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P1 );

   H[0] = _mm512_xor_si512( H[0], P2 );
   H[1] = _mm512_xor_si512( H[1], P3 );
--- a/algo/shavite/shavite.c
+++ b/algo/shavite/shavite.c
@@ -1,159 +0,0 @@
-#include "miner.h"
-#include "algo-gate-api.h"
-#include <string.h>
-#include <stdint.h>
-
-#include "sph_shavite.h"
-
-extern void inkhash(void *state, const void *input)
-{
-    sph_shavite512_context	 ctx_shavite;
-    uint32_t hash[16];
-	
-    sph_shavite512_init(&ctx_shavite);
-    sph_shavite512 (&ctx_shavite, (const void*) input, 80);
-    sph_shavite512_close(&ctx_shavite, (void*) hash);
-    
-    sph_shavite512_init(&ctx_shavite);
-    sph_shavite512(&ctx_shavite, (const void*) hash, 64);
-    sph_shavite512_close(&ctx_shavite, (void*) hash);
-
-    memcpy(state, hash, 32);
-
-/*	
-	int ii;
-	printf("result: ");
-	for (ii=0; ii < 32; ii++)
-	{
-		printf ("%.2x",((uint8_t*)state)[ii]);
-	};
-	printf ("\n");	
-*/	
-}
-
-int scanhash_ink( struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
-{
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;
-
-	uint32_t n = pdata[19] - 1;
-	const uint32_t first_nonce = pdata[19];
-	//const uint32_t Htarg = ptarget[7];
-
-	uint32_t _ALIGN(32) hash64[8];
-	uint32_t endiandata[32];
-	
-	//char testdata[] = {"\x70\x00\x00\x00\x5d\x38\x5b\xa1\x14\xd0\x79\x97\x0b\x29\xa9\x41\x8f\xd0\x54\x9e\x7d\x68\xa9\x5c\x7f\x16\x86\x21\xa3\x14\x20\x10\x00\x00\x00\x00\x57\x85\x86\xd1\x49\xfd\x07\xb2\x2f\x3a\x8a\x34\x7c\x51\x6d\xe7\x05\x2f\x03\x4d\x2b\x76\xff\x68\xe0\xd6\xec\xff\x9b\x77\xa4\x54\x89\xe3\xfd\x51\x17\x32\x01\x1d\xf0\x73\x10\x00"};
-	
-	//we need bigendian data...
-	//lessons learned: do NOT endianchange directly in pdata, this will all proof-of-works be considered as stale from minerd.... 
-	int kk=0;
-	for (; kk < 32; kk++)
-	{
-		be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
-	};
-
-//	if (opt_debug) 
-//	{
-//		applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce);
-//	}
-	
-	/* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */
-	/* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */
-	if (ptarget[7]==0) {
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (((hash64[7]&0xFFFFFFFF)==0) && 
-					fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-	} 
-	else if (ptarget[7]<=0xF) 
-	{
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (((hash64[7]&0xFFFFFFF0)==0) && 
-					fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-	} 
-	else if (ptarget[7]<=0xFF) 
-	{
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (((hash64[7]&0xFFFFFF00)==0) && 
-					fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-	} 
-	else if (ptarget[7]<=0xFFF) 
-	{
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (((hash64[7]&0xFFFFF000)==0) && 
-					fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-
-	} 
-	else if (ptarget[7]<=0xFFFF) 
-	{
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (((hash64[7]&0xFFFF0000)==0) && 
-					fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-
-	} 
-	else 
-	{
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-	}
-	
-	
-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-	return 0;
-}
-
-bool register_shavite_algo( algo_gate_t* gate )
-{
-    algo_not_implemented();
-    return false;
-
-//    gate->scanhash = (void*)&scanhash_ink;
-//    gate->hash     = (void*)&inkhash;
-//    return true;
-};
-
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -50,7 +50,8 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

-static const sph_u32 IV512[] = {
+static const sph_u32 IV512[] =
+{
 	0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
 	0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC,
 	0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47,
@@ -71,38 +72,26 @@ c512( sph_shavite_big_context *sc, const void *msg )
   p2 = h[2];
   p3 = h[3];   

-   // round
-
   k00 = m[0];
-   x = v128_xor( p1, k00 );
-   x = v128_aesenc_nokey( x );
-
   k01 = m[1];
-   x = v128_xor( x, k01 );
-   x = v128_aesenc_nokey( x );
   k02 = m[2];
-   x = v128_xor( x, k02 );
-   x = v128_aesenc_nokey( x );
   k03 = m[3];
-   x = v128_xor( x, k03 );
-   x = v128_aesenc_nokey( x );
-
-   p0 = v128_xor( p0, x );
-
   k10 = m[4];
-   x = v128_xor( p3, k10 );
-   x = v128_aesenc_nokey( x );
   k11 = m[5];
-   x = v128_xor( x, k11 );
-   x = v128_aesenc_nokey( x );
   k12 = m[6];
-   x = v128_xor( x, k12 );
-   x = v128_aesenc_nokey( x );
   k13 = m[7];
-   x = v128_xor( x, k13 );
-   x = v128_aesenc_nokey( x );

-   p2 = v128_xor( p2, x );
+   // round 0
+   
+   x = v128_xoraesenc( p1, k00 );
+   x = v128_xoraesenc( x, k01 );
+   x = v128_xoraesenc( x, k02 );
+   p0 = v128_xoraesencxor( x, k03, p0 );
+
+   x = v128_xoraesenc( p3, k10 );
+   x = v128_xoraesenc( x, k11 );
+   x = v128_xoraesenc( x, k12 );
+   p2 = v128_xoraesencxor( x, k13, p2 );

   for ( r = 0; r < 3; r ++ )
   {
@@ -113,198 +102,165 @@ c512( sph_shavite_big_context *sc, const void *msg )
      if ( r == 0 )
         k00 = v128_xor( k00, v128_set32(
                  ~sc->count3, sc->count2, sc->count1, sc->count0 ) ); 
+      x = v128_xoraesenc( p0, k00 );

-      x = v128_xor( p0, k00 );
-      x = v128_aesenc_nokey( x );
      k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
      k01 = v128_xor( k01, k00 );

      if ( r == 1 )
         k01 = v128_xor( k01, v128_set32(
                  ~sc->count0, sc->count1, sc->count2, sc->count3 ) );
+      x = v128_xoraesenc( x, k01 );

-      x = v128_xor( x, k01 );
-      x = v128_aesenc_nokey( x );
      k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
      k02 = v128_xor( k02, k01 );
-      x = v128_xor( x, k02 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k02 );
+
      k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
      k03 = v128_xor( k03, k02 );
-      x = v128_xor( x, k03 );
-      x = v128_aesenc_nokey( x );
-
-      p3 = v128_xor( p3, x );
+      p3 = v128_xoraesencxor( x, k03, p3 );

      k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
      k10 = v128_xor( k10, k03 );
+      x = v128_xoraesenc( p2, k10 );

-      x = v128_xor( p2, k10 );
-      x = v128_aesenc_nokey( x );
      k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
      k11 = v128_xor( k11, k10 );
-      x = v128_xor( x, k11 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k11 );
+
      k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
      k12 = v128_xor( k12, k11 );
-      x = v128_xor( x, k12 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k12 );
+
      k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
      k13 = v128_xor( k13, k12 );

      if ( r == 2 )
         k13 = v128_xor( k13, v128_set32(
                  ~sc->count1, sc->count0, sc->count3, sc->count2 ) );
-
-      x = v128_xor( x, k13 );
-      x = v128_aesenc_nokey( x );
-      p1 = v128_xor( p1, x );
+      p1 = v128_xoraesencxor( x, k13, p1 );

      // round 2, 6, 10

      k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
-      x = v128_xor( p3, k00 );
-      x = v128_aesenc_nokey( x );
-      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
-      x = v128_xor( x, k01 );
-      x = v128_aesenc_nokey( x );
-      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
-      x = v128_xor( x, k02 );
-      x = v128_aesenc_nokey( x );
-      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
-      x = v128_xor( x, k03 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p3, k00 );

-      p2 = v128_xor( p2, x );
+      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
+      x = v128_xoraesenc( x, k01 );
+
+      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
+      x = v128_xoraesenc( x, k02 );
+
+      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
+      p2 = v128_xoraesencxor( x, k03, p2 );

      k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
-      x = v128_xor( p1, k10 );
-      x = v128_aesenc_nokey( x );
-      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
-      x = v128_xor( x, k11 );
-      x = v128_aesenc_nokey( x );
-      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
-      x = v128_xor( x, k12 );
-      x = v128_aesenc_nokey( x );
-      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
-      x = v128_xor( x, k13 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p1, k10 );

-      p0 = v128_xor( p0, x );
+      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
+      x = v128_xoraesenc( x, k11 );
+
+      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
+      x = v128_xoraesenc( x, k12 );
+
+      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
+      p0 = v128_xoraesencxor( x, k13, p0 );

      // round 3, 7, 11

      k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
      k00 = v128_xor( k00, k13 );
-      x = v128_xor( p2, k00 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p2, k00 );
+
      k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
      k01 = v128_xor( k01, k00 );
-      x = v128_xor( x, k01 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k01 );
+
      k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
      k02 = v128_xor( k02, k01 );
-      x = v128_xor( x, k02 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k02 );
+
      k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
      k03 = v128_xor( k03, k02 );
-      x = v128_xor( x, k03 );
-      x = v128_aesenc_nokey( x );
-
-      p1 = v128_xor( p1, x );
+      p1 = v128_xoraesencxor( x, k03, p1 );

      k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
      k10 = v128_xor( k10, k03 );
-      x = v128_xor( p0, k10 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p0, k10 );
+
      k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
      k11 = v128_xor( k11, k10 );
-      x = v128_xor( x, k11 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k11 );
+
      k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
      k12 = v128_xor( k12, k11 );
-      x = v128_xor( x, k12 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k12 );
+
      k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
      k13 = v128_xor( k13, k12 );
-      x = v128_xor( x, k13 );
-      x = v128_aesenc_nokey( x );
-
-      p3 = v128_xor( p3, x );
+      p3 = v128_xoraesencxor( x, k13, p3 );

      // round 4, 8, 12

      k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
-      x = v128_xor( p1, k00 );
-      x = v128_aesenc_nokey( x );
-      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
-      x = v128_xor( x, k01 );
-      x = v128_aesenc_nokey( x );
-      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
-      x = v128_xor( x, k02 );
-      x = v128_aesenc_nokey( x );
-      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
-      x = v128_xor( x, k03 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p1, k00 );

-      p0 = v128_xor( p0, x );
+      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
+      x = v128_xoraesenc( x, k01 );
+
+      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
+      x = v128_xoraesenc( x, k02 );
+
+      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
+      p0 = v128_xoraesencxor( x, k03, p0 );

      k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
-      x = v128_xor( p3, k10 );
-      x = v128_aesenc_nokey( x );
-      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
-      x = v128_xor( x, k11 );
-      x = v128_aesenc_nokey( x );
-      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
-      x = v128_xor( x, k12 );
-      x = v128_aesenc_nokey( x );
-      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
-      x = v128_xor( x, k13 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p3, k10 );

-      p2 = v128_xor( p2, x );
+      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
+      x = v128_xoraesenc( x, k11 );
+
+      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
+      x = v128_xoraesenc( x, k12 );
+
+      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
+      p2 = v128_xoraesencxor( x, k13, p2 );
   }

   // round 13

   k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
   k00 = v128_xor( k00, k13 );
-   x = v128_xor( p0, k00 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( p0, k00 );
+
   k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) ); 
   k01 = v128_xor( k01, k00 );
-   x = v128_xor( x, k01 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( x, k01 );
+
   k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
   k02 = v128_xor( k02, k01 );
-   x = v128_xor( x, k02 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( x, k02 );
+
   k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
   k03 = v128_xor( k03, k02 );
-   x = v128_xor( x, k03 );
-   x = v128_aesenc_nokey( x );
-
-   p3 = v128_xor( p3, x );
+   p3 = v128_xoraesencxor( x, k03, p3 );

   k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
   k10 = v128_xor( k10, k03 );
-   x = v128_xor( p2, k10 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( p2, k10 );
+
   k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
   k11 = v128_xor( k11, k10 );
-   x = v128_xor( x, k11 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( x, k11 );
+
   k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
   k12 = v128_xor( k12, v128_xor( k11, v128_set32(
               ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
-   x = v128_xor( x, k12 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( x, k12 );
+
   k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
   k13 = v128_xor( k13, k12 );
-   x = v128_xor( x, k13 );
-   x = v128_aesenc_nokey( x );
-
-   p1 = v128_xor( p1, x );
+   p1 = v128_xoraesencxor( x, k13, p1 );

   h[0] = v128_xor( h[0], p2 );
   h[1] = v128_xor( h[1], p3 );
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,7 +4,7 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.

-rm cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-zen5 cpuminer-alderlake cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2  > /dev/null
+./clean-all.sh

 # AVX512 SHA VAES: Intel Core Icelake, Rocketlake
 make distclean || echo clean
@@ -18,31 +18,31 @@ strip -s cpuminer
 mv cpuminer cpuminer-avx512-sha-vaes

 # Intel Core Alderlake: AVX2 SHA VAES, needs gcc-12
-#make clean || echo clean
-#rm -f config.status
-#CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl
-#make -j $(nproc)
-#strip -s cpuminer
-#mv cpuminer cpuminer-alderlake
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl
+make -j $(nproc)
+strip -s cpuminer
+mv cpuminer cpuminer-alderlake

 # Intel Core Arrowlake-s: AVX2 SHA512 VAES, needs gcc-14
 # Arrowlake-s includes SHA512, Arrowlake does not?
-#make clean || echo clean
-#rm -f config.status
-#CFLAGS="-O3 -march=arrowlake-s -Wall" ./configure --with-curl
-#make -j $(nproc)
-#strip -s cpuminer
-#mv cpuminer cpuminer-arrowlake-s
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=arrowlake-s -Wall" ./configure --with-curl
+make -j $(nproc)
+strip -s cpuminer
+mv cpuminer cpuminer-arrowlake-s

 # Intel Core Graniterapids: AVX512, SHA256, VAES, needs gcc-14
 # Granitrapids does not build with AVX10, SHA512 or APX.
 # wait for Diamondrapids & gcc-15.
-#make clean || echo clean
-#rm -f config.status
-#CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl
-#make -j $(nproc)
-#strip -s cpuminer
-#mv cpuminer cpuminer-graniterapids
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl
+make -j $(nproc)
+strip -s cpuminer
+mv cpuminer cpuminer-graniterapids

 # SHA512 AVX10.1
 #make clean || echo clean
@@ -69,20 +69,20 @@ mv cpuminer cpuminer-avx512-sha-vaes
 #mv cpuminer cpuminer-diamondrapids

 # Zen5: AVX512 SHA VAES, requires gcc-14.
-#make clean || echo clean
-#rm -f config.status
-#CFLAGS="-O3 -march=znver5 -Wall" ./configure --with-curl
-#make -j $(nproc)
-#strip -s cpuminer
-#mv cpuminer cpuminer-zen5
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=znver5 -Wall" ./configure --with-curl
+make -j $(nproc)
+strip -s cpuminer
+mv cpuminer cpuminer-zen5

 # Zen4: AVX512 SHA VAES
 make clean || echo clean
 rm -f config.status
 # Zen4: AVX512, SHA, VAES, needs gcc-12.3.
-#CFLAGS="-O3 -march=znver4 -Wall" ./configure --with-curl
+CFLAGS="-O3 -march=znver4 -Wall" ./configure --with-curl
 # Inclomplete list of Zen4 AVX512 extensions but includes all extensions used by cpuminer.
-CFLAGS="-O3 -march=znver3 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq -Wall" ./configure --with-curl
+#CFLAGS="-O3 -march=znver3 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq -Wall" ./configure --with-curl
 make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-zen4
@@ -115,8 +115,8 @@ mv cpuminer cpuminer-avx2-sha-vaes
 # AVX2 SHA AES: AMD Zen1
 make clean || echo done
 rm -f config.status
-#CFLAGS="-O3 -march=znver1 -maes -Wall" ./configure --with-curl
-CFLAGS="-O3 -maes -mavx2 -msha -Wall" ./configure --with-curl
+CFLAGS="-O3 -march=znver1 -maes -Wall" ./configure --with-curl
+#CFLAGS="-O3 -maes -mavx2 -msha -Wall" ./configure --with-curl
 make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-avx2-sha
--- a/clean-all.sh
+++ b/clean-all.sh
@@ -2,7 +2,7 @@
 #
 # make clean and rm all the targetted executables.

-rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9 cpuminer-armv9-crypto cpuminer-armv9-crypto-sha3 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8.5-crypto-sha3-sve2  cpuminer-armv8-crypto cpuminer-armv8 > /dev/null
+rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen* cpuminer-x64 cpuminer-armv* > /dev/null

 rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null

--- a/compat/aes_helper.c
+++ b/compat/aes_helper.c
@@ -108,7 +108,24 @@ extern "C"{
 	} while (0)

 #define AES_ROUND_NOKEY_LE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \
-	AES_ROUND_LE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3)
+{ \
+      (Y0) = AES0[(X0) & 0xFF] \
+         ^ AES1[((X1) >> 8) & 0xFF] \
+         ^ AES2[((X2) >> 16) & 0xFF] \
+         ^ AES3[((X3) >> 24) & 0xFF]; \
+      (Y1) = AES0[(X1) & 0xFF] \
+         ^ AES1[((X2) >> 8) & 0xFF] \
+         ^ AES2[((X3) >> 16) & 0xFF] \
+         ^ AES3[((X0) >> 24) & 0xFF]; \
+      (Y2) = AES0[(X2) & 0xFF] \
+         ^ AES1[((X3) >> 8) & 0xFF] \
+         ^ AES2[((X0) >> 16) & 0xFF] \
+         ^ AES3[((X1) >> 24) & 0xFF]; \
+      (Y3) = AES0[(X3) & 0xFF] \
+         ^ AES1[((X0) >> 8) & 0xFF] \
+         ^ AES2[((X1) >> 16) & 0xFF] \
+         ^ AES3[((X2) >> 24) & 0xFF]; \
+}

 #endif

--- a/28
+++ b/28
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.4.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.6.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='25.4'
-PACKAGE_STRING='cpuminer-opt 25.4'
+PACKAGE_VERSION='25.6'
+PACKAGE_STRING='cpuminer-opt 25.6'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1359,7 +1359,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 25.4 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 25.6 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1431,7 +1431,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 25.4:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 25.6:";;
   esac
  cat <<\_ACEOF

@@ -1536,7 +1536,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 25.4
+cpuminer-opt configure 25.6
 generated by GNU Autoconf 2.71

 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1983,7 +1983,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 25.4, which was
+It was created by cpuminer-opt $as_me 25.6, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3591,7 +3591,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='25.4'
+ VERSION='25.6'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -5808,11 +5808,11 @@ if test x$ac_prog_cxx_stdcxx = xno
 then :
  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++11 features" >&5
 printf %s "checking for $CXX option to enable C++11 features... " >&6; }
-if test ${ac_cv_prog_cxx_11+y}
+if test ${ac_cv_prog_cxx_cxx11+y}
 then :
  printf %s "(cached) " >&6
 else $as_nop
-  ac_cv_prog_cxx_11=no
+  ac_cv_prog_cxx_cxx11=no
 ac_save_CXX=$CXX
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
@@ -5854,11 +5854,11 @@ if test x$ac_prog_cxx_stdcxx = xno
 then :
  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++98 features" >&5
 printf %s "checking for $CXX option to enable C++98 features... " >&6; }
-if test ${ac_cv_prog_cxx_98+y}
+if test ${ac_cv_prog_cxx_cxx98+y}
 then :
  printf %s "(cached) " >&6
 else $as_nop
-  ac_cv_prog_cxx_98=no
+  ac_cv_prog_cxx_cxx98=no
 ac_save_CXX=$CXX
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
@@ -7435,7 +7435,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 25.4, which was
+This file was extended by cpuminer-opt $as_me 25.6, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7503,7 +7503,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 25.4
+cpuminer-opt config.status 25.6
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [25.4])
+AC_INIT([cpuminer-opt], [25.6])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/159
+++ b/159
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.4.
+# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.6.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation,
@@ -601,8 +601,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='25.4'
-PACKAGE_STRING='cpuminer-opt 25.4'
+PACKAGE_VERSION='25.6'
+PACKAGE_STRING='cpuminer-opt 25.6'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1352,7 +1352,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-'configure' configures cpuminer-opt 25.4 to adapt to many kinds of systems.
+'configure' configures cpuminer-opt 25.6 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1424,7 +1424,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 25.4:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 25.6:";;
   esac
  cat <<\_ACEOF

@@ -1528,7 +1528,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 25.4
+cpuminer-opt configure 25.6
 generated by GNU Autoconf 2.72

 Copyright (C) 2023 Free Software Foundation, Inc.
@@ -1949,7 +1949,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 25.4, which was
+It was created by cpuminer-opt $as_me 25.6, which was
 generated by GNU Autoconf 2.72.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3065,7 +3065,7 @@ ac_config_headers="$ac_config_headers cpuminer-config.h"



-am__api_version='1.17'
+am__api_version='1.18'


  # Find a good install program.  We prefer a C program (faster),
@@ -3334,10 +3334,14 @@ am_lf='
 '
 case `pwd` in
  *[\\\"\#\$\&\'\`$am_lf]*)
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
    as_fn_error $? "unsafe absolute working directory name" "$LINENO" 5;;
 esac
 case $srcdir in
  *[\\\"\#\$\&\'\`$am_lf\ \	]*)
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
    as_fn_error $? "unsafe srcdir value: '$srcdir'" "$LINENO" 5;;
 esac

@@ -3764,7 +3768,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='25.4'
+ VERSION='25.6'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -3802,9 +3806,133 @@ AMTAR='$${TAR-tar}'


 # We'll loop over all known methods to create a tar archive until one works.
-_am_tools='gnutar  pax cpio none'
+_am_tools='gnutar plaintar pax cpio none'

-am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'
+# The POSIX 1988 'ustar' format is defined with fixed-size fields.
+      # There is notably a 21 bits limit for the UID and the GID.  In fact,
+      # the 'pax' utility can hang on bigger UID/GID (see automake bug#8343
+      # and bug#13588).
+      am_max_uid=2097151 # 2^21 - 1
+      am_max_gid=$am_max_uid
+      # The $UID and $GID variables are not portable, so we need to resort
+      # to the POSIX-mandated id(1) utility.  Errors in the 'id' calls
+      # below are definitely unexpected, so allow the users to see them
+      # (that is, avoid stderr redirection).
+      am_uid=`id -u || echo unknown`
+      am_gid=`id -g || echo unknown`
+      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether UID '$am_uid' is supported by ustar format" >&5
+printf %s "checking whether UID '$am_uid' is supported by ustar format... " >&6; }
+      if test x$am_uid = xunknown; then
+        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: ancient id detected; assuming current UID is ok, but dist-ustar might not work" >&5
+printf "%s\n" "$as_me: WARNING: ancient id detected; assuming current UID is ok, but dist-ustar might not work" >&2;}
+      elif test $am_uid -le $am_max_uid; then
+        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+      else
+        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+        _am_tools=none
+      fi
+      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether GID '$am_gid' is supported by ustar format" >&5
+printf %s "checking whether GID '$am_gid' is supported by ustar format... " >&6; }
+      if test x$gm_gid = xunknown; then
+        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: ancient id detected; assuming current GID is ok, but dist-ustar might not work" >&5
+printf "%s\n" "$as_me: WARNING: ancient id detected; assuming current GID is ok, but dist-ustar might not work" >&2;}
+      elif test $am_gid -le $am_max_gid; then
+        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+      else
+        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+        _am_tools=none
+      fi
+
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking how to create a ustar tar archive" >&5
+printf %s "checking how to create a ustar tar archive... " >&6; }
+
+  # Go ahead even if we have the value already cached.  We do so because we
+  # need to set the values for the 'am__tar' and 'am__untar' variables.
+  _am_tools=${am_cv_prog_tar_ustar-$_am_tools}
+
+  for _am_tool in $_am_tools; do
+    case $_am_tool in
+    gnutar)
+      for _am_tar in tar gnutar gtar; do
+        { echo "$as_me:$LINENO: $_am_tar --version" >&5
+   ($_am_tar --version) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); } && break
+      done
+      am__tar="$_am_tar --format=ustar -chf - "'"$$tardir"'
+      am__tar_="$_am_tar --format=ustar -chf - "'"$tardir"'
+      am__untar="$_am_tar -xf -"
+      ;;
+    plaintar)
+      # Must skip GNU tar: if it does not support --format= it doesn't create
+      # ustar tarball either.
+      (tar --version) >/dev/null 2>&1 && continue
+      am__tar='tar chf - "$$tardir"'
+      am__tar_='tar chf - "$tardir"'
+      am__untar='tar xf -'
+      ;;
+    pax)
+      am__tar='pax -L -x ustar -w "$$tardir"'
+      am__tar_='pax -L -x ustar -w "$tardir"'
+      am__untar='pax -r'
+      ;;
+    cpio)
+      am__tar='find "$$tardir" -print | cpio -o -H ustar -L'
+      am__tar_='find "$tardir" -print | cpio -o -H ustar -L'
+      am__untar='cpio -i -H ustar -d'
+      ;;
+    none)
+      am__tar=false
+      am__tar_=false
+      am__untar=false
+      ;;
+    esac
+
+    # If the value was cached, stop now.  We just wanted to have am__tar
+    # and am__untar set.
+    test -n "${am_cv_prog_tar_ustar}" && break
+
+    # tar/untar a dummy directory, and stop if the command works.
+    rm -rf conftest.dir
+    mkdir conftest.dir
+    echo GrepMe > conftest.dir/file
+    { echo "$as_me:$LINENO: tardir=conftest.dir && eval $am__tar_ >conftest.tar" >&5
+   (tardir=conftest.dir && eval $am__tar_ >conftest.tar) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); }
+    rm -rf conftest.dir
+    if test -s conftest.tar; then
+      { echo "$as_me:$LINENO: $am__untar <conftest.tar" >&5
+   ($am__untar <conftest.tar) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); }
+      { echo "$as_me:$LINENO: cat conftest.dir/file" >&5
+   (cat conftest.dir/file) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); }
+      grep GrepMe conftest.dir/file >/dev/null 2>&1 && break
+    fi
+  done
+  rm -rf conftest.dir
+
+  if test ${am_cv_prog_tar_ustar+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) am_cv_prog_tar_ustar=$_am_tool ;;
+esac
+fi
+
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $am_cv_prog_tar_ustar" >&5
+printf "%s\n" "$am_cv_prog_tar_ustar" >&6; }



@@ -4986,7 +5114,10 @@ _ACEOF
      break
    fi
  done
-  rm -f core conftest*
+  # aligned with autoconf, so not including core; see bug#72225.
+  rm -f -r a.out a.exe b.out conftest.$ac_ext conftest.$ac_objext \
+    conftest.dSYM conftest1.$ac_ext conftest1.$ac_objext conftest1.dSYM \
+    conftest2.$ac_ext conftest2.$ac_objext conftest2.dSYM
  unset am_i ;;
 esac
 fi
@@ -7450,7 +7581,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 25.4, which was
+This file was extended by cpuminer-opt $as_me 25.6, which was
 generated by GNU Autoconf 2.72.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7518,7 +7649,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 25.4
+cpuminer-opt config.status 25.6
 configured by $0, generated by GNU Autoconf 2.72,
  with options \\"\$ac_cs_config\\"

--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -3760,10 +3760,10 @@ int main(int argc, char *argv[])

 #if defined(_WIN32_WINNT)
   if (opt_debug)
-      applog( LOG_INFO, "_WIN232_WINNT = 0x%04x", _WIN32_WINNT ); 
+      applog( LOG_INFO, "_WIN32_WINNT = 0x%04x", _WIN32_WINNT ); 
 #else
   if (opt_debug)
-      applog( LOG_INFO, "_WIN232_WINNT undefined." );
+      applog( LOG_INFO, "_WIN32_WINNT undefined." );
 #endif
 #if defined(WINDOWS_CPU_GROUPS_ENABLED)
   if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) )
--- a/miner.h
+++ b/miner.h
@@ -582,6 +582,8 @@ enum algos {
        ALGO_ANIME,
        ALGO_ARGON2D250,
        ALGO_ARGON2D500,
+        ALGO_ARGON2D1000,
+        ALGO_ARGON2D16000,
        ALGO_ARGON2D4096,
        ALGO_AXIOM,       
        ALGO_BLAKE,       
@@ -677,6 +679,8 @@ static const char* const algo_names[] = {
        "anime",
        "argon2d250",
        "argon2d500",
+        "argon2d1000",
+        "argon2d16000",
        "argon2d4096",
        "axiom",
        "blake",
@@ -837,6 +841,8 @@ Options:\n\
                          anime         Animecoin (ANI)\n\
                          argon2d250\n\
                          argon2d500\n\
+                          argon2d1000\n\
+                          argon2d16000\n\
                          argon2d4096\n\
                          axiom         Shabal-256 MemoHash\n\
                          blake         blake256r14 (SFR)\n\
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -137,10 +137,24 @@
 #define v128_unpackhi8                 _mm_unpackhi_epi8

 // AES
-// Nokey means nothing on x86_64 but it saves an instruction and a register
-// on ARM.
-#define v128_aesenc                    _mm_aesenc_si128
+
+// xor key with result after encryption, x86_64 format.
+#define v128_aesencxor                 _mm_aesenc_si128
+// default is x86_64 format.
+#define v128_aesenc                    v128_aesencxor
+
+// xor key with v before encryption, arm64 format.
+#define v128_xoraesenc( v, k ) \
+   _mm_aesenc_si128( v128_xor( v, k ), v128_zero )
+
+// xor v with k_in before encryption then xor the result with k_out afterward.
+// Uses the applicable optimization based on the target.
+#define v128_xoraesencxor( v, k_in, k_out ) \
+   _mm_aesenc_si128( v128_xor( v, k_in ), k_out )
+
+// arm64 optimized
 #define v128_aesenc_nokey(v)           _mm_aesenc_si128( v, v128_zero )
+
 #define v128_aesenclast                _mm_aesenclast_si128
 #define v128_aesenclast_nokey(v)       _mm_aesenclast_si128( v, v128_zero )
 #define v128_aesdec                    _mm_aesdec_si128
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -217,7 +217,9 @@ static inline __m256i mm256_not( const __m256i v )
 // Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
 // Returns 4 or 8 bit integer mask from MSBit of 64 or 32 bit elements.
 // Effectively a sign test.
-
+// The functions return int which can promote small integers to int when used
+// in an expression. Users should mask the slack bits strategically to maintain
+// data integrity.
 #define mm256_movmask_64( v ) \
   _mm256_movemask_pd( _mm256_castsi256_pd( v ) )

--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -14,12 +14,6 @@
 //   vectors. It is therefore not technically required for any 512 bit vector
 //   utilities defined below.

-// if avx10   // avx512 is always set
-//      if evex512: yes   
-// else if avx512 : yes   // avx512 is set but not avx10
-// else           : no    // avx512 not set or avx10.1 is set without evex512
-
-
 #if defined(SIMD512)

 //  AVX512 intrinsics have a few changes from previous conventions.
@@ -57,7 +51,7 @@
 //      - if an argument is to referenced multiple times a C inline function
 //        should be used instead of a macro to prevent an expression argument
 //        from being evaluated multiple times (wasteful) or produces side
-//         effects (very bad).
+//        effects (very bad).
 //
 //    There are 2 areas where overhead is a major concern: constants and
 //    permutations.
--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -4,9 +4,10 @@
 #if defined(__aarch64__) && defined(__ARM_NEON)

 // Targeted functions supporting NEON SIMD 128 & 64 bit vectors.
-// Element size matters!
 //
-// Intel naming is generally used.
+// Intel style naming is generally used, however, this not an attempt to emulate Intel
+// intructions. It's focussed on the functions used in this program and the best way
+// to implement them with NEON.
 //
 // Some advanced logical operations that require SHA3. Prior to GCC-13
 // they also require armv8.2
@@ -186,9 +187,21 @@
 // vzipq_u32 can do hi & lo and return uint32x4x2, no 64 bit version.

 // AES
-// consistent with Intel AES intrinsics, break up for optimizing
-#define v128_aesenc( v, k ) \
-   v128_xor( k, vaesmcq_u8( vaeseq_u8( v, v128_zero ) ) )
+
+// xor key with result after encryption, x86_64 format.
+#define v128_aesencxor( v, k ) \
+   v128_xor( vaesmcq_u8( vaeseq_u8( v, v128_zero ) ), k )
+// default is x86_64 format.
+#define v128_aesenc v128_aesencxor
+
+// xor key with v before encryption, arm64 format.
+#define v128_xoraesenc( v, k ) \
+   vaesmcq_u8( vaeseq_u8( v, k ) )
+
+// xor v with k_in before encryption then xor the result with k_out afterward.
+// Uses the applicable optimization based on the target.
+#define v128_xoraesencxor( v, k_in, k_out ) \
+   v128_xor( v128_xoraesenc( v, k_in ), k_out )

 #define v128_aesenc_nokey( v ) \
   vaesmcq_u8( vaeseq_u8( v, v128_zero ) )
Author	SHA1	Message	Date
Jay D Dee	12480a3ea5	v25.6	2025-07-20 19:43:10 -04:00
Jay D Dee	aa47e880d5	v25.5	2025-07-09 01:32:38 -04:00