v25.7

v25.6
2026-02-23 17:03:08 +00:00 · 2025-11-15 10:44:32 -05:00 · 2025-07-20 19:43:10 -04:00
36 changed files with 960 additions and 935 deletions
--- a/README.md
+++ b/README.md
@@ -54,9 +54,9 @@ Supported Algorithms

                          allium        Garlicoin
                          anime         Animecoin
-                          argon2        Argon2 coin (AR2)
                          argon2d250    
                          argon2d500
+                          argon2d1000
                          argon2d4096
                          blake         Blake-256
                          blake2b       Blake2-512
--- a/15
+++ b/15
@@ -75,9 +75,22 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v25.7
+
+Fixed a bug calculating TTF longer than 1 year.
+Faster argon2d.
+Faster hamsi AVX512.
+Faster switfftx AVX2.
+Other small fixes and improvements.
+
+v25.6
+
+Added argon2d1000, argon2d16000 algos.
+Target specific AES optimizations improve shavite for ARM64 & x86_64.
+
 v25.5

-x86_64: Fixed and insidious bug in sha256 early rejection optimization for AVX2 & AVX512.
+x86_64: Fixed an insidious bug in sha256 early rejection optimization for AVX2 & AVX512.
 x86_64: Faster sha256d, sha256dt for AVX2 & AVX512.
 Other small bug fixes.

--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -297,6 +297,8 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_ANIME:        rc = register_anime_algo         ( gate ); break;
    case ALGO_ARGON2D250:   rc = register_argon2d250_algo    ( gate ); break;
    case ALGO_ARGON2D500:   rc = register_argon2d500_algo    ( gate ); break;
+    case ALGO_ARGON2D1000:  rc = register_argon2d1000_algo   ( gate ); break;
+    case ALGO_ARGON2D16000: rc = register_argon2d16000_algo  ( gate ); break;
    case ALGO_ARGON2D4096:  rc = register_argon2d4096_algo   ( gate ); break;
    case ALGO_AXIOM:        rc = register_axiom_algo         ( gate ); break;
    case ALGO_BLAKE:        rc = register_blake_algo         ( gate ); break;
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -172,8 +172,11 @@ void ( *set_work_data_endian )  ( struct work* );

 json_t* ( *longpoll_rpc_call )  ( CURL*, int*, char* );

+// Deprecated
 set_t optimizations;
+
 int  ( *get_work_data_size )     ();
+
 int  ntime_index;
 int  nbits_index;
 int  nonce_index;            // use with caution, see warning below
@@ -274,8 +277,6 @@ void std_get_new_work( struct work *work, struct work *g_work, int thr_id,

 void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
 void sha256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
-// OpenSSL sha256 deprecated
-//void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );

 bool std_le_work_decode( struct work *work );
 bool std_be_work_decode( struct work *work );
--- a/algo/argon2d/argon2d-gate.c
+++ b/algo/argon2d/argon2d-gate.c
@@ -6,6 +6,38 @@ static const size_t INPUT_BYTES = 80;  // Lenth of a block header in bytes. Inpu
 static const size_t OUTPUT_BYTES = 32; // Length of output needed for a 256-bit hash
 static const unsigned int DEFAULT_ARGON2_FLAG = 2; //Same as ARGON2_DEFAULT_FLAGS

+// generic, works with most variations of argon2d
+int scanhash_argon2d( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t _ALIGN(64) hash[8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const int thr_id = mythr->id;
+   const uint32_t first_nonce = (const uint32_t)pdata[19];
+   const uint32_t last_nonce = (const uint32_t)max_nonce;
+   uint32_t nonce = first_nonce;
+   const bool bench = opt_benchmark;
+
+   v128_bswap32_80( edata, pdata );
+   do
+   {
+      edata[19] = nonce;
+      algo_gate.hash( hash, edata, thr_id );
+      if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
+      {
+          pdata[19] = bswap_32( nonce );
+          submit_solution( work, hash, mythr );
+      }
+      nonce++;
+  } while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
+
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce;
+   return 0;
+}
+
 void argon2d250_hash( void *output, const void *input )
 {
 	argon2_context context;
@@ -32,41 +64,10 @@ void argon2d250_hash( void *output, const void *input )
 	argon2_ctx( &context, Argon2_d );
 }

-int scanhash_argon2d250( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t _ALIGN(64) edata[20];
-   uint32_t _ALIGN(64) hash[8];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t Htarg = ptarget[7];
-   uint32_t nonce = first_nonce;
-
-   swab32_array( edata, pdata, 20 );
-
-   do {
-      be32enc(&edata[19], nonce);
-      argon2d250_hash( hash, edata );
-      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
-      {
-          pdata[19] = nonce;
-          submit_solution( work, hash, mythr );
-      }
-      nonce++;
-   } while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce + 1;
-   return 0;
-}
-
 bool register_argon2d250_algo( algo_gate_t* gate )
 {
-        gate->scanhash = (void*)&scanhash_argon2d250;
+        gate->scanhash = (void*)&scanhash_argon2d;
        gate->hash = (void*)&argon2d250_hash;
-        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
        opt_target_factor = 65536.0;
        return true;
 }
@@ -97,43 +98,78 @@ void argon2d500_hash( void *output, const void *input )
    argon2_ctx( &context, Argon2_d );
 }

-int scanhash_argon2d500( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t _ALIGN(64) edata[20];
-   uint32_t _ALIGN(64) hash[8];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const int thr_id = mythr->id; 
-   const uint32_t first_nonce = (const uint32_t)pdata[19];
-   const uint32_t last_nonce = (const uint32_t)max_nonce;
-   uint32_t nonce = first_nonce;
-   const bool bench = opt_benchmark;
-
-   v128_bswap32_80( edata, pdata );
-   do
-   {
-      edata[19] = nonce;
-      argon2d500_hash( hash, edata );
-      if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
-           && !bench ) )
-      {
-          pdata[19] = bswap_32( nonce );;
-          submit_solution( work, hash, mythr );
-      }
-      nonce++;
-  } while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
-
-   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce;
-   return 0;
-}
-
 bool register_argon2d500_algo( algo_gate_t* gate )
 {
-        gate->scanhash = (void*)&scanhash_argon2d500;
+        gate->scanhash = (void*)&scanhash_argon2d;
        gate->hash = (void*)&argon2d500_hash;
-        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
+        opt_target_factor = 65536.0;
+        return true;
+}
+
+void argon2d1000_hash( void *output, const void *input )
+{
+    argon2_context context;
+    context.out = (uint8_t *)output;
+    context.outlen = (uint32_t)OUTPUT_BYTES;
+    context.pwd = (uint8_t *)input;
+    context.pwdlen = (uint32_t)INPUT_BYTES;
+    context.salt = (uint8_t *)input; //salt = input
+    context.saltlen = (uint32_t)INPUT_BYTES;
+    context.secret = NULL;
+    context.secretlen = 0;
+    context.ad = NULL;
+    context.adlen = 0;
+    context.allocate_cbk = NULL;
+    context.free_cbk = NULL;
+    context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS
+    // main configurable Argon2 hash parameters
+    context.m_cost = 1000;  // Memory in KiB (1MB)
+    context.lanes = 8;     // Degree of Parallelism
+    context.threads = 1;   // Threads
+    context.t_cost = 2;    // Iterations
+    context.version = ARGON2_VERSION_10;
+
+    argon2_ctx( &context, Argon2_d );
+}
+
+bool register_argon2d1000_algo( algo_gate_t* gate )
+{
+        gate->scanhash = (void*)&scanhash_argon2d;
+        gate->hash = (void*)&argon2d1000_hash;
+        opt_target_factor = 65536.0;
+        return true;
+}
+
+void argon2d16000_hash( void *output, const void *input )
+{
+   argon2_context context;
+   context.out = (uint8_t *)output;
+   context.outlen = (uint32_t)OUTPUT_BYTES;
+   context.pwd = (uint8_t *)input;
+   context.pwdlen = (uint32_t)INPUT_BYTES;
+   context.salt = (uint8_t *)input; //salt = input
+   context.saltlen = (uint32_t)INPUT_BYTES;
+   context.secret = NULL;
+   context.secretlen = 0;
+   context.ad = NULL;
+   context.adlen = 0;
+   context.allocate_cbk = NULL;
+   context.free_cbk = NULL;
+   context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS
+   // main configurable Argon2 hash parameters
+   context.m_cost = 16000; // Memory in KiB (~16384KB)
+   context.lanes = 1;    // Degree of Parallelism
+   context.threads = 1;  // Threads
+   context.t_cost = 1;   // Iterations
+   context.version = ARGON2_VERSION_10;
+
+   argon2_ctx( &context, Argon2_d );
+}
+
+bool register_argon2d16000_algo( algo_gate_t* gate )
+{
+        gate->scanhash = (void*)&scanhash_argon2d;
+        gate->hash = (void*)&argon2d16000_hash;
        opt_target_factor = 65536.0;
        return true;
 }
@@ -148,7 +184,7 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = (const uint32_t)max_nonce;
   uint32_t n = first_nonce;
-   const int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;  
   uint32_t t_cost = 1; // 1 iteration
   uint32_t m_cost = 4096; // use 4MB
   uint32_t parallelism = 1; // 1 thread, 2 lanes
@@ -176,7 +212,6 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
 bool register_argon2d4096_algo( algo_gate_t* gate )
 {
        gate->scanhash = (void*)&scanhash_argon2d4096;
-        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT |NEON_OPT;
        opt_target_factor = 65536.0;
        return true;
 }
--- a/algo/argon2d/argon2d-gate.h
+++ b/algo/argon2d/argon2d-gate.h
@@ -4,22 +4,27 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

+int scanhash_argon2d( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );
+
 // Credits: version = 0x10, m_cost = 250.
 bool register_argon2d250_algo( algo_gate_t* gate );

 void argon2d250_hash( void *state, const void *input );

-int scanhash_argon2d250( struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done, struct thr_info *mythr );
-
 // Dynamic: version = 0x10, m_cost = 500.
 bool register_argon2d500_algo( algo_gate_t* gate );

 void argon2d500_hash( void *state, const void *input );

-int scanhash_argon2d500( struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done, struct thr_info *mythr );
+// Zero Dynamics Cash: version = 0x10, m_cost = 1000.
+bool register_argon2d1000_algo( algo_gate_t* gate );

+void argon2d1000_hash( void *state, const void *input );
+
+bool register_argon2d16000_algo( algo_gate_t* gate );
+
+void argon2d16000_hash( void *state, const void *input );

 // Unitus: version = 0x13, m_cost = 4096.
 bool register_argon2d4096_algo( algo_gate_t* gate );
--- a/algo/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2d/blake2/blamka-round-opt.h
@@ -66,82 +66,60 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)

 #if defined(__SSSE3__)  || defined(__ARM_NEON)

-#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
-    do {                                                                       \
-        v128_t t0 = v128_alignr8(B1, B0, 8);                               \
-        v128_t t1 = v128_alignr8(B0, B1, 8);                               \
-        B0 = t0;                                                               \
-        B1 = t1;                                                               \
-                                                                               \
-        t0 = C0;                                                               \
-        C0 = C1;                                                               \
-        C1 = t0;                                                               \
-                                                                               \
-        t0 = v128_alignr8(D1, D0, 8);                                       \
-        t1 = v128_alignr8(D0, D1, 8);                                       \
-        D0 = t1;                                                               \
-        D1 = t0;                                                               \
-    } while ((void)0, 0)
+#define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+   v128_t t = v128_alignr8( B1, B0, 8 ); \
+   B1 = v128_alignr8( B0, B1, 8 ); \
+   B0 = t; \
+   t = v128_alignr8( D1, D0, 8 ); \
+   D0 = v128_alignr8( D0, D1, 8 ); \
+   D1 = t; \
+}

-#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
-    do {                                                                       \
-        v128_t t0 = v128_alignr8(B0, B1, 8);                               \
-        v128_t t1 = v128_alignr8(B1, B0, 8);                               \
-        B0 = t0;                                                               \
-        B1 = t1;                                                               \
-                                                                               \
-        t0 = C0;                                                               \
-        C0 = C1;                                                               \
-        C1 = t0;                                                               \
-                                                                               \
-        t0 = v128_alignr8(D0, D1, 8);                                       \
-        t1 = v128_alignr8(D1, D0, 8);                                       \
-        D0 = t1;                                                               \
-        D1 = t0;                                                               \
-    } while ((void)0, 0)
+#define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+    v128_t t = v128_alignr8( B0, B1, 8 ); \
+    B1 = v128_alignr8( B1, B0, 8 ); \
+    B0 = t; \
+    t = v128_alignr8( D0, D1, 8 ); \
+    D0 = v128_alignr8( D1, D0, 8 ); \
+    D1 = t; \
+}

 #else /* SSE2 */

-#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
-    do {                                                                       \
-        v128_t t0 = D0;                                                       \
-        v128_t t1 = B0;                                                       \
-        D0 = C0;                                                               \
-        C0 = C1;                                                               \
-        C1 = D0;                                                               \
-        D0 = v128_unpackhi64(D1, v128_unpacklo64(t0, t0));               \
-        D1 = v128_unpackhi64(t0, v128_unpacklo64(D1, D1));               \
-        B0 = v128_unpackhi64(B0, v128_unpacklo64(B1, B1));               \
-        B1 = v128_unpackhi64(B1, v128_unpacklo64(t1, t1));               \
-    } while ((void)0, 0)
+#define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+    v128_t t = D0; \
+    D0 = v128_unpackhi64( D1, v128_unpacklo64( D0, D0 ) ); \
+    D1 = v128_unpackhi64( t, v128_unpacklo64( D1, D1 ) ); \
+    t = B0; \
+    B0 = v128_unpackhi64( B0, v128_unpacklo64( B1, B1 ) ); \
+    B1 = v128_unpackhi64( B1, v128_unpacklo64( t, t ) ); \
+}
+
+#define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+    v128_t t = B0; \
+    B0 = v128_unpackhi64( B1, v128_unpacklo64( B0, B0 ) ); \
+    B1 = v128_unpackhi64( t, v128_unpacklo64( B1, B1 ) ); \
+    t = D0; \
+    D0 = v128_unpackhi64( D0, v128_unpacklo64( D1, D1 ) ); \
+    D1 = v128_unpackhi64( D1, v128_unpacklo64( t, t ) ); \
+}

-#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
-    do {                                                                       \
-        v128_t t0, t1;                                                        \
-        t0 = C0;                                                               \
-        C0 = C1;                                                               \
-        C1 = t0;                                                               \
-        t0 = B0;                                                               \
-        t1 = D0;                                                               \
-        B0 = v128_unpackhi64(B1, v128_unpacklo64(B0, B0));               \
-        B1 = v128_unpackhi64(t0, v128_unpacklo64(B1, B1));               \
-        D0 = v128_unpackhi64(D0, v128_unpacklo64(D1, D1));               \
-        D1 = v128_unpackhi64(D1, v128_unpacklo64(t1, t1));               \
-    } while ((void)0, 0)
 #endif

-#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1)                           \
-    do {                                                                       \
-        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
-        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
-                                                                               \
-        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                           \
-                                                                               \
-        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
-        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
-                                                                               \
-        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                         \
-    } while ((void)0, 0)
+#define BLAKE2_ROUND( A0, A1, B0, B1, C0, C1, D0, D1 ) \
+{ \
+    G1( A0, B0, C0, D0, A1, B1, C1, D1 ); \
+    G2( A0, B0, C0, D0, A1, B1, C1, D1 ); \
+    DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \
+    G1( A0, B0, C1, D0, A1, B1, C0, D1 ); \
+    G2( A0, B0, C1, D0, A1, B1, C0, D1 ); \
+    UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \
+}
+
 #else /* __AVX2__ */

 #include <immintrin.h>
@@ -211,7 +189,6 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
-        \
        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
@@ -219,17 +196,14 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)

 #define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
    do { \
-        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
-        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
-        B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
-        tmp1 = C0; \
-        B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
-        C0 = C1; \
-        tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
-        C1 = tmp1; \
+        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0x33); \
+        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0xCC); \
+        B0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
+        B1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
        tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
-        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
-        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
+        D0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
+        D1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
    } while(0);

 #define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -237,7 +211,6 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
-        \
        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
@@ -247,27 +220,21 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
    do { \
        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
-        B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
-        tmp1 = C0; \
-        B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
-        C0 = C1; \
+        B0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
+        B1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
        tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
-        C1 = tmp1; \
        tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
-        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
-        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        D1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
+        D0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
    } while((void)0, 0);

 #define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
    do{ \
        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        \
        DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
-        \
        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        \
        UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
    } while((void)0, 0);

@@ -275,12 +242,9 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
    do{ \
        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        \
        DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        \
-        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        \
+        G1_AVX2(A0, A1, B0, B1, C1, C0, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C1, C0, D0, D1) \
        UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
    } while((void)0, 0);

@@ -290,12 +254,73 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)

 #include <immintrin.h>

+/*
 static inline __m512i muladd(__m512i x, __m512i y)
 {
    __m512i z = _mm512_mul_epu32(x, y);
    return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
 }
+*/

+#define G1( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+  __m512i z0, z1; \
+  z0 = _mm512_mul_epu32( A0, B0 ); \
+  z1 = _mm512_mul_epu32( A1, B1 ); \
+  A0 = _mm512_add_epi64( A0, B0 ); \
+  A1 = _mm512_add_epi64( A1, B1 ); \
+  z0 = _mm512_add_epi64( z0, z0 ); \
+  z1 = _mm512_add_epi64( z1, z1 ); \
+  A0 = _mm512_add_epi64( A0, z0 ); \
+  A1 = _mm512_add_epi64( A1, z1 ); \
+  D0 = _mm512_xor_si512(D0, A0); \
+  D1 = _mm512_xor_si512(D1, A1); \
+  D0 = _mm512_ror_epi64(D0, 32); \
+  D1 = _mm512_ror_epi64(D1, 32); \
+  z0 = _mm512_mul_epu32( C0, D0 ); \
+  z1 = _mm512_mul_epu32( C1, D1 ); \
+  C0 = _mm512_add_epi64( C0, D0 ); \
+  C1 = _mm512_add_epi64( C1, D1 ); \
+  z0 = _mm512_add_epi64( z0, z0 ); \
+  z1 = _mm512_add_epi64( z1, z1 ); \
+  C0 = _mm512_add_epi64( C0, z0 ); \
+  C1 = _mm512_add_epi64( C1, z1 ); \
+  B0 = _mm512_xor_si512(B0, C0); \
+  B1 = _mm512_xor_si512(B1, C1); \
+  B0 = _mm512_ror_epi64(B0, 24); \
+  B1 = _mm512_ror_epi64(B1, 24); \
+}
+
+#define G2( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+  __m512i z0, z1; \
+  z0 = _mm512_mul_epu32( A0, B0 ); \
+  z1 = _mm512_mul_epu32( A1, B1 ); \
+  A0 = _mm512_add_epi64( A0, B0 ); \
+  A1 = _mm512_add_epi64( A1, B1 ); \
+  z0 = _mm512_add_epi64( z0, z0 ); \
+  z1 = _mm512_add_epi64( z1, z1 ); \
+  A0 = _mm512_add_epi64( A0, z0 ); \
+  A1 = _mm512_add_epi64( A1, z1 ); \
+  D0 = _mm512_xor_si512(D0, A0); \
+  D1 = _mm512_xor_si512(D1, A1); \
+  D0 = _mm512_ror_epi64(D0, 16); \
+  D1 = _mm512_ror_epi64(D1, 16); \
+  z0 = _mm512_mul_epu32( C0, D0 ); \
+  z1 = _mm512_mul_epu32( C1, D1 ); \
+  C0 = _mm512_add_epi64( C0, D0 ); \
+  C1 = _mm512_add_epi64( C1, D1 ); \
+  z0 = _mm512_add_epi64( z0, z0 ); \
+  z1 = _mm512_add_epi64( z1, z1 ); \
+  C0 = _mm512_add_epi64( C0, z0 ); \
+  C1 = _mm512_add_epi64( C1, z1 ); \
+  B0 = _mm512_xor_si512(B0, C0); \
+  B1 = _mm512_xor_si512(B1, C1); \
+  B0 = _mm512_ror_epi64(B0, 63); \
+  B1 = _mm512_ror_epi64(B1, 63); \
+}
+
+/*
 #define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
    do { \
        A0 = muladd(A0, B0); \
@@ -316,7 +341,8 @@ static inline __m512i muladd(__m512i x, __m512i y)
        B0 = _mm512_ror_epi64(B0, 24); \
        B1 = _mm512_ror_epi64(B1, 24); \
    } while ((void)0, 0)
-
+*/
+/* 
 #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
    do { \
        A0 = muladd(A0, B0); \
@@ -337,15 +363,14 @@ static inline __m512i muladd(__m512i x, __m512i y)
        B0 = _mm512_ror_epi64(B0, 63); \
        B1 = _mm512_ror_epi64(B1, 63); \
    } while ((void)0, 0)
+*/

 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
    do { \
        B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
        B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
-\
        C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
        C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
-\
        D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
        D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
    } while ((void)0, 0)
@@ -354,10 +379,8 @@ static inline __m512i muladd(__m512i x, __m512i y)
    do { \
        B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
        B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
-\
        C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
        C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
-\
        D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
        D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
    } while ((void)0, 0)
@@ -366,15 +389,17 @@ static inline __m512i muladd(__m512i x, __m512i y)
    do { \
        G1(A0, B0, C0, D0, A1, B1, C1, D1); \
        G2(A0, B0, C0, D0, A1, B1, C1, D1); \
-\
        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
-\
        G1(A0, B0, C0, D0, A1, B1, C1, D1); \
        G2(A0, B0, C0, D0, A1, B1, C1, D1); \
-\
        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
    } while ((void)0, 0)

+static const __m512i swap_q0  = { 0,1,  8,9,  2,3,  10,11 }; 
+static const __m512i swap_q1  = { 4,5, 12,13, 6,7,  14,15 };
+static const __m512i uswap_q0 = { 0,1,  4,5,  8,9,  12,13 };
+static const __m512i uswap_q1 = { 2,3,  6,7, 10,11, 14,15 };
+
 #define SWAP_HALVES(A0, A1) \
    do { \
        __m512i t; \
@@ -383,19 +408,36 @@ static inline __m512i muladd(__m512i x, __m512i y)
        A0 = t; \
    } while((void)0, 0)

+#define SWAP_QUARTERS(A0, A1) \
+{ \
+   __m512i t = _mm512_permutex2var_epi64( A0, swap_q0, A1 ); \
+   A1 = _mm512_permutex2var_epi64( A0, swap_q1, A1 ); \
+   A0 = t; \
+}   
+
+#define UNSWAP_QUARTERS(A0, A1) \
+{ \
+   __m512i t = _mm512_permutex2var_epi64( A0, uswap_q0, A1 ); \
+   A1 = _mm512_permutex2var_epi64( A0, uswap_q1, A1 ); \
+   A0 = t; \
+}   
+   
+/*
 #define SWAP_QUARTERS(A0, A1) \
    do { \
        SWAP_HALVES(A0, A1); \
        A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
        A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
    } while((void)0, 0)
-
+*/
+/*
 #define UNSWAP_QUARTERS(A0, A1) \
    do { \
        A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
        A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
        SWAP_HALVES(A0, A1); \
    } while((void)0, 0)
+*/

 #define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
    do { \
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -683,8 +683,9 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
   mj[14] = mm256_rol_64( M[14], 15 );
   mj[15] = mm256_rol_64( M[15], 16 );

-   __m256i K = _mm256_set1_epi64x( 16 * 0x0555555555555555ULL );
-   const __m256i Kincr = _mm256_set1_epi64x( 0x0555555555555555ULL );
+   __m256i K = _mm256_set1_epi64x( 0x5555555555555550ULL );
+   static const __m256i Kincr = { 0x0555555555555555ULL, 0x0555555555555555ULL,
+                                  0x0555555555555555ULL, 0x0555555555555555ULL };

   qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K );
   K = _mm256_add_epi64( K, Kincr );
@@ -1094,7 +1095,7 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
                        __m512i dH[16] )
 {
   __m512i qt[32], xl, xh;
-   __m512i mh[16];
+   __m512i mh[16], mj[16];
   int i;

   for ( i = 0; i < 16; i++ )
@@ -1117,8 +1118,6 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
   qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
   qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );

-   __m512i mj[16];
- 
   mj[ 0] = mm512_rol_64( M[ 0],  1 );
   mj[ 1] = mm512_rol_64( M[ 1],  2 );
   mj[ 2] = mm512_rol_64( M[ 2],  3 );
@@ -1136,8 +1135,11 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
   mj[14] = mm512_rol_64( M[14], 15 );
   mj[15] = mm512_rol_64( M[15], 16 );

-   __m512i K = _mm512_set1_epi64( 16 * 0x0555555555555555ULL );
-   const __m512i Kincr = _mm512_set1_epi64( 0x0555555555555555ULL );
+   __m512i K = _mm512_set1_epi64( 0x5555555555555550ULL );
+   static const __m512i Kincr = { 0x0555555555555555ULL, 0x0555555555555555ULL,
+                                  0x0555555555555555ULL, 0x0555555555555555ULL,
+                                  0x0555555555555555ULL, 0x0555555555555555ULL,
+                                  0x0555555555555555ULL, 0x0555555555555555ULL };

   qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], K );
   K = _mm512_add_epi64( K, Kincr );
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -503,32 +503,28 @@ do { \
  SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \
  SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \
  s4 = mm512_swap64_32( s4 ); \
-  s5 = mm512_swap64_32( s5 ); \
+  t0 = _mm512_mask_shuffle_epi32( s4, 0xaaaa, s5, 0xb1 ); \
  sD = mm512_swap64_32( sD ); \
-  sE = mm512_swap64_32( sE ); \
-  t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
+  t1 = _mm512_mask_shuffle_epi32( sD, 0xaaaa, sE, 0xb1 ); \
  L8( s0, t0, s9, t1 ); \
  s6 = mm512_swap64_32( s6 ); \
  sF = mm512_swap64_32( sF ); \
-  t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
+  t2 = _mm512_mask_shuffle_epi32( s6, 0x5555, s5, 0xb1 ); \
+  t3 = _mm512_mask_shuffle_epi32( sF, 0x5555, sE, 0xb1 ); \
  L8( s1, t2, sA, t3 ); \
  s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \
  sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \
 \
-  s7 = mm512_swap64_32( s7 ); \
-  sC = mm512_swap64_32( sC ); \
-  t4 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
-  t5 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
+  t4 = _mm512_mask_shuffle_epi32( s6, 0xaaaa, s7, 0xb1 ); \
+  t5 = _mm512_mask_shuffle_epi32( sF, 0xaaaa, sC, 0xb1 ); \
  L8( s2, t4, sB, t5 ); \
  s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \
  sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \
  s6 = mm512_swap64_32( s6 ); \
  sF = mm512_swap64_32( sF ); \
 \
-  t2 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
+  t2 = _mm512_mask_shuffle_epi32( s4, 0x5555, s7, 0xb1 ); \
+  t3 = _mm512_mask_shuffle_epi32( sD, 0x5555, sC, 0xb1 ); \
  L8( s3, t2, s8, t3 ); \
  s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \
  s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \
@@ -537,21 +533,20 @@ do { \
  s7 = mm512_swap64_32( s7 ); \
  sC = mm512_swap64_32( sC ); \
 \
-  t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \
+  t0 = _mm512_mask_shuffle_epi32( s0, 0xaaaa, s8, 0xb1 ); \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
-  t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \
+  t2 = _mm512_mask_shuffle_epi32( sA, 0x5555, s2, 0xb1 ); \
  t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \
  t3 = mm512_swap64_32( t3 ); \
  L8( t0, t1, t2, t3 ); \
-  t3 = mm512_swap64_32( t3 ); \
  s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
-  s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \
+  s8 = _mm512_mask_shuffle_epi32( s8, 0x5555, t0, 0xb1 ); \
  s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
  s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
-  s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \
+  s2 = _mm512_mask_shuffle_epi32( s2, 0xaaaa, t2, 0xb1 ); \
  sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
-  s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \
-  sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \
+  s3 = _mm512_mask_shuffle_epi32( s3, 0xaaaa, t3, 0xb1 ); \
+  sB = _mm512_mask_shuffle_epi32( sB, 0x5555, t3, 0xb1 ); \
 \
  t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \
@@ -1268,7 +1263,7 @@ do { \
 } while (0)
 #endif

-// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
+// v3, 15 instructions
 #define SBOX( a, b, c, d ) \
 { \
  __m256i tb, td; \
@@ -1286,7 +1281,7 @@ do { \
 #endif

 /*
-/ v2, 16 instructions, 10 TL equivalent instructions
+/ v2, 16 instructions
 #define SBOX( a, b, c, d ) \
 { \
  __m256i t = mm256_xorand( d, a, c ); \
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -80,14 +80,14 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
    __m512i t = a0; \
    a0 = mm512_xoror( a3, a0, a1 ); \
    a2 = _mm512_xor_si512( a2, a3 ); \
-    a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
+    a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 nxor (a3 & t) */ \
    a3 = mm512_xorand( a2, a3, t ); \
    a2 = mm512_xorand( a1, a2, a0); \
    a1 = _mm512_or_si512( a1, a3 ); \
    a3 = _mm512_xor_si512( a3, a2 ); \
    t  = _mm512_xor_si512( t, a1 ); \
    a2 = _mm512_and_si512( a2, a1 ); \
-    a1 = mm512_xnor( a1, a0 ); \
+    a1 = mm512_nxor( a1, a0 ); \
    a0 = t; \
 }

@@ -527,14 +527,14 @@ int luffa_4way_update_close( luffa_4way_context *state,
    __m256i t = a0; \
    a0 = mm256_xoror( a3, a0, a1 ); \
    a2 = _mm256_xor_si256( a2, a3 ); \
-    a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
+    a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 nxor (a3 & t) */ \
    a3 = mm256_xorand( a2, a3, t ); \
    a2 = mm256_xorand( a1, a2, a0); \
    a1 = _mm256_or_si256( a1, a3 ); \
    a3 = _mm256_xor_si256( a3, a2 ); \
    t  = _mm256_xor_si256( t, a1 ); \
    a2 = _mm256_and_si256( a2, a1 ); \
-    a1 = mm256_xnor( a1, a0 ); \
+    a1 = mm256_nxor( a1, a0 ); \
    a0 = t; \
 }

--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -69,18 +69,18 @@
    v128_t t = a0; \
    a0 = v128_xoror( a3, a0, a1 ); \
    a2 = v128_xor( a2, a3 ); \
-    a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
+    a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* ~a1 ^ (a3 & t) */ \
    a3 = v128_xorand( a2, a3, t ); \
    a2 = v128_xorand( a1, a2, a0 ); \
    a1 = v128_or( a1, a3 ); \
    a3 = v128_xor( a3, a2 ); \
    t  = v128_xor( t, a1 ); \
    a2 = v128_and( a2, a1 ); \
-    a1 = v128_xnor( a1, a0 ); \
+    a1 = v128_nxor( a1, a0 ); \
    a0 = t; \
 }

-#else
+#elif defined(__ARM_NEON) || defined(__SSE2__)

 #define SUBCRUMB( a0, a1, a2, a3 ) \
 { \
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -67,7 +67,7 @@ static const uint64_t K512[80] =
 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
 };

-#if defined(__AVX2__) && defined(__SHA512__)
+#if defined(__AVX__) && defined(__SHA512__)

 // SHA-512 implemented using SHA512 CPU extension.

--- a/algo/sha/sha512-hash.h
+++ b/algo/sha/sha512-hash.h
@@ -5,7 +5,7 @@
 #include "simd-utils.h"
 #include "sph_sha2.h"

-#if defined(__SHA512__) && defined(__AVX2__)
+#if defined(__SHA512__) && defined(__AVX__)

 // Experimental, untested
 // Need to substitute for sph_sha512
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -305,7 +305,7 @@ do { \
   xb0 = mm512_rol_32( xb0, 1 ); \
   xa0 = mm512_xor3( xm, xb1, \
                     mm512_xorandnot( v512_mult_x3( xa0 ), xb3, xb2 ) ); \
-   xb0 = mm512_xnor( xa0, xb0 ); \
+   xb0 = mm512_nxor( xa0, xb0 ); \
 } while (0)

 #define PERM_STEP_0_16  do { \
@@ -898,7 +898,7 @@ do { \
   xb0 = mm256_rol_32( xb0, 1 ); \
   xa0 = mm256_xor3( xm, xb1, \
                     mm256_xorandnot( v256_mult_x3( xa0 ), xb3, xb2 ) ); \
-   xb0 = mm256_xnor( xa0, xb0 ); \
+   xb0 = mm256_nxor( xa0, xb0 ); \
 } while (0)

 #define PERM_STEP_0_8   do { \
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -109,7 +109,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )

   for ( r = 0; r < 3; r ++ )
   {
-      // round 1, 5, 9
+     // round 1, 5, 9

     k00 = _mm256_xor_si256( k13, mm256_shuflr128_32(
                                  mm256_aesenc_2x128( k00, zero ) ) );
--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -21,7 +21,7 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
   __m512i *H = (__m512i*)ctx->h;
   const __m512i count = _mm512_set4_epi32( ctx->count3, ctx->count2,
                                            ctx->count1, ctx->count0 );
-   int r;
+   const __m512i zero = _mm512_setzero_si512();

   P0 = H[0];
   P1 = H[1];
@@ -37,182 +37,160 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
   K6 = M[6];
   K7 = M[7];

-   X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K1 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K2 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K3 ), m512_zero );
+   // round 0

-   P0 = _mm512_xor_si512( P0, X );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K1 ), zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K2 ), zero );
+   P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P0 );

-   X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K5 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K6 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K7 ), m512_zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K5 ), zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K6 ), zero );
+   P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P2 );

-   P2 = _mm512_xor_si512( P2, X );
-
-   // round
-   for ( r = 0; r < 3; r ++ )
+   for ( int r = 0; r < 3; r ++ )
   {
-      // round 1, 5, 9
+     // round 1, 5, 9

     K0 = _mm512_xor_si512( K7, mm512_shuflr128_32(
-                                  _mm512_aesenc_epi128( K0, m512_zero ) ) );
+                                  _mm512_aesenc_epi128( K0, zero ) ) );

     if ( r == 0 )
        K0 = _mm512_xor_si512( K0,
-                    _mm512_mask_xor_epi32( count, 0x8888, count, m512_neg1 ) );
+             _mm512_mask_ternarylogic_epi32( count, 0x8888, count, count, 1 ) );

-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), zero );
     K1 = _mm512_xor_si512( K0,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K1, zero ) ) );

     if ( r == 1 )
        K1 = _mm512_xor_si512( K1, mm512_shuflr128_32(
-                 _mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) );
+           _mm512_mask_ternarylogic_epi32( count, 0x1111, count, count, 1 ) ) );

-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
     K2 = _mm512_xor_si512( K1,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K2, zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
     K3 = _mm512_xor_si512( K2,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
-
-     P3 = _mm512_xor_si512( P3, X );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K3, zero ) ) );
+     P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P3 );

     K4 = _mm512_xor_si512( K3,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K4, zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), zero );
     K5 = _mm512_xor_si512( K4,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K5, zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
     K6 = _mm512_xor_si512( K5,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K6, zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
     K7 = _mm512_xor_si512( K6,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K7, zero ) ) );

     if ( r == 2 )
        K7 = _mm512_xor_si512( K7, mm512_swap128_64(
-                 _mm512_mask_xor_epi32( count, 0x2222, count, m512_neg1 ) ) );
+           _mm512_mask_ternarylogic_epi32( count, 0x2222, count, count, 1 ) ) );
 
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
-     P1 = _mm512_xor_si512( P1, X );
+     P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P1 );
     
     // round 2, 6, 10

     K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), zero );
     K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
     K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
     K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
-
-     P2 = _mm512_xor_si512( P2, X );
+     P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P2 );

     K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), zero );
     K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
     K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
     K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
-
-     P0 = _mm512_xor_si512( P0, X );
+     P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P0 );

     // round 3, 7, 11

     K0 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero );
+                               _mm512_aesenc_epi128( K0, zero ) ), K7 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), zero );
     K1 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+                               _mm512_aesenc_epi128( K1, zero ) ), K0 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
     K2 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+                               _mm512_aesenc_epi128( K2, zero ) ), K1 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
     K3 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
-
-     P1 = _mm512_xor_si512( P1, X );
+                               _mm512_aesenc_epi128( K3, zero ) ), K2 );
+     P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P1 );

     K4 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero );
+                               _mm512_aesenc_epi128( K4, zero ) ), K3 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), zero );
     K5 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+                               _mm512_aesenc_epi128( K5, zero ) ), K4 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
     K6 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K6, m512_zero ) ), K5 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+                               _mm512_aesenc_epi128( K6, zero ) ), K5 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
     K7 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
-
-     P3 = _mm512_xor_si512( P3, X );
+                               _mm512_aesenc_epi128( K7, zero ) ), K6 );
+     P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P3 );

     // round 4, 8, 12

     K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), zero );
     K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
     K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
     K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
-
-     P0 = _mm512_xor_si512( P0, X );
+     P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P0 );

     K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), zero );
     K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
     K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
     K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
-
-     P2 = _mm512_xor_si512( P2, X );
+     P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P2 );
   }

   // round 13

   K0 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K0, m512_zero ) ), K7  );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
+			             _mm512_aesenc_epi128( K0, zero ) ), K7  );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), zero );
   K1 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+			             _mm512_aesenc_epi128( K1, zero ) ), K0 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
   K2 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+			             _mm512_aesenc_epi128( K2, zero ) ), K1 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
   K3 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
-
-   P3 = _mm512_xor_si512( P3, X );
+			             _mm512_aesenc_epi128( K3, zero ) ), K2 );
+   P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P3 );

   K4 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
+			             _mm512_aesenc_epi128( K4, zero ) ), K3 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), zero );
   K5 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
-   K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
-   K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5,  mm512_swap64_32( 
-              _mm512_mask_xor_epi32( count, 0x4444, count, m512_neg1 ) ) ) );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+			             _mm512_aesenc_epi128( K5, zero ) ), K4 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
+   K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, zero ) );
+   K6 = mm512_xor3( K6, K5, mm512_swap64_32(
+        _mm512_mask_ternarylogic_epi32( count, 0x4444, count, count, 1 ) ) );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
   K7= _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
-
-   P1 = _mm512_xor_si512( P1, X );
+			             _mm512_aesenc_epi128( K7, zero ) ), K6 );
+   P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P1 );

   H[0] = _mm512_xor_si512( H[0], P2 );
   H[1] = _mm512_xor_si512( H[1], P3 );
--- a/algo/shavite/shavite.c
+++ b/algo/shavite/shavite.c
@@ -1,159 +0,0 @@
-#include "miner.h"
-#include "algo-gate-api.h"
-#include <string.h>
-#include <stdint.h>
-
-#include "sph_shavite.h"
-
-extern void inkhash(void *state, const void *input)
-{
-    sph_shavite512_context	 ctx_shavite;
-    uint32_t hash[16];
-	
-    sph_shavite512_init(&ctx_shavite);
-    sph_shavite512 (&ctx_shavite, (const void*) input, 80);
-    sph_shavite512_close(&ctx_shavite, (void*) hash);
-    
-    sph_shavite512_init(&ctx_shavite);
-    sph_shavite512(&ctx_shavite, (const void*) hash, 64);
-    sph_shavite512_close(&ctx_shavite, (void*) hash);
-
-    memcpy(state, hash, 32);
-
-/*	
-	int ii;
-	printf("result: ");
-	for (ii=0; ii < 32; ii++)
-	{
-		printf ("%.2x",((uint8_t*)state)[ii]);
-	};
-	printf ("\n");	
-*/	
-}
-
-int scanhash_ink( struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
-{
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;
-
-	uint32_t n = pdata[19] - 1;
-	const uint32_t first_nonce = pdata[19];
-	//const uint32_t Htarg = ptarget[7];
-
-	uint32_t _ALIGN(32) hash64[8];
-	uint32_t endiandata[32];
-	
-	//char testdata[] = {"\x70\x00\x00\x00\x5d\x38\x5b\xa1\x14\xd0\x79\x97\x0b\x29\xa9\x41\x8f\xd0\x54\x9e\x7d\x68\xa9\x5c\x7f\x16\x86\x21\xa3\x14\x20\x10\x00\x00\x00\x00\x57\x85\x86\xd1\x49\xfd\x07\xb2\x2f\x3a\x8a\x34\x7c\x51\x6d\xe7\x05\x2f\x03\x4d\x2b\x76\xff\x68\xe0\xd6\xec\xff\x9b\x77\xa4\x54\x89\xe3\xfd\x51\x17\x32\x01\x1d\xf0\x73\x10\x00"};
-	
-	//we need bigendian data...
-	//lessons learned: do NOT endianchange directly in pdata, this will all proof-of-works be considered as stale from minerd.... 
-	int kk=0;
-	for (; kk < 32; kk++)
-	{
-		be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
-	};
-
-//	if (opt_debug) 
-//	{
-//		applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce);
-//	}
-	
-	/* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */
-	/* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */
-	if (ptarget[7]==0) {
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (((hash64[7]&0xFFFFFFFF)==0) && 
-					fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-	} 
-	else if (ptarget[7]<=0xF) 
-	{
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (((hash64[7]&0xFFFFFFF0)==0) && 
-					fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-	} 
-	else if (ptarget[7]<=0xFF) 
-	{
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (((hash64[7]&0xFFFFFF00)==0) && 
-					fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-	} 
-	else if (ptarget[7]<=0xFFF) 
-	{
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (((hash64[7]&0xFFFFF000)==0) && 
-					fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-
-	} 
-	else if (ptarget[7]<=0xFFFF) 
-	{
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (((hash64[7]&0xFFFF0000)==0) && 
-					fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-
-	} 
-	else 
-	{
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-	}
-	
-	
-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-	return 0;
-}
-
-bool register_shavite_algo( algo_gate_t* gate )
-{
-    algo_not_implemented();
-    return false;
-
-//    gate->scanhash = (void*)&scanhash_ink;
-//    gate->hash     = (void*)&inkhash;
-//    return true;
-};
-
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -50,7 +50,8 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

-static const sph_u32 IV512[] = {
+static const sph_u32 IV512[] =
+{
 	0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
 	0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC,
 	0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47,
@@ -71,38 +72,26 @@ c512( sph_shavite_big_context *sc, const void *msg )
   p2 = h[2];
   p3 = h[3];   

-   // round
-
   k00 = m[0];
-   x = v128_xor( p1, k00 );
-   x = v128_aesenc_nokey( x );
-
   k01 = m[1];
-   x = v128_xor( x, k01 );
-   x = v128_aesenc_nokey( x );
   k02 = m[2];
-   x = v128_xor( x, k02 );
-   x = v128_aesenc_nokey( x );
   k03 = m[3];
-   x = v128_xor( x, k03 );
-   x = v128_aesenc_nokey( x );
-
-   p0 = v128_xor( p0, x );
-
   k10 = m[4];
-   x = v128_xor( p3, k10 );
-   x = v128_aesenc_nokey( x );
   k11 = m[5];
-   x = v128_xor( x, k11 );
-   x = v128_aesenc_nokey( x );
   k12 = m[6];
-   x = v128_xor( x, k12 );
-   x = v128_aesenc_nokey( x );
   k13 = m[7];
-   x = v128_xor( x, k13 );
-   x = v128_aesenc_nokey( x );

-   p2 = v128_xor( p2, x );
+   // round 0
+   
+   x = v128_xoraesenc( p1, k00 );
+   x = v128_xoraesenc( x, k01 );
+   x = v128_xoraesenc( x, k02 );
+   p0 = v128_xoraesencxor( x, k03, p0 );
+
+   x = v128_xoraesenc( p3, k10 );
+   x = v128_xoraesenc( x, k11 );
+   x = v128_xoraesenc( x, k12 );
+   p2 = v128_xoraesencxor( x, k13, p2 );

   for ( r = 0; r < 3; r ++ )
   {
@@ -113,198 +102,165 @@ c512( sph_shavite_big_context *sc, const void *msg )
      if ( r == 0 )
         k00 = v128_xor( k00, v128_set32(
                  ~sc->count3, sc->count2, sc->count1, sc->count0 ) ); 
+      x = v128_xoraesenc( p0, k00 );

-      x = v128_xor( p0, k00 );
-      x = v128_aesenc_nokey( x );
      k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
      k01 = v128_xor( k01, k00 );

      if ( r == 1 )
         k01 = v128_xor( k01, v128_set32(
                  ~sc->count0, sc->count1, sc->count2, sc->count3 ) );
+      x = v128_xoraesenc( x, k01 );

-      x = v128_xor( x, k01 );
-      x = v128_aesenc_nokey( x );
      k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
      k02 = v128_xor( k02, k01 );
-      x = v128_xor( x, k02 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k02 );
+
      k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
      k03 = v128_xor( k03, k02 );
-      x = v128_xor( x, k03 );
-      x = v128_aesenc_nokey( x );
-
-      p3 = v128_xor( p3, x );
+      p3 = v128_xoraesencxor( x, k03, p3 );

      k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
      k10 = v128_xor( k10, k03 );
+      x = v128_xoraesenc( p2, k10 );

-      x = v128_xor( p2, k10 );
-      x = v128_aesenc_nokey( x );
      k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
      k11 = v128_xor( k11, k10 );
-      x = v128_xor( x, k11 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k11 );
+
      k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
      k12 = v128_xor( k12, k11 );
-      x = v128_xor( x, k12 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k12 );
+
      k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
      k13 = v128_xor( k13, k12 );

      if ( r == 2 )
         k13 = v128_xor( k13, v128_set32(
                  ~sc->count1, sc->count0, sc->count3, sc->count2 ) );
-
-      x = v128_xor( x, k13 );
-      x = v128_aesenc_nokey( x );
-      p1 = v128_xor( p1, x );
+      p1 = v128_xoraesencxor( x, k13, p1 );

      // round 2, 6, 10

      k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
-      x = v128_xor( p3, k00 );
-      x = v128_aesenc_nokey( x );
-      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
-      x = v128_xor( x, k01 );
-      x = v128_aesenc_nokey( x );
-      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
-      x = v128_xor( x, k02 );
-      x = v128_aesenc_nokey( x );
-      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
-      x = v128_xor( x, k03 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p3, k00 );

-      p2 = v128_xor( p2, x );
+      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
+      x = v128_xoraesenc( x, k01 );
+
+      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
+      x = v128_xoraesenc( x, k02 );
+
+      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
+      p2 = v128_xoraesencxor( x, k03, p2 );

      k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
-      x = v128_xor( p1, k10 );
-      x = v128_aesenc_nokey( x );
-      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
-      x = v128_xor( x, k11 );
-      x = v128_aesenc_nokey( x );
-      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
-      x = v128_xor( x, k12 );
-      x = v128_aesenc_nokey( x );
-      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
-      x = v128_xor( x, k13 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p1, k10 );

-      p0 = v128_xor( p0, x );
+      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
+      x = v128_xoraesenc( x, k11 );
+
+      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
+      x = v128_xoraesenc( x, k12 );
+
+      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
+      p0 = v128_xoraesencxor( x, k13, p0 );

      // round 3, 7, 11

      k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
      k00 = v128_xor( k00, k13 );
-      x = v128_xor( p2, k00 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p2, k00 );
+
      k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
      k01 = v128_xor( k01, k00 );
-      x = v128_xor( x, k01 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k01 );
+
      k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
      k02 = v128_xor( k02, k01 );
-      x = v128_xor( x, k02 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k02 );
+
      k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
      k03 = v128_xor( k03, k02 );
-      x = v128_xor( x, k03 );
-      x = v128_aesenc_nokey( x );
-
-      p1 = v128_xor( p1, x );
+      p1 = v128_xoraesencxor( x, k03, p1 );

      k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
      k10 = v128_xor( k10, k03 );
-      x = v128_xor( p0, k10 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p0, k10 );
+
      k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
      k11 = v128_xor( k11, k10 );
-      x = v128_xor( x, k11 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k11 );
+
      k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
      k12 = v128_xor( k12, k11 );
-      x = v128_xor( x, k12 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k12 );
+
      k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
      k13 = v128_xor( k13, k12 );
-      x = v128_xor( x, k13 );
-      x = v128_aesenc_nokey( x );
-
-      p3 = v128_xor( p3, x );
+      p3 = v128_xoraesencxor( x, k13, p3 );

      // round 4, 8, 12

      k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
-      x = v128_xor( p1, k00 );
-      x = v128_aesenc_nokey( x );
-      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
-      x = v128_xor( x, k01 );
-      x = v128_aesenc_nokey( x );
-      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
-      x = v128_xor( x, k02 );
-      x = v128_aesenc_nokey( x );
-      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
-      x = v128_xor( x, k03 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p1, k00 );

-      p0 = v128_xor( p0, x );
+      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
+      x = v128_xoraesenc( x, k01 );
+
+      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
+      x = v128_xoraesenc( x, k02 );
+
+      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
+      p0 = v128_xoraesencxor( x, k03, p0 );

      k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
-      x = v128_xor( p3, k10 );
-      x = v128_aesenc_nokey( x );
-      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
-      x = v128_xor( x, k11 );
-      x = v128_aesenc_nokey( x );
-      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
-      x = v128_xor( x, k12 );
-      x = v128_aesenc_nokey( x );
-      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
-      x = v128_xor( x, k13 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p3, k10 );

-      p2 = v128_xor( p2, x );
+      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
+      x = v128_xoraesenc( x, k11 );
+
+      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
+      x = v128_xoraesenc( x, k12 );
+
+      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
+      p2 = v128_xoraesencxor( x, k13, p2 );
   }

   // round 13

   k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
   k00 = v128_xor( k00, k13 );
-   x = v128_xor( p0, k00 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( p0, k00 );
+
   k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) ); 
   k01 = v128_xor( k01, k00 );
-   x = v128_xor( x, k01 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( x, k01 );
+
   k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
   k02 = v128_xor( k02, k01 );
-   x = v128_xor( x, k02 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( x, k02 );
+
   k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
   k03 = v128_xor( k03, k02 );
-   x = v128_xor( x, k03 );
-   x = v128_aesenc_nokey( x );
-
-   p3 = v128_xor( p3, x );
+   p3 = v128_xoraesencxor( x, k03, p3 );

   k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
   k10 = v128_xor( k10, k03 );
-   x = v128_xor( p2, k10 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( p2, k10 );
+
   k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
   k11 = v128_xor( k11, k10 );
-   x = v128_xor( x, k11 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( x, k11 );
+
   k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
   k12 = v128_xor( k12, v128_xor( k11, v128_set32(
               ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
-   x = v128_xor( x, k12 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( x, k12 );
+
   k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
   k13 = v128_xor( k13, k12 );
-   x = v128_xor( x, k13 );
-   x = v128_aesenc_nokey( x );
-
-   p1 = v128_xor( p1, x );
+   p1 = v128_xoraesencxor( x, k13, p1 );

   h[0] = v128_xor( h[0], p2 );
   h[1] = v128_xor( h[1], p3 );
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -171,6 +171,53 @@ static const m128_v16 FFT256_twiddle[] __attribute__((aligned(64))) =
   {{ -30,   55,  -58,  -65,  -95,  -40,  -98,   94 }},
 };

+#if defined(__AVX2__)
+
+static const __m256i V256_00FF = { 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff,
+                                   0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff };
+#define V128_00FF _mm256_castsi256_si128( V256_00FF )
+
+#elif defined(__SSE2__) || defined(__ARM_NEON )
+
+static const v128u64_t V128_00FF = { 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff };
+
+#endif
+
+#if defined(SIMD512)
+
+static const __m512i V512_0101 = { 0x0101010101010101, 0x0101010101010101,
+                                   0x0101010101010101, 0x0101010101010101,
+                                   0x0101010101010101, 0x0101010101010101,
+                                   0x0101010101010101, 0x0101010101010101 };
+#define V256_0101 _mm512_castsi512_si256( V512_0101 )
+#define V128_0101 _mm512_castsi512_si128( V512_0101 )
+
+
+static const __m512i V512_0080 = { 0x0080008000800080, 0x0080008000800080,
+                                   0x0080008000800080, 0x0080008000800080,
+                                   0x0080008000800080, 0x0080008000800080,
+                                   0x0080008000800080, 0x0080008000800080 };
+#define V256_0080 _mm512_castsi512_si256( V512_0080 )
+#define V128_0080 _mm512_castsi512_si128( V512_0080 )
+
+#elif defined(__AVX2__)
+
+static const __m256i V256_0101 = { 0x0101010101010101, 0x0101010101010101,
+                                   0x0101010101010101, 0x0101010101010101 };
+#define V128_0101 _mm256_castsi256_si128( V256_0101 )
+
+static const __m256i V256_0080 = { 0x0080008000800080, 0x0080008000800080,
+                                   0x0080008000800080, 0x0080008000800080 };
+#define V128_0080 _mm256_castsi256_si128( V256_0080 )
+
+#elif defined(__SSE2__) || defined(__ARM_NEON )
+
+static const v128u64_t V128_0101 = { 0x0101010101010101, 0x0101010101010101 };
+
+static const v128u64_t V128_0080 = { 0x0080008000800080, 0x0080008000800080 };
+
+#endif
+
 #if defined(__x86_64__)

 #define SHUFXOR_1(x)        _mm_shuffle_epi32(x,0xb1)
@@ -190,13 +237,10 @@ static const m128_v16 FFT256_twiddle[] __attribute__((aligned(64))) =
 #define shufxor(x,s)   XCAT(SHUFXOR_,s)(x) 

 #define REDUCE(x) \
-  v128_sub16( v128_and( x, v128_64( \
-                         0x00ff00ff00ff00ff ) ), v128_sra16( x, 8 ) )
+  v128_sub16( v128_and( x, V128_00FF ), v128_sra16( x, 8 ) )

 #define EXTRA_REDUCE_S(x)\
-  v128_sub16( x, v128_and( \
-          v128_64( 0x0101010101010101 ), \
-          v128_cmpgt16( x, v128_64( 0x0080008000800080 ) ) ) )
+  v128_sub16( x, v128_and( V128_0101, v128_cmpgt16( x, V128_0080 ) ) )

 #define REDUCE_FULL_S( x )  EXTRA_REDUCE_S( REDUCE (x ) )

@@ -293,10 +337,9 @@ do { \
  // This will make the full FFT_64 in order.
 #define INTERLEAVE(i,j) \
  do { \
-    v128u16_t t1= X(i); \
-    v128u16_t t2= X(j); \
-    X(i) = v128_unpacklo16( t1, t2 ); \
-    X(j) = v128_unpackhi16( t1, t2 ); \
+    v128u16_t t = X(i); \
+    X(i) = v128_unpacklo16( t, X(j) ); \
+    X(j) = v128_unpackhi16( t, X(j) ); \
  } while(0)

  INTERLEAVE( 1, 0 );
@@ -803,23 +846,12 @@ static const m256_v16 FFT256_Twiddle[] =

 #define shufxor2w(x,s)      XCAT(SHUFXOR_,s)(x)

-#if defined(VL256)
-
 #define REDUCE(x) \
-  _mm256_sub_epi16( _mm256_maskz_mov_epi8( 0x55555555, x ), \
-                    _mm256_srai_epi16( x, 8 ) )
-#else
-
-#define REDUCE(x) \
-  _mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi64x( \
-                         0x00ff00ff00ff00ff ) ), _mm256_srai_epi16( x, 8 ) )
-
-#endif
+  _mm256_sub_epi16( _mm256_and_si256( x, V256_00FF ), _mm256_srai_epi16( x, 8 ) )

 #define EXTRA_REDUCE_S(x)\
-  _mm256_sub_epi16( x, _mm256_and_si256( \
-          _mm256_set1_epi64x( 0x0101010101010101 ), \
-          _mm256_cmpgt_epi16( x, _mm256_set1_epi64x( 0x0080008000800080 ) ) ) )
+  _mm256_sub_epi16( x, _mm256_and_si256( V256_0101, \
+                                         _mm256_cmpgt_epi16( x, V256_0080 ) ) )

 #define REDUCE_FULL_S( x )  EXTRA_REDUCE_S( REDUCE (x ) )

@@ -917,10 +949,9 @@ do { \
  // This will make the full FFT_64 in order.
 #define INTERLEAVE(i,j) \
  do { \
-    __m256i t1= X(i); \
-    __m256i t2= X(j); \
-    X(i) = _mm256_unpacklo_epi16( t1, t2 ); \
-    X(j) = _mm256_unpackhi_epi16( t1, t2 ); \
+    __m256i t = X(i); \
+    X(i) = _mm256_unpacklo_epi16( t, X(j) ); \
+    X(j) = _mm256_unpackhi_epi16( t, X(j) ); \
  } while(0)

  INTERLEAVE( 1, 0 );
@@ -1658,10 +1689,8 @@ static const m512_v16 FFT256_Twiddle4w[] =
                    _mm512_srai_epi16( x, 8 ) )

 #define EXTRA_REDUCE_S4w(x) \
-  _mm512_sub_epi16( x, _mm512_and_si512( \
-             _mm512_set1_epi64( 0x0101010101010101 ), \
-             _mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \
-                             x, _mm512_set1_epi64( 0x0080008000800080 ) ) ) ) )
+  _mm512_sub_epi16( x, _mm512_and_si512( V512_0101, \
+             _mm512_movm_epi16( _mm512_cmpgt_epi16_mask( x, V512_0080 ) ) ) )

 // generic, except it calls targetted macros
 #define REDUCE_FULL_S4w( x )  EXTRA_REDUCE_S4w( REDUCE4w (x ) )
--- a/algo/swifftx/swifftx.c
+++ b/algo/swifftx/swifftx.c
@@ -640,24 +640,25 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
 #if defined(__AVX2__)

   __m256i F0, F1, F2, F3, F4, F5, F6, F7;
-   __m256i tbl = *(__m256i*)&( fftTable[ input[0] << 3 ] );
+   __m256i *table = (__m256i*)fftTable;
+   __m256i tbl = table[ input[0] ];
   __m256i *mul = (__m256i*)multipliers;
   __m256i *out = (__m256i*)output;

   F0 = _mm256_mullo_epi32( mul[0], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[1] << 3 ] );
+   tbl = table[ input[1] ];
   F1 = _mm256_mullo_epi32( mul[1], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[2] << 3 ] );
+   tbl = table[ input[2] ];
   F2 = _mm256_mullo_epi32( mul[2], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[3] << 3 ] );
+   tbl = table[ input[3] ];
   F3 = _mm256_mullo_epi32( mul[3], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[4] << 3 ] );
+   tbl = table[ input[4] ];
   F4 = _mm256_mullo_epi32( mul[4], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[5] << 3 ] );
+   tbl = table[ input[5] ];
   F5 = _mm256_mullo_epi32( mul[5], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[6] << 3 ] );
+   tbl = table[ input[6] ];
   F6 = _mm256_mullo_epi32( mul[6], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[7] << 3 ] );
+   tbl = table[ input[7]  ];
   F7 = _mm256_mullo_epi32( mul[7], tbl );

   #define ADD_SUB( a, b ) \
@@ -677,9 +678,9 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
   ADD_SUB( F1, F3 );
   ADD_SUB( F4, F6 );
   ADD_SUB( F5, F7 );  
-   F5 = _mm256_slli_epi32( F5, 2 );
   F6 = _mm256_slli_epi32( F6, 4 );
   F7 = _mm256_slli_epi32( F7, 6 );
+   F5 = _mm256_slli_epi32( F5, 2 );
   ADD_SUB( F0, F4 );
   ADD_SUB( F1, F5 );
   ADD_SUB( F2, F6 );
--- a/armbuild-all.sh
+++ b/armbuild-all.sh
@@ -4,11 +4,11 @@
 # during development. However, the information contained may provide compilation
 # tips to users.

-rm cpuminer cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null
+rm cpuminer cpuminer-m2 cpuminer-m4 cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto > /dev/null

 # armv9 needs gcc-13
 # -march-armv9-a includes SVE2 but no crypto
-# -march=armv9-a+crypto adds AES & SHA2 but not SHA512
+# -march=armv9-a+crypto adds AES & SHA256 but not SHA512

 make distclean || echo clean
 rm -f config.status
@@ -27,18 +27,37 @@ CFLAGS="-O3 -march=armv9-a -Wall -flax-vector-conversions" ./configure  --with-c
 make -j $(nproc)
 mv cpuminer cpuminer-armv9

+# Apple M4: armv9.2, AES, SHA3, SVE2
+make clean || echo clean
+CFLAGS="-O3 -march=armv9.2-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure  --with-curl
+make -j $(nproc)
+mv cpuminer cpuminer-m4
+
+# Apple M2: armv8.6, AES, SHA3
+make clean || echo clean
+CFLAGS="-O3 -march=armv8.6-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure  --with-curl
+make -j $(nproc)
+mv cpuminer cpuminer-m2
+
 # SVE2 available in armv8.5
 make clean || echo clean
 CFLAGS="-O3 -march=armv8.5-a+crypto+sha3+sve2 -Wall -flax-vector-conversions" ./configure  --with-curl
 make -j $(nproc)
 mv cpuminer cpuminer-armv8.5-crypto-sha3-sve2

-# SHA3 available in armv8.4
+# Apple M1: armv8.4 AES, SHA3
 make clean || echo clean
 CFLAGS="-O3 -march=armv8.4-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure  --with-curl
 make -j $(nproc)
 mv cpuminer cpuminer-armv8.4-crypto-sha3

+# Cortex-A76 (Orange Pi 5)
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=armv8.2-a+crypto -Wall -flax-vector-conversions" ./configure  --with-curl
+make -j $(nproc)
+mv cpuminer cpuminer-armv8-crypto
+
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=armv8-a+crypto -Wall -flax-vector-conversions" ./configure  --with-curl
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -34,9 +34,7 @@ make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-arrowlake-s

-# Intel Core Graniterapids: AVX512, SHA256, VAES, needs gcc-14
-# Granitrapids does not build with AVX10, SHA512 or APX.
-# wait for Diamondrapids & gcc-15.
+# Intel Core Graniterapids: AVX512, SHA256, VAES, AMX, needs gcc-14
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl
@@ -44,13 +42,13 @@ make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-graniterapids

-# SHA512 AVX10.1
-#make clean || echo clean
-#rm -f config.status
-#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1 -Wall" ./configure --with-curl
-#make -j $(nproc)
-#strip -s cpuminer
-#mv cpuminer cpuminer-avx10_1
+# Graniterapids + SHA512, AVX10.1
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1 -Wall" ./configure --with-curl
+make -j $(nproc)
+strip -s cpuminer
+mv cpuminer cpuminer-avx10.1

 # SHA512 AVX10.2
 #make clean || echo clean
@@ -72,6 +70,9 @@ mv cpuminer cpuminer-graniterapids
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=znver5 -Wall" ./configure --with-curl
+# zen4 is close enough for older compiler
+#CFLAGS="-O3 -march=znver4 -Wall" ./configure --with-curl
+
 make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-zen5
@@ -138,13 +139,21 @@ make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-avx

-# SSE4.2 AES: Intel Westmere, most Pentium & Celeron
+# SSE4.2 AES SHA: Intel Atom Goldmont, newer Pentium & Celeron
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=goldmont -Wall" ./configure --with-curl
+make -j $(nproc)
+strip -s cpuminer
+mv cpuminer cpuminer-sse42-aes-sha
+
+# SSE4.2 AES: Intel Westmere, older Pentium & Celeron
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=westmere -maes -Wall" ./configure --with-curl
 make -j $(nproc)
 strip -s cpuminer
-mv cpuminer cpuminer-aes-sse42
+mv cpuminer cpuminer-sse42-aes

 # SSE4.2: Intel Nehalem
 make clean || echo clean
--- a/clean-all.sh
+++ b/clean-all.sh
@@ -2,8 +2,8 @@
 #
 # make clean and rm all the targetted executables.

-rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen* cpuminer-x64 cpuminer-armv* > /dev/null
+rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512* cpuminer-alderlake cpuminer-avx10* cpuminer-avx2* cpuminer-avx cpuminer-sse* cpuminer-ssse3 cpuminer-zen* cpuminer-x64 cpuminer-armv* cpuminer-m2 cpuminer-m4 > /dev/null

-rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null
+rm cpuminer-avx512* cpuminer-avx2* cpuminer-avx.exe cpuminer-sse* cpuminer-zen* cpuminer-x64.exe > /dev/null

 make distclean > /dev/null
--- a/compat/aes_helper.c
+++ b/compat/aes_helper.c
@@ -108,7 +108,24 @@ extern "C"{
 	} while (0)

 #define AES_ROUND_NOKEY_LE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \
-	AES_ROUND_LE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3)
+{ \
+      (Y0) = AES0[(X0) & 0xFF] \
+         ^ AES1[((X1) >> 8) & 0xFF] \
+         ^ AES2[((X2) >> 16) & 0xFF] \
+         ^ AES3[((X3) >> 24) & 0xFF]; \
+      (Y1) = AES0[(X1) & 0xFF] \
+         ^ AES1[((X2) >> 8) & 0xFF] \
+         ^ AES2[((X3) >> 16) & 0xFF] \
+         ^ AES3[((X0) >> 24) & 0xFF]; \
+      (Y2) = AES0[(X2) & 0xFF] \
+         ^ AES1[((X3) >> 8) & 0xFF] \
+         ^ AES2[((X0) >> 16) & 0xFF] \
+         ^ AES3[((X1) >> 24) & 0xFF]; \
+      (Y3) = AES0[(X3) & 0xFF] \
+         ^ AES1[((X0) >> 8) & 0xFF] \
+         ^ AES2[((X1) >> 16) & 0xFF] \
+         ^ AES3[((X2) >> 24) & 0xFF]; \
+}

 #endif

--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.5.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.7.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='25.5'
-PACKAGE_STRING='cpuminer-opt 25.5'
+PACKAGE_VERSION='25.7'
+PACKAGE_STRING='cpuminer-opt 25.7'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1359,7 +1359,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 25.5 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 25.7 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1431,7 +1431,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 25.5:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 25.7:";;
   esac
  cat <<\_ACEOF

@@ -1536,7 +1536,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 25.5
+cpuminer-opt configure 25.7
 generated by GNU Autoconf 2.71

 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1983,7 +1983,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 25.5, which was
+It was created by cpuminer-opt $as_me 25.7, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3591,7 +3591,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='25.5'
+ VERSION='25.7'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7435,7 +7435,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 25.5, which was
+This file was extended by cpuminer-opt $as_me 25.7, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7503,7 +7503,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 25.5
+cpuminer-opt config.status 25.7
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [25.5])
+AC_INIT([cpuminer-opt], [25.7])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/159
+++ b/159
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.5.
+# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.7.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation,
@@ -601,8 +601,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='25.5'
-PACKAGE_STRING='cpuminer-opt 25.5'
+PACKAGE_VERSION='25.7'
+PACKAGE_STRING='cpuminer-opt 25.7'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1352,7 +1352,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-'configure' configures cpuminer-opt 25.5 to adapt to many kinds of systems.
+'configure' configures cpuminer-opt 25.7 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1424,7 +1424,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 25.5:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 25.7:";;
   esac
  cat <<\_ACEOF

@@ -1528,7 +1528,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 25.5
+cpuminer-opt configure 25.7
 generated by GNU Autoconf 2.72

 Copyright (C) 2023 Free Software Foundation, Inc.
@@ -1949,7 +1949,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 25.5, which was
+It was created by cpuminer-opt $as_me 25.7, which was
 generated by GNU Autoconf 2.72.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3065,7 +3065,7 @@ ac_config_headers="$ac_config_headers cpuminer-config.h"



-am__api_version='1.17'
+am__api_version='1.18'


  # Find a good install program.  We prefer a C program (faster),
@@ -3334,10 +3334,14 @@ am_lf='
 '
 case `pwd` in
  *[\\\"\#\$\&\'\`$am_lf]*)
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
    as_fn_error $? "unsafe absolute working directory name" "$LINENO" 5;;
 esac
 case $srcdir in
  *[\\\"\#\$\&\'\`$am_lf\ \	]*)
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
    as_fn_error $? "unsafe srcdir value: '$srcdir'" "$LINENO" 5;;
 esac

@@ -3764,7 +3768,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='25.5'
+ VERSION='25.7'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -3802,9 +3806,133 @@ AMTAR='$${TAR-tar}'


 # We'll loop over all known methods to create a tar archive until one works.
-_am_tools='gnutar  pax cpio none'
+_am_tools='gnutar plaintar pax cpio none'

-am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'
+# The POSIX 1988 'ustar' format is defined with fixed-size fields.
+      # There is notably a 21 bits limit for the UID and the GID.  In fact,
+      # the 'pax' utility can hang on bigger UID/GID (see automake bug#8343
+      # and bug#13588).
+      am_max_uid=2097151 # 2^21 - 1
+      am_max_gid=$am_max_uid
+      # The $UID and $GID variables are not portable, so we need to resort
+      # to the POSIX-mandated id(1) utility.  Errors in the 'id' calls
+      # below are definitely unexpected, so allow the users to see them
+      # (that is, avoid stderr redirection).
+      am_uid=`id -u || echo unknown`
+      am_gid=`id -g || echo unknown`
+      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether UID '$am_uid' is supported by ustar format" >&5
+printf %s "checking whether UID '$am_uid' is supported by ustar format... " >&6; }
+      if test x$am_uid = xunknown; then
+        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: ancient id detected; assuming current UID is ok, but dist-ustar might not work" >&5
+printf "%s\n" "$as_me: WARNING: ancient id detected; assuming current UID is ok, but dist-ustar might not work" >&2;}
+      elif test $am_uid -le $am_max_uid; then
+        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+      else
+        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+        _am_tools=none
+      fi
+      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether GID '$am_gid' is supported by ustar format" >&5
+printf %s "checking whether GID '$am_gid' is supported by ustar format... " >&6; }
+      if test x$gm_gid = xunknown; then
+        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: ancient id detected; assuming current GID is ok, but dist-ustar might not work" >&5
+printf "%s\n" "$as_me: WARNING: ancient id detected; assuming current GID is ok, but dist-ustar might not work" >&2;}
+      elif test $am_gid -le $am_max_gid; then
+        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+      else
+        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+        _am_tools=none
+      fi
+
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking how to create a ustar tar archive" >&5
+printf %s "checking how to create a ustar tar archive... " >&6; }
+
+  # Go ahead even if we have the value already cached.  We do so because we
+  # need to set the values for the 'am__tar' and 'am__untar' variables.
+  _am_tools=${am_cv_prog_tar_ustar-$_am_tools}
+
+  for _am_tool in $_am_tools; do
+    case $_am_tool in
+    gnutar)
+      for _am_tar in tar gnutar gtar; do
+        { echo "$as_me:$LINENO: $_am_tar --version" >&5
+   ($_am_tar --version) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); } && break
+      done
+      am__tar="$_am_tar --format=ustar -chf - "'"$$tardir"'
+      am__tar_="$_am_tar --format=ustar -chf - "'"$tardir"'
+      am__untar="$_am_tar -xf -"
+      ;;
+    plaintar)
+      # Must skip GNU tar: if it does not support --format= it doesn't create
+      # ustar tarball either.
+      (tar --version) >/dev/null 2>&1 && continue
+      am__tar='tar chf - "$$tardir"'
+      am__tar_='tar chf - "$tardir"'
+      am__untar='tar xf -'
+      ;;
+    pax)
+      am__tar='pax -L -x ustar -w "$$tardir"'
+      am__tar_='pax -L -x ustar -w "$tardir"'
+      am__untar='pax -r'
+      ;;
+    cpio)
+      am__tar='find "$$tardir" -print | cpio -o -H ustar -L'
+      am__tar_='find "$tardir" -print | cpio -o -H ustar -L'
+      am__untar='cpio -i -H ustar -d'
+      ;;
+    none)
+      am__tar=false
+      am__tar_=false
+      am__untar=false
+      ;;
+    esac
+
+    # If the value was cached, stop now.  We just wanted to have am__tar
+    # and am__untar set.
+    test -n "${am_cv_prog_tar_ustar}" && break
+
+    # tar/untar a dummy directory, and stop if the command works.
+    rm -rf conftest.dir
+    mkdir conftest.dir
+    echo GrepMe > conftest.dir/file
+    { echo "$as_me:$LINENO: tardir=conftest.dir && eval $am__tar_ >conftest.tar" >&5
+   (tardir=conftest.dir && eval $am__tar_ >conftest.tar) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); }
+    rm -rf conftest.dir
+    if test -s conftest.tar; then
+      { echo "$as_me:$LINENO: $am__untar <conftest.tar" >&5
+   ($am__untar <conftest.tar) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); }
+      { echo "$as_me:$LINENO: cat conftest.dir/file" >&5
+   (cat conftest.dir/file) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); }
+      grep GrepMe conftest.dir/file >/dev/null 2>&1 && break
+    fi
+  done
+  rm -rf conftest.dir
+
+  if test ${am_cv_prog_tar_ustar+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) am_cv_prog_tar_ustar=$_am_tool ;;
+esac
+fi
+
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $am_cv_prog_tar_ustar" >&5
+printf "%s\n" "$am_cv_prog_tar_ustar" >&6; }



@@ -4986,7 +5114,10 @@ _ACEOF
      break
    fi
  done
-  rm -f core conftest*
+  # aligned with autoconf, so not including core; see bug#72225.
+  rm -f -r a.out a.exe b.out conftest.$ac_ext conftest.$ac_objext \
+    conftest.dSYM conftest1.$ac_ext conftest1.$ac_objext conftest1.dSYM \
+    conftest2.$ac_ext conftest2.$ac_objext conftest2.dSYM
  unset am_i ;;
 esac
 fi
@@ -7450,7 +7581,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 25.5, which was
+This file was extended by cpuminer-opt $as_me 25.7, which was
 generated by GNU Autoconf 2.72.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7518,7 +7649,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 25.5
+cpuminer-opt config.status 25.7
 configured by $0, generated by GNU Autoconf 2.72,
  with options \\"\$ac_cs_config\\"

--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -921,39 +921,32 @@ out:
   return rc;
 }

-// returns the unit prefix and the hashrate appropriately scaled.
-void scale_hash_for_display ( double* hashrate, char* prefix )
-{
-       if ( *hashrate < 1e4  )    *prefix =  0;
-  else if ( *hashrate < 1e7  )  { *prefix = 'k';  *hashrate /= 1e3;  }
-  else if ( *hashrate < 1e10 )  { *prefix = 'M';  *hashrate /= 1e6;  }  
-  else if ( *hashrate < 1e13 )  { *prefix = 'G';  *hashrate /= 1e9;  }
-  else if ( *hashrate < 1e16 )  { *prefix = 'T';  *hashrate /= 1e12; }
-  else if ( *hashrate < 1e19 )  { *prefix = 'P';  *hashrate /= 1e15; }
-  else if ( *hashrate < 1e22 )  { *prefix = 'E';  *hashrate /= 1e18; }
-  else if ( *hashrate < 1e25 )  { *prefix = 'Z';  *hashrate /= 1e21; } 
-  else                          { *prefix = 'Y';  *hashrate /= 1e24; }
-}
-
+// Does not account for leap years.
 static inline void sprintf_et( char *str, long unsigned int seconds )
 {
-   long unsigned int min = seconds / 60;
-   long unsigned int sec = seconds % 60;
-   long unsigned int hrs = min / 60;
-   
-   if ( unlikely( hrs ) )   
+   long unsigned int minutes = seconds / 60;
+   if ( minutes )
   {
-      long unsigned int days = hrs / 24;
-      long unsigned int years = days / 365;
-      if ( years )      // 0y000d
-         sprintf( str, "%luy%lud", years, years % 365 );
-      else if ( days )  // 0d00h
-         sprintf( str, "%lud%02luh", days, hrs % 24 );
-      else         // 0h00m  
-         sprintf( str, "%luh%02lum", hrs, min % 60 );
+      long unsigned int hours = minutes / 60;
+      if ( hours )
+      {
+         long unsigned int days = hours / 24;
+         if ( days )
+         {
+            long unsigned int years = days / 365;
+            if ( years )   
+               sprintf( str, "%luy%03lud", years, days % 365 ); // 0y000d
+            else
+               sprintf( str, "%lud%02luh", days, hours % 24 );  // 0d00h
+         }
+         else
+            sprintf( str, "%luh%02lum", hours, minutes % 60 );  // 0h00m
+      }
+      else
+         sprintf( str, "%lum%02lus", minutes, seconds % 60 );   // 0m00s
   }
-   else         // 0m00s
-      sprintf( str, "%lum%02lus", min, sec );
+   else
+      sprintf( str, "%lus", seconds );   // 0s
 }      

 const long double exp32  = EXP32;                                 // 2**32
@@ -2833,67 +2826,29 @@ static void show_credits()
 static bool cpu_capability( bool display_only )
 {
     char cpu_brand[0x40];
-     bool cpu_has_sse2     = has_sse2();     // X86_64 only
-     bool cpu_has_ssse3    = has_ssse3();    // X86_64 only
-     bool cpu_has_sse41    = has_sse41();    // X86_64 only
-     bool cpu_has_sse42    = has_sse42();
-     bool cpu_has_avx      = has_avx();
-     bool cpu_has_neon     = has_neon();     // AArch64 
-     bool cpu_has_sve      = has_sve();      // aarch64 only, insignificant
-     bool cpu_has_sve2     = has_sve2();     // AArch64 only
-     bool cpu_has_sme      = has_sme();
-     bool cpu_has_sme2     = has_sme2();  
-     bool cpu_has_avx2     = has_avx2(); 
-     bool cpu_has_avx512   = has_avx512();
-     bool cpu_has_avx10    = has_avx10();
-     bool cpu_has_aes      = has_aes();      // x86_64 or AArch64
-     bool cpu_has_vaes     = has_vaes();     // X86_64 only
-     bool cpu_has_sha256   = has_sha256();   // x86_64 or AArch64
-     bool cpu_has_sha512   = has_sha512();
     bool sw_has_x86_64    = false;
     bool sw_has_aarch64   = false;
     int  sw_arm_arch      = 0;            // AArch64 version
     bool sw_has_neon      = false;        // AArch64
-     bool sw_has_sve       = false;        // AArch64
-     bool sw_has_sve2      = false;        // AArch64
+     bool sw_has_sve       = false;
+     bool sw_has_sve2      = false;
     bool sw_has_sme       = false;  
     bool sw_has_sme2      = false; 
     bool sw_has_sse2      = false;        // x86_64
-     bool sw_has_ssse3     = false;        // x86_64
-     bool sw_has_sse41     = false;        // x86_64
+     bool sw_has_ssse3     = false;
+     bool sw_has_sse41     = false;
     bool sw_has_sse42     = false;
     bool sw_has_avx       = false;
     bool sw_has_avx2      = false;
     bool sw_has_avx512    = false;
     bool sw_has_avx10     = false;
-     bool sw_has_aes       = false;
-     bool sw_has_vaes      = false;
+     bool sw_has_amx       = false;
+     bool sw_has_apx       = false;
+     bool sw_has_aes       = false;        // x86_64 or AArch64
+     bool sw_has_vaes      = false;        // x86_64
     bool sw_has_sha256    = false;        // x86_64 or AArch64
-     bool sw_has_sha512    = false;        // x86_64 or AArch64
-/*
-     set_t algo_features   = algo_gate.optimizations;
-     bool algo_has_sse2    = set_incl( SSE2_OPT,    algo_features );
-     bool algo_has_sse42   = set_incl( SSE42_OPT,   algo_features );
-     bool algo_has_avx     = set_incl( AVX_OPT,     algo_features );
-     bool algo_has_avx2    = set_incl( AVX2_OPT,    algo_features );
-     bool algo_has_avx512  = set_incl( AVX512_OPT,  algo_features );
-     bool algo_has_aes     = set_incl( AES_OPT,     algo_features );
-     bool algo_has_vaes    = set_incl( VAES_OPT,    algo_features );
-     bool algo_has_sha256  = set_incl( SHA256_OPT,  algo_features );
-     bool algo_has_sha512  = set_incl( SHA512_OPT,  algo_features );
-     bool algo_has_neon    = set_incl( NEON_OPT,    algo_features );
-     bool use_sse2;
-     bool use_sse42;
-     bool use_avx;
-     bool use_avx2;
-     bool use_avx512;
-     bool use_aes;
-     bool use_vaes;
-     bool use_sha256;
-     bool use_sha512;
-     bool use_neon;
-     bool use_none;
-*/
+     bool sw_has_sha512    = false;
+
     #if defined(__x86_64__)
         sw_has_x86_64 = true;
     #elif defined(__aarch64__)
@@ -2928,14 +2883,15 @@ static bool cpu_capability( bool display_only )
     #if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__))
         sw_has_avx512 = true;
     #endif
-// AVX10 version is not significant as of AVX10.2. If that changes use a better
-// way to test the version than sequentially.
-//     #if defined(__AVX10_2__)
-// 
-//     #elif defined(__AVX10_1__)
-     #if defined(__AVX10_1__)
+     #if defined(__AVX10_1__)    // version is not significant
         sw_has_avx10 = true;
     #endif
+     #ifdef __AMX_TILE__
+         sw_has_amx = true;
+     #endif
+     #ifdef __APX_F__
+         sw_has_apx = true;
+     #endif

     // x86_64 or AArch64 
     #if defined(__AES__) || defined(__ARM_FEATURE_AES)
@@ -2955,6 +2911,7 @@ static bool cpu_capability( bool display_only )
     #if defined(__ARM_NEON)
         sw_has_neon = true;
     #endif
+     // FYI, SVE & SME not used by cpuminer
     #if defined(__ARM_FEATURE_SVE)
         sw_has_sve = true;
     #endif
@@ -2975,8 +2932,7 @@ static bool cpu_capability( bool display_only )
     // Build
     printf( "SW built on " __DATE__
     #if defined(__clang__)
-        " with CLANG-%d.%d.%d", __clang_major__, __clang_minor__,
-                                __clang_patchlevel__ );
+        " with CLANG-%d.%d.%d", __clang_major__, __clang_minor__, __clang_patchlevel__ );
     #elif defined(__GNUC__)
        " with GCC-%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ );
     #endif
@@ -3002,27 +2958,30 @@ static bool cpu_capability( bool display_only )
     printf("CPU features: ");
     if ( cpu_arch_x86_64()  )
     {
-       if      ( cpu_has_avx10  )    printf( " AVX10.%d", avx10_version() );
-       if      ( cpu_has_avx512 )    printf( " AVX512" );
-       else if ( cpu_has_avx2   )    printf( " AVX2  " );
-       else if ( cpu_has_avx    )    printf( " AVX   " );
-       else if ( cpu_has_sse42  )    printf( " SSE4.2" );
-       else if ( cpu_has_sse41  )    printf( " SSE4.1" );
-       else if ( cpu_has_ssse3  )    printf( " SSSE3 " );
-       else if ( cpu_has_sse2   )    printf( " SSE2  " );
+       if      ( has_avx10()  )    printf( " AVX10.%d", avx10_version() );
+       else if ( has_avx512() )    printf( " AVX512" );
+       else if ( has_avx2()   )    printf( " AVX2  " );
+       else if ( has_avx()    )    printf( " AVX   " );
+       else if ( has_sse42()  )    printf( " SSE4.2" );
+       else if ( has_sse41()  )    printf( " SSE4.1" );
+       else if ( has_ssse3()  )    printf( " SSSE3 " );
+       else if ( has_sse2()   )    printf( " SSE2  " );
+       if      ( has_amx()    )    printf( " AMX"    );
+       if      ( has_apx_f()  )    printf( " APX"    );
+
     }
     else if   ( cpu_arch_aarch64() )
     {
-       if      ( cpu_has_neon   )    printf( "       NEON" );
-       if      ( cpu_has_sve2   )    printf( " SVE2-%d", sve_vector_length() );
-       else if ( cpu_has_sve    )    printf( " SVE"    );
-       if      ( cpu_has_sme2   )    printf( " SME2"   );
-       else if ( cpu_has_sme    )    printf( " SME"    );
+       if      ( has_neon()   )    printf( "       NEON" );
+       if      ( has_sve2()   )    printf( " SVE2-%d", sve_vector_length() );
+       else if ( has_sve()    )    printf( " SVE"    );
+       if      ( has_sme2()   )    printf( " SME2"   );
+       else if ( has_sme()    )    printf( " SME"    );
     }     
-     if        ( cpu_has_vaes   )    printf( " VAES"   );
-     else if   ( cpu_has_aes    )    printf( "  AES"   );
-     if        ( cpu_has_sha512 )    printf( " SHA512" );
-     else if   ( cpu_has_sha256 )    printf( " SHA256" );
+     if        ( has_vaes()   )    printf( " VAES"   );
+     else if   ( has_aes()    )    printf( "  AES"   );
+     if        ( has_sha512() )    printf( " SHA512" );
+     else if   ( has_sha256() )    printf( " SHA256" );

     printf("\nSW features:  ");
     if ( sw_has_x86_64 )
@@ -3035,6 +2994,8 @@ static bool cpu_capability( bool display_only )
        else if ( sw_has_sse41     ) printf( " SSE4.1" );
        else if ( sw_has_ssse3     ) printf( " SSSE3 " );
        else if ( sw_has_sse2      ) printf( " SSE2  " );
+        if      ( sw_has_amx       ) printf( " AMX"    );
+        if      ( sw_has_apx       ) printf( " APX"    );
     }
     else if    ( sw_has_aarch64 ) 
     {
--- a/miner.h
+++ b/miner.h
@@ -542,7 +542,9 @@ void applog_hash(void *hash);
 void format_hashrate(double hashrate, char *output);
 void print_hash_tests(void);

+// Factors of 1000 used for hashes, ie kH/s, Mh/s.
 void scale_hash_for_display ( double* hashrate, char* units );
+// Factors of 1024 used for bytes, ie kiB, MiB.
 void format_number_si( double* hashrate, char* si_units );
 void report_summary_log( bool force );

@@ -582,6 +584,8 @@ enum algos {
        ALGO_ANIME,
        ALGO_ARGON2D250,
        ALGO_ARGON2D500,
+        ALGO_ARGON2D1000,
+        ALGO_ARGON2D16000,
        ALGO_ARGON2D4096,
        ALGO_AXIOM,       
        ALGO_BLAKE,       
@@ -677,6 +681,8 @@ static const char* const algo_names[] = {
        "anime",
        "argon2d250",
        "argon2d500",
+        "argon2d1000",
+        "argon2d16000",
        "argon2d4096",
        "axiom",
        "blake",
@@ -837,6 +843,8 @@ Options:\n\
                          anime         Animecoin (ANI)\n\
                          argon2d250\n\
                          argon2d500\n\
+                          argon2d1000\n\
+                          argon2d16000\n\
                          argon2d4096\n\
                          axiom         Shabal-256 MemoHash\n\
                          blake         blake256r14 (SFR)\n\
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -137,10 +137,24 @@
 #define v128_unpackhi8                 _mm_unpackhi_epi8

 // AES
-// Nokey means nothing on x86_64 but it saves an instruction and a register
-// on ARM.
-#define v128_aesenc                    _mm_aesenc_si128
+
+// xor key with result after encryption, x86_64 format.
+#define v128_aesencxor                 _mm_aesenc_si128
+// default is x86_64 format.
+#define v128_aesenc                    v128_aesencxor
+
+// xor key with v before encryption, arm64 format.
+#define v128_xoraesenc( v, k ) \
+   _mm_aesenc_si128( v128_xor( v, k ), v128_zero )
+
+// xor v with k_in before encryption then xor the result with k_out afterward.
+// Uses the applicable optimization based on the target.
+#define v128_xoraesencxor( v, k_in, k_out ) \
+   _mm_aesenc_si128( v128_xor( v, k_in ), k_out )
+
+// arm64 optimized
 #define v128_aesenc_nokey(v)           _mm_aesenc_si128( v, v128_zero )
+
 #define v128_aesenclast                _mm_aesenclast_si128
 #define v128_aesenclast_nokey(v)       _mm_aesenclast_si128( v, v128_zero )
 #define v128_aesdec                    _mm_aesdec_si128
@@ -433,7 +447,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_orand( a, b, c )     _mm_ternarylogic_epi64( a, b, c, 0xf8 )

 // ~( a ^ b ), same as (~a) ^ b
-#define v128_xnor( a, b )         _mm_ternarylogic_epi64( a, b, b, 0x81 )
+#define v128_nxor( a, b )         _mm_ternarylogic_epi64( a, b, b, 0x81 )

 #else

@@ -455,7 +469,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #define v128_orand( a, b, c )     _mm_or_si128( a, _mm_and_si128( b, c ) )

-#define v128_xnor( a, b )         v128_not( _mm_xor_si128( a, b ) )
+#define v128_nxor( a, b )         v128_not( _mm_xor_si128( a, b ) )

 #endif

--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -170,7 +170,7 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_orand( a, b, c )     _mm256_ternarylogic_epi64( a, b, c, 0xf8 )

 // ~( a ^ b ), same as (~a) ^ b
-#define mm256_xnor( a, b )         _mm256_ternarylogic_epi64( a, b, b, 0x81 )
+#define mm256_nxor( a, b )         _mm256_ternarylogic_epi64( a, b, b, 0x81 )
    
 #else

@@ -208,7 +208,7 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_orand( a, b, c ) \
  _mm256_or_si256( a, _mm256_and_si256( b, c ) )

-#define mm256_xnor( a, b ) \
+#define mm256_nxor( a, b ) \
  mm256_not( _mm256_xor_si256( a, b ) )

 #endif
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -18,8 +18,13 @@

 //  AVX512 intrinsics have a few changes from previous conventions.
 //
-//    "_mm512_cmp" instructions now returns a bitmask instead of a vector mask.
-//    This removes the need for an explicit movemask instruction.
+//    "_mm512_cmp" instructions now return a bitmask instead of a vector mask.
+//    This removes the need for an explicit movemask instruction. It is also
+//    slower than AVX2 cmp. There is no version of AVX512 cmp that returns a
+//    vector result resulting in a double penalty if a vector result is needed:
+//    slower cmp instruction & extra instruction to convert bit mask into 
+//    vector mask. 256 bit & 128 bit still have legacy cmp returning vector
+//    while AVX512VL adds masked versions returning bit mask.
 //
 //    Many previously sizeless (si) instructions now have sized (epi) versions
 //    to accomodate masking packed elements.
@@ -30,7 +35,7 @@
 //    list.
 //
 //    "_mm512_permutex_epi64" only shuffles within 256 bit lanes. All other
-//    AVX512 permutes can cross all lanes.
+//    AVX512 instructions using the permute name can cross all lanes.
 //
 //    New alignr instructions for epi64 and epi32 operate across the entire
 //    vector but slower than epi8 which continues to be restricted to 128 bit
@@ -50,16 +55,17 @@
 //        parentheses to ensure the expression argument is evaluated first.
 //      - if an argument is to referenced multiple times a C inline function
 //        should be used instead of a macro to prevent an expression argument
-//        from being evaluated multiple times (wasteful) or produces side
+//        from being evaluated multiple times (wasteful) or produce side
 //        effects (very bad).
 //
 //    There are 2 areas where overhead is a major concern: constants and
 //    permutations.
 //
 //    Constants need to be composed at run time by assembling individual
-//    elements, very expensive. The cost is proportional to the number of
-//    different elements therefore use the largest element size possible,
-//    merge smaller integer elements to 64 bits, and group repeated elements.
+//    elements or loaded from memory, very expensive. The cost of runtime
+//    construction is proportional to the number of different elements
+//    therefore use the largest element size possible merging smaller integer
+//    elements to 64 bits, and group repeated elements.
 //
 //    Constants with repeating patterns can be optimized with the smaller
 //    patterns repeated more frequently being more efficient.
@@ -67,14 +73,15 @@
 //    Some specific constants can be very efficient. Zero is very efficient,
 //    1 and -1 slightly less so. 
 //
-//    If an expensive constant is to be reused in the same function it should
-//    be declared as a local variable defined once and reused.
+//    If an expensive constant is to be reused in the same function it may
+//    be declared as a local variable defined once and reused. If frequently
+//    used it can be declared as a static const in memory.
 //
 //    Permutations can be very expensive if they use a vector control index,
-//    even if the permutation itself is quite efficient.
-//    The index is essentially a constant with all the baggage that brings.
-//    The same rules apply, if an index is to be reused it should be defined
-//    as a local. This applies specifically to bswap operations.
+//    even if the permute instruction itself is quite efficient.
+//    The index is essentially a vector constant with all the baggage that
+//    brings. The same rules apply, if an index is to be reused it should either
+//    be defined as a local or static const.
 //
 //    Permutations that cross 128 bit lanes are typically slower and often need
 //    a vector control index. If the permutation doesn't need to cross 128 bit
@@ -221,7 +228,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 #define mm512_nor( a, b )          _mm512_ternarylogic_epi64( a, b, b, 0x01 )

 // ~( a ^ b ),  (~a) ^ b
-#define mm512_xnor( a, b )         _mm512_ternarylogic_epi64( a, b, b, 0x81 )
+#define mm512_nxor( a, b )         _mm512_ternarylogic_epi64( a, b, b, 0x81 )

 // ~( a & b )
 #define mm512_nand( a, b )         _mm512_ternarylogic_epi64( a, b, b, 0xef )
@@ -241,6 +248,15 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 #define mm512_ror_32 _mm512_ror_epi32
 #define mm512_rol_32 _mm512_rol_epi32

+/* not used
+#if defined(__AVX512VBMI2__)
+
+#define mm512_ror_16( v, c )   _mm512_shrdi_epi16( c, v, v )
+#define mm512_rol_16( v, c )   _mm512_shldi_epi16( c, v, v )
+
+#endif
+*/
+
 //
 // Reverse byte order of packed elements, vectorized endian conversion.

@@ -249,9 +265,17 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 #define mm512_bswap_32( v )  _mm512_shuffle_epi8( v, V512_BSWAP32 )

 /* not used
+#if defined(__AVX512VBMI2__)
+
+#define mm512_bswap_16( v )  mm512_ror_16( v, 8 )
+
+#else
+
 #define mm512_bswap_16( v ) \
   _mm512_shuffle_epi8( v, mm512_bcast128( _mm_set_epi64x( \
-                              0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
+                                0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
+
+#endif
 */

 #define mm512_bswap_16( v ) \
@@ -431,8 +455,7 @@ static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
                                           _mm512_castsi512_ps( v2 ), c ) ); 

 // 64 bit lanes
-// ROL, ROR not necessary with AVX512, included for consistency with AVX2/SSE.
-
+// Redundant with ror & rol but included for consistency with AVX2/SSE.
 #define mm512_qrev32( v )       _mm512_shuffle_epi32( v, 0xb1 )
 #define mm512_swap64_32         mm512_qrev32        // grandfathered

--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -126,7 +126,7 @@
 #define v128_andnot( v1, v0 )         vbicq_u32( v0, v1 )

 // ~( v1 ^ v0 ), same as (~v1) ^ v0
-#define v128_xnor( v1, v0 )           v128_not( v128_xor( v1, v0 ) )
+#define v128_nxor( v1, v0 )           v128_not( v128_xor( v1, v0 ) )

 // ~v1 | v0,  args reversed for consistency with x86_64
 #define v128_ornot( v1, v0 )          vornq_u32( v0, v1 )
@@ -187,9 +187,21 @@
 // vzipq_u32 can do hi & lo and return uint32x4x2, no 64 bit version.

 // AES
-// consistent with Intel AES intrinsics, break up for optimizing
-#define v128_aesenc( v, k ) \
-   v128_xor( k, vaesmcq_u8( vaeseq_u8( v, v128_zero ) ) )
+
+// xor key with result after encryption, x86_64 format.
+#define v128_aesencxor( v, k ) \
+   v128_xor( vaesmcq_u8( vaeseq_u8( v, v128_zero ) ), k )
+// default is x86_64 format.
+#define v128_aesenc v128_aesencxor
+
+// xor key with v before encryption, arm64 format.
+#define v128_xoraesenc( v, k ) \
+   vaesmcq_u8( vaeseq_u8( v, k ) )
+
+// xor v with k_in before encryption then xor the result with k_out afterward.
+// Uses the applicable optimization based on the target.
+#define v128_xoraesencxor( v, k_in, k_out ) \
+   v128_xor( v128_xoraesenc( v, k_in ), k_out )

 #define v128_aesenc_nokey( v ) \
   vaesmcq_u8( vaeseq_u8( v, v128_zero ) )
@@ -337,52 +349,6 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
     vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) )

 // Bit rotation
-/*
-#define v128_shuflr64_8( v )        v128_shuffle8( v, V128_SHUFLR64_8 )
-#define v128_shufll64_8( v )        v128_shuffle8( v, V128_SHUFLL64_8 )
-#define v128_shuflr64_16(v )        v128_shuffle8( v, V128_SHUFLR64_16 )
-#define v128_shufll64_16(v )        v128_shuffle8( v, V128_SHUFLL64_16 )
-#define v128_shuflr64_24(v )        v128_shuffle8( v, V128_SHUFLR64_24 )
-#define v128_shufll64_24(v )        v128_shuffle8( v, V128_SHUFLL64_24 )
-#define v128_shuflr32_8( v )        v128_shuffle8( v, V128_SHUFLR32_8 )
-#define v128_shufll32_8( v )        v128_shuffle8( v, V128_SHUFLL32_8 )
-
-#define v128_ror64( v, c ) \
-   ( (c) ==  8 ) ? v128_shuflr64_8( v ) \
- : ( (c) == 16 ) ? v128_shuflr64_16( v ) \
- : ( (c) == 24 ) ? v128_shuflr64_24( v ) \
- : ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
- : ( (c) == 40 ) ? v128_shufll64_24( v ) \
- : ( (c) == 48 ) ? v128_shufll64_16( v ) \
- : ( (c) == 56 ) ? v128_shufll64_8( v ) \
- :                 vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
-                                ((uint64x2_t)(v)), c )
-
-#define v128_rol64( v, c ) \
-   ( (c) ==  8 ) ? v128_shufll64_8( v ) \
- : ( (c) == 16 ) ? v128_shufll64_16( v ) \
- : ( (c) == 24 ) ? v128_shufll64_24( v ) \
- : ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
- : ( (c) == 40 ) ? v128_shuflr64_24( v ) \
- : ( (c) == 48 ) ? v128_shuflr64_16( v ) \
- : ( (c) == 56 ) ? v128_shuflr64_8( v ) \
- :                 vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
-                                ((uint64x2_t)(v)), c )
-
-#define v128_ror32( v, c ) \
-   ( (c) ==  8 ) ? v128_shuflr32_8( v ) \
- : ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
- : ( (c) == 24 ) ? v128_shufll32_8( v ) \
- :                 vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
-                                ((uint32x4_t)(v)), c )
-
-#define v128_rol32( v, c ) \
-   ( (c) ==  8 ) ? v128_shufll32_8( v ) \
- : ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
- : ( (c) == 24 ) ? v128_shuflr32_8( v ) \
- :                 vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
-                                ((uint32x4_t)(v)), c )
-*/

 #define v128_ror64( v, c ) \
  ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -474,6 +474,7 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 }
 
 /*
+// ARM feature compiler flags
 #ifdef __aarch64__
 #warning "__aarch64__"
 #endif
@@ -509,16 +510,6 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 #endif
 */

-// Typical display format: AVX10.[version]_[vectorlength], if vector length is
-// omitted 256 is the default.
-//    Ex: AVX10.1_512
-// Flags:
-// AVX10  128  256  512
-//   0     0    0    0    = AVX10 not supported
-//   1     1    1    0    = AVX10 256 bit max  (version 2)
-//   1     1    1    1    = AVX10 512 bit max  (version 1 granite rapids)
-// Other combinations are not defined.
-
 static inline bool cpu_arch_x86_64()
 {
 #if defined(__x86_64__)
@@ -766,6 +757,17 @@ static inline bool has_vbmi2()
 #endif
 }

+static inline bool has_amx()
+{
+#if defined(__x86_64__)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( EXTENDED_FEATURES, 0, cpu_info );
+   return cpu_info[ EDX_Reg ] & AMX_TILE_Flag;
+#else
+   return false;
+#endif
+}
+
 static inline bool has_aes()
 {
 #if defined(__x86_64__)
@@ -815,12 +817,9 @@ static inline bool has_sveaes()
 static inline bool has_sha256()
 {
 #if defined(__x86_64__)
-   if ( has_avx() )
-   {
      unsigned int cpu_info[4] = { 0 };
      cpuid( EXTENDED_FEATURES, 0, cpu_info );
      return cpu_info[ EBX_Reg ] & SHA_Flag;
-   }
   return false;
 #elif defined(__aarch64__) && defined(HWCAP_SHA2)
   // NEON SHA256
@@ -835,7 +834,7 @@ static inline bool has_sha256()
 static inline bool has_sha512()
 {
 #if defined(__x86_64__)
-   if ( has_avx2() )
+   if ( has_avx() )
   {
      unsigned int cpu_info[4] = { 0 };
      cpuid( EXTENDED_FEATURES, 1, cpu_info );
@@ -852,7 +851,6 @@ static inline bool has_sha512()
 #endif
 }

-// Arm only
 static inline bool has_sha3()
 {
 #if defined(__aarch64__) && defined(HWCAP_SHA3)
@@ -944,16 +942,6 @@ static inline int sve_vector_length()
   return 0;
 }

-// Assume min_vlen refers to the register size
-static inline int rvv_vector_length()
-{
-#if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_v_min_vlen)
-   return __riscv_v_min_vlen;
-#endif
-   return 0;
-}
-
-// generic
 static inline int vector_length()
 {
 #if defined(__x86_64__)
@@ -965,8 +953,8 @@ static inline int vector_length()
   return has_sve()    ? sve_vector_length()
        : has_neon()   ? 128
        :                0;
-#elif defined(__riscv) && defined(__riscv_vector)
-   return rvv_vector_length();
+#elif defined(__riscv) && defined(__riscv_vector) && defined(__riscv_v_min_vlen)
+   return __riscv_v_min_vlen;
 #endif
   return 0;
 }
--- a/util.c
+++ b/util.c
@@ -304,39 +304,28 @@ void get_defconfig_path(char *out, size_t bufsize, char *argv0)
 	free(cmd);
 }

-
-void format_hashrate(double hashrate, char *output)
+// Decimal SI, factors 0f 1000
+void scale_hash_for_display ( double* hashrate, char* prefix )
 {
-	char prefix = '\0';
-
-	if (hashrate < 10000) {
-		// nop
-	}
-	else if (hashrate < 1e7) {
-		prefix = 'k';
-		hashrate *= 1e-3;
-	}
-	else if (hashrate < 1e10) {
-		prefix = 'M';
-		hashrate *= 1e-6;
-	}
-	else if (hashrate < 1e13) {
-		prefix = 'G';
-		hashrate *= 1e-9;
-	}
-	else {
-		prefix = 'T';
-		hashrate *= 1e-12;
-	}
-
-	sprintf(
-		output,
-		prefix ? "%.2f %cH/s" : "%.2f H/s%c",
-		hashrate, prefix
-	);
+       if ( *hashrate < 1e4  )    *prefix =  0;
+  else if ( *hashrate < 1e7  )  { *prefix = 'k';  *hashrate /= 1e3;  }
+  else if ( *hashrate < 1e10 )  { *prefix = 'M';  *hashrate /= 1e6;  }
+  else if ( *hashrate < 1e13 )  { *prefix = 'G';  *hashrate /= 1e9;  }
+  else if ( *hashrate < 1e16 )  { *prefix = 'T';  *hashrate /= 1e12; }
+  else if ( *hashrate < 1e19 )  { *prefix = 'P';  *hashrate /= 1e15; }
+  else if ( *hashrate < 1e22 )  { *prefix = 'E';  *hashrate /= 1e18; }
+  else if ( *hashrate < 1e25 )  { *prefix = 'Z';  *hashrate /= 1e21; }
+  else                          { *prefix = 'Y';  *hashrate /= 1e24; }
 }

-// For use with MiB etc
+void format_hashrate( double hashrate, char *output )
+{
+	char prefix = '\0';
+   scale_hash_for_display( &hashrate, &prefix );
+	sprintf( output, prefix ? "%.2f %cH/s" : "%.2f H/s%c", hashrate, prefix	);
+}
+
+// Binary SI, factors of 1024
 void format_number_si( double* n, char* si_units )
 {
  if ( *n < 1024*10 )  {  *si_units = 0;   return;  }
--- a/verthash-help.txt
+++ b/verthash-help.txt
@@ -55,9 +55,9 @@ the current directory it will be created.

 Data file creation can take up to 30 minutes on a spinning hard drive. 
 Once created the new data file will be verified and used immediately
-if a valid url and user were included on the command line.
+if a valid url and user was included on the command line.

-A default data file can be created by ommitting the url option. That will
+A default data file can also be created by ommitting the url option. That will
 either verify an existing default data file or create one and verify it,
 then exit.
Author	SHA1	Message	Date
Jay D Dee	8f2f9ec3e9	v25.7	2025-11-15 10:44:32 -05:00
Jay D Dee	12480a3ea5	v25.6	2025-07-20 19:43:10 -04:00