From 12480a3ea5fe7c33df8184c1dce92507fdb33b2d Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Sun, 20 Jul 2025 19:43:10 -0400
Subject: [PATCH] v25.6

---
 README.md                        |   2 +-
 RELEASE_NOTES                    |   7 +-
 algo-gate-api.c                  |   2 +
 algo-gate-api.h                  |   5 +-
 algo/argon2d/argon2d-gate.c      | 171 ++++++++++++++----------
 algo/argon2d/argon2d-gate.h      |  15 ++-
 algo/shavite/shavite-hash-2way.c |   2 +-
 algo/shavite/shavite-hash-4way.c | 184 ++++++++++++--------------
 algo/shavite/shavite.c           | 159 ----------------------
 algo/shavite/sph-shavite-aesni.c | 220 +++++++++++++------------------
 compat/aes_helper.c              |  19 ++-
 configure                        |  20 +--
 configure.ac                     |   2 +-
 configure~                       | 159 ++++++++++++++++++++--
 miner.h                          |   6 +
 simd-utils/simd-128.h            |  20 ++-
 simd-utils/simd-neon.h           |  18 ++-
 17 files changed, 507 insertions(+), 504 deletions(-)
 delete mode 100644 algo/shavite/shavite.c

diff --git a/README.md b/README.md
index 3b4daaf..140c4e0 100644
--- a/README.md
+++ b/README.md
@@ -54,9 +54,9 @@ Supported Algorithms
 
                           allium        Garlicoin
                           anime         Animecoin
-                          argon2        Argon2 coin (AR2)
                           argon2d250    
                           argon2d500
+                          argon2d1000
                           argon2d4096
                           blake         Blake-256
                           blake2b       Blake2-512
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 1c189d3..ed04fe0 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -75,9 +75,14 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v25.6
+
+Added argon2d1000, argon2d16000 algos.
+Target specific AES optimizations improve shavite for ARM64 & x86_64.
+
 v25.5
 
-x86_64: Fixed and insidious bug in sha256 early rejection optimization for AVX2 & AVX512.
+x86_64: Fixed an insidious bug in sha256 early rejection optimization for AVX2 & AVX512.
 x86_64: Faster sha256d, sha256dt for AVX2 & AVX512.
 Other small bug fixes.
 
diff --git a/algo-gate-api.c b/algo-gate-api.c
index 5b898f2..05f75de 100644
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -297,6 +297,8 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_ANIME:        rc = register_anime_algo         ( gate ); break;
     case ALGO_ARGON2D250:   rc = register_argon2d250_algo    ( gate ); break;
     case ALGO_ARGON2D500:   rc = register_argon2d500_algo    ( gate ); break;
+    case ALGO_ARGON2D1000:  rc = register_argon2d1000_algo   ( gate ); break;
+    case ALGO_ARGON2D16000: rc = register_argon2d16000_algo  ( gate ); break;
     case ALGO_ARGON2D4096:  rc = register_argon2d4096_algo   ( gate ); break;
     case ALGO_AXIOM:        rc = register_axiom_algo         ( gate ); break;
     case ALGO_BLAKE:        rc = register_blake_algo         ( gate ); break;
diff --git a/algo-gate-api.h b/algo-gate-api.h
index 59abc51..594c828 100644
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -172,8 +172,11 @@ void ( *set_work_data_endian )  ( struct work* );
 
 json_t* ( *longpoll_rpc_call )  ( CURL*, int*, char* );
 
+// Deprecated
 set_t optimizations;
+
 int  ( *get_work_data_size )     ();
+
 int  ntime_index;
 int  nbits_index;
 int  nonce_index;            // use with caution, see warning below
@@ -274,8 +277,6 @@ void std_get_new_work( struct work *work, struct work *g_work, int thr_id,
 
 void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
 void sha256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
-// OpenSSL sha256 deprecated
-//void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
 
 bool std_le_work_decode( struct work *work );
 bool std_be_work_decode( struct work *work );
diff --git a/algo/argon2d/argon2d-gate.c b/algo/argon2d/argon2d-gate.c
index 4278918..afdeef3 100644
--- a/algo/argon2d/argon2d-gate.c
+++ b/algo/argon2d/argon2d-gate.c
@@ -6,6 +6,38 @@ static const size_t INPUT_BYTES = 80;  // Lenth of a block header in bytes. Inpu
 static const size_t OUTPUT_BYTES = 32; // Length of output needed for a 256-bit hash
 static const unsigned int DEFAULT_ARGON2_FLAG = 2; //Same as ARGON2_DEFAULT_FLAGS
 
+// generic, works with most variations of argon2d
+int scanhash_argon2d( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t _ALIGN(64) hash[8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const int thr_id = mythr->id;
+   const uint32_t first_nonce = (const uint32_t)pdata[19];
+   const uint32_t last_nonce = (const uint32_t)max_nonce;
+   uint32_t nonce = first_nonce;
+   const bool bench = opt_benchmark;
+
+   v128_bswap32_80( edata, pdata );
+   do
+   {
+      edata[19] = nonce;
+      algo_gate.hash( hash, edata, thr_id );
+      if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
+      {
+          pdata[19] = bswap_32( nonce );
+          submit_solution( work, hash, mythr );
+      }
+      nonce++;
+  } while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
+
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce;
+   return 0;
+}
+
 void argon2d250_hash( void *output, const void *input )
 {
 	argon2_context context;
@@ -32,41 +64,10 @@ void argon2d250_hash( void *output, const void *input )
 	argon2_ctx( &context, Argon2_d );
 }
 
-int scanhash_argon2d250( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t _ALIGN(64) edata[20];
-   uint32_t _ALIGN(64) hash[8];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t Htarg = ptarget[7];
-   uint32_t nonce = first_nonce;
-
-   swab32_array( edata, pdata, 20 );
-
-   do {
-      be32enc(&edata[19], nonce);
-      argon2d250_hash( hash, edata );
-      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
-      {
-          pdata[19] = nonce;
-          submit_solution( work, hash, mythr );
-      }
-      nonce++;
-   } while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce + 1;
-   return 0;
-}
-
 bool register_argon2d250_algo( algo_gate_t* gate )
 {
-        gate->scanhash = (void*)&scanhash_argon2d250;
+        gate->scanhash = (void*)&scanhash_argon2d;
         gate->hash = (void*)&argon2d250_hash;
-        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
         opt_target_factor = 65536.0;
         return true;
 }
@@ -97,43 +98,78 @@ void argon2d500_hash( void *output, const void *input )
     argon2_ctx( &context, Argon2_d );
 }
 
-int scanhash_argon2d500( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t _ALIGN(64) edata[20];
-   uint32_t _ALIGN(64) hash[8];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const int thr_id = mythr->id; 
-   const uint32_t first_nonce = (const uint32_t)pdata[19];
-   const uint32_t last_nonce = (const uint32_t)max_nonce;
-   uint32_t nonce = first_nonce;
-   const bool bench = opt_benchmark;
-
-   v128_bswap32_80( edata, pdata );
-   do
-   {
-      edata[19] = nonce;
-      argon2d500_hash( hash, edata );
-      if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
-           && !bench ) )
-      {
-          pdata[19] = bswap_32( nonce );;
-          submit_solution( work, hash, mythr );
-      }
-      nonce++;
-  } while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
-
-   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce;
-   return 0;
-}
-
 bool register_argon2d500_algo( algo_gate_t* gate )
 {
-        gate->scanhash = (void*)&scanhash_argon2d500;
+        gate->scanhash = (void*)&scanhash_argon2d;
         gate->hash = (void*)&argon2d500_hash;
-        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
+        opt_target_factor = 65536.0;
+        return true;
+}
+
+void argon2d1000_hash( void *output, const void *input )
+{
+    argon2_context context;
+    context.out = (uint8_t *)output;
+    context.outlen = (uint32_t)OUTPUT_BYTES;
+    context.pwd = (uint8_t *)input;
+    context.pwdlen = (uint32_t)INPUT_BYTES;
+    context.salt = (uint8_t *)input; //salt = input
+    context.saltlen = (uint32_t)INPUT_BYTES;
+    context.secret = NULL;
+    context.secretlen = 0;
+    context.ad = NULL;
+    context.adlen = 0;
+    context.allocate_cbk = NULL;
+    context.free_cbk = NULL;
+    context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS
+    // main configurable Argon2 hash parameters
+    context.m_cost = 1000;  // Memory in KiB (1MB)
+    context.lanes = 8;     // Degree of Parallelism
+    context.threads = 1;   // Threads
+    context.t_cost = 2;    // Iterations
+    context.version = ARGON2_VERSION_10;
+
+    argon2_ctx( &context, Argon2_d );
+}
+
+bool register_argon2d1000_algo( algo_gate_t* gate )
+{
+        gate->scanhash = (void*)&scanhash_argon2d;
+        gate->hash = (void*)&argon2d1000_hash;
+        opt_target_factor = 65536.0;
+        return true;
+}
+
+void argon2d16000_hash( void *output, const void *input )
+{
+   argon2_context context;
+   context.out = (uint8_t *)output;
+   context.outlen = (uint32_t)OUTPUT_BYTES;
+   context.pwd = (uint8_t *)input;
+   context.pwdlen = (uint32_t)INPUT_BYTES;
+   context.salt = (uint8_t *)input; //salt = input
+   context.saltlen = (uint32_t)INPUT_BYTES;
+   context.secret = NULL;
+   context.secretlen = 0;
+   context.ad = NULL;
+   context.adlen = 0;
+   context.allocate_cbk = NULL;
+   context.free_cbk = NULL;
+   context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS
+   // main configurable Argon2 hash parameters
+   context.m_cost = 16000; // Memory in KiB (~16384KB)
+   context.lanes = 1;    // Degree of Parallelism
+   context.threads = 1;  // Threads
+   context.t_cost = 1;   // Iterations
+   context.version = ARGON2_VERSION_10;
+
+   argon2_ctx( &context, Argon2_d );
+}
+
+bool register_argon2d16000_algo( algo_gate_t* gate )
+{
+        gate->scanhash = (void*)&scanhash_argon2d;
+        gate->hash = (void*)&argon2d16000_hash;
         opt_target_factor = 65536.0;
         return true;
 }
@@ -148,7 +184,7 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
    const uint32_t first_nonce = pdata[19];
    const uint32_t last_nonce = (const uint32_t)max_nonce;
    uint32_t n = first_nonce;
-   const int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;  
    uint32_t t_cost = 1; // 1 iteration
    uint32_t m_cost = 4096; // use 4MB
    uint32_t parallelism = 1; // 1 thread, 2 lanes
@@ -176,7 +212,6 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
 bool register_argon2d4096_algo( algo_gate_t* gate )
 {
         gate->scanhash = (void*)&scanhash_argon2d4096;
-        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT |NEON_OPT;
         opt_target_factor = 65536.0;
         return true;
 }
diff --git a/algo/argon2d/argon2d-gate.h b/algo/argon2d/argon2d-gate.h
index b96b626..3445726 100644
--- a/algo/argon2d/argon2d-gate.h
+++ b/algo/argon2d/argon2d-gate.h
@@ -4,22 +4,27 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
+int scanhash_argon2d( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );
+
 // Credits: version = 0x10, m_cost = 250.
 bool register_argon2d250_algo( algo_gate_t* gate );
 
 void argon2d250_hash( void *state, const void *input );
 
-int scanhash_argon2d250( struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done, struct thr_info *mythr );
-
 // Dynamic: version = 0x10, m_cost = 500.
 bool register_argon2d500_algo( algo_gate_t* gate );
 
 void argon2d500_hash( void *state, const void *input );
 
-int scanhash_argon2d500( struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done, struct thr_info *mythr );
+// Zero Dynamics Cash: version = 0x10, m_cost = 1000.
+bool register_argon2d1000_algo( algo_gate_t* gate );
 
+void argon2d1000_hash( void *state, const void *input );
+
+bool register_argon2d16000_algo( algo_gate_t* gate );
+
+void argon2d16000_hash( void *state, const void *input );
 
 // Unitus: version = 0x13, m_cost = 4096.
 bool register_argon2d4096_algo( algo_gate_t* gate );
diff --git a/algo/shavite/shavite-hash-2way.c b/algo/shavite/shavite-hash-2way.c
index 6f8a3db..6d288bf 100644
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -109,7 +109,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
 
    for ( r = 0; r < 3; r ++ )
    {
-      // round 1, 5, 9
+     // round 1, 5, 9
 
      k00 = _mm256_xor_si256( k13, mm256_shuflr128_32(
                                   mm256_aesenc_2x128( k00, zero ) ) );
diff --git a/algo/shavite/shavite-hash-4way.c b/algo/shavite/shavite-hash-4way.c
index cb8e721..798f40c 100644
--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -21,7 +21,7 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
    __m512i *H = (__m512i*)ctx->h;
    const __m512i count = _mm512_set4_epi32( ctx->count3, ctx->count2,
                                             ctx->count1, ctx->count0 );
-   int r;
+   const __m512i zero = _mm512_setzero_si512();
 
    P0 = H[0];
    P1 = H[1];
@@ -37,182 +37,160 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
    K6 = M[6];
    K7 = M[7];
 
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K1 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K2 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K3 ), m512_zero );
+   // round 0
 
-   P0 = _mm512_xor_si512( P0, X );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K1 ), zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K2 ), zero );
+   P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P0 );
 
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K5 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K6 ), m512_zero );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K7 ), m512_zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K5 ), zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K6 ), zero );
+   P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P2 );
 
-   P2 = _mm512_xor_si512( P2, X );
-
-   // round
-   for ( r = 0; r < 3; r ++ )
+   for ( int r = 0; r < 3; r ++ )
    {
-      // round 1, 5, 9
+     // round 1, 5, 9
 
      K0 = _mm512_xor_si512( K7, mm512_shuflr128_32(
-                                  _mm512_aesenc_epi128( K0, m512_zero ) ) );
+                                  _mm512_aesenc_epi128( K0, zero ) ) );
 
      if ( r == 0 )
         K0 = _mm512_xor_si512( K0,
-                    _mm512_mask_xor_epi32( count, 0x8888, count, m512_neg1 ) );
+             _mm512_mask_ternarylogic_epi32( count, 0x8888, count, count, 1 ) );
 
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), zero );
      K1 = _mm512_xor_si512( K0,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K1, zero ) ) );
 
      if ( r == 1 )
         K1 = _mm512_xor_si512( K1, mm512_shuflr128_32(
-                 _mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) );
+           _mm512_mask_ternarylogic_epi32( count, 0x1111, count, count, 1 ) ) );
 
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
      K2 = _mm512_xor_si512( K1,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K2, zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
      K3 = _mm512_xor_si512( K2,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
-
-     P3 = _mm512_xor_si512( P3, X );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K3, zero ) ) );
+     P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P3 );
 
      K4 = _mm512_xor_si512( K3,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K4, zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), zero );
      K5 = _mm512_xor_si512( K4,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K5, zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
      K6 = _mm512_xor_si512( K5,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K6, zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
      K7 = _mm512_xor_si512( K6,
-		           mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K7, zero ) ) );
 
      if ( r == 2 )
         K7 = _mm512_xor_si512( K7, mm512_swap128_64(
-                 _mm512_mask_xor_epi32( count, 0x2222, count, m512_neg1 ) ) );
+           _mm512_mask_ternarylogic_epi32( count, 0x2222, count, count, 1 ) ) );
  
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
-     P1 = _mm512_xor_si512( P1, X );
+     P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P1 );
      
      // round 2, 6, 10
 
      K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), zero );
      K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
      K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
      K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
-
-     P2 = _mm512_xor_si512( P2, X );
+     P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P2 );
 
      K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), zero );
      K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
      K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
      K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
-
-     P0 = _mm512_xor_si512( P0, X );
+     P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P0 );
 
      // round 3, 7, 11
 
      K0 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero );
+                               _mm512_aesenc_epi128( K0, zero ) ), K7 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), zero );
      K1 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+                               _mm512_aesenc_epi128( K1, zero ) ), K0 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
      K2 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+                               _mm512_aesenc_epi128( K2, zero ) ), K1 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
      K3 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
-
-     P1 = _mm512_xor_si512( P1, X );
+                               _mm512_aesenc_epi128( K3, zero ) ), K2 );
+     P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P1 );
 
      K4 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero );
+                               _mm512_aesenc_epi128( K4, zero ) ), K3 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), zero );
      K5 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+                               _mm512_aesenc_epi128( K5, zero ) ), K4 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
      K6 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K6, m512_zero ) ), K5 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+                               _mm512_aesenc_epi128( K6, zero ) ), K5 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
      K7 = _mm512_xor_si512( mm512_shuflr128_32(
-                               _mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
-
-     P3 = _mm512_xor_si512( P3, X );
+                               _mm512_aesenc_epi128( K7, zero ) ), K6 );
+     P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P3 );
 
      // round 4, 8, 12
 
      K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), zero );
      K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
      K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
      K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
-
-     P0 = _mm512_xor_si512( P0, X );
+     P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P0 );
 
      K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), zero );
      K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
      K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
      K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
-     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
-
-     P2 = _mm512_xor_si512( P2, X );
+     P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P2 );
    }
 
    // round 13
 
    K0 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K0, m512_zero ) ), K7  );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
+			             _mm512_aesenc_epi128( K0, zero ) ), K7  );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), zero );
    K1 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+			             _mm512_aesenc_epi128( K1, zero ) ), K0 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
    K2 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+			             _mm512_aesenc_epi128( K2, zero ) ), K1 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
    K3 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
-
-   P3 = _mm512_xor_si512( P3, X );
+			             _mm512_aesenc_epi128( K3, zero ) ), K2 );
+   P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P3 );
 
    K4 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
+			             _mm512_aesenc_epi128( K4, zero ) ), K3 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), zero );
    K5 = _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
-   K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
-   K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5,  mm512_swap64_32( 
-              _mm512_mask_xor_epi32( count, 0x4444, count, m512_neg1 ) ) ) );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+			             _mm512_aesenc_epi128( K5, zero ) ), K4 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
+   K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, zero ) );
+   K6 = mm512_xor3( K6, K5, mm512_swap64_32(
+        _mm512_mask_ternarylogic_epi32( count, 0x4444, count, count, 1 ) ) );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
    K7= _mm512_xor_si512( mm512_shuflr128_32(
-			             _mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
-   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
-
-   P1 = _mm512_xor_si512( P1, X );
+			             _mm512_aesenc_epi128( K7, zero ) ), K6 );
+   P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P1 );
 
    H[0] = _mm512_xor_si512( H[0], P2 );
    H[1] = _mm512_xor_si512( H[1], P3 );
diff --git a/algo/shavite/shavite.c b/algo/shavite/shavite.c
deleted file mode 100644
index 9ad9844..0000000
--- a/algo/shavite/shavite.c
+++ /dev/null
@@ -1,159 +0,0 @@
-#include "miner.h"
-#include "algo-gate-api.h"
-#include <string.h>
-#include <stdint.h>
-
-#include "sph_shavite.h"
-
-extern void inkhash(void *state, const void *input)
-{
-    sph_shavite512_context	 ctx_shavite;
-    uint32_t hash[16];
-	
-    sph_shavite512_init(&ctx_shavite);
-    sph_shavite512 (&ctx_shavite, (const void*) input, 80);
-    sph_shavite512_close(&ctx_shavite, (void*) hash);
-    
-    sph_shavite512_init(&ctx_shavite);
-    sph_shavite512(&ctx_shavite, (const void*) hash, 64);
-    sph_shavite512_close(&ctx_shavite, (void*) hash);
-
-    memcpy(state, hash, 32);
-
-/*	
-	int ii;
-	printf("result: ");
-	for (ii=0; ii < 32; ii++)
-	{
-		printf ("%.2x",((uint8_t*)state)[ii]);
-	};
-	printf ("\n");	
-*/	
-}
-
-int scanhash_ink( struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
-{
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;
-
-	uint32_t n = pdata[19] - 1;
-	const uint32_t first_nonce = pdata[19];
-	//const uint32_t Htarg = ptarget[7];
-
-	uint32_t _ALIGN(32) hash64[8];
-	uint32_t endiandata[32];
-	
-	//char testdata[] = {"\x70\x00\x00\x00\x5d\x38\x5b\xa1\x14\xd0\x79\x97\x0b\x29\xa9\x41\x8f\xd0\x54\x9e\x7d\x68\xa9\x5c\x7f\x16\x86\x21\xa3\x14\x20\x10\x00\x00\x00\x00\x57\x85\x86\xd1\x49\xfd\x07\xb2\x2f\x3a\x8a\x34\x7c\x51\x6d\xe7\x05\x2f\x03\x4d\x2b\x76\xff\x68\xe0\xd6\xec\xff\x9b\x77\xa4\x54\x89\xe3\xfd\x51\x17\x32\x01\x1d\xf0\x73\x10\x00"};
-	
-	//we need bigendian data...
-	//lessons learned: do NOT endianchange directly in pdata, this will all proof-of-works be considered as stale from minerd.... 
-	int kk=0;
-	for (; kk < 32; kk++)
-	{
-		be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
-	};
-
-//	if (opt_debug) 
-//	{
-//		applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce);
-//	}
-	
-	/* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */
-	/* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */
-	if (ptarget[7]==0) {
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (((hash64[7]&0xFFFFFFFF)==0) && 
-					fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-	} 
-	else if (ptarget[7]<=0xF) 
-	{
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (((hash64[7]&0xFFFFFFF0)==0) && 
-					fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-	} 
-	else if (ptarget[7]<=0xFF) 
-	{
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (((hash64[7]&0xFFFFFF00)==0) && 
-					fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-	} 
-	else if (ptarget[7]<=0xFFF) 
-	{
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (((hash64[7]&0xFFFFF000)==0) && 
-					fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-
-	} 
-	else if (ptarget[7]<=0xFFFF) 
-	{
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (((hash64[7]&0xFFFF0000)==0) && 
-					fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-
-	} 
-	else 
-	{
-		do {
-			pdata[19] = ++n;
-			be32enc(&endiandata[19], n); 
-			inkhash(hash64, endiandata);
-			if (fulltest(hash64, ptarget)) {
-				*hashes_done = n - first_nonce + 1;
-				return true;
-			}
-		} while (n < max_nonce && !work_restart[thr_id].restart);	
-	}
-	
-	
-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-	return 0;
-}
-
-bool register_shavite_algo( algo_gate_t* gate )
-{
-    algo_not_implemented();
-    return false;
-
-//    gate->scanhash = (void*)&scanhash_ink;
-//    gate->hash     = (void*)&inkhash;
-//    return true;
-};
-
diff --git a/algo/shavite/sph-shavite-aesni.c b/algo/shavite/sph-shavite-aesni.c
index e9f5894..6cd6250 100644
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -50,7 +50,8 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif
 
-static const sph_u32 IV512[] = {
+static const sph_u32 IV512[] =
+{
 	0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
 	0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC,
 	0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47,
@@ -71,38 +72,26 @@ c512( sph_shavite_big_context *sc, const void *msg )
    p2 = h[2];
    p3 = h[3];   
 
-   // round
-
    k00 = m[0];
-   x = v128_xor( p1, k00 );
-   x = v128_aesenc_nokey( x );
-
    k01 = m[1];
-   x = v128_xor( x, k01 );
-   x = v128_aesenc_nokey( x );
    k02 = m[2];
-   x = v128_xor( x, k02 );
-   x = v128_aesenc_nokey( x );
    k03 = m[3];
-   x = v128_xor( x, k03 );
-   x = v128_aesenc_nokey( x );
-
-   p0 = v128_xor( p0, x );
-
    k10 = m[4];
-   x = v128_xor( p3, k10 );
-   x = v128_aesenc_nokey( x );
    k11 = m[5];
-   x = v128_xor( x, k11 );
-   x = v128_aesenc_nokey( x );
    k12 = m[6];
-   x = v128_xor( x, k12 );
-   x = v128_aesenc_nokey( x );
    k13 = m[7];
-   x = v128_xor( x, k13 );
-   x = v128_aesenc_nokey( x );
 
-   p2 = v128_xor( p2, x );
+   // round 0
+   
+   x = v128_xoraesenc( p1, k00 );
+   x = v128_xoraesenc( x, k01 );
+   x = v128_xoraesenc( x, k02 );
+   p0 = v128_xoraesencxor( x, k03, p0 );
+
+   x = v128_xoraesenc( p3, k10 );
+   x = v128_xoraesenc( x, k11 );
+   x = v128_xoraesenc( x, k12 );
+   p2 = v128_xoraesencxor( x, k13, p2 );
 
    for ( r = 0; r < 3; r ++ )
    {
@@ -113,198 +102,165 @@ c512( sph_shavite_big_context *sc, const void *msg )
       if ( r == 0 )
          k00 = v128_xor( k00, v128_set32(
                   ~sc->count3, sc->count2, sc->count1, sc->count0 ) ); 
+      x = v128_xoraesenc( p0, k00 );
 
-      x = v128_xor( p0, k00 );
-      x = v128_aesenc_nokey( x );
       k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
       k01 = v128_xor( k01, k00 );
 
       if ( r == 1 )
          k01 = v128_xor( k01, v128_set32(
                   ~sc->count0, sc->count1, sc->count2, sc->count3 ) );
+      x = v128_xoraesenc( x, k01 );
 
-      x = v128_xor( x, k01 );
-      x = v128_aesenc_nokey( x );
       k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
       k02 = v128_xor( k02, k01 );
-      x = v128_xor( x, k02 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k02 );
+
       k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
       k03 = v128_xor( k03, k02 );
-      x = v128_xor( x, k03 );
-      x = v128_aesenc_nokey( x );
-
-      p3 = v128_xor( p3, x );
+      p3 = v128_xoraesencxor( x, k03, p3 );
 
       k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
       k10 = v128_xor( k10, k03 );
+      x = v128_xoraesenc( p2, k10 );
 
-      x = v128_xor( p2, k10 );
-      x = v128_aesenc_nokey( x );
       k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
       k11 = v128_xor( k11, k10 );
-      x = v128_xor( x, k11 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k11 );
+
       k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
       k12 = v128_xor( k12, k11 );
-      x = v128_xor( x, k12 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k12 );
+
       k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
       k13 = v128_xor( k13, k12 );
 
       if ( r == 2 )
          k13 = v128_xor( k13, v128_set32(
                   ~sc->count1, sc->count0, sc->count3, sc->count2 ) );
-
-      x = v128_xor( x, k13 );
-      x = v128_aesenc_nokey( x );
-      p1 = v128_xor( p1, x );
+      p1 = v128_xoraesencxor( x, k13, p1 );
 
       // round 2, 6, 10
 
       k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
-      x = v128_xor( p3, k00 );
-      x = v128_aesenc_nokey( x );
-      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
-      x = v128_xor( x, k01 );
-      x = v128_aesenc_nokey( x );
-      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
-      x = v128_xor( x, k02 );
-      x = v128_aesenc_nokey( x );
-      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
-      x = v128_xor( x, k03 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p3, k00 );
 
-      p2 = v128_xor( p2, x );
+      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
+      x = v128_xoraesenc( x, k01 );
+
+      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
+      x = v128_xoraesenc( x, k02 );
+
+      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
+      p2 = v128_xoraesencxor( x, k03, p2 );
 
       k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
-      x = v128_xor( p1, k10 );
-      x = v128_aesenc_nokey( x );
-      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
-      x = v128_xor( x, k11 );
-      x = v128_aesenc_nokey( x );
-      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
-      x = v128_xor( x, k12 );
-      x = v128_aesenc_nokey( x );
-      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
-      x = v128_xor( x, k13 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p1, k10 );
 
-      p0 = v128_xor( p0, x );
+      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
+      x = v128_xoraesenc( x, k11 );
+
+      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
+      x = v128_xoraesenc( x, k12 );
+
+      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
+      p0 = v128_xoraesencxor( x, k13, p0 );
 
       // round 3, 7, 11
 
       k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
       k00 = v128_xor( k00, k13 );
-      x = v128_xor( p2, k00 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p2, k00 );
+
       k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
       k01 = v128_xor( k01, k00 );
-      x = v128_xor( x, k01 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k01 );
+
       k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
       k02 = v128_xor( k02, k01 );
-      x = v128_xor( x, k02 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k02 );
+
       k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
       k03 = v128_xor( k03, k02 );
-      x = v128_xor( x, k03 );
-      x = v128_aesenc_nokey( x );
-
-      p1 = v128_xor( p1, x );
+      p1 = v128_xoraesencxor( x, k03, p1 );
 
       k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
       k10 = v128_xor( k10, k03 );
-      x = v128_xor( p0, k10 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p0, k10 );
+
       k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
       k11 = v128_xor( k11, k10 );
-      x = v128_xor( x, k11 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k11 );
+
       k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
       k12 = v128_xor( k12, k11 );
-      x = v128_xor( x, k12 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( x, k12 );
+
       k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
       k13 = v128_xor( k13, k12 );
-      x = v128_xor( x, k13 );
-      x = v128_aesenc_nokey( x );
-
-      p3 = v128_xor( p3, x );
+      p3 = v128_xoraesencxor( x, k13, p3 );
 
       // round 4, 8, 12
 
       k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
-      x = v128_xor( p1, k00 );
-      x = v128_aesenc_nokey( x );
-      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
-      x = v128_xor( x, k01 );
-      x = v128_aesenc_nokey( x );
-      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
-      x = v128_xor( x, k02 );
-      x = v128_aesenc_nokey( x );
-      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
-      x = v128_xor( x, k03 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p1, k00 );
 
-      p0 = v128_xor( p0, x );
+      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
+      x = v128_xoraesenc( x, k01 );
+
+      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
+      x = v128_xoraesenc( x, k02 );
+
+      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
+      p0 = v128_xoraesencxor( x, k03, p0 );
 
       k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
-      x = v128_xor( p3, k10 );
-      x = v128_aesenc_nokey( x );
-      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
-      x = v128_xor( x, k11 );
-      x = v128_aesenc_nokey( x );
-      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
-      x = v128_xor( x, k12 );
-      x = v128_aesenc_nokey( x );
-      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
-      x = v128_xor( x, k13 );
-      x = v128_aesenc_nokey( x );
+      x = v128_xoraesenc( p3, k10 );
 
-      p2 = v128_xor( p2, x );
+      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
+      x = v128_xoraesenc( x, k11 );
+
+      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
+      x = v128_xoraesenc( x, k12 );
+
+      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
+      p2 = v128_xoraesencxor( x, k13, p2 );
    }
 
    // round 13
 
    k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
    k00 = v128_xor( k00, k13 );
-   x = v128_xor( p0, k00 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( p0, k00 );
+
    k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) ); 
    k01 = v128_xor( k01, k00 );
-   x = v128_xor( x, k01 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( x, k01 );
+
    k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
    k02 = v128_xor( k02, k01 );
-   x = v128_xor( x, k02 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( x, k02 );
+
    k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
    k03 = v128_xor( k03, k02 );
-   x = v128_xor( x, k03 );
-   x = v128_aesenc_nokey( x );
-
-   p3 = v128_xor( p3, x );
+   p3 = v128_xoraesencxor( x, k03, p3 );
 
    k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
    k10 = v128_xor( k10, k03 );
-   x = v128_xor( p2, k10 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( p2, k10 );
+
    k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
    k11 = v128_xor( k11, k10 );
-   x = v128_xor( x, k11 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( x, k11 );
+
    k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
    k12 = v128_xor( k12, v128_xor( k11, v128_set32(
                ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
-   x = v128_xor( x, k12 );
-   x = v128_aesenc_nokey( x );
+   x = v128_xoraesenc( x, k12 );
+
    k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
    k13 = v128_xor( k13, k12 );
-   x = v128_xor( x, k13 );
-   x = v128_aesenc_nokey( x );
-
-   p1 = v128_xor( p1, x );
+   p1 = v128_xoraesencxor( x, k13, p1 );
 
    h[0] = v128_xor( h[0], p2 );
    h[1] = v128_xor( h[1], p3 );
diff --git a/compat/aes_helper.c b/compat/aes_helper.c
index 3006344..36bb2d3 100644
--- a/compat/aes_helper.c
+++ b/compat/aes_helper.c
@@ -108,7 +108,24 @@ extern "C"{
 	} while (0)
 
 #define AES_ROUND_NOKEY_LE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \
-	AES_ROUND_LE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3)
+{ \
+      (Y0) = AES0[(X0) & 0xFF] \
+         ^ AES1[((X1) >> 8) & 0xFF] \
+         ^ AES2[((X2) >> 16) & 0xFF] \
+         ^ AES3[((X3) >> 24) & 0xFF]; \
+      (Y1) = AES0[(X1) & 0xFF] \
+         ^ AES1[((X2) >> 8) & 0xFF] \
+         ^ AES2[((X3) >> 16) & 0xFF] \
+         ^ AES3[((X0) >> 24) & 0xFF]; \
+      (Y2) = AES0[(X2) & 0xFF] \
+         ^ AES1[((X3) >> 8) & 0xFF] \
+         ^ AES2[((X0) >> 16) & 0xFF] \
+         ^ AES3[((X1) >> 24) & 0xFF]; \
+      (Y3) = AES0[(X3) & 0xFF] \
+         ^ AES1[((X0) >> 8) & 0xFF] \
+         ^ AES2[((X1) >> 16) & 0xFF] \
+         ^ AES3[((X2) >> 24) & 0xFF]; \
+}
 
 #endif
 
diff --git a/configure b/configure
index e60f1b4..1289149 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.5.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.6.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='25.5'
-PACKAGE_STRING='cpuminer-opt 25.5'
+PACKAGE_VERSION='25.6'
+PACKAGE_STRING='cpuminer-opt 25.6'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1359,7 +1359,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 25.5 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 25.6 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1431,7 +1431,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 25.5:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 25.6:";;
    esac
   cat <<\_ACEOF
 
@@ -1536,7 +1536,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 25.5
+cpuminer-opt configure 25.6
 generated by GNU Autoconf 2.71
 
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1983,7 +1983,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 25.5, which was
+It was created by cpuminer-opt $as_me 25.6, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   $ $0$ac_configure_args_raw
@@ -3591,7 +3591,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='25.5'
+ VERSION='25.6'
 
 
 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7435,7 +7435,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 25.5, which was
+This file was extended by cpuminer-opt $as_me 25.6, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -7503,7 +7503,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 25.5
+cpuminer-opt config.status 25.6
 configured by $0, generated by GNU Autoconf 2.71,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 87e84ed..ad237ab 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [25.5])
+AC_INIT([cpuminer-opt], [25.6])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/configure~ b/configure~
index 0be54fb..aba559b 100755
--- a/configure~
+++ b/configure~
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.5.
+# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.6.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation,
@@ -601,8 +601,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='25.5'
-PACKAGE_STRING='cpuminer-opt 25.5'
+PACKAGE_VERSION='25.6'
+PACKAGE_STRING='cpuminer-opt 25.6'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1352,7 +1352,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-'configure' configures cpuminer-opt 25.5 to adapt to many kinds of systems.
+'configure' configures cpuminer-opt 25.6 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1424,7 +1424,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 25.5:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 25.6:";;
    esac
   cat <<\_ACEOF
 
@@ -1528,7 +1528,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 25.5
+cpuminer-opt configure 25.6
 generated by GNU Autoconf 2.72
 
 Copyright (C) 2023 Free Software Foundation, Inc.
@@ -1949,7 +1949,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 25.5, which was
+It was created by cpuminer-opt $as_me 25.6, which was
 generated by GNU Autoconf 2.72.  Invocation command line was
 
   $ $0$ac_configure_args_raw
@@ -3065,7 +3065,7 @@ ac_config_headers="$ac_config_headers cpuminer-config.h"
 
 
 
-am__api_version='1.17'
+am__api_version='1.18'
 
 
   # Find a good install program.  We prefer a C program (faster),
@@ -3334,10 +3334,14 @@ am_lf='
 '
 case `pwd` in
   *[\\\"\#\$\&\'\`$am_lf]*)
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
     as_fn_error $? "unsafe absolute working directory name" "$LINENO" 5;;
 esac
 case $srcdir in
   *[\\\"\#\$\&\'\`$am_lf\ \	]*)
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
     as_fn_error $? "unsafe srcdir value: '$srcdir'" "$LINENO" 5;;
 esac
 
@@ -3764,7 +3768,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='25.5'
+ VERSION='25.6'
 
 
 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -3802,9 +3806,133 @@ AMTAR='$${TAR-tar}'
 
 
 # We'll loop over all known methods to create a tar archive until one works.
-_am_tools='gnutar  pax cpio none'
+_am_tools='gnutar plaintar pax cpio none'
 
-am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'
+# The POSIX 1988 'ustar' format is defined with fixed-size fields.
+      # There is notably a 21 bits limit for the UID and the GID.  In fact,
+      # the 'pax' utility can hang on bigger UID/GID (see automake bug#8343
+      # and bug#13588).
+      am_max_uid=2097151 # 2^21 - 1
+      am_max_gid=$am_max_uid
+      # The $UID and $GID variables are not portable, so we need to resort
+      # to the POSIX-mandated id(1) utility.  Errors in the 'id' calls
+      # below are definitely unexpected, so allow the users to see them
+      # (that is, avoid stderr redirection).
+      am_uid=`id -u || echo unknown`
+      am_gid=`id -g || echo unknown`
+      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether UID '$am_uid' is supported by ustar format" >&5
+printf %s "checking whether UID '$am_uid' is supported by ustar format... " >&6; }
+      if test x$am_uid = xunknown; then
+        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: ancient id detected; assuming current UID is ok, but dist-ustar might not work" >&5
+printf "%s\n" "$as_me: WARNING: ancient id detected; assuming current UID is ok, but dist-ustar might not work" >&2;}
+      elif test $am_uid -le $am_max_uid; then
+        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+      else
+        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+        _am_tools=none
+      fi
+      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether GID '$am_gid' is supported by ustar format" >&5
+printf %s "checking whether GID '$am_gid' is supported by ustar format... " >&6; }
+      if test x$gm_gid = xunknown; then
+        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: ancient id detected; assuming current GID is ok, but dist-ustar might not work" >&5
+printf "%s\n" "$as_me: WARNING: ancient id detected; assuming current GID is ok, but dist-ustar might not work" >&2;}
+      elif test $am_gid -le $am_max_gid; then
+        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+      else
+        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+        _am_tools=none
+      fi
+
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking how to create a ustar tar archive" >&5
+printf %s "checking how to create a ustar tar archive... " >&6; }
+
+  # Go ahead even if we have the value already cached.  We do so because we
+  # need to set the values for the 'am__tar' and 'am__untar' variables.
+  _am_tools=${am_cv_prog_tar_ustar-$_am_tools}
+
+  for _am_tool in $_am_tools; do
+    case $_am_tool in
+    gnutar)
+      for _am_tar in tar gnutar gtar; do
+        { echo "$as_me:$LINENO: $_am_tar --version" >&5
+   ($_am_tar --version) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); } && break
+      done
+      am__tar="$_am_tar --format=ustar -chf - "'"$$tardir"'
+      am__tar_="$_am_tar --format=ustar -chf - "'"$tardir"'
+      am__untar="$_am_tar -xf -"
+      ;;
+    plaintar)
+      # Must skip GNU tar: if it does not support --format= it doesn't create
+      # ustar tarball either.
+      (tar --version) >/dev/null 2>&1 && continue
+      am__tar='tar chf - "$$tardir"'
+      am__tar_='tar chf - "$tardir"'
+      am__untar='tar xf -'
+      ;;
+    pax)
+      am__tar='pax -L -x ustar -w "$$tardir"'
+      am__tar_='pax -L -x ustar -w "$tardir"'
+      am__untar='pax -r'
+      ;;
+    cpio)
+      am__tar='find "$$tardir" -print | cpio -o -H ustar -L'
+      am__tar_='find "$tardir" -print | cpio -o -H ustar -L'
+      am__untar='cpio -i -H ustar -d'
+      ;;
+    none)
+      am__tar=false
+      am__tar_=false
+      am__untar=false
+      ;;
+    esac
+
+    # If the value was cached, stop now.  We just wanted to have am__tar
+    # and am__untar set.
+    test -n "${am_cv_prog_tar_ustar}" && break
+
+    # tar/untar a dummy directory, and stop if the command works.
+    rm -rf conftest.dir
+    mkdir conftest.dir
+    echo GrepMe > conftest.dir/file
+    { echo "$as_me:$LINENO: tardir=conftest.dir && eval $am__tar_ >conftest.tar" >&5
+   (tardir=conftest.dir && eval $am__tar_ >conftest.tar) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); }
+    rm -rf conftest.dir
+    if test -s conftest.tar; then
+      { echo "$as_me:$LINENO: $am__untar <conftest.tar" >&5
+   ($am__untar <conftest.tar) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); }
+      { echo "$as_me:$LINENO: cat conftest.dir/file" >&5
+   (cat conftest.dir/file) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); }
+      grep GrepMe conftest.dir/file >/dev/null 2>&1 && break
+    fi
+  done
+  rm -rf conftest.dir
+
+  if test ${am_cv_prog_tar_ustar+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) am_cv_prog_tar_ustar=$_am_tool ;;
+esac
+fi
+
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $am_cv_prog_tar_ustar" >&5
+printf "%s\n" "$am_cv_prog_tar_ustar" >&6; }
 
 
 
@@ -4986,7 +5114,10 @@ _ACEOF
       break
     fi
   done
-  rm -f core conftest*
+  # aligned with autoconf, so not including core; see bug#72225.
+  rm -f -r a.out a.exe b.out conftest.$ac_ext conftest.$ac_objext \
+    conftest.dSYM conftest1.$ac_ext conftest1.$ac_objext conftest1.dSYM \
+    conftest2.$ac_ext conftest2.$ac_objext conftest2.dSYM
   unset am_i ;;
 esac
 fi
@@ -7450,7 +7581,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 25.5, which was
+This file was extended by cpuminer-opt $as_me 25.6, which was
 generated by GNU Autoconf 2.72.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -7518,7 +7649,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 25.5
+cpuminer-opt config.status 25.6
 configured by $0, generated by GNU Autoconf 2.72,
   with options \\"\$ac_cs_config\\"
 
diff --git a/miner.h b/miner.h
index 7bd508f..db0c123 100644
--- a/miner.h
+++ b/miner.h
@@ -582,6 +582,8 @@ enum algos {
         ALGO_ANIME,
         ALGO_ARGON2D250,
         ALGO_ARGON2D500,
+        ALGO_ARGON2D1000,
+        ALGO_ARGON2D16000,
         ALGO_ARGON2D4096,
         ALGO_AXIOM,       
         ALGO_BLAKE,       
@@ -677,6 +679,8 @@ static const char* const algo_names[] = {
         "anime",
         "argon2d250",
         "argon2d500",
+        "argon2d1000",
+        "argon2d16000",
         "argon2d4096",
         "axiom",
         "blake",
@@ -837,6 +841,8 @@ Options:\n\
                           anime         Animecoin (ANI)\n\
                           argon2d250\n\
                           argon2d500\n\
+                          argon2d1000\n\
+                          argon2d16000\n\
                           argon2d4096\n\
                           axiom         Shabal-256 MemoHash\n\
                           blake         blake256r14 (SFR)\n\
diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h
index 8a5f908..3b7d56d 100644
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -137,10 +137,24 @@
 #define v128_unpackhi8                 _mm_unpackhi_epi8
 
 // AES
-// Nokey means nothing on x86_64 but it saves an instruction and a register
-// on ARM.
-#define v128_aesenc                    _mm_aesenc_si128
+
+// xor key with result after encryption, x86_64 format.
+#define v128_aesencxor                 _mm_aesenc_si128
+// default is x86_64 format.
+#define v128_aesenc                    v128_aesencxor
+
+// xor key with v before encryption, arm64 format.
+#define v128_xoraesenc( v, k ) \
+   _mm_aesenc_si128( v128_xor( v, k ), v128_zero )
+
+// xor v with k_in before encryption then xor the result with k_out afterward.
+// Uses the applicable optimization based on the target.
+#define v128_xoraesencxor( v, k_in, k_out ) \
+   _mm_aesenc_si128( v128_xor( v, k_in ), k_out )
+
+// arm64 optimized
 #define v128_aesenc_nokey(v)           _mm_aesenc_si128( v, v128_zero )
+
 #define v128_aesenclast                _mm_aesenclast_si128
 #define v128_aesenclast_nokey(v)       _mm_aesenclast_si128( v, v128_zero )
 #define v128_aesdec                    _mm_aesdec_si128
diff --git a/simd-utils/simd-neon.h b/simd-utils/simd-neon.h
index 7063036..a1816fc 100644
--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -187,9 +187,21 @@
 // vzipq_u32 can do hi & lo and return uint32x4x2, no 64 bit version.
 
 // AES
-// consistent with Intel AES intrinsics, break up for optimizing
-#define v128_aesenc( v, k ) \
-   v128_xor( k, vaesmcq_u8( vaeseq_u8( v, v128_zero ) ) )
+
+// xor key with result after encryption, x86_64 format.
+#define v128_aesencxor( v, k ) \
+   v128_xor( vaesmcq_u8( vaeseq_u8( v, v128_zero ) ), k )
+// default is x86_64 format.
+#define v128_aesenc v128_aesencxor
+
+// xor key with v before encryption, arm64 format.
+#define v128_xoraesenc( v, k ) \
+   vaesmcq_u8( vaeseq_u8( v, k ) )
+
+// xor v with k_in before encryption then xor the result with k_out afterward.
+// Uses the applicable optimization based on the target.
+#define v128_xoraesencxor( v, k_in, k_out ) \
+   v128_xor( v128_xoraesenc( v, k_in ), k_out )
 
 #define v128_aesenc_nokey( v ) \
    vaesmcq_u8( vaeseq_u8( v, v128_zero ) )