v23.11

2025-09-17 23:44:27 +00:00 · 2023-11-17 14:39:26 -05:00
47 changed files with 3512 additions and 2600 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -250,7 +250,6 @@ cpuminer_SOURCES = \
  algo/x16/x16rt.c \
  algo/x16/x16rt-4way.c \
  algo/x16/hex.c \
-  algo/x16/x20r.c \
  algo/x16/x21s-4way.c \
  algo/x16/x21s.c \
  algo/x16/minotaur.c \
--- a/README.md
+++ b/README.md
@@ -87,6 +87,7 @@ Supported Algorithms
                          groestl       Groestl coin
                          hex           x16r-hex
                          hmq1725       
+                          hodl          Hodlcoin
                          jha           Jackpotcoin
                          keccak        Maxcoin
                          keccakc       Creative coin
@@ -114,11 +115,9 @@ Supported Algorithms
                          scrypt:N      scrypt(N, 1, 1)
                          scryptn2      scrypt(1048576, 1, 1)
                          sha256d       Double SHA-256
-                          sha256dt
                          sha256q       Quad SHA-256
                          sha256t       Triple SHA-256
                          sha3d         Double keccak256 (BSHA3)
-                          sha512256d
                          skein         Skein+Sha (Skeincoin)
                          skein2        Double Skein (Woodcoin)
                          skunk         Signatum (SIGT)
@@ -146,7 +145,6 @@ Supported Algorithms
                          x16rt-veil    veil
                          x16s          
                          x17
-                          x20r
                          x21s
                          x22i
                          x25x
--- a/16
+++ b/16
@@ -75,22 +75,6 @@ If not what makes it happen or not happen?
 Change Log
 ----------

-v23.14
-
-ARM: Groestl AES optimizations enabled.
-All: Small optimization to Shabal 4way.
-x86_64: Extend Shabal 4way support to SSE2 from SSE4.1.
-All: deleted some unused files.
-
-v23.13
-
-Added x20r algo.
-Eliminated redundant hash order calculations for x16r family.
-
-v23.12
-
-Several bugs fixes and speed improvements for x16r family for all CPU architectures.
-
 v23.11

 This is a release candidate for full AArch64 support, marking the end of the Beta phase.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -368,7 +368,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_X16RT_VEIL:   rc = register_x16rt_veil_algo    ( gate ); break;
    case ALGO_X16S:         rc = register_x16s_algo          ( gate ); break;
    case ALGO_X17:          rc = register_x17_algo           ( gate ); break;
-    case ALGO_X20R:         rc = register_x20r_algo          ( gate ); break;
    case ALGO_X21S:         rc = register_x21s_algo          ( gate ); break;
    case ALGO_X22I:         rc = register_x22i_algo          ( gate ); break;
    case ALGO_X25X:         rc = register_x25x_algo          ( gate ); break;
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -39,7 +39,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
   blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );

   do {
-      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );

      blakehash_4way( hash, vdata );

--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -182,7 +182,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
   blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );

   do {
-      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      pdata[19] = n;
      blakecoin_4way_hash( hash, vdata );

--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -60,17 +60,54 @@ static const v128u64_t SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };

 #if defined(__ARM_NEON)

-static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
-   { 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
+// No fast shuffle on NEON
+//static const uint32x4_t vmask_d8 = {  3, 1, 2, 0 };  
+static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };

-#define gr_shuffle32(v)       vqtbl1q_u8( v, gr_mask ) 
+#define gr_shuffle32( v )      v128_blendv( v128_qrev32( v ), v, BLEND_MASK )
+
+/*
+#define TRANSP_MASK \
+     0xd,0x5,0x9,0x1,0xc,0x4,0x8,0x0,0xf,0x7,0xb,0x3,0xe,0x6,0xa,0x2
+#define SUBSH_MASK0 \
+     0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8
+#define SUBSH_MASK1 \
+     0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9
+#define SUBSH_MASK2 \
+     0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa
+#define SUBSH_MASK3 \
+     0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb
+#define SUBSH_MASK4  \
+     0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc
+#define SUBSH_MASK5 \
+     0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd
+#define SUBSH_MASK6 \
+     0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe
+#define SUBSH_MASK7 \
+     0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3
+
+//#define gr_shuffle8( v, c )    v128_shullfev8( v, c )
+
+
+#define gr_shuffle8( v, c15, c14, c13, c12, c11, c10, c09, c08, \
+                        c07, c06, c05, c04, c03, c02, c01, c00 ) \
+  v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
+  v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
+  v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
+  v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
+    v, 15, v, c15 ), 14, v, c14 ), 13, v, c13 ), 12, v, c12 ), \
+       11, v, c11 ), 10, v, c10 ),  9, v, c09 ),  8, v, c08 ), \
+        7, v, c07 ),  6, v, c06 ),  5, v, c05 ),  4, v, c04 ), \
+        3, v, c03 ),  2, v, c02 ),  1, v, c01 ),  0, v, c00 )
+*/

 #else

-#define gr_shuffle32(v)       _mm_shuffle_epi32( v, 0xd8 )
+#define gr_shuffle32( v )       _mm_shuffle_epi32( v, 0xd8 )

 #endif

+
 #define tos(a)    #a
 #define tostr(a)  tos(a)

@@ -297,16 +334,17 @@ static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
 */
 #define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* SubBytes */\
-  a0 = v128_aesenclast_nokey( a0 ); \
-  a1 = v128_aesenclast_nokey( a1 ); \
-  a2 = v128_aesenclast_nokey( a2 ); \
-  a3 = v128_aesenclast_nokey( a3 ); \
-  a4 = v128_aesenclast_nokey( a4 ); \
-  a5 = v128_aesenclast_nokey( a5 ); \
-  a6 = v128_aesenclast_nokey( a6 ); \
-  a7 = v128_aesenclast_nokey( a7 ); \
+  b0 = v128_xor(b0, b0);\
+  a0 = v128_aesenclast(a0, b0);\
+  a1 = v128_aesenclast(a1, b0);\
+  a2 = v128_aesenclast(a2, b0);\
+  a3 = v128_aesenclast(a3, b0);\
+  a4 = v128_aesenclast(a4, b0);\
+  a5 = v128_aesenclast(a5, b0);\
+  a6 = v128_aesenclast(a6, b0);\
+  a7 = v128_aesenclast(a7, b0);\
  /* MixBytes */\
-  MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7 ); \
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
 }

 #define ROUNDS_P(){\
@@ -324,9 +362,10 @@ static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
    xmm13 = v128_shuffle8( xmm13, SUBSH_MASK5 ); \
    xmm14 = v128_shuffle8( xmm14, SUBSH_MASK6 ); \
    xmm15 = v128_shuffle8( xmm15, SUBSH_MASK7 ); \
-     /* SubBytes + MixBytes */\
+    /* SubBytes + MixBytes */\
    SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
            xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7 ); \
+    \
    /* AddRoundConstant P1024 */\
    xmm0 = v128_xor( xmm0, \
             casti_v128( round_const_p, round_counter+1 ) ); \
@@ -428,6 +467,7 @@ static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
  t1 = v128_unpackhi16(t1, i3);\
  i2 = v128_unpacklo16(i2, i3);\
  i0 = v128_unpacklo16(i0, i1);\
+\
  /* shuffle with immediate */\
  t0 = gr_shuffle32( t0 ); \
  t1 = gr_shuffle32( t1 ); \
@@ -437,6 +477,7 @@ static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
  i2 = gr_shuffle32( i2 ); \
  i4 = gr_shuffle32( i4 ); \
  i6 = gr_shuffle32( i6 ); \
+\
  /* continue with unpack */\
  t4 = i0;\
  i0 = v128_unpacklo32(i0, i2);\
@@ -543,8 +584,7 @@ static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
  /* transpose done */\
 }/**/

-#if 0
-// not used
+
 void INIT( v128_t* chaining )
 {
  static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
@@ -573,7 +613,6 @@ void INIT( v128_t* chaining )
  chaining[6] = xmm14;
  chaining[7] = xmm15;
 }
-#endif

 void TF1024( v128_t* chaining, const v128_t* message )
 {
--- a/algo/groestl/aes_ni/groestl256-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl256-intr-aes.h
@@ -1,6 +1,3 @@
-#if !defined GROESTL256_INTR_AES_H__
-#define GROESTL256_INTR_AES_H__
-
 /* groestl-intr-aes.h     Aug 2011
 *
 * Groestl implementation with intrinsics using ssse3, sse4.1, and aes
@@ -53,17 +50,18 @@ static const v128u64_t SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };

 #if defined(__ARM_NEON)

-static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
-   { 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
+// No fast shuffle on NEON
+static const uint32x4_t vmask_d8 = {  3, 1, 2, 0 };

-#define gr_shuffle32(v)       vqtbl1q_u8( v, gr_mask ) 
+#define gr_shuffle32( v )       v128_shufflev32( v, vmask_d8 )

 #else

-#define gr_shuffle32(v)       _mm_shuffle_epi32( v, 0xd8 )
+#define gr_shuffle32( v )       _mm_shuffle_epi32( v, 0xd8 )

 #endif

+
 #define tos(a)    #a
 #define tostr(a)  tos(a)

@@ -600,4 +598,4 @@ void OF512( v128_t* chaining )
  chaining[3] = xmm11;
 }

-#endif
+
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -146,7 +146,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
   const int hash_offset = SIZE512 - hashlen_m128i;
   uint64_t blocks = len / SIZE512;
   v128_t* in = (v128_t*)input;
-   
+
   // digest any full blocks, process directly from input 
   for ( i = 0; i < blocks; i++ )
      TF1024( ctx->chaining, &in[ i * SIZE512 ] );
@@ -181,7 +181,6 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,

   // digest final padding block and do output transform
   TF1024( ctx->chaining, ctx->buffer );
-
   OF1024( ctx->chaining );

   // store hash result in output 
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -87,7 +87,6 @@ int final_groestl( hashState_groestl*, void* );
 int update_and_final_groestl( hashState_groestl*,  void*, const void*, int );
 int groestl512( hashState_groestl*,  void*, const void*, uint64_t );
 #define groestl512_full   groestl512
-#define groestl512_ctx    groestl512


 #endif /* __hash_h */
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -626,7 +626,7 @@ static const __m256i SUBSH_MASK7_2WAY =

 #define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
-  b1 = mm256_bcast_m128( v128_mask32( v128_neg1, 0x3 ) ); \
+  b1 = mm256_bcast_m128( mm128_mask_32( v128_neg1, 0x3 ) ); \
  a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
  a1 = _mm256_xor_si256( a1, b1 );\
  a2 = _mm256_xor_si256( a2, b1 );\
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -213,7 +213,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,

   v128_bswap32_intrlv80_4x32( vdata, pdata );
   do {
-      *noncev = v128_bswap32( _mm_set_epi32( n+3,n+2,n+1,n ) );
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );

      myriad_4way_hash( hash, vdata );
      pdata[19] = n;
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -465,8 +465,12 @@ typedef union
 {
   keccak256_2x64_context    keccak;
   cubehashParam             cube;
+//#if defined(__x86_64__)
   skein256_2x64_context     skein;
-#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+//#else
+//   sph_skein512_context      skein;
+//#endif
+#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
   hashState_groestl256      groestl;
 #else
   sph_groestl256_context     groestl;
@@ -512,6 +516,7 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );

+//#if defined(__x86_64__)
   intrlv_2x64( vhashA, hash0, hash1, 256 );
   skein256_2x64_init( &ctx.skein );
   skein256_2x64_update( &ctx.skein, vhashA, 32 );
@@ -522,8 +527,23 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
   skein256_2x64_update( &ctx.skein, vhashA, 32 );
   skein256_2x64_close( &ctx.skein, vhashA );
   dintrlv_2x64( hash2, hash3, vhashA, 256 );
-
-#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+/*
+#else
+    sph_skein256_init( &ctx.skein );
+    sph_skein256( &ctx.skein, hash0, 32 );
+    sph_skein256_close( &ctx.skein, hash0 );
+    sph_skein256_init( &ctx.skein );
+    sph_skein256( &ctx.skein, hash1, 32 );
+    sph_skein256_close( &ctx.skein, hash1 );
+    sph_skein256_init( &ctx.skein );
+    sph_skein256( &ctx.skein, hash2, 32 );
+    sph_skein256_close( &ctx.skein, hash2 );
+    sph_skein256_init( &ctx.skein );
+    sph_skein256( &ctx.skein, hash3, 32 );
+    sph_skein256_close( &ctx.skein, hash3 );
+#endif
+*/
+#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
   groestl256_full( &ctx.groestl, hash0, hash0, 256 );
   groestl256_full( &ctx.groestl, hash1, hash1, 256 );
   groestl256_full( &ctx.groestl, hash2, hash2, 256 );
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -67,7 +67,7 @@ int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
   lyra2h_4way_midstate( vdata );

   do {
-     *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+     *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      lyra2h_4way_hash( hash, vdata );

      for ( int i = 0; i < 4; i++ )
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -456,7 +456,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,

   do
   {
-      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );

      lyra2rev2_4way_hash( hash, vdata );

--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -7,15 +7,15 @@
 #include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
 #if defined(__AES__)
+  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/fugue/fugue-aesni.h"
 #else
+  #include "algo/groestl/sph_groestl.h"
  #include "algo/fugue/sph_fugue.h"
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
-  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #else
-  #include "algo/groestl/sph_groestl.h"
  #include "algo/echo/sph_echo.h"
 #endif
 #include "algo/jh/sph_jh.h"
@@ -33,18 +33,18 @@

 union _hmq1725_ctx_holder
 {
-   blake512_context        blake;
+   blake512_context    blake;
   sph_bmw512_context      bmw;
 #if defined(__AES__)
+   hashState_groestl       groestl;
   hashState_fugue         fugue;
 #else
+   sph_groestl512_context  groestl;
   sph_fugue512_context    fugue;
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
-   hashState_groestl       groestl;
   hashState_echo          echo;
 #else
-   sph_groestl512_context  groestl;
   sph_echo512_context     echo;
 #endif
   sph_skein512_context    skein;
@@ -62,6 +62,9 @@ union _hmq1725_ctx_holder
 };
 typedef union _hmq1725_ctx_holder hmq1725_ctx_holder;

+//static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64)));
+//static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64)));
+
 extern void hmq1725hash(void *state, const void *input)
 {
    const uint32_t mask = 24;
@@ -79,7 +82,7 @@ extern void hmq1725hash(void *state, const void *input)

    if ( hashB[0] & mask )   //1
    {
-#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+#if defined(__AES__)
       groestl512_full( &ctx.groestl, hashA, hashB, 512 );
 #else
       sph_groestl512_init( &ctx.groestl );
@@ -223,7 +226,7 @@ extern void hmq1725hash(void *state, const void *input)
       sph_sha512_close( &ctx.sha, hashA );
    }

-#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+#if defined(__AES__)
    groestl512_full( &ctx.groestl, hashB, hashA, 512 );
 #else
    sph_groestl512_init( &ctx.groestl );
--- a/algo/sha/sha256-hash.c
+++ b/algo/sha/sha256-hash.c
@@ -587,8 +587,8 @@ void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y,
    // Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3)
    TMSG0_X = casti_m128i( msg_X, 0 );
    TMSG0_Y = casti_m128i( msg_Y, 0 );
-    TMP_X = v128_xim32( TMSG0_X, TMSG0_X, 0xd5 );
-    TMP_Y = v128_xim32( TMSG0_Y, TMSG0_Y, 0xd5 );
+    TMP_X = mm128_xim_32( TMSG0_X, TMSG0_X, 0xd5 );
+    TMP_Y = mm128_xim_32( TMSG0_Y, TMSG0_Y, 0xd5 );
    STATE0_X = _mm_add_epi32( STATE0_X, TMP_X );
    STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y );

--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -34,6 +34,8 @@
 #include <string.h>
 #include "shabal-hash-4way.h"

+//#if defined(__SSE4_1__) || defined(__ARM_NEON)
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 #define DECL_STATE16   \
@@ -45,6 +47,8 @@
           C8, C9, CA, CB, CC, CD, CE, CF; \
   __m512i M0, M1, M2, M3, M4, M5, M6, M7, \
           M8, M9, MA, MB, MC, MD, ME, MF; \
+   const __m512i FIVE  = v512_32( 5 ); \
+   const __m512i THREE = v512_32( 3 ); \
   uint32_t Wlow, Whigh;

 #define READ_STATE16(state) do \
@@ -288,21 +292,11 @@ do { \
    mm512_swap1024_512( BF, CF ); \
 } while (0)

-static inline __m512i v512_mult_x3( const __m512i x )
-{
-   return _mm512_add_epi32( x, _mm512_slli_epi32( x, 1 ) );
-}
-
-static inline __m512i v512_mult_x5( const __m512i x )
-{
-   return _mm512_add_epi32( x, _mm512_slli_epi32( x, 2 ) );
-}
-
 #define PERM_ELT16( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
   xa0 = mm512_xor3( xm, xb1, mm512_xorandnot( \
-           v512_mult_x3( mm512_xor3( xa0, xc, \
-              v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ) ), \
+           _mm512_mullo_epi32( mm512_xor3( xa0, xc, \
+              _mm512_mullo_epi32( mm512_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
           xb3, xb2 ) ); \
   xb0 = mm512_xnor( xa0, mm512_rol_32( xb0, 1 ) ); \
 } while (0)
@@ -650,6 +644,8 @@ shabal512_16way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
           C8, C9, CA, CB, CC, CD, CE, CF; \
   __m256i M0, M1, M2, M3, M4, M5, M6, M7, \
           M8, M9, MA, MB, MC, MD, ME, MF; \
+   const __m256i FIVE  = v256_32( 5 ); \
+   const __m256i THREE = v256_32( 3 ); \
   uint32_t Wlow, Whigh;

 #define READ_STATE8(state) do \
@@ -893,21 +889,11 @@ do { \
    mm256_swap512_256( BF, CF ); \
 } while (0)

-static inline __m256i v256_mult_x3( const __m256i x )
-{
-   return _mm256_add_epi32( x, _mm256_slli_epi32( x, 1 ) );
-}
-
-static inline __m256i v256_mult_x5( const __m256i x )
-{
-   return _mm256_add_epi32( x, _mm256_slli_epi32( x, 2 ) );
-}
-
 #define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
   xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
-           v256_mult_x3( mm256_xor3( xa0, xc, \
-              v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ) ), \
+           _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
+              _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
           xb3, xb2 ) ); \
   xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
 } while (0)
@@ -1240,13 +1226,15 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)

 #endif  // AVX2

-#if defined(__SSE2__) || defined(__ARM_NEON)
+#if defined(__SSE4_1__) || defined(__ARM_NEON)

 #define DECL_STATE   \
 	v128u32_t A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, AA, AB; \
 	v128u32_t B0, B1, B2, B3, B4, B5, B6, B7, B8, B9, BA, BB, BC, BD, BE, BF; \
 	v128u32_t C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, CA, CB, CC, CD, CE, CF; \
 	v128u32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; \
+   const v128u32_t FIVE  = v128_32( 5 ); \
+   const v128u32_t THREE = v128_32( 3 ); \
   uint32_t Wlow, Whigh;

 #define READ_STATE( state ) \
@@ -1491,22 +1479,12 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
    v128_swap256_128( BF, CF ); \
 }

-static inline v128_t v128_mult_x3( const v128_t x )
-{
-   return v128_add32( x, v128_sl32( x, 1 ) );
-}
-
-static inline v128_t v128_mult_x5( const v128_t x )
-{
-   return v128_add32( x, v128_sl32( x, 2 ) );
-}
-
 #define PERM_ELT( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 { \
   xa0 = v128_xor3( xm, xb1, v128_xorandnot( \
-                               v128_mult_x3( v128_xor3( xa0, xc, \
-                                   v128_mult_x5( v128_rol32( xa1, 15 ) ) ) ), \
-                               xb3, xb2 ) ); \
+           v128_mul32( v128_xor3( xa0, xc, \
+              v128_mul32( v128_rol32( xa1, 15 ), FIVE ) ), THREE ), \
+           xb3, xb2 ) ); \
   xb0 = v128_not( v128_xor( xa0, v128_rol32( xb0, 1 ) ) ); \
 }

--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -62,7 +62,7 @@ void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,

 #endif

-#if defined(__SSE2__) || defined(__ARM_NEON)
+#if defined(__SSE4_1__) || defined(__ARM_NEON)

 typedef struct {
 	v128_t buf[16] __attribute__ ((aligned (64)));
--- a/algo/swifftx/Swifftx_sha3.cpp
+++ b/algo/swifftx/Swifftx_sha3.cpp
@@ -0,0 +1,369 @@
+#include "Swifftx_sha3.h"
+extern "C" {
+#include "SWIFFTX.h"
+}
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+// The default salt value.
+// This is the expansion of e (Euler's number) - the 19 digits after 2.71:
+// 8281828459045235360.
+// The above in base 256, from MSB to LSB:
+BitSequence SWIF_saltValueChar[SWIF_HAIFA_SALT_SIZE] = {114, 238, 247, 26, 192, 28, 170, 160};
+
+// All the IVs here below were produced from the decimal digits of e's expansion.
+// The code can be found in 'ProduceRandomIV.c'.
+// The initial value for 224 digest size.
+const BitSequence SWIF_HAIFA_IV_224[SWIFFTX_OUTPUT_BLOCK_SIZE] = 
+{37, 242, 132,   2, 167,  81, 158, 237, 113,  77, 162,  60,  65, 236, 108, 246,
+101,  72, 190, 109,  58, 205,  99,   6, 114, 169, 104, 114,  38, 146, 121, 142,
+ 59,  98, 233,  84,  72, 227,  22, 199,  17, 102, 198, 145,  24, 178,  37,   1,
+215, 245,  66, 120, 230, 193, 113, 253, 165, 218,  66, 134,  49, 231, 124, 204,
+  0};
+
+// The initial value for 256 digest size.
+const BitSequence SWIF_HAIFA_IV_256[SWIFFTX_OUTPUT_BLOCK_SIZE] = 
+{250,  50,  42,  40,  14, 233,  53,  48, 227,  42, 237, 187, 211, 120, 209, 234,
+  27, 144,   4,  61, 243, 244,  29, 247,  37, 162,  70,  11, 231, 196,  53,   6,
+ 193, 240,  94, 126, 204, 132, 104,  46, 114,  29,   3, 104, 118, 184, 201,   3,
+  57,  77,  91, 101,  31, 155,  84, 199, 228,  39, 198,  42, 248, 198, 201, 178,
+   8};
+
+// The initial value for 384 digest size.
+const BitSequence SWIF_HAIFA_IV_384[SWIFFTX_OUTPUT_BLOCK_SIZE] = 
+{40, 145, 193, 100, 205, 171,  47,  76, 254,  10, 196,  41, 165, 207, 200,  79,
+109,  13,  75, 201,  17, 172,  64, 162, 217,  22,  88,  39,  51,  30, 220, 151,
+133,  73, 216, 233, 184, 203,  77,   0, 248,  13,  28, 199,  30, 147, 232, 242,
+227, 124, 169, 174,  14,  45,  27,  87, 254,  73,  68, 136, 135, 159,  83, 152,
+  0};
+
+// The initial value for 512 digest size.
+const BitSequence SWIF_HAIFA_IV_512[SWIFFTX_OUTPUT_BLOCK_SIZE] = 
+{195, 126, 197, 167, 157, 114,  99, 126, 208, 105, 200,  90,  71, 195, 144, 138,
+ 142, 122, 123, 116,  24, 214, 168, 173, 203, 183, 194, 210, 102, 117, 138,  42,
+ 114, 118, 132,  33,  35, 149, 143, 163, 163, 183, 243, 175,  72,  22, 201, 255,
+ 102, 243,  22, 187, 211, 167, 239,  76, 164,  70,  80, 182, 181, 212,   9, 185,
+   0};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+// NIST API implementation portion.
+///////////////////////////////////////////////////////////////////////////////////////////////
+
+int Swifftx::Init(int hashbitlen)
+{
+	switch(hashbitlen)
+	{
+	case 224:
+		swifftxState.hashbitlen = hashbitlen;
+		// Initializes h_0 in HAIFA:
+		memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_224, SWIFFTX_OUTPUT_BLOCK_SIZE);
+		break;
+	case 256:
+		swifftxState.hashbitlen = hashbitlen;
+		memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_256, SWIFFTX_OUTPUT_BLOCK_SIZE);
+		break;
+	case 384:
+		swifftxState.hashbitlen = hashbitlen;
+		memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_384, SWIFFTX_OUTPUT_BLOCK_SIZE);
+		break;
+	case 512:
+		swifftxState.hashbitlen = hashbitlen;
+		memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_512, SWIFFTX_OUTPUT_BLOCK_SIZE);
+		break;
+	default:
+		return BAD_HASHBITLEN;
+	}
+	
+	swifftxState.wasUpdated = false;
+	swifftxState.remainingSize = 0;
+	memset(swifftxState.remaining, 0, SWIF_HAIFA_INPUT_BLOCK_SIZE);
+	memset(swifftxState.numOfBitsChar, 0, SWIF_HAIFA_NUM_OF_BITS_SIZE);
+	// Initialize the salt with the default value.
+	memcpy(swifftxState.salt, SWIF_saltValueChar, SWIF_HAIFA_SALT_SIZE);
+
+	InitializeSWIFFTX();
+
+	return SUCCESS;
+}
+
+int Swifftx::Update(const BitSequence *data, DataLength databitlen)
+{
+	// The size of input in bytes after putting the remaining data from previous invocation.
+	int sizeOfInputAfterRemaining = 0;
+	// The input block to compression function of SWIFFTX:
+	BitSequence currInputBlock[SWIFFTX_INPUT_BLOCK_SIZE] = {0};
+	// Whether we handled a single block.
+	bool wasSingleBlockHandled = false;
+
+	swifftxState.wasUpdated = true;
+
+	// Handle an empty message as required by NIST. Since 'Final()' is oblivious to the input
+	// (but of course uses the output of the compression function from the previous round, 
+	// which is called h_{i-1} in HAIFA article), we have to do nothing here.
+	if (databitlen == 0)
+		return SUCCESS;
+
+    // If we had before an input with unaligned length, return an error
+    if (swifftxState.remainingSize % 8)
+	{
+    	return INPUT_DATA_NOT_ALIGNED;
+    }
+
+    // Convert remaining size to bytes.
+    swifftxState.remainingSize /= 8;
+
+	// As long as we have enough data combined from (remaining + data) to fill input block
+	//NASTAVENIE RUND
+	while (((databitlen / 8) + swifftxState.remainingSize) >= SWIF_HAIFA_INPUT_BLOCK_SIZE)
+	{
+		// Fill the input block with data:
+		// 1. The output of the previous block:
+		memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
+		// 2. The input part of the block:
+		// 2a. The remaining data from the previous 'Update()' call:
+		if (swifftxState.remainingSize)
+			memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE, swifftxState.remaining, 
+				   swifftxState.remainingSize);
+		// 2b. The input data that we have place for after the 'remaining':
+		sizeOfInputAfterRemaining = SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE 
+								  - ((int) swifftxState.remainingSize) - SWIF_HAIFA_NUM_OF_BITS_SIZE 
+								  - SWIF_HAIFA_SALT_SIZE;
+		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize, 
+			   data, sizeOfInputAfterRemaining);
+
+		// 3. The #bits part of the block:
+		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize 
+			 + sizeOfInputAfterRemaining,
+			   swifftxState.numOfBitsChar, SWIF_HAIFA_NUM_OF_BITS_SIZE);
+		// 4. The salt part of the block:
+		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize 
+			 + sizeOfInputAfterRemaining + SWIF_HAIFA_NUM_OF_BITS_SIZE,
+			   swifftxState.salt, SWIF_HAIFA_SALT_SIZE);
+
+		ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, false);
+
+		// Update the #bits field with SWIF_HAIFA_INPUT_BLOCK_SIZE.
+		AddToCurrInBase256(swifftxState.numOfBitsChar, SWIF_HAIFA_INPUT_BLOCK_SIZE * 8);
+		wasSingleBlockHandled = true;
+		data += sizeOfInputAfterRemaining;
+		databitlen -= (sizeOfInputAfterRemaining * 8);
+   		swifftxState.remainingSize = 0;
+	}
+
+	// Update the swifftxState.remaining and swifftxState.remainingSize.
+    // remainingSize will be in bits after exiting 'Update()'.
+	if (wasSingleBlockHandled)
+	{		
+		swifftxState.remainingSize = (unsigned int) databitlen; // now remaining size is in bits.
+        if (swifftxState.remainingSize)
+			memcpy(swifftxState.remaining, data, (swifftxState.remainingSize + 7) / 8);
+	}
+	else
+	{
+		memcpy(swifftxState.remaining + swifftxState.remainingSize, data, 
+			   (size_t) (databitlen + 7) / 8);
+		swifftxState.remainingSize = (swifftxState.remainingSize * 8) + (unsigned short) databitlen;
+	}
+
+	return SUCCESS;
+}
+
+int Swifftx::Final(BitSequence *hashval)
+{
+    int i;
+    // Whether to add one last block. True if the padding appended to the last block overflows
+	// the block size.
+    bool toAddFinalBlock = false;
+    bool toPutOneInFinalBlock = false;
+    unsigned short oneShift = 0;
+   	// The size of the last input block before the zeroes padding. We add 1 here because we
+    // include the final '1' bit in the calculation and 7 as we round the length to bytes.
+	unsigned short sizeOfLastInputBlock = (swifftxState.remainingSize + 1 + 7) / 8;
+    // The number of bytes of zero in the padding part.
+	// The padding contains:
+	// 1. A single 1 bit.
+	// 2. As many zeroes as needed.
+	// 3. The message length in bits. Occupies SWIF_HAIFA_NUM_OF_BITS_SIZE bytes.
+	// 4. The digest size. Maximum is 512, so we need 2 bytes.
+	// If the total number achieved is negative, add an additional block, as HAIFA specifies.
+	short numOfZeroBytesInPadding = (short) SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE 
+								  - sizeOfLastInputBlock - (2 * SWIF_HAIFA_NUM_OF_BITS_SIZE) - 2 
+								  - SWIF_HAIFA_SALT_SIZE;
+   	// The input block to compression function of SWIFFTX:
+	BitSequence currInputBlock[SWIFFTX_INPUT_BLOCK_SIZE] = {0};
+	// The message length in base 256.
+	BitSequence messageLengthChar[SWIF_HAIFA_NUM_OF_BITS_SIZE] = {0};
+   	// The digest size used for padding:
+	unsigned char digestSizeLSB = swifftxState.hashbitlen % 256;
+	unsigned char digestSizeMSB = (swifftxState.hashbitlen - digestSizeLSB) / 256;
+
+	if (numOfZeroBytesInPadding < 1)
+		toAddFinalBlock = true;
+
+	// Fill the input block with data:
+	// 1. The output of the previous block:
+	memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
+	// 2a. The input part of the block, which is the remaining data from the previous 'Update()'
+    //     call, if exists and an extra '1' bit (maybe all we have is this extra 1):
+
+    // Add the last 1 in big-endian convention ...
+    if (swifftxState.remainingSize % 8 == 0)
+	{
+       swifftxState.remaining[sizeOfLastInputBlock - 1] = 0x80;
+    }
+    else 
+	{
+       swifftxState.remaining[sizeOfLastInputBlock - 1] |= (1 << (7 - (swifftxState.remainingSize % 8)));
+    }
+
+	if (sizeOfLastInputBlock)
+		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE, swifftxState.remaining, 
+			   sizeOfLastInputBlock);
+    
+   	// Compute the message length in base 256:
+	for (i = 0; i < SWIF_HAIFA_NUM_OF_BITS_SIZE; ++i)
+        messageLengthChar[i] = swifftxState.numOfBitsChar[i];
+    if (sizeOfLastInputBlock)
+		AddToCurrInBase256(messageLengthChar, sizeOfLastInputBlock * 8);
+
+	if (!toAddFinalBlock)
+	{
+		// 2b. Put the zeroes:
+		memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock,
+			   0, numOfZeroBytesInPadding);
+		// 2c. Pad the message length:
+		for (i = 0; i < SWIF_HAIFA_NUM_OF_BITS_SIZE; ++i)
+			currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock 
+						 + numOfZeroBytesInPadding + i] = messageLengthChar[i];
+		// 2d. Pad the digest size:
+		currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock 
+					 + numOfZeroBytesInPadding + SWIF_HAIFA_NUM_OF_BITS_SIZE] = digestSizeMSB;
+		currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock 
+					 + numOfZeroBytesInPadding + SWIF_HAIFA_NUM_OF_BITS_SIZE + 1] = digestSizeLSB;
+	}
+	else
+	{
+		// 2b. Put the zeroes, if at all:
+		if ((SWIF_HAIFA_INPUT_BLOCK_SIZE - sizeOfLastInputBlock) > 0)
+		{
+			 memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock,
+					0, SWIF_HAIFA_INPUT_BLOCK_SIZE - sizeOfLastInputBlock);
+		}
+	}
+
+   	// 3. The #bits part of the block: 
+	memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE, 
+           swifftxState.numOfBitsChar, SWIF_HAIFA_NUM_OF_BITS_SIZE);
+	// 4. The salt part of the block:
+	memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE 
+		 + SWIF_HAIFA_NUM_OF_BITS_SIZE, 
+           swifftxState.salt, 
+		   SWIF_HAIFA_SALT_SIZE);
+
+    ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, !toAddFinalBlock); 
+
+	// If we have to add one more block, it is now:
+	if (toAddFinalBlock)
+	{
+		// 1. The previous output block, as usual.
+		memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
+
+		// 2a. Instead of the input, zeroes:
+		memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE , 0, 
+			   SWIF_HAIFA_INPUT_BLOCK_SIZE - SWIF_HAIFA_NUM_OF_BITS_SIZE - 2);
+		// 2b. Instead of the input, the message length:
+		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE 
+			 - SWIF_HAIFA_NUM_OF_BITS_SIZE - 2,
+			   messageLengthChar,
+			   SWIF_HAIFA_NUM_OF_BITS_SIZE);
+		// 2c. Instead of the input, the digest size:
+		currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE - 2] = digestSizeMSB;
+		currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE - 1] = digestSizeLSB;
+		// 3. The #bits part of the block, which is zero in case of additional block:
+		memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE,
+			   0,
+			   SWIF_HAIFA_NUM_OF_BITS_SIZE);
+		// 4. The salt part of the block:
+		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE 
+			 + SWIF_HAIFA_NUM_OF_BITS_SIZE, 
+               swifftxState.salt, 
+			   SWIF_HAIFA_SALT_SIZE);
+
+        ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, true); 
+	}
+
+	// Finally, copy the result into 'hashval'. In case the digest size is not 512bit, copy the
+	// first hashbitlen of them:
+    for (i = 0; i < (swifftxState.hashbitlen / 8); ++i)
+		hashval[i] = swifftxState.currOutputBlock[i];
+
+	return SUCCESS;
+}
+
+int Swifftx::Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, 
+				BitSequence *hashval)
+{
+	int result;
+	//hashState state;
+   	// The pointer to the current place in the input we take into the compression function.
+	DataLength currInputIndex = 0;
+
+    result = Swifftx::Init(hashbitlen);
+
+	if (result != SUCCESS)
+		return result;
+
+	for ( ; (databitlen / 8) >  SWIF_HAIFA_INPUT_BLOCK_SIZE; 
+         currInputIndex += SWIF_HAIFA_INPUT_BLOCK_SIZE, databitlen -= (SWIF_HAIFA_INPUT_BLOCK_SIZE * 8))
+	{
+		result = Swifftx::Update(data + currInputIndex, SWIF_HAIFA_INPUT_BLOCK_SIZE * 8); 
+		if (result != SUCCESS)
+			return result;
+	}
+
+	// The length of the last block may be shorter than (SWIF_HAIFA_INPUT_BLOCK_SIZE * 8)
+	result = Swifftx::Update(data + currInputIndex, databitlen); 
+	if (result != SUCCESS)
+	{
+		return result;
+	}
+
+    return Swifftx::Final(hashval);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+// Helper fuction implementation portion.
+///////////////////////////////////////////////////////////////////////////////////////////////
+
+void Swifftx::AddToCurrInBase256(BitSequence value[SWIF_HAIFA_NUM_OF_BITS_SIZE], 
+							   unsigned short toAdd)
+{
+	unsigned char remainder = 0;
+	short i;
+	BitSequence currValueInBase256[8] = {0};
+	unsigned short currIndex = 7;
+	unsigned short temp = 0;
+
+	do
+	{
+		remainder = toAdd % 256;
+		currValueInBase256[currIndex--] = remainder;
+		toAdd -= remainder;
+		toAdd /= 256;
+	}
+	while(toAdd != 0);
+
+	for (i = 7; i >= 0; --i)
+	{
+		temp = value[i] + currValueInBase256[i];
+		if (temp > 255)
+		{
+			value[i] = temp % 256;
+			currValueInBase256[i - 1]++;
+		}
+		else
+			value[i] = (unsigned char) temp;
+	}
+}
--- a/algo/swifftx/Swifftx_sha3.h
+++ b/algo/swifftx/Swifftx_sha3.h
@@ -0,0 +1,79 @@
+#ifndef SWIFFTX_SHA3_H
+#define SWIFFTX_SHA3_H
+
+#include "sha3_interface.h"
+#include "stdbool.h"
+#include "stdint.h"
+
+class Swifftx : public SHA3 {
+
+#define SWIFFTX_INPUT_BLOCK_SIZE 256
+#define SWIFFTX_OUTPUT_BLOCK_SIZE 65
+#define SWIF_HAIFA_SALT_SIZE 8
+#define SWIF_HAIFA_NUM_OF_BITS_SIZE 8
+#define SWIF_HAIFA_INPUT_BLOCK_SIZE (SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE \
+							  - SWIF_HAIFA_NUM_OF_BITS_SIZE - SWIF_HAIFA_SALT_SIZE)
+
+	typedef unsigned char BitSequence;
+//const DataLength SWIF_SALT_VALUE;
+
+#define SWIF_HAIFA_IV 0
+
+/*const BitSequence SWIF_HAIFA_IV_224[SWIFFTX_OUTPUT_BLOCK_SIZE];
+const BitSequence SWIF_HAIFA_IV_256[SWIFFTX_OUTPUT_BLOCK_SIZE];
+const BitSequence SWIF_HAIFA_IV_384[SWIFFTX_OUTPUT_BLOCK_SIZE];
+const BitSequence SWIF_HAIFA_IV_512[SWIFFTX_OUTPUT_BLOCK_SIZE];*/
+
+typedef enum 
+{ 
+	SUCCESS = 0,
+	FAIL = 1,
+	BAD_HASHBITLEN = 2,
+	BAD_SALT_SIZE = 3,
+	SET_SALT_VALUE_FAILED = 4,
+	INPUT_DATA_NOT_ALIGNED = 5
+} HashReturn;
+
+typedef struct hashState {
+	unsigned short hashbitlen;
+
+	// The data remained after the recent call to 'Update()'. 
+	BitSequence remaining[SWIF_HAIFA_INPUT_BLOCK_SIZE + 1];
+
+	// The size of the remaining data in bits.
+	// Is 0 in case there is no remaning data at all.
+	unsigned int remainingSize;
+
+	// The current output of the compression function. At the end will contain the final digest
+	// (which may be needed to be truncated, depending on hashbitlen).
+	BitSequence currOutputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE];
+
+	// The value of '#bits hashed so far' field in HAIFA, in base 256.
+	BitSequence numOfBitsChar[SWIF_HAIFA_NUM_OF_BITS_SIZE];
+
+	// The salt value currently in use:
+	BitSequence salt[SWIF_HAIFA_SALT_SIZE];
+
+	// Indicates whether a single 'Update()' occured. 
+	// Ater a call to 'Update()' the key and the salt values cannot be changed.
+	bool wasUpdated;
+} hashState;
+
+private:
+int swifftxNumRounds;
+hashState swifftxState;
+
+
+public:
+int Init(int hashbitlen);
+int Update(const BitSequence *data, DataLength databitlen);
+int Final(BitSequence *hashval);
+int Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, 
+				BitSequence *hashval);
+
+private:
+static void AddToCurrInBase256(BitSequence value[SWIF_HAIFA_NUM_OF_BITS_SIZE], unsigned short toAdd);
+
+};
+
+#endif
--- a/algo/swifftx/hash_interface.h
+++ b/algo/swifftx/hash_interface.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <cstdint>
+
+namespace hash {
+
+using BitSequence = unsigned char;
+using DataLength = unsigned long long;
+
+struct hash_interface {
+    virtual ~hash_interface() = default;
+
+    virtual int Init(int hash_bitsize) = 0;
+    virtual int Update(const BitSequence *data, DataLength data_bitsize) = 0;
+    virtual int Final(BitSequence *hash) = 0;
+
+    virtual int
+    Hash(int hash_bitsize, const BitSequence *data, DataLength data_bitsize, BitSequence *hash) = 0;
+};
+
+} // namespace hash
--- a/algo/swifftx/sha3_interface.h
+++ b/algo/swifftx/sha3_interface.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <cstdint>
+//#include <streams/hash/hash_interface.h>
+#include "hash_interface.h"
+
+namespace sha3 {
+
+using BitSequence = hash::BitSequence;
+using DataLength = hash::DataLength;
+
+struct sha3_interface : hash::hash_interface {};
+
+} // namespace sha3
--- a/algo/x16/hex.c
+++ b/algo/x16/hex.c
@@ -23,12 +23,13 @@ static void hex_getAlgoString(const uint32_t* prevblock, char *output)
   *sptr = '\0';
 }

+static __thread x16r_context_overlay hex_ctx;

 int hex_hash( void* output, const void* input, int thrid )
 {
   uint32_t _ALIGN(128) hash[16];
   x16r_context_overlay ctx;
-   memcpy( &ctx, &x16r_ref_ctx, sizeof(ctx) );
+   memcpy( &ctx, &hex_ctx, sizeof(ctx) );
   void *in = (void*) input;
   int size = 80;

@@ -86,7 +87,7 @@ int hex_hash( void* output, const void* input, int thrid )
         case LUFFA:
            if ( i == 0 )
            {
-              update_and_final_luffa( &ctx.luffa, hash, in+64, 16 );
+              update_and_final_luffa( &ctx.luffa, hash, (const void*)in+64, 16 );
            }
            else
            {
@@ -96,7 +97,7 @@ int hex_hash( void* output, const void* input, int thrid )
            break;
         case CUBEHASH:
            if ( i == 0 )
-               cubehashUpdateDigest( &ctx.cube, hash, in+64, 16 );
+               cubehashUpdateDigest( &ctx.cube, hash, (const void*)in+64, 16 );
            else
            {
               cubehashInit( &ctx.cube, 512, 16, 32 );
@@ -204,32 +205,32 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
   switch ( algo )
   {
      case JH:
-         sph_jh512_init( &x16r_ref_ctx.jh );
-         sph_jh512( &x16r_ref_ctx.jh, edata, 64 );
+         sph_jh512_init( &hex_ctx.jh );
+         sph_jh512( &hex_ctx.jh, edata, 64 );
      break;
      case SKEIN:
-         sph_skein512_init( &x16r_ref_ctx.skein );
-         sph_skein512( &x16r_ref_ctx.skein, edata, 64 );
+         sph_skein512_init( &hex_ctx.skein );
+         sph_skein512( &hex_ctx.skein, edata, 64 );
      break;
      case LUFFA:
-         init_luffa( &x16r_ref_ctx.luffa, 512 );
-         update_luffa( &x16r_ref_ctx.luffa, edata, 64 );
+         init_luffa( &hex_ctx.luffa, 512 );
+         update_luffa( &hex_ctx.luffa, edata, 64 );
      break;
      case CUBEHASH:
-         cubehashInit( &x16r_ref_ctx.cube, 512, 16, 32 );
-         cubehashUpdate( &x16r_ref_ctx.cube, edata, 64 );
+         cubehashInit( &hex_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &hex_ctx.cube, edata, 64 );
      break;
      case HAMSI:
-         sph_hamsi512_init( &x16r_ref_ctx.hamsi );
-         sph_hamsi512( &x16r_ref_ctx.hamsi, edata, 64 );
+         sph_hamsi512_init( &hex_ctx.hamsi );
+         sph_hamsi512( &hex_ctx.hamsi, edata, 64 );
      break;
      case SHABAL:
-         sph_shabal512_init( &x16r_ref_ctx.shabal );
-         sph_shabal512( &x16r_ref_ctx.shabal, edata, 64 );
+         sph_shabal512_init( &hex_ctx.shabal );
+         sph_shabal512( &hex_ctx.shabal, edata, 64 );
      break;
      case WHIRLPOOL:
-         sph_whirlpool_init( &x16r_ref_ctx.whirlpool );
-         sph_whirlpool( &x16r_ref_ctx.whirlpool, edata, 64 );
+         sph_whirlpool_init( &hex_ctx.whirlpool );
+         sph_whirlpool( &hex_ctx.whirlpool, edata, 64 );
      break;
   }
   
--- a/algo/x16/minotaur.c
+++ b/algo/x16/minotaur.c
@@ -27,7 +27,7 @@
 #else
  #include "algo/echo/sph_echo.h"
 #endif
-#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #else
  #include "algo/groestl/sph_groestl.h"
@@ -50,7 +50,7 @@ typedef struct TortureGarden TortureGarden;
 // Graph of hash algos plus SPH contexts
 struct TortureGarden
 {
-#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
   hashState_groestl       groestl;
 #else
   sph_groestl512_context  groestl;
@@ -123,7 +123,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
 #endif
 	         break;
        case 5:
-#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
            groestl512_full( &garden->groestl, hash, input, 512 );
 #else
            sph_groestl512_init( &garden->groestl) ;
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -19,12 +19,12 @@
 // Perform midstate prehash of hash functions with block size <= 72 bytes,
 // 76 bytes for hash functions that operate on 32 bit data.

-void x16r_8way_prehash( void *vdata, void *pdata, const char *hash_order )
+void x16r_8way_prehash( void *vdata, void *pdata )
 {
   uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
   uint32_t edata[20] __attribute__ ((aligned (64)));

-   const char elem = hash_order[0];
+   const char elem = x16r_hash_order[0];
   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

   switch ( algo )
@@ -110,8 +110,7 @@ void x16r_8way_prehash( void *vdata, void *pdata, const char *hash_order )
 // Called by wrapper hash function to optionally continue hashing and
 // convert to final hash.

-int x16r_8way_hash_generic( void* output, const void* input, int thrid,
-     const char *hash_order, const int func_count )
+int x16r_8way_hash_generic( void* output, const void* input, int thrid )
 {
   uint32_t vhash[20*8] __attribute__ ((aligned (128)));
   uint32_t hash0[20] __attribute__ ((aligned (16)));
@@ -137,9 +136,9 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid,
   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 input, 640 );

-   for ( int i = 0; i < func_count; i++ )
+   for ( int i = 0; i < 16; i++ )
   {
-      const char elem = hash_order[i];
+      const char elem = x16r_hash_order[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

      switch ( algo )
@@ -475,8 +474,7 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid,
 int x16r_8way_hash( void* output, const void* input, int thrid )
 {
   uint8_t hash[64*8] __attribute__ ((aligned (128)));
-   if ( !x16r_8way_hash_generic( hash, input, thrid, x16r_hash_order, 
-                                 X16R_HASH_FUNC_COUNT ) )
+   if ( !x16r_8way_hash_generic( hash, input, thrid ) )
      return 0;

   memcpy( output,     hash,     32 );
@@ -497,6 +495,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*8] __attribute__ ((aligned (128)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -509,18 +508,21 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,

   if ( bench )   ptarget[7] = 0x0cff;

-   static __thread uint32_t saved_height = UINT32_MAX;
-   if ( work->height != saved_height )
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   const uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
   {
-      vdata[1] = bswap_32( pdata[1] );
-      vdata[2] = bswap_32( pdata[2] );
-      saved_height = work->height;
-      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
-      if ( !opt_quiet && !thr_id )
-           applog( LOG_INFO, "hash order %s", x16r_hash_order );
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+
+      if ( opt_debug && !thr_id )
+          applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
   }

-   x16r_8way_prehash( vdata, pdata, x16r_hash_order );
+   x16r_8way_prehash( vdata, pdata );
   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
@@ -544,12 +546,12 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,

 #elif defined (X16R_4WAY)

-void x16r_4way_prehash( void *vdata, void *pdata, const char *hash_order )
+void x16r_4way_prehash( void *vdata, void *pdata )
 {
   uint32_t vdata2[20*4] __attribute__ ((aligned (64)));
   uint32_t edata[20] __attribute__ ((aligned (64)));

-   const char elem = hash_order[0];
+   const char elem = x16r_hash_order[0];
   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

   switch ( algo )
@@ -625,8 +627,7 @@ void x16r_4way_prehash( void *vdata, void *pdata, const char *hash_order )
   }
 }

-int x16r_4way_hash_generic( void* output, const void* input, int thrid,
-                            const char *hash_order, const int func_count )
+int x16r_4way_hash_generic( void* output, const void* input, int thrid )
 {
   uint32_t vhash[20*4] __attribute__ ((aligned (128)));
   uint32_t hash0[20] __attribute__ ((aligned (32)));
@@ -643,9 +644,9 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid,

   dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );

-   for ( int i = 0; i < func_count; i++ )
+   for ( int i = 0; i < 16; i++ )
   {
-      const char elem = hash_order[i];
+      const char elem = x16r_hash_order[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

      switch ( algo )
@@ -907,8 +908,7 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid,
 int x16r_4way_hash( void* output, const void* input, int thrid )
 {
   uint8_t hash[64*4] __attribute__ ((aligned (64)));
-   if ( !x16r_4way_hash_generic( hash, input, thrid, x16r_hash_order,
-                                 X16R_HASH_FUNC_COUNT ) )
+   if ( !x16r_4way_hash_generic( hash, input, thrid ) )
      return 0;

   memcpy( output,     hash,     32 );
@@ -924,6 +924,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -936,18 +937,20 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,

   if ( bench )  ptarget[7] = 0x0cff;

-   static __thread uint32_t saved_height = UINT32_MAX;
-   if ( work->height != saved_height )
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   const uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
   {
-      vdata[1] = bswap_32( pdata[1] );
-      vdata[2] = bswap_32( pdata[2] );
-      saved_height = work->height;
-      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
-      if ( !opt_quiet && !thr_id )
-           applog( LOG_INFO, "hash order %s", x16r_hash_order );
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+         applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
   }

-   x16r_4way_prehash( vdata, pdata, x16r_hash_order );
+   x16r_4way_prehash( vdata, pdata );
   *noncev = mm256_intrlv_blend_32(
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
@@ -970,10 +973,10 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,

 #elif defined (X16R_2WAY)

-void x16r_2x64_prehash( void *vdata, void *pdata, const char *hash_order )
+void x16r_2x64_prehash( void *vdata, void *pdata )
 {
   uint32_t edata[20] __attribute__ ((aligned (64)));
-   const char elem = hash_order[0];
+   const char elem = x16r_hash_order[0];
   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

   switch ( algo )
@@ -1009,7 +1012,7 @@ void x16r_2x64_prehash( void *vdata, void *pdata, const char *hash_order )
      }
      break;
      case HAMSI:
-#if defined(__SSE4_2__) || defined(__ARM_NEON)
+#if defined(__SSE4_2__)
         v128_bswap32_intrlv80_2x64( vdata, pdata );
         hamsi512_2x64_init( &x16r_ctx.hamsi );
         hamsi512_2x64_update( &x16r_ctx.hamsi, vdata, 72 );
@@ -1048,8 +1051,7 @@ void x16r_2x64_prehash( void *vdata, void *pdata, const char *hash_order )
   }
 }

-int x16r_2x64_hash_generic( void* output, const void* input, int thrid,
-                            const char *hash_order, const int func_count )
+int x16r_2x64_hash_generic( void* output, const void* input, int thrid )
 {
   uint32_t vhash[20*2] __attribute__ ((aligned (64)));
   uint32_t hash0[20] __attribute__ ((aligned (32)));
@@ -1062,9 +1064,9 @@ int x16r_2x64_hash_generic( void* output, const void* input, int thrid,

   dintrlv_2x64( hash0, hash1, input, 640 );

-   for ( int i = 0; i < func_count; i++ )
+   for ( int i = 0; i < 16; i++ )
   {
-      const char elem = hash_order[i];
+      const char elem = x16r_hash_order[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

      switch ( algo )
@@ -1092,7 +1094,7 @@ int x16r_2x64_hash_generic( void* output, const void* input, int thrid,
            dintrlv_2x64( hash0, hash1, vhash, 512 );
         break;
         case GROESTL:
-#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+#if defined(__AES__)  // || defined(__ARM_FEATURE_AES)
            groestl512_full( &ctx.groestl, hash0, in0, size<<3 );
            groestl512_full( &ctx.groestl, hash1, in1, size<<3 );
 #else
@@ -1142,7 +1144,6 @@ int x16r_2x64_hash_generic( void* output, const void* input, int thrid,
            if ( i == 0 )
            {
               update_and_final_luffa( &ctx.luffa, hash0, in0 + 64, 16 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
               update_and_final_luffa( &ctx.luffa, hash1, in1 + 64, 16 );
            }
            else
@@ -1155,7 +1156,6 @@ int x16r_2x64_hash_generic( void* output, const void* input, int thrid,
            if ( i == 0 )
            {
               cubehashUpdateDigest( &ctx.cube, hash0, in0 + 64, 16 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
               cubehashUpdateDigest( &ctx.cube, hash1, in1 + 64, 16 );
            }
            else
@@ -1173,7 +1173,7 @@ int x16r_2x64_hash_generic( void* output, const void* input, int thrid,
            simd512_ctx( &ctx.simd, hash1, in1, size );
         break;
         case ECHO:
-#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+#if defined(__AES__)
            echo_full( &ctx.echo, hash0, 512, in0, size );
            echo_full( &ctx.echo, hash1, 512, in1, size );
 #else
@@ -1311,8 +1311,7 @@ int x16r_2x64_hash_generic( void* output, const void* input, int thrid,
 int x16r_2x64_hash( void* output, const void* input, int thrid )
 {
   uint8_t hash[64*2] __attribute__ ((aligned (64)));
-   if ( !x16r_2x64_hash_generic( hash, input, thrid, x16r_hash_order,
-                                 X16R_HASH_FUNC_COUNT ) )
+   if ( !x16r_2x64_hash_generic( hash, input, thrid ) )
      return 0;

   memcpy( output,     hash,     32 );
@@ -1326,6 +1325,7 @@ int scanhash_x16r_2x64( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*2] __attribute__ ((aligned (64)));
   uint32_t vdata[20*2] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -1338,18 +1338,20 @@ int scanhash_x16r_2x64( struct work *work, uint32_t max_nonce,

   if ( bench )  ptarget[7] = 0x0cff;

-   static __thread uint32_t saved_height = UINT32_MAX;
-   if ( work->height != saved_height )
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   const uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
   {
-      vdata[1] = bswap_32( pdata[1] );
-      vdata[2] = bswap_32( pdata[2] );
-      saved_height = work->height;
-      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
-      if ( !opt_quiet && !thr_id )
-           applog( LOG_INFO, "hash order %s", x16r_hash_order );
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+         applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
   }

-   x16r_2x64_prehash( vdata, pdata, x16r_hash_order );
+   x16r_2x64_prehash( vdata, pdata );
   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
   do
   {
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -5,15 +5,15 @@ __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = { 0 };

 void (*x16_r_s_getAlgoString) ( const uint8_t*, char* ) = NULL;

-#if defined(X16R_8WAY)
+#if defined (X16R_8WAY)

 __thread x16r_8way_context_overlay x16r_ctx;

-#elif defined(X16R_4WAY)
+#elif defined (X16R_4WAY)

 __thread x16r_4way_context_overlay x16r_ctx;

-#elif defined(X16R_2WAY)
+#elif defined (X16R_2WAY)

 __thread x16r_2x64_context_overlay x16r_ctx;

@@ -55,13 +55,13 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output )

 bool register_x16r_algo( algo_gate_t* gate )
 {
-#if defined(X16R_8WAY)
+#if defined (X16R_8WAY)
  gate->scanhash  = (void*)&scanhash_x16r_8way;
  gate->hash      = (void*)&x16r_8way_hash;
-#elif defined(X16R_4WAY)
+#elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
-#elif defined(X16R_2WAY)
+#elif defined (X16R_2WAY)
  gate->scanhash  = (void*)&scanhash_x16r_2x64;
  gate->hash      = (void*)&x16r_2x64_hash;
 #else
@@ -77,13 +77,13 @@ bool register_x16r_algo( algo_gate_t* gate )

 bool register_x16rv2_algo( algo_gate_t* gate )
 {
-#if defined(X16RV2_8WAY)
+#if defined (X16RV2_8WAY)
  gate->scanhash  = (void*)&scanhash_x16rv2_8way;
  gate->hash      = (void*)&x16rv2_8way_hash;
-#elif defined(X16RV2_4WAY)
+#elif defined (X16RV2_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rv2_4way;
  gate->hash      = (void*)&x16rv2_4way_hash;
-#elif defined(X16RV2_2WAY)
+#elif defined (X16RV2_2WAY)
  gate->scanhash  = (void*)&scanhash_x16rv2_2x64;
  gate->hash      = (void*)&x16rv2_2x64_hash;
 #else
@@ -99,13 +99,13 @@ bool register_x16rv2_algo( algo_gate_t* gate )

 bool register_x16s_algo( algo_gate_t* gate )
 {
-#if defined(X16R_8WAY)
+#if defined (X16R_8WAY)
  gate->scanhash  = (void*)&scanhash_x16r_8way;
  gate->hash      = (void*)&x16r_8way_hash;
-#elif defined(X16R_4WAY)
+#elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
-#elif defined(X16R_2WAY)
+#elif defined (X16R_2WAY)
  gate->scanhash  = (void*)&scanhash_x16r_2x64;
  gate->hash      = (void*)&x16r_2x64_hash;
 #else
@@ -235,13 +235,13 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )

 bool register_x16rt_algo( algo_gate_t* gate )
 {
-#if defined(X16RT_8WAY)
+#if defined (X16RT_8WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_8way;
  gate->hash      = (void*)&x16r_8way_hash;
-#elif defined(X16RT_4WAY)
+#elif defined (X16RT_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16r_4way_hash;
-#elif defined(X16RT_2WAY)
+#elif defined (X16RT_2WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_2x64;
  gate->hash      = (void*)&x16r_2x64_hash;
 #else
@@ -256,13 +256,13 @@ bool register_x16rt_algo( algo_gate_t* gate )

 bool register_x16rt_veil_algo( algo_gate_t* gate )
 {
-#if defined(X16RT_8WAY)
+#if defined (X16RT_8WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_8way;
  gate->hash      = (void*)&x16r_8way_hash;
-#elif defined(X16RT_4WAY)
+#elif defined (X16RT_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16r_4way_hash;
-#elif defined(X16RT_2WAY)
+#elif defined (X16RT_2WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_2x64;
  gate->hash      = (void*)&x16r_2x64_hash;
 #else
@@ -296,15 +296,15 @@ bool register_hex_algo( algo_gate_t* gate )

 bool register_x21s_algo( algo_gate_t* gate )
 {
-#if defined(X21S_8WAY)
+#if defined (X21S_8WAY)
  gate->scanhash          = (void*)&scanhash_x21s_8way;
  gate->hash              = (void*)&x21s_8way_hash;
  gate->miner_thread_init = (void*)&x21s_8way_thread_init;
-#elif defined(X21S_4WAY)
+#elif defined (X21S_4WAY)
  gate->scanhash          = (void*)&scanhash_x21s_4way;
  gate->hash              = (void*)&x21s_4way_hash;
  gate->miner_thread_init = (void*)&x21s_4way_thread_init;
-#elif defined(X21S_2WAY)
+#elif defined (X21S_2WAY)
  gate->scanhash          = (void*)&scanhash_x21s_2x64;
  gate->hash              = (void*)&x21s_2x64_hash;
  gate->miner_thread_init = (void*)&x21s_2x64_thread_init;
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -55,7 +55,7 @@
  #define X16R_8WAY   1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X16R_4WAY   1
-#elif defined(__SSE2__) || defined(__ARM_NEON)
+#elif defined(__SSE2__) || defined(__ARM_NEON__)
  #define X16R_2WAY   1
 #endif

@@ -63,7 +63,7 @@
  #define X16RV2_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X16RV2_4WAY 1
-#elif defined(__SSE2__) || defined(__ARM_NEON)
+#elif defined(__SSE2__) || defined(__ARM_NEON__)
  #define X16RV2_2WAY 1
 #endif

@@ -72,7 +72,7 @@
  #define X16RT_8WAY  1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X16RT_4WAY  1
-#elif defined(__SSE2__) || defined(__ARM_NEON)
+#elif defined(__SSE2__) || defined(__ARM_NEON__)
  #define X16RT_2WAY  1
 #endif

@@ -80,7 +80,7 @@
  #define X21S_8WAY   1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X21S_4WAY   1
-#elif defined(__SSE2__) || defined(__ARM_NEON)
+#elif defined(__SSE2__) || defined(__ARM_NEON__)
  #define X21S_2WAY   1
 #endif

@@ -149,23 +149,18 @@ union _x16r_8way_context_overlay
    hashState_echo          echo;
 #endif
 } __attribute__ ((aligned (64)));
-#define  _x16r_8x64_context_overlay _x16r_8way_context_overlay

 typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;
-#define  x16r_8x64_context_overlay x16r_8way_context_overlay

 extern __thread x16r_8way_context_overlay x16r_ctx;

-void x16r_8way_prehash( void *, void *, const char * );
-int x16r_8way_hash_generic( void *, const void *, int, const char*, const int );
+void x16r_8way_prehash( void *, void * );
+int x16r_8way_hash_generic( void *, const void *, int );
 int x16r_8way_hash( void *, const void *, int );
 int scanhash_x16r_8way( struct work *, uint32_t ,
                        uint64_t *, struct thr_info * );
+extern __thread x16r_8way_context_overlay x16r_ctx;

-#define x16r_8x64_prehash         x16r_8way_prehash
-#define x16r_8x64_hash_generic    x16r_8way_hash_generic
-#define x16r_8x64_hash            x16r_8way_hash
-#define scanhash_x16r_8x64        scanhash_x16r_8x64

 #elif defined(X16R_4WAY)

@@ -194,23 +189,17 @@ union _x16r_4way_context_overlay
    sph_whirlpool_context   whirlpool;
    sha512_4way_context     sha512;
 } __attribute__ ((aligned (64)));
-#define  _x16r_4x64_context_overlay _x16r_4way_context_overlay

 typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;
-#define  x16r_4x64_context_overlay x16r_4way_context_overlay

 extern __thread x16r_4way_context_overlay x16r_ctx;

-void x16r_4way_prehash( void *, void *, const char * );
-int x16r_4way_hash_generic( void *, const void *, int, const char*, const int );
+void x16r_4way_prehash( void *, void * );
+int x16r_4way_hash_generic( void *, const void *, int );
 int x16r_4way_hash( void *, const void *, int );
 int scanhash_x16r_4way( struct work *, uint32_t,
                        uint64_t *, struct thr_info * );
-
-#define x16r_4x64_prehash         x16r_4way_prehash
-#define x16r_4x64_hash_generic    x16r_4way_hash_generic
-#define x16r_4x64_hash            x16r_4way_hash
-#define scanhash_x16r_4x64        scanhash_x16r_4x64
+extern __thread x16r_4way_context_overlay x16r_ctx;

 #elif defined(X16R_2WAY)

@@ -218,7 +207,7 @@ union _x16r_2x64_context_overlay
 {
    blake512_2x64_context   blake;
    bmw512_2x64_context     bmw;
-#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
    hashState_groestl       groestl;
 #else
    sph_groestl512_context  groestl;
@@ -252,8 +241,8 @@ union _x16r_2x64_context_overlay

 typedef union _x16r_2x64_context_overlay x16r_2x64_context_overlay;

-void x16r_2x64_prehash( void *, void *, const char * );
-int x16r_2x64_hash_generic( void *, const void *, int, const char*, const int );
+void x16r_2x64_prehash( void *, void * );
+int x16r_2x64_hash_generic( void *, const void *, int );
 int x16r_2x64_hash( void *, const void *, int );
 int scanhash_x16r_2x64( struct work *, uint32_t,
                        uint64_t *, struct thr_info * );
@@ -261,7 +250,6 @@ extern __thread x16r_2x64_context_overlay x16r_ctx;

 #endif

-// need a reference, add hooks for SSE2.
 // needed for hex
 union _x16r_context_overlay
 {
@@ -284,7 +272,11 @@ union _x16r_context_overlay
 #else
        sph_echo512_context     echo;
 #endif
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
+        hamsi_2x64_context      hamsi;
+#else
        sph_hamsi512_context    hamsi;
+#endif
 #if defined(__AES__)
        hashState_fugue         fugue;
 #else
@@ -299,8 +291,8 @@ typedef union _x16r_context_overlay x16r_context_overlay;

 extern __thread x16r_context_overlay x16r_ref_ctx;

-void x16r_prehash( void *, void *, const char * );
-int x16r_hash_generic( void *, const void *, int, const char*, const int );
+void x16r_prehash( void *, void * );
+int x16r_hash_generic( void *, const void *, int );
 int x16r_hash( void *, const void *, int );
 int scanhash_x16r( struct work *, uint32_t, uint64_t *, struct thr_info * );

--- a/algo/x16/x16r.c
+++ b/algo/x16/x16r.c
@@ -10,9 +10,9 @@
 #include <stdlib.h>
 #include <string.h>

-void x16r_prehash( void *edata, void *pdata, const char *hash_order )
+void x16r_prehash( void *edata, void *pdata )
 {
-   const char elem = hash_order[0];
+   const char elem = x16r_hash_order[0];
   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

   switch ( algo )
@@ -52,18 +52,17 @@ void x16r_prehash( void *edata, void *pdata, const char *hash_order )
   }
 }

-int x16r_hash_generic( void* output, const void* input, int thrid, 
-                       const char *hash_order, const int func_count )
+int x16r_hash_generic( void* output, const void* input, int thrid )
 {
-   uint32_t _ALIGN(32) hash[16];
+   uint32_t _ALIGN(128) hash[16];
   x16r_context_overlay ctx;
   memcpy( &ctx, &x16r_ref_ctx, sizeof(ctx) );
   void *in = (void*) input;
   int size = 80;

-   for ( int i = 0; i < func_count; i++ )
+   for ( int i = 0; i < 16; i++ )
   {
-      const char elem = hash_order[i];
+      const char elem = x16r_hash_order[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

      switch ( algo )
@@ -75,8 +74,8 @@ int x16r_hash_generic( void* output, const void* input, int thrid,
         break;
         case BMW:
            sph_bmw512_init( &ctx.bmw );
-            sph_bmw512( &ctx.bmw, in, size );
-            sph_bmw512_close( &ctx.bmw, hash );
+            sph_bmw512(&ctx.bmw, in, size);
+            sph_bmw512_close(&ctx.bmw, hash);
         break;
         case GROESTL:
 #if defined(__AES__)  // || defined(__ARM_FEATURE_AES)
@@ -89,13 +88,13 @@ int x16r_hash_generic( void* output, const void* input, int thrid,
         break;
         case JH:
            if ( i == 0 )
-               sph_jh512( &ctx.jh, in+64, 16 );
+               sph_jh512(&ctx.jh, in+64, 16 );
            else
            {
               sph_jh512_init( &ctx.jh );
-               sph_jh512( &ctx.jh, in, size );
+               sph_jh512(&ctx.jh, in, size );
            }
-            sph_jh512_close( &ctx.jh, hash );
+            sph_jh512_close(&ctx.jh, hash );
         break;
         case KECCAK:
            if ( i == 0 )
@@ -109,7 +108,7 @@ int x16r_hash_generic( void* output, const void* input, int thrid,
         break;
         case SKEIN:
            if ( i == 0 )
-               sph_skein512( &ctx.skein, in+64, 16 );
+               sph_skein512(&ctx.skein, in+64, 16 );
            else
            {
               sph_skein512_init( &ctx.skein );
@@ -134,8 +133,8 @@ int x16r_hash_generic( void* output, const void* input, int thrid,
         break;
         case SIMD:
            sph_simd512_init( &ctx.simd );
-            sph_simd512( &ctx.simd, hash, size );
-            sph_simd512_close( &ctx.simd, hash );
+            sph_simd512(&ctx.simd, hash, 64);
+            sph_simd512_close(&ctx.simd, hash);
         break;
         case ECHO:
 #if defined(__AES__)
@@ -148,7 +147,7 @@ int x16r_hash_generic( void* output, const void* input, int thrid,
         break;
         case HAMSI:
            if ( i == 0 )
-               sph_hamsi512( &ctx.hamsi, in+72, 8 );
+               sph_hamsi512( &ctx.hamsi, in+64, 16 );
            else
            {
               sph_hamsi512_init( &ctx.hamsi );
@@ -197,8 +196,7 @@ int x16r_hash_generic( void* output, const void* input, int thrid,
 int x16r_hash( void* output, const void* input, int thrid )
 {  
   uint8_t hash[64] __attribute__ ((aligned (64)));
-   if ( !x16r_hash_generic( hash, input, thrid, x16r_hash_order, 
-                            X16R_HASH_FUNC_COUNT ) )
+   if ( !x16r_hash_generic( hash, input, thrid ) )
      return 0;
   
    memcpy( output, hash, 32 );
@@ -208,8 +206,8 @@ int x16r_hash( void* output, const void* input, int thrid )
 int scanhash_x16r( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t _ALIGN(32) hash32[8];
-   uint32_t _ALIGN(32) edata[20];
+   uint32_t _ALIGN(128) hash32[8];
+   uint32_t _ALIGN(128) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -231,7 +229,7 @@ int scanhash_x16r( struct work *work, uint32_t max_nonce,
           applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

-   x16r_prehash( edata, pdata, x16r_hash_order );
+   x16r_prehash( edata, pdata );

   do
   {
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -30,12 +30,12 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
      x16rt_getTimeHash( masked_ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
      s_ntime = masked_ntime;
-      if ( !opt_quiet && !thr_id )
-          applog( LOG_INFO, "Hash order %s, Ntime %08x",
-                            x16r_hash_order, bswap_32( pdata[17] ) );
+      if ( !thr_id )
+          applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
+                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
   }

-   x16r_8way_prehash( vdata, pdata, x16r_hash_order );
+   x16r_8way_prehash( vdata, pdata );
   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
@@ -84,12 +84,12 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
      x16rt_getTimeHash( masked_ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
      s_ntime = masked_ntime;
-      if ( !opt_quiet && !thr_id )
-          applog( LOG_INFO, "Hash order %s, Ntime %08x",
-                            x16r_hash_order, bswap_32( pdata[17] ) );
+      if ( !thr_id )
+          applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
+                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
   }

-   x16r_4way_prehash( vdata, pdata, x16r_hash_order );
+   x16r_4way_prehash( vdata, pdata );
   *noncev = mm256_intrlv_blend_32(
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
@@ -137,12 +137,12 @@ int scanhash_x16rt_2x64( struct work *work, uint32_t max_nonce,
      x16rt_getTimeHash( masked_ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
      s_ntime = masked_ntime;
-      if ( !opt_quiet && !thr_id )
-          applog( LOG_INFO, "Hash order %s, Ntime %08x",
-                            x16r_hash_order, bswap_32( pdata[17] ) );
+      if ( !thr_id )
+          applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
+                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
   }

-   x16r_2x64_prehash( vdata, pdata, x16r_hash_order );
+   x16r_2x64_prehash( vdata, pdata );
   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
   do
   {
--- a/algo/x16/x16rt.c
+++ b/algo/x16/x16rt.c
@@ -31,7 +31,7 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
                        x16r_hash_order, swab32( pdata[17] ), timeHash );
   }
   
-   x16r_prehash( edata, pdata, x16r_hash_order );
+   x16r_prehash( edata, pdata );
   
   do
   {
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -395,7 +395,7 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
         break;
         case HAMSI:
            if ( i == 0 )
-               hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 );
+               hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -409,43 +409,14 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
                          hash7, vhash );
         break;
         case FUGUE:
-            if ( i == 0 )
-            {
-               fugue512_update( &ctx.fugue, in0 + 76, 4 );
-               fugue512_final( &ctx.fugue, hash0 );
-               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
-               fugue512_update( &ctx.fugue, in1 + 76, 4 );
-               fugue512_final( &ctx.fugue, hash1 );
-               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
-               fugue512_update( &ctx.fugue, in2 + 76, 4 );
-               fugue512_final( &ctx.fugue, hash2 );
-               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
-               fugue512_update( &ctx.fugue, in3 + 76, 4 );
-               fugue512_final( &ctx.fugue, hash3 );
-               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) ); 
-               fugue512_update( &ctx.fugue, in4 + 76, 4 ); 
-               fugue512_final( &ctx.fugue, hash4 ); 
-               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) ); 
-               fugue512_update( &ctx.fugue, in5 + 76, 4 );
-               fugue512_final( &ctx.fugue, hash5 );
-               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
-               fugue512_update( &ctx.fugue, in6 + 76, 4 );
-               fugue512_final( &ctx.fugue, hash6 );
-               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
-               fugue512_update( &ctx.fugue, in7 + 76, 4 );
-               fugue512_final( &ctx.fugue, hash7 );
-            }
-            else
-            {
-               fugue512_full( &ctx.fugue, hash0, hash0, size );
-               fugue512_full( &ctx.fugue, hash1, hash1, size );
-               fugue512_full( &ctx.fugue, hash2, hash2, size );
-               fugue512_full( &ctx.fugue, hash3, hash3, size );
-               fugue512_full( &ctx.fugue, hash4, hash4, size );
-               fugue512_full( &ctx.fugue, hash5, hash5, size );
-               fugue512_full( &ctx.fugue, hash6, hash6, size );
-               fugue512_full( &ctx.fugue, hash7, hash7, size );
-            }
+            fugue512_full( &ctx.fugue, hash0, in0, size );
+            fugue512_full( &ctx.fugue, hash1, in1, size );
+            fugue512_full( &ctx.fugue, hash2, in2, size );
+            fugue512_full( &ctx.fugue, hash3, in3, size );
+            fugue512_full( &ctx.fugue, hash4, in4, size );
+            fugue512_full( &ctx.fugue, hash5, in5, size );
+            fugue512_full( &ctx.fugue, hash6, in6, size );
+            fugue512_full( &ctx.fugue, hash7, in7, size );
         break;
         case SHABAL:
            intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -593,6 +564,7 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
   uint32_t edata[20] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -605,15 +577,19 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,

   if ( bench ) ptarget[7] = 0x0cff;

-   static __thread uint32_t saved_height = UINT32_MAX;
-   if ( work->height != saved_height )
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   const uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
   {
-      vdata[1] = bswap_32( pdata[1] );
-      vdata[2] = bswap_32( pdata[2] );
-      saved_height = work->height;
-      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
-      if ( !opt_quiet && !thr_id )
-           applog( LOG_INFO, "hash order %s", x16r_hash_order );
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

   // Do midstate prehash on hash functions with block size <= 64 bytes.
@@ -650,14 +626,7 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
      case HAMSI:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
         hamsi512_8way_init( &x16rv2_ctx.hamsi );
-         hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 72 );
-      break;
-      case FUGUE:
-         v128_bswap32_80( edata, pdata );
-         fugue512_init( &x16rv2_ctx.fugue );
-         fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
-         intrlv_8x64( vdata, edata, edata, edata, edata,
-                             edata, edata, edata, edata, 640 );
+         hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 );
      break;
      case SHABAL:
         mm256_bswap32_intrlv80_8x32( vdata2, pdata );
@@ -855,8 +824,8 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               skein512_4way_init( &ctx.skein );
               skein512_4way_update( &ctx.skein, vhash, size );
-               skein512_4way_close( &ctx.skein, vhash );
            }
+            skein512_4way_close( &ctx.skein, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case LUFFA:
@@ -976,7 +945,7 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
         break;
         case HAMSI:
            if ( i == 0 )
-               hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 );
+               hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
@@ -987,27 +956,10 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case FUGUE:
-            if ( i == 0 )
-            {
-               fugue512_update( &ctx.fugue, in0 + 76, 4 );
-               fugue512_final( &ctx.fugue, hash0 );
-               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
-               fugue512_update( &ctx.fugue, in1 + 76, 4 );
-               fugue512_final( &ctx.fugue, hash1 );
-               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
-               fugue512_update( &ctx.fugue, in2 + 76, 4 );
-               fugue512_final( &ctx.fugue, hash2 );
-               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
-               fugue512_update( &ctx.fugue, in3 + 76, 4 );
-               fugue512_final( &ctx.fugue, hash3 );
-            }
-            else
-            {
-               fugue512_full( &ctx.fugue, hash0, hash0, size );
-               fugue512_full( &ctx.fugue, hash1, hash1, size );
-               fugue512_full( &ctx.fugue, hash2, hash2, size );
-               fugue512_full( &ctx.fugue, hash3, hash3, size );
-            }
+            fugue512_full( &ctx.fugue, hash0, in0, size );
+            fugue512_full( &ctx.fugue, hash1, in1, size );
+            fugue512_full( &ctx.fugue, hash2, in2, size );
+            fugue512_full( &ctx.fugue, hash3, in3, size );
         break;
         case SHABAL:
             intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
@@ -1103,6 +1055,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
   uint32_t edata[20];
+   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -1115,15 +1068,17 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,

   if ( bench )  ptarget[7] = 0x0fff;
   
-   static __thread uint32_t saved_height = UINT32_MAX;
-   if ( work->height != saved_height )
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   const uint32_t ntime = bswap_32(pdata[17]);
+   if ( s_ntime != ntime )
   {
-      vdata[1] = bswap_32( pdata[1] );
-      vdata[2] = bswap_32( pdata[2] );
-      saved_height = work->height;
-      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
-      if ( !opt_quiet && !thr_id )
-           applog( LOG_INFO, "hash order %s", x16r_hash_order );
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

   // Do midstate prehash on hash functions with block size <= 64 bytes.
@@ -1146,7 +1101,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
      break;
      case SKEIN:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         skein512_4way_prehash64( &x16rv2_ctx.skein, vdata );
+         skein512_4way_prehash64( &x16r_ctx.skein, vdata );
      break;
      case CUBEHASH:
         v128_bswap32_80( edata, pdata );
@@ -1157,13 +1112,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
      case HAMSI:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
         hamsi512_4way_init( &x16rv2_ctx.hamsi );
-         hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 72 );
-      break;
-      case FUGUE:
-         v128_bswap32_80( edata, pdata );
-         fugue512_init( &x16rv2_ctx.fugue );
-         fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
-         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+         hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 );
      break;
      case SHABAL:
         v128_bswap32_intrlv80_4x32( vdata32, pdata );
@@ -1208,7 +1157,7 @@ union _x16rv2_2x64_context_overlay
 {
    blake512_2x64_context   blake;
    bmw512_2x64_context     bmw;
-#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
    hashState_groestl       groestl;
 #else
    sph_groestl512_context  groestl;
@@ -1294,7 +1243,7 @@ int x16rv2_2x64_hash( void* output, const void* input, int thrid )
            dintrlv_2x64( hash0, hash1, vhash, 512 );
         break;
         case GROESTL:
-#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+#if defined(__AES__)
            groestl512_full( &ctx.groestl, hash0, in0, size<<3 );
            groestl512_full( &ctx.groestl, hash1, in1, size<<3 );
 #else
@@ -1308,7 +1257,7 @@ int x16rv2_2x64_hash( void* output, const void* input, int thrid )
         break;
         case JH:
            if ( i == 0 )
-               jh512_2x64_update( &ctx.jh, input + (64<<1), 16 );
+               jh512_2x64_update( &ctx.jh, input + (64<<2), 16 );
            else
            {
               intrlv_2x64( vhash, in0, in1, size<<3 );
@@ -1347,12 +1296,14 @@ int x16rv2_2x64_hash( void* output, const void* input, int thrid )
         break;
         case SKEIN:
            if ( i == 0 )
-               skein512_2x64_final16( &ctx.skein, vhash, input + (64*2) );
+               skein512_2x64_final16( &ctx.skein, vhash, input + (64*4) );
            else
            {
               intrlv_2x64( vhash, in0, in1, size<<3 );
-               skein512_2x64_full( &ctx.skein, vhash, vhash,  size );
+               skein512_2x64_init( &ctx.skein );
+               skein512_2x64_update( &ctx.skein, vhash, size );
            }
+            skein512_2x64_close( &ctx.skein, vhash );
            dintrlv_2x64( hash0, hash1, vhash, 512 );
         break;
         case LUFFA:
@@ -1375,14 +1326,13 @@ int x16rv2_2x64_hash( void* output, const void* input, int thrid )
            }
            for ( int i = (24/4); i < (64/4); i++ )
                hash0[i] = hash1[i] = 0;
-            luffa_full( &ctx.luffa, hash0, 512, hash0, 64 );
-            luffa_full( &ctx.luffa, hash1, 512, hash1, 64 );
+            luffa_full( &ctx.luffa, hash0, 512, hash0, size );
+            luffa_full( &ctx.luffa, hash1, 512, hash1, size );
         break;
         case CUBEHASH:
            if ( i == 0 )
            {
               cubehashUpdateDigest( &ctx.cube, hash0, in0 + 64, 16 );
-               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
               cubehashUpdateDigest( &ctx.cube, hash1, in1 + 64, 16 );
            }
            else
@@ -1400,7 +1350,7 @@ int x16rv2_2x64_hash( void* output, const void* input, int thrid )
            simd512_ctx( &ctx.simd, hash1, in1, size );
         break;
         case ECHO:
-#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+#if defined(__AES__)
            echo_full( &ctx.echo, hash0, 512, in0, size );
            echo_full( &ctx.echo, hash1, 512, in1, size );
 #else
@@ -1429,7 +1379,7 @@ int x16rv2_2x64_hash( void* output, const void* input, int thrid )
            {
               sph_hamsi512( &ctx.hamsi, in0 + 72, 8 );
               sph_hamsi512_close( &ctx.hamsi, hash0 );
-               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
               sph_hamsi512( &ctx.hamsi, in1 + 72, 8 );
               sph_hamsi512_close( &ctx.hamsi, hash1 );
            }
@@ -1450,7 +1400,7 @@ int x16rv2_2x64_hash( void* output, const void* input, int thrid )
            {
               fugue512_update( &ctx.fugue, in0 + 76, 4 );
               fugue512_final( &ctx.fugue, hash0 );
-               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
+               memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
               fugue512_update( &ctx.fugue, in1 + 76, 4 );
               fugue512_final( &ctx.fugue, hash1 );
            }
@@ -1464,7 +1414,7 @@ int x16rv2_2x64_hash( void* output, const void* input, int thrid )
            {
               sph_fugue512( &ctx.fugue, in0 + 76, 4 );
               sph_fugue512_close( &ctx.fugue, hash0 );
-               memcpy( &ctx, &x16rv2_ctx, sizeof(sph_fugue512_context) );
+               memcpy( &ctx, &x16r_ctx, sizeof(sph_fugue512_context) );
               sph_fugue512( &ctx.fugue, in1 + 76, 4 );
               sph_fugue512_close( &ctx.fugue, hash1 );
            }
@@ -1480,7 +1430,7 @@ int x16rv2_2x64_hash( void* output, const void* input, int thrid )
            {
               sph_shabal512( &ctx.shabal, in0 + 64, 16 );
               sph_shabal512_close( &ctx.shabal, hash0 );
-               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
               sph_shabal512( &ctx.shabal, in1 + 64, 16 );
               sph_shabal512_close( &ctx.shabal, hash1 );
            }
@@ -1542,6 +1492,7 @@ int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce,
   uint32_t hash[2*16] __attribute__ ((aligned (64)));
   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
   uint32_t edata[20];
+   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -1554,15 +1505,17 @@ int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce,

   if ( bench )  ptarget[7] = 0x0fff;

-   static __thread uint32_t saved_height = UINT32_MAX;
-   if ( work->height != saved_height )
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   const uint32_t ntime = bswap_32(pdata[17]);
+   if ( s_ntime != ntime )
   {
-      vdata[1] = bswap_32( pdata[1] );
-      vdata[2] = bswap_32( pdata[2] );
-      saved_height = work->height;
-      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
-      if ( !opt_quiet && !thr_id )
-           applog( LOG_INFO, "hash order %s", x16r_hash_order );
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

   // Do midstate prehash on hash functions with block size <= 64 bytes.
@@ -1585,7 +1538,7 @@ int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce,
      break;
      case SKEIN:
         v128_bswap32_intrlv80_2x64( vdata, pdata );
-         skein512_2x64_prehash64( &x16rv2_ctx.skein, vdata );
+         skein512_2x64_prehash64( &x16r_ctx.skein, vdata );
      break;
      case CUBEHASH:
         v128_bswap32_80( edata, pdata );
@@ -1594,32 +1547,32 @@ int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce,
         intrlv_2x64( vdata, edata, edata, 640 );
      break;
      case HAMSI:
-#if defined(__SSE4_2__) || defined(__ARM_NEON)
+#if defined(__SSE4_2__)
         v128_bswap32_intrlv80_2x64( vdata, pdata );
-         hamsi512_2x64_init( &x16rv2_ctx.hamsi );
-         hamsi512_2x64_update( &x16rv2_ctx.hamsi, vdata, 72 );
+         hamsi512_2x64_init( &x16r_ctx.hamsi );
+         hamsi512_2x64_update( &x16r_ctx.hamsi, vdata, 72 );
 #else
         v128_bswap32_80( edata, pdata );
-         sph_hamsi512_init( &x16rv2_ctx.hamsi );
-         sph_hamsi512( &x16rv2_ctx.hamsi, edata, 72 );
+         sph_hamsi512_init( &x16r_ctx.hamsi );
+         sph_hamsi512( &x16r_ctx.hamsi, edata, 72 );
         intrlv_2x64( vdata, edata, edata, 640 );
 #endif
      break;
      case FUGUE:
         v128_bswap32_80( edata, pdata );
 #if defined(__AES__)
-         fugue512_init( &x16rv2_ctx.fugue );
-         fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
+         fugue512_init( &x16r_ctx.fugue );
+         fugue512_update( &x16r_ctx.fugue, edata, 76 );
 #else
-         sph_fugue512_init( &x16rv2_ctx.fugue );
-         sph_fugue512( &x16rv2_ctx.fugue, edata, 76 );
+         sph_fugue512_init( &x16r_ctx.fugue );
+         sph_fugue512( &x16r_ctx.fugue, edata, 76 );
 #endif
         intrlv_2x64( vdata, edata, edata, 640 );
      break;
      case SHABAL:
         v128_bswap32_80( edata, pdata );
-         sph_shabal512_init( &x16rv2_ctx.shabal );
-         sph_shabal512( &x16rv2_ctx.shabal, edata, 64);
+         sph_shabal512_init( &x16r_ctx.shabal );
+         sph_shabal512( &x16r_ctx.shabal, edata, 64);
         intrlv_2x64( vdata, edata, edata, 640 );
      break;
      default:
--- a/algo/x16/x20r.c
+++ b/algo/x16/x20r.c
@@ -1,362 +0,0 @@
-#include "miner.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "algo/blake/sph_blake.h"
-#include "algo/bmw/sph_bmw.h"
-#include "algo/groestl/sph_groestl.h"
-#include "algo/jh/sph_jh.h"
-#include "algo/keccak/sph_keccak.h"
-#include "algo/skein/sph_skein.h"
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sph_simd.h"
-#include "algo/echo/sph_echo.h"
-#include "algo/hamsi/sph_hamsi.h"
-#include "algo/fugue/sph_fugue.h"
-#include "algo/shabal/sph_shabal.h"
-#include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sph_sha2.h"
-#include "x16r-gate.h"
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define X20R_8WAY   1
-#elif defined(__AVX2__) && defined(__AES__)
-  #define X20R_4WAY   1
-#elif defined(__SSE2__) || defined(__ARM_NEON)
-  #define X20R_2WAY   1
-#endif
-
-// X20R is not what it seems. It does not permute 20 functions over 20 rounds,
-// it only permutes 16 of them. The last 4 functions are victims of trying to
-// fit 20 elements in the space for only 16. Arithmetic overflow recycles the
-// first 4 functions.  Otherwise it's identical to X16R. 
-// Welcome to the real X20R.
-
-#define X20R_HASH_FUNC_COUNT 20
-/*
-enum x20r_algo
-{
-	BLAKE = 0,
-	BMW,
-	GROESTL,
-	JH,
-	KECCAK,
-	SKEIN,
-	LUFFA,
-	CUBEHASH,
-	SHAVITE,
-	SIMD,
-	ECHO,
-	HAMSI,
-	FUGUE,
-	SHABAL,
-	WHIRLPOOL,
-	SHA512,
-	HAVAL,       // Last 4 names are meaningless and not used
-	GOST,
-	RADIOGATUN,
-	PANAMA,   
-	X20R_HASH_FUNC_COUNT
-};
-*/
-static __thread char x20r_hash_order[ X20R_HASH_FUNC_COUNT + 1 ] = {0};
-
-static void x20r_getAlgoString(const uint8_t* prevblock, char *output)
-{
-	char *sptr = output;
-
-	for (int j = 0; j < X20R_HASH_FUNC_COUNT; j++) {
-		uint8_t b = (19 - j) >> 1; // 16 ascii hex chars, reversed
-		uint8_t algoDigit = (j & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4;
-		if (algoDigit >= 10)
-			sprintf(sptr, "%c", 'A' + (algoDigit - 10));
-		else
-			sprintf(sptr, "%u", (uint32_t) algoDigit);
-		sptr++;
-	}
-	*sptr = '\0';
-}
-
-#if defined(X20R_8WAY)
-
-int x20r_8x64_hash( void* output, const void* input, int thrid )
-{
-   uint8_t hash[64*8] __attribute__ ((aligned (128)));
-   if ( !x16r_8x64_hash_generic( hash, input, thrid, x20r_hash_order,
-                                 X20R_HASH_FUNC_COUNT ) )
-      return 0;
-
-   memcpy( output,     hash,     32 );
-   memcpy( output+32,  hash+64,  32 );
-   memcpy( output+64,  hash+128, 32 );
-   memcpy( output+96,  hash+192, 32 );
-   memcpy( output+128, hash+256, 32 );
-   memcpy( output+160, hash+320, 32 );
-   memcpy( output+192, hash+384, 32 );
-   memcpy( output+224, hash+448, 32 );
-
-   return 1;
-}
-
-int scanhash_x20r_8x64( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr)
-{
-   uint32_t hash[16*8] __attribute__ ((aligned (128)));
-   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 8;
-   uint32_t n = first_nonce;
-    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-   const int thr_id = mythr->id;
-   volatile uint8_t *restart = &(work_restart[thr_id].restart);
-   const bool bench = opt_benchmark;
-
-   if ( bench )   ptarget[7] = 0x0cff;
-
-   static __thread uint32_t saved_height = UINT32_MAX;
-   if ( work->height != saved_height )
-   {
-      vdata[1] = bswap_32( pdata[1] );
-      vdata[2] = bswap_32( pdata[2] );
-      vdata[3] = bswap_32( pdata[3] );
-      saved_height = work->height;
-      x20r_getAlgoString( (const uint8_t*)(&vdata[1]), x20r_hash_order );
-      if ( !opt_quiet && !thr_id )
-           applog( LOG_INFO, "hash order %s", x20r_hash_order );
-   }
-
-   x16r_8x64_prehash( vdata, pdata, x20r_hash_order );
-   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
-                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
-   do
-   {
-      if( x20r_8x64_hash( hash, vdata, thr_id ) );
-      for ( int i = 0; i < 8; i++ )
-      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
-      {
-         pdata[19] = bswap_32( n+i );
-         submit_solution( work, hash+(i<<3), mythr );
-      }
-      *noncev = _mm512_add_epi32( *noncev,
-                                  _mm512_set1_epi64( 0x0000000800000000 ) );
-      n += 8;
-   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-
-#elif defined(X20R_4WAY)
-
-int x20r_4x64_hash( void* output, const void* input, int thrid )
-{
-   uint8_t hash[64*4] __attribute__ ((aligned (64)));
-   if ( !x16r_4x64_hash_generic( hash, input, thrid, x20r_hash_order,
-                                 X20R_HASH_FUNC_COUNT ) )
-      return 0;
-
-   memcpy( output,     hash,     32 );
-   memcpy( output+32,  hash+64,  32 );
-   memcpy( output+64,  hash+128, 32 );
-   memcpy( output+96,  hash+192, 32 );
-
-   return 1;
-}
-
-int scanhash_x20r_4x64( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr)
-{
-   uint32_t hash[16*4] __attribute__ ((aligned (64)));
-   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 4;
-   uint32_t n = first_nonce;
-    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
-   volatile uint8_t *restart = &(work_restart[thr_id].restart);
-
-   if ( bench )  ptarget[7] = 0x0cff;
-
-   static __thread uint32_t saved_height = UINT32_MAX;
-   if ( work->height != saved_height )
-   {
-      vdata[1] = bswap_32( pdata[1] );
-      vdata[2] = bswap_32( pdata[2] );
-      vdata[3] = bswap_32( pdata[3] );
-      saved_height = work->height;
-      x20r_getAlgoString( (const uint8_t*)(&vdata[1]), x20r_hash_order );
-      if ( !opt_quiet && !thr_id )
-           applog( LOG_INFO, "hash order %s", x20r_hash_order );
-   }
-   
-   x16r_4x64_prehash( vdata, pdata, x20r_hash_order );
-   *noncev = mm256_intrlv_blend_32(
-                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
-   do
-   {
-      if ( x20r_4x64_hash( hash, vdata, thr_id ) );
-      for ( int i = 0; i < 4; i++ )
-      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
-      {
-         pdata[19] = bswap_32( n+i );
-         submit_solution( work, hash+(i<<3), mythr );
-      }
-      *noncev = _mm256_add_epi32( *noncev,
-                                  _mm256_set1_epi64x( 0x0000000400000000 ) );
-      n += 4;
-   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-#elif defined(X20R_2WAY)
-
-int x20r_2x64_hash( void* output, const void* input, int thrid )
-{
-   uint8_t hash[64*2] __attribute__ ((aligned (64)));
-   if ( !x16r_2x64_hash_generic( hash, input, thrid, x20r_hash_order,
-                                 X20R_HASH_FUNC_COUNT ) )
-      return 0;
-
-   memcpy( output,     hash,     32 );
-   memcpy( output+32,  hash+64,  32 );
-
-   return 1;
-}
-
-int scanhash_x20r_2x64( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr)
-{
-   uint32_t hash[16*2] __attribute__ ((aligned (64)));
-   uint32_t vdata[20*2] __attribute__ ((aligned (64)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 2;
-   uint32_t n = first_nonce;
-   v128_t *noncev = (v128_t*)vdata + 9;
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
-   volatile uint8_t *restart = &(work_restart[thr_id].restart);
-
-   if ( bench )  ptarget[7] = 0x0cff;
-
-   static __thread uint32_t saved_height = UINT32_MAX;
-   if ( work->height != saved_height )
-   {
-      vdata[1] = bswap_32( pdata[1] );
-      vdata[2] = bswap_32( pdata[2] );
-      vdata[3] = bswap_32( pdata[3] );
-      saved_height = work->height;
-      x20r_getAlgoString( (const uint8_t*)(&vdata[1]), x20r_hash_order );
-      if ( !opt_quiet && !thr_id )
-           applog( LOG_INFO, "hash order %s", x20r_hash_order );
-   }
-   
-   x16r_2x64_prehash( vdata, pdata, x20r_hash_order );
-   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
-   do
-   {
-      if ( x20r_2x64_hash( hash, vdata, thr_id ) );
-      for ( int i = 0; i < 2; i++ )
-      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
-      {
-         pdata[19] = bswap_32( n+i );
-         submit_solution( work, hash+(i<<3), mythr );
-      }
-      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
-      n += 2;
-   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-#else
-
-int x20r_hash( void* output, const void* input, int thrid )
-{
-   uint8_t hash[64] __attribute__ ((aligned (64)));
-   if ( !x16r_hash_generic( hash, input, thrid, x20r_hash_order, 
-                            X20R_HASH_FUNC_COUNT ) )
-      return 0;
-
-    memcpy( output, hash, 32 );
-    return 1;
-}
-
-int scanhash_x20r( struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t _ALIGN(32) hash32[8];
-   uint32_t _ALIGN(32) edata[20];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   const int thr_id = mythr->id;
-   uint32_t nonce = first_nonce;
-   volatile uint8_t *restart = &( work_restart[thr_id].restart );
-   const bool bench = opt_benchmark;
-   if ( bench )  ptarget[7] = 0x0cff;
-
-   static __thread uint32_t saved_height = UINT32_MAX;
-   if ( work->height != saved_height )
-   {
-      edata[1] = bswap_32( pdata[1] );
-      edata[2] = bswap_32( pdata[2] );
-      edata[3] = bswap_32( pdata[3] );
-      saved_height = work->height;
-      x20r_getAlgoString( (const uint8_t*)(&edata[1]), x20r_hash_order );
-      if ( !opt_quiet && !thr_id )
-           applog( LOG_INFO, "hash order %s", x20r_hash_order );
-   }
-
-   x16r_prehash( edata, pdata, x20r_hash_order );
-
-   do
-   {
-      edata[19] = nonce;
-      if ( x20r_hash( hash32, edata, thr_id ) )
-      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
-      {
-         pdata[19] = bswap_32( nonce );
-         submit_solution( work, hash32, mythr );
-      }
-      nonce++;
-   } while ( nonce < max_nonce && !(*restart) );
-   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce;
-   return 0;
-}
-
-#endif
-
-bool register_x20r_algo( algo_gate_t* gate )
-{
-#if defined (X20R_8WAY)
-  gate->scanhash          = (void*)&scanhash_x20r_8x64;
-#elif defined (X20R_4WAY)
-  gate->scanhash          = (void*)&scanhash_x20r_4x64;
-#elif defined (X20R_2WAY)
-  gate->scanhash          = (void*)&scanhash_x20r_2x64;
-#else
-  gate->scanhash          = (void*)&scanhash_x20r;
-#endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
-                      | NEON_OPT;
-  opt_target_factor = 256.0;
-  return true;
-};
-
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -43,8 +43,7 @@ int x21s_8way_hash( void* output, const void* input, int thrid )
   uint32_t *hash7 = (uint32_t*)( shash+448 );
   x21s_8way_context_overlay ctx;

-   if ( !x16r_8way_hash_generic( shash, input, thrid, x16r_hash_order, 
-                                 X16R_HASH_FUNC_COUNT ) )
+   if ( !x16r_8way_hash_generic( shash, input, thrid ) )
      return 0;

   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
@@ -136,6 +135,7 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &hash[7<<3];
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
@@ -149,18 +149,20 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,

   if ( bench )   ptarget[7] = 0x0cff;

-   static __thread uint32_t saved_height = UINT32_MAX;
-   if ( work->height != saved_height )
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
   {
-      vdata[1] = bswap_32( pdata[1] );
-      vdata[2] = bswap_32( pdata[2] );
-      saved_height = work->height;
-      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
-      if ( !opt_quiet && !thr_id )
-           applog( LOG_INFO, "hash order %s", x16r_hash_order );
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

-   x16r_8way_prehash( vdata, pdata, x16r_hash_order );
+   x16r_8way_prehash( vdata, pdata );
   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
@@ -222,8 +224,7 @@ int x21s_4way_hash( void* output, const void* input, int thrid )
   uint32_t *hash2 = (uint32_t*)( shash+128 );
   uint32_t *hash3 = (uint32_t*)( shash+192 );

-   if ( !x16r_4way_hash_generic( shash, input, thrid, x16r_hash_order,
-                                 X16R_HASH_FUNC_COUNT ) )
+   if ( !x16r_4way_hash_generic( shash, input, thrid ) )
      return 0;

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3,  512 );
@@ -294,6 +295,7 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -306,18 +308,20 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,

   if ( bench )  ptarget[7] = 0x0cff;
 
-   static __thread uint32_t saved_height = UINT32_MAX;
-   if ( work->height != saved_height )
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
   {
-      vdata[1] = bswap_32( pdata[1] );
-      vdata[2] = bswap_32( pdata[2] );
-      saved_height = work->height;
-      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
-      if ( !opt_quiet && !thr_id )
-           applog( LOG_INFO, "hash order %s", x16r_hash_order );
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

-   x16r_4way_prehash( vdata, pdata, x16r_hash_order );
+   x16r_4way_prehash( vdata, pdata );
   *noncev = mm256_intrlv_blend_32(
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
@@ -368,8 +372,7 @@ int x21s_2x64_hash( void* output, const void* input, int thrid )
   uint32_t *hash0 = (uint32_t*)  shash;
   uint32_t *hash1 = (uint32_t*)( shash+64  );

-   if ( !x16r_2x64_hash_generic( shash, input, thrid, x16r_hash_order, 
-                                 X16R_HASH_FUNC_COUNT ) )
+   if ( !x16r_2x64_hash_generic( shash, input, thrid ) )
      return 0;

   sph_haval256_5_init( &ctx.haval );
@@ -409,6 +412,7 @@ int scanhash_x21s_2x64( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*2] __attribute__ ((aligned (64)));
   uint32_t vdata[20*2] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -421,18 +425,20 @@ int scanhash_x21s_2x64( struct work *work, uint32_t max_nonce,

   if ( bench )  ptarget[7] = 0x0cff;

-   static __thread uint32_t saved_height = UINT32_MAX;
-   if ( work->height != saved_height )
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
   {
-      vdata[1] = bswap_32( pdata[1] );
-      vdata[2] = bswap_32( pdata[2] );
-      saved_height = work->height;
-      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
-      if ( !opt_quiet && !thr_id )
-           applog( LOG_INFO, "hash order %s", x16r_hash_order );
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

-   x16r_2x64_prehash( vdata, pdata, x16r_hash_order );
+   x16r_2x64_prehash( vdata, pdata );
   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
   do
   {
--- a/algo/x16/x21s.c
+++ b/algo/x16/x21s.c
@@ -33,8 +33,7 @@ int x21s_hash( void* output, const void* input, int thrid )
   uint32_t _ALIGN(128) hash[16];
   x21s_context_overlay ctx;

-   if ( !x16r_hash_generic( hash, input, thrid, x16r_hash_order,
-                            X16R_HASH_FUNC_COUNT ) )
+   if ( !x16r_hash_generic( hash, input, thrid ) )
      return 0;

   sph_haval256_5_init( &ctx.haval );
@@ -85,7 +84,7 @@ int scanhash_x21s( struct work *work, uint32_t max_nonce,
          applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

-   x16r_prehash( edata, pdata, x16r_hash_order );
+   x16r_prehash( edata, pdata );

   do
   {
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -938,7 +938,7 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,
 #endif
 #include "algo/shabal/sph_shabal.h"
 #include "algo/haval/sph-haval.h"
-#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
+#if !( defined(__AES__) ) //|| defined(__ARM_FEATURE_AES) )
  #include "algo/groestl/sph_groestl.h"
 #endif
 #if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
@@ -950,7 +950,7 @@ union _x17_context_overlay
 {
        blake512_2x64_context   blake;
        bmw512_2x64_context     bmw;
-#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
        hashState_groestl       groestl;
 #else
        sph_groestl512_context  groestl;
@@ -1000,7 +1000,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )

    dintrlv_2x64( hash0, hash1, vhash, 512 );

-#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
    groestl512_full( &ctx.groestl, hash0, hash0, 512 );
    groestl512_full( &ctx.groestl, hash1, hash1, 512 );
 #else
@@ -1133,12 +1133,14 @@ int scanhash_x17_2x64( struct work *work, uint32_t max_nonce,
      {
         if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
         {
+applog(LOG_INFO,"Submitted Thread %d, lane %d",thr_id,0);
              pdata[19] = bswap_32( n );
 //            pdata[19] = n;
            submit_solution( work, hash, mythr );
         }
         if ( unlikely( valid_hash( hash+8, ptarget ) && !bench ) )
         {
+applog(LOG_INFO,"Submitted Thread %d, lane %d",thr_id,1);            
            pdata[19] = bswap_32( n+1 );
            submit_solution( work, hash+8, mythr );
         }
--- a/algo/x22/x22i.c
+++ b/algo/x22/x22i.c
@@ -5,15 +5,15 @@
 #include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
 #if defined(__AES__)
+  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/fugue/fugue-aesni.h"
 #else
+  #include "algo/groestl/sph_groestl.h"
  #include "algo/fugue/sph_fugue.h"
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
-  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #else
-  #include "algo/groestl/sph_groestl.h"
  #include "algo/echo/sph_echo.h"
 #endif
 #include "algo/skein/sph_skein.h"
@@ -39,15 +39,15 @@ union _x22i_context_overlay
        blake512_context       blake;
        sph_bmw512_context     bmw;
 #if defined(__AES__)
+        hashState_groestl       groestl;
        hashState_fugue         fugue;
 #else
+        sph_groestl512_context  groestl;
        sph_fugue512_context    fugue;
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
-        hashState_groestl       groestl;
        hashState_echo          echo;
 #else
-        sph_groestl512_context  groestl;
        sph_echo512_context     echo;
 #endif
        sph_jh512_context       jh;
@@ -81,7 +81,7 @@ int x22i_hash( void *output, const void *input, int thrid )
   sph_bmw512(&ctx.bmw, (const void*) hash, 64);
   sph_bmw512_close(&ctx.bmw, hash);

-#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+#if defined(__AES__)
   groestl512_full( &ctx.groestl, hash, hash, 512 );
 #else
   sph_groestl512_init( &ctx.groestl );
--- a/algo/x22/x25x.c
+++ b/algo/x22/x25x.c
@@ -5,15 +5,15 @@
 #include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
 #if defined(__AES__)
+  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/fugue/fugue-aesni.h"
 #else
+  #include "algo/groestl/sph_groestl.h"
  #include "algo/fugue/sph_fugue.h"
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
-  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #else
-  #include "algo/groestl/sph_groestl.h"
  #include "algo/echo/sph_echo.h"
 #endif
 #include "algo/skein/sph_skein.h"
@@ -42,15 +42,15 @@ union _x25x_context_overlay
        blake512_context        blake;
        sph_bmw512_context      bmw;
 #if defined(__AES__)
+        hashState_groestl       groestl;
        hashState_fugue         fugue;
 #else
+        sph_groestl512_context  groestl;
        sph_fugue512_context    fugue;
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
-        hashState_groestl       groestl;
        hashState_echo          echo;
 #else
-        sph_groestl512_context  groestl;
        sph_echo512_context     echo;
 #endif
        sph_jh512_context       jh;
@@ -86,7 +86,7 @@ int x25x_hash( void *output, const void *input, int thrid )
   sph_bmw512(&ctx.bmw, (const void*) &hash[0], 64);
   sph_bmw512_close(&ctx.bmw, &hash[1]);

-#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+#if defined(__AES__)
   groestl512_full( &ctx.groestl, (void*)&hash[2], (const void*)&hash[1], 512 );
 #else
   sph_groestl512_init( &ctx.groestl );
@@ -119,7 +119,7 @@ int x25x_hash( void *output, const void *input, int thrid )
   simd512_ctx( &ctx.simd, (void*)&hash[9], (const void*)&hash[8], 64 ); 

 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
-   echo_full( &ctx.echo, (void*)&hash[10], 512, (const void*)&hash[9], 64 );
+    echo_full( &ctx.echo, (void*)&hash[10], 512, (const void*)&hash[9], 64 );
 #else
   sph_echo512_init( &ctx.echo );
   sph_echo512( &ctx.echo, &hash[9], 64 );
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.14.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.11.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='23.14'
-PACKAGE_STRING='cpuminer-opt 23.14'
+PACKAGE_VERSION='23.11'
+PACKAGE_STRING='cpuminer-opt 23.11'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 23.14 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 23.11 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1432,7 +1432,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 23.14:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 23.11:";;
   esac
  cat <<\_ACEOF

@@ -1538,7 +1538,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 23.14
+cpuminer-opt configure 23.11
 generated by GNU Autoconf 2.71

 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 23.14, which was
+It was created by cpuminer-opt $as_me 23.11, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='23.14'
+ VERSION='23.11'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 23.14, which was
+This file was extended by cpuminer-opt $as_me 23.11, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 23.14
+cpuminer-opt config.status 23.11
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [23.14])
+AC_INIT([cpuminer-opt], [23.11])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/4343
+++ b/4343
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -2837,6 +2837,15 @@ static void show_credits()
 #define check_cpu_capability() cpu_capability( false )
 #define display_cpu_capability() cpu_capability( true )

+#if defined(__aarch64__)
+
+#define XSTR(x) STR(x)
+#define STR(x) #x
+
+//#pragma message "Building for armv" XSTR(__ARM_ARCH)  
+
+#endif
+
 static bool cpu_capability( bool display_only )
 {
     char cpu_brand[0x40];
@@ -3666,6 +3675,11 @@ static int thread_create(struct thr_info *thr, void* func)

 void get_defconfig_path(char *out, size_t bufsize, char *argv0);

+
+#include "simd-utils.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "compat/aes_helper.c"
+
 int main(int argc, char *argv[])
 {
 	struct thr_info *thr;
--- a/miner.h
+++ b/miner.h
@@ -672,7 +672,6 @@ enum algos {
        ALGO_X16RT_VEIL,
        ALGO_X16S,
        ALGO_X17,
-        ALGO_X20R,
        ALGO_X21S,
        ALGO_X22I,
        ALGO_X25X,
@@ -768,7 +767,6 @@ static const char* const algo_names[] = {
        "x16rt-veil",
        "x16s",
        "x17",
-        "x20r",
        "x21s",
        "x22i",
        "x25x",
@@ -932,7 +930,6 @@ Options:\n\
                          x16rt-veil    Veil (VEIL)\n\
                          x16s\n\
                          x17\n\
-                          x20r\n\
                          x21s\n\
                          x22i\n\
                          x25x\n\
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -381,7 +381,7 @@ static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
   d0[15] = s[ 60];   d1[15] = s[ 61];    d2[15] = s[ 62];   d3[15] = s[ 63];
 }

-#endif   // SSE4_1 or NEON else SSE2
+#endif   // SSE4_1 else SSE2 or NEON

 static inline void extr_lane_4x32( void *d, const void *s,
                                   const int lane, const int bit_len )
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -322,7 +322,6 @@ static inline __m128i v128_neg1_fn()
 #define mm128_xim_32( v1, v0, c ) \
   _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
                                    _mm_castsi128_ps( v0 ), c ) )
-#define v128_xim32 mm128_xim_32

 // Examples of simple operations using xim:
 /*
--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -40,7 +40,7 @@
 #define v128u8_load( p )              vld1q_u16( (uint8_t*)(p) )
 #define v128u8_store( p, v )          vst1q_u16( (uint8_t*)(p), v )

-// load & set1 combined, doesn't work
+// load & set1 combined
 #define v128_load1_64(p)              vld1q_dup_u64( (uint64_t*)(p) )
 #define v128_load1_32(p)              vld1q_dup_u32( (uint32_t*)(p) )
 #define v128_load1_16(p)              vld1q_dup_u16( (uint16_t*)(p) )
@@ -68,7 +68,7 @@
 #define v128_mul32                    vmulq_u32
 #define v128_mul16                    vmulq_u16

-// Widening, shuffle high element to align with Intel
+// slow, tested with argon2d
 static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 {
   return vmull_u32( vget_low_u32( vcopyq_laneq_u32( v1, 1, v1, 2 ) ),
@@ -86,7 +86,7 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )

 // Not yet needed
 //#define v128_cmpeq1
-// Signed
+
 #define v128_cmpgt64( v1, v0 )        vcgtq_s64( (int64x2_t)v1, (int64x2_t)v0 )
 #define v128_cmpgt32( v1, v0 )        vcgtq_s32( (int32x4_t)v1, (int32x4_t)v0 )
 #define v128_cmpgt16( v1, v0 )        vcgtq_s16( (int16x8_t)v1, (int16x8_t)v0 )
@@ -406,15 +406,34 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 v1 = vorrq_u32( v1, t1 ); \
 }

-// vector mask, use as last resort. prefer tbl, rev, alignr, etc
+// Cross lane shuffles, no programmable shuffle in NEON
+
+// vector mask, use as last resort. prefer rev, alignr, etc
 #define v128_shufflev32( v, vmask ) \
  v128_set32( ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[3] ], \
              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[2] ], \
              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[1] ], \
              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[0] ] ) \

+// compatible with x86_64, but very slow, avoid
 #define v128_shuffle8( v, vmask ) \
-     vqtbl1q_u8( (uint8x16_t)v, (uint8x16_t)vmask );
+   v128_set8( ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[15] ], \
+              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[14] ], \
+              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[13] ], \
+              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[12] ], \
+              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[11] ], \
+              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[10] ], \
+              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 9] ], \
+              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 8] ], \
+              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 7] ], \
+              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 6] ], \
+              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 5] ], \
+              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 4] ], \
+              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 3] ], \
+              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 2] ], \
+              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 1] ], \
+              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 0] ] )
+

 // sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
 // Bit rotation already promotes faster widths. Usage is context sensitive.
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -930,9 +930,7 @@ static inline void cpu_brand_string( char* s )

 #elif defined(__arm__) || defined(__aarch64__)

-    unsigned int cpu_info[4] = { 0 };
-    cpuid( 0, 0, cpu_info );
-    sprintf( s, "ARM 64 bit CPU, HWCAP %08x", cpu_info[0] );
+    sprintf( s, "ARM 64 bit CPU" );

 #else