v23.14

2026-02-22 16:33:08 +00:00 · 2023-11-28 00:58:43 -05:00
parent 045b42babf
commit 4e3f1b926f
35 changed files with 144 additions and 678 deletions
--- a/README.md
+++ b/README.md
@@ -87,7 +87,6 @@ Supported Algorithms
                          groestl       Groestl coin
                          hex           x16r-hex
                          hmq1725       
-                          hodl          Hodlcoin
                          jha           Jackpotcoin
                          keccak        Maxcoin
                          keccakc       Creative coin
@@ -115,9 +114,11 @@ Supported Algorithms
                          scrypt:N      scrypt(N, 1, 1)
                          scryptn2      scrypt(1048576, 1, 1)
                          sha256d       Double SHA-256
+                          sha256dt
                          sha256q       Quad SHA-256
                          sha256t       Triple SHA-256
                          sha3d         Double keccak256 (BSHA3)
+                          sha512256d
                          skein         Skein+Sha (Skeincoin)
                          skein2        Double Skein (Woodcoin)
                          skunk         Signatum (SIGT)
@@ -145,6 +146,7 @@ Supported Algorithms
                          x16rt-veil    veil
                          x16s          
                          x17
+                          x20r
                          x21s
                          x22i
                          x25x
--- a/7
+++ b/7
@@ -75,6 +75,13 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v23.14
+
+ARM: Groestl AES optimizations enabled.
+All: Small optimization to Shabal 4way.
+x86_64: Extend Shabal 4way support to SSE2 from SSE4.1.
+All: deleted some unused files.
+
 v23.13

 Added x20r algo.
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -39,7 +39,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
   blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );

   do {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );

      blakehash_4way( hash, vdata );

--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -182,7 +182,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
   blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );

   do {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      pdata[19] = n;
      blakecoin_4way_hash( hash, vdata );

--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -60,54 +60,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };

 #if defined(__ARM_NEON)

-// No fast shuffle on NEON
-//static const uint32x4_t vmask_d8 = {  3, 1, 2, 0 };  
-static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
+static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
+   { 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };

-#define gr_shuffle32( v )      v128_blendv( v128_qrev32( v ), v, BLEND_MASK )
-
-/*
-#define TRANSP_MASK \
-     0xd,0x5,0x9,0x1,0xc,0x4,0x8,0x0,0xf,0x7,0xb,0x3,0xe,0x6,0xa,0x2
-#define SUBSH_MASK0 \
-     0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8
-#define SUBSH_MASK1 \
-     0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9
-#define SUBSH_MASK2 \
-     0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa
-#define SUBSH_MASK3 \
-     0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb
-#define SUBSH_MASK4  \
-     0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc
-#define SUBSH_MASK5 \
-     0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd
-#define SUBSH_MASK6 \
-     0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe
-#define SUBSH_MASK7 \
-     0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3
-
-//#define gr_shuffle8( v, c )    v128_shullfev8( v, c )
-
-
-#define gr_shuffle8( v, c15, c14, c13, c12, c11, c10, c09, c08, \
-                        c07, c06, c05, c04, c03, c02, c01, c00 ) \
-  v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
-  v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
-  v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
-  v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
-    v, 15, v, c15 ), 14, v, c14 ), 13, v, c13 ), 12, v, c12 ), \
-       11, v, c11 ), 10, v, c10 ),  9, v, c09 ),  8, v, c08 ), \
-        7, v, c07 ),  6, v, c06 ),  5, v, c05 ),  4, v, c04 ), \
-        3, v, c03 ),  2, v, c02 ),  1, v, c01 ),  0, v, c00 )
-*/
+#define gr_shuffle32(v)       vqtbl1q_u8( v, gr_mask ) 

 #else

-#define gr_shuffle32( v )       _mm_shuffle_epi32( v, 0xd8 )
+#define gr_shuffle32(v)       _mm_shuffle_epi32( v, 0xd8 )

 #endif

-
 #define tos(a)    #a
 #define tostr(a)  tos(a)

@@ -334,17 +297,16 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
 */
 #define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* SubBytes */\
-  b0 = v128_xor(b0, b0);\
-  a0 = v128_aesenclast(a0, b0);\
-  a1 = v128_aesenclast(a1, b0);\
-  a2 = v128_aesenclast(a2, b0);\
-  a3 = v128_aesenclast(a3, b0);\
-  a4 = v128_aesenclast(a4, b0);\
-  a5 = v128_aesenclast(a5, b0);\
-  a6 = v128_aesenclast(a6, b0);\
-  a7 = v128_aesenclast(a7, b0);\
+  a0 = v128_aesenclast_nokey( a0 ); \
+  a1 = v128_aesenclast_nokey( a1 ); \
+  a2 = v128_aesenclast_nokey( a2 ); \
+  a3 = v128_aesenclast_nokey( a3 ); \
+  a4 = v128_aesenclast_nokey( a4 ); \
+  a5 = v128_aesenclast_nokey( a5 ); \
+  a6 = v128_aesenclast_nokey( a6 ); \
+  a7 = v128_aesenclast_nokey( a7 ); \
  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+  MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7 ); \
 }

 #define ROUNDS_P(){\
@@ -365,7 +327,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
     /* SubBytes + MixBytes */\
    SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
            xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7 ); \
-    \
    /* AddRoundConstant P1024 */\
    xmm0 = v128_xor( xmm0, \
             casti_v128( round_const_p, round_counter+1 ) ); \
@@ -467,7 +428,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
  t1 = v128_unpackhi16(t1, i3);\
  i2 = v128_unpacklo16(i2, i3);\
  i0 = v128_unpacklo16(i0, i1);\
-\
  /* shuffle with immediate */\
  t0 = gr_shuffle32( t0 ); \
  t1 = gr_shuffle32( t1 ); \
@@ -477,7 +437,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
  i2 = gr_shuffle32( i2 ); \
  i4 = gr_shuffle32( i4 ); \
  i6 = gr_shuffle32( i6 ); \
-\
  /* continue with unpack */\
  t4 = i0;\
  i0 = v128_unpacklo32(i0, i2);\
@@ -584,7 +543,8 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
  /* transpose done */\
 }/**/

-
+#if 0
+// not used
 void INIT( v128_t* chaining )
 {
  static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
@@ -613,6 +573,7 @@ void INIT( v128_t* chaining )
  chaining[6] = xmm14;
  chaining[7] = xmm15;
 }
+#endif

 void TF1024( v128_t* chaining, const v128_t* message )
 {
--- a/algo/groestl/aes_ni/groestl256-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl256-intr-aes.h
@@ -1,3 +1,6 @@
+#if !defined GROESTL256_INTR_AES_H__
+#define GROESTL256_INTR_AES_H__
+
 /* groestl-intr-aes.h     Aug 2011
 *
 * Groestl implementation with intrinsics using ssse3, sse4.1, and aes
@@ -50,18 +53,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };

 #if defined(__ARM_NEON)

-// No fast shuffle on NEON
-static const uint32x4_t vmask_d8 = {  3, 1, 2, 0 };
+static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
+   { 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };

-#define gr_shuffle32( v )       v128_shufflev32( v, vmask_d8 )
+#define gr_shuffle32(v)       vqtbl1q_u8( v, gr_mask ) 

 #else

-#define gr_shuffle32( v )       _mm_shuffle_epi32( v, 0xd8 )
+#define gr_shuffle32(v)       _mm_shuffle_epi32( v, 0xd8 )

 #endif

-
 #define tos(a)    #a
 #define tostr(a)  tos(a)

@@ -598,4 +600,4 @@ void OF512( v128_t* chaining )
  chaining[3] = xmm11;
 }

-
+#endif
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -181,6 +181,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,

   // digest final padding block and do output transform
   TF1024( ctx->chaining, ctx->buffer );
+
   OF1024( ctx->chaining );

   // store hash result in output 
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -87,6 +87,7 @@ int final_groestl( hashState_groestl*, void* );
 int update_and_final_groestl( hashState_groestl*,  void*, const void*, int );
 int groestl512( hashState_groestl*,  void*, const void*, uint64_t );
 #define groestl512_full   groestl512
+#define groestl512_ctx    groestl512


 #endif /* __hash_h */
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -626,7 +626,7 @@ static const __m256i SUBSH_MASK7_2WAY =

 #define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
-  b1 = mm256_bcast_m128( mm128_mask_32( v128_neg1, 0x3 ) ); \
+  b1 = mm256_bcast_m128( v128_mask32( v128_neg1, 0x3 ) ); \
  a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
  a1 = _mm256_xor_si256( a1, b1 );\
  a2 = _mm256_xor_si256( a2, b1 );\
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -213,7 +213,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,

   v128_bswap32_intrlv80_4x32( vdata, pdata );
   do {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
+      *noncev = v128_bswap32( _mm_set_epi32( n+3,n+2,n+1,n ) );

      myriad_4way_hash( hash, vdata );
      pdata[19] = n;
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -465,12 +465,8 @@ typedef union
 {
   keccak256_2x64_context    keccak;
   cubehashParam             cube;
-//#if defined(__x86_64__)
   skein256_2x64_context     skein;
-//#else
-//   sph_skein512_context      skein;
-//#endif
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   hashState_groestl256      groestl;
 #else
   sph_groestl256_context     groestl;
@@ -516,7 +512,6 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );

-//#if defined(__x86_64__)
   intrlv_2x64( vhashA, hash0, hash1, 256 );
   skein256_2x64_init( &ctx.skein );
   skein256_2x64_update( &ctx.skein, vhashA, 32 );
@@ -527,23 +522,8 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
   skein256_2x64_update( &ctx.skein, vhashA, 32 );
   skein256_2x64_close( &ctx.skein, vhashA );
   dintrlv_2x64( hash2, hash3, vhashA, 256 );
-/*
-#else
-    sph_skein256_init( &ctx.skein );
-    sph_skein256( &ctx.skein, hash0, 32 );
-    sph_skein256_close( &ctx.skein, hash0 );
-    sph_skein256_init( &ctx.skein );
-    sph_skein256( &ctx.skein, hash1, 32 );
-    sph_skein256_close( &ctx.skein, hash1 );
-    sph_skein256_init( &ctx.skein );
-    sph_skein256( &ctx.skein, hash2, 32 );
-    sph_skein256_close( &ctx.skein, hash2 );
-    sph_skein256_init( &ctx.skein );
-    sph_skein256( &ctx.skein, hash3, 32 );
-    sph_skein256_close( &ctx.skein, hash3 );
-#endif
-*/
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   groestl256_full( &ctx.groestl, hash0, hash0, 256 );
   groestl256_full( &ctx.groestl, hash1, hash1, 256 );
   groestl256_full( &ctx.groestl, hash2, hash2, 256 );
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -67,7 +67,7 @@ int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
   lyra2h_4way_midstate( vdata );

   do {
-     *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+     *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      lyra2h_4way_hash( hash, vdata );

      for ( int i = 0; i < 4; i++ )
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -456,7 +456,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,

   do
   {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );

      lyra2rev2_4way_hash( hash, vdata );

--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -7,15 +7,15 @@
 #include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
 #if defined(__AES__)
-  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/fugue/fugue-aesni.h"
 #else
-  #include "algo/groestl/sph_groestl.h"
  #include "algo/fugue/sph_fugue.h"
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
+  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #else
+  #include "algo/groestl/sph_groestl.h"
  #include "algo/echo/sph_echo.h"
 #endif
 #include "algo/jh/sph_jh.h"
@@ -36,15 +36,15 @@ union _hmq1725_ctx_holder
   blake512_context        blake;
   sph_bmw512_context      bmw;
 #if defined(__AES__)
-   hashState_groestl       groestl;
   hashState_fugue         fugue;
 #else
-   sph_groestl512_context  groestl;
   sph_fugue512_context    fugue;
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
+   hashState_groestl       groestl;
   hashState_echo          echo;
 #else
+   sph_groestl512_context  groestl;
   sph_echo512_context     echo;
 #endif
   sph_skein512_context    skein;
@@ -62,9 +62,6 @@ union _hmq1725_ctx_holder
 };
 typedef union _hmq1725_ctx_holder hmq1725_ctx_holder;

-//static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64)));
-//static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64)));
-
 extern void hmq1725hash(void *state, const void *input)
 {
    const uint32_t mask = 24;
@@ -82,7 +79,7 @@ extern void hmq1725hash(void *state, const void *input)

    if ( hashB[0] & mask )   //1
    {
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
       groestl512_full( &ctx.groestl, hashA, hashB, 512 );
 #else
       sph_groestl512_init( &ctx.groestl );
@@ -226,7 +223,7 @@ extern void hmq1725hash(void *state, const void *input)
       sph_sha512_close( &ctx.sha, hashA );
    }

-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
    groestl512_full( &ctx.groestl, hashB, hashA, 512 );
 #else
    sph_groestl512_init( &ctx.groestl );
--- a/algo/sha/sha256-hash.c
+++ b/algo/sha/sha256-hash.c
@@ -587,8 +587,8 @@ void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y,
    // Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3)
    TMSG0_X = casti_m128i( msg_X, 0 );
    TMSG0_Y = casti_m128i( msg_Y, 0 );
-    TMP_X = mm128_xim_32( TMSG0_X, TMSG0_X, 0xd5 );
-    TMP_Y = mm128_xim_32( TMSG0_Y, TMSG0_Y, 0xd5 );
+    TMP_X = v128_xim32( TMSG0_X, TMSG0_X, 0xd5 );
+    TMP_Y = v128_xim32( TMSG0_Y, TMSG0_Y, 0xd5 );
    STATE0_X = _mm_add_epi32( STATE0_X, TMP_X );
    STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y );

--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -34,8 +34,6 @@
 #include <string.h>
 #include "shabal-hash-4way.h"

-//#if defined(__SSE4_1__) || defined(__ARM_NEON)
-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 #define DECL_STATE16   \
@@ -47,8 +45,6 @@
           C8, C9, CA, CB, CC, CD, CE, CF; \
   __m512i M0, M1, M2, M3, M4, M5, M6, M7, \
           M8, M9, MA, MB, MC, MD, ME, MF; \
-   const __m512i FIVE  = v512_32( 5 ); \
-   const __m512i THREE = v512_32( 3 ); \
   uint32_t Wlow, Whigh;

 #define READ_STATE16(state) do \
@@ -292,11 +288,21 @@ do { \
    mm512_swap1024_512( BF, CF ); \
 } while (0)

+static inline __m512i v512_mult_x3( const __m512i x )
+{
+   return _mm512_add_epi32( x, _mm512_slli_epi32( x, 1 ) );
+}
+
+static inline __m512i v512_mult_x5( const __m512i x )
+{
+   return _mm512_add_epi32( x, _mm512_slli_epi32( x, 2 ) );
+}
+
 #define PERM_ELT16( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
   xa0 = mm512_xor3( xm, xb1, mm512_xorandnot( \
-           _mm512_mullo_epi32( mm512_xor3( xa0, xc, \
-              _mm512_mullo_epi32( mm512_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
+           v512_mult_x3( mm512_xor3( xa0, xc, \
+              v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ) ), \
           xb3, xb2 ) ); \
   xb0 = mm512_xnor( xa0, mm512_rol_32( xb0, 1 ) ); \
 } while (0)
@@ -644,8 +650,6 @@ shabal512_16way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
           C8, C9, CA, CB, CC, CD, CE, CF; \
   __m256i M0, M1, M2, M3, M4, M5, M6, M7, \
           M8, M9, MA, MB, MC, MD, ME, MF; \
-   const __m256i FIVE  = v256_32( 5 ); \
-   const __m256i THREE = v256_32( 3 ); \
   uint32_t Wlow, Whigh;

 #define READ_STATE8(state) do \
@@ -889,11 +893,21 @@ do { \
    mm256_swap512_256( BF, CF ); \
 } while (0)

+static inline __m256i v256_mult_x3( const __m256i x )
+{
+   return _mm256_add_epi32( x, _mm256_slli_epi32( x, 1 ) );
+}
+
+static inline __m256i v256_mult_x5( const __m256i x )
+{
+   return _mm256_add_epi32( x, _mm256_slli_epi32( x, 2 ) );
+}
+
 #define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
   xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
-           _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
-              _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
+           v256_mult_x3( mm256_xor3( xa0, xc, \
+              v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ) ), \
           xb3, xb2 ) ); \
   xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
 } while (0)
@@ -1226,15 +1240,13 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)

 #endif  // AVX2

-#if defined(__SSE4_1__) || defined(__ARM_NEON)
+#if defined(__SSE2__) || defined(__ARM_NEON)

 #define DECL_STATE   \
 	v128u32_t A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, AA, AB; \
 	v128u32_t B0, B1, B2, B3, B4, B5, B6, B7, B8, B9, BA, BB, BC, BD, BE, BF; \
 	v128u32_t C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, CA, CB, CC, CD, CE, CF; \
 	v128u32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; \
-   const v128u32_t FIVE  = v128_32( 5 ); \
-   const v128u32_t THREE = v128_32( 3 ); \
   uint32_t Wlow, Whigh;

 #define READ_STATE( state ) \
@@ -1479,11 +1491,21 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
    v128_swap256_128( BF, CF ); \
 }

+static inline v128_t v128_mult_x3( const v128_t x )
+{
+   return v128_add32( x, v128_sl32( x, 1 ) );
+}
+
+static inline v128_t v128_mult_x5( const v128_t x )
+{
+   return v128_add32( x, v128_sl32( x, 2 ) );
+}
+
 #define PERM_ELT( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 { \
   xa0 = v128_xor3( xm, xb1, v128_xorandnot( \
-           v128_mul32( v128_xor3( xa0, xc, \
-              v128_mul32( v128_rol32( xa1, 15 ), FIVE ) ), THREE ), \
+                               v128_mult_x3( v128_xor3( xa0, xc, \
+                                   v128_mult_x5( v128_rol32( xa1, 15 ) ) ) ), \
                               xb3, xb2 ) ); \
   xb0 = v128_not( v128_xor( xa0, v128_rol32( xb0, 1 ) ) ); \
 }
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -62,7 +62,7 @@ void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,

 #endif

-#if defined(__SSE4_1__) || defined(__ARM_NEON)
+#if defined(__SSE2__) || defined(__ARM_NEON)

 typedef struct {
 	v128_t buf[16] __attribute__ ((aligned (64)));
--- a/algo/swifftx/Swifftx_sha3.cpp
+++ b/algo/swifftx/Swifftx_sha3.cpp
@@ -1,369 +0,0 @@
-#include "Swifftx_sha3.h"
-extern "C" {
-#include "SWIFFTX.h"
-}
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-// The default salt value.
-// This is the expansion of e (Euler's number) - the 19 digits after 2.71:
-// 8281828459045235360.
-// The above in base 256, from MSB to LSB:
-BitSequence SWIF_saltValueChar[SWIF_HAIFA_SALT_SIZE] = {114, 238, 247, 26, 192, 28, 170, 160};
-
-// All the IVs here below were produced from the decimal digits of e's expansion.
-// The code can be found in 'ProduceRandomIV.c'.
-// The initial value for 224 digest size.
-const BitSequence SWIF_HAIFA_IV_224[SWIFFTX_OUTPUT_BLOCK_SIZE] = 
-{37, 242, 132,   2, 167,  81, 158, 237, 113,  77, 162,  60,  65, 236, 108, 246,
-101,  72, 190, 109,  58, 205,  99,   6, 114, 169, 104, 114,  38, 146, 121, 142,
- 59,  98, 233,  84,  72, 227,  22, 199,  17, 102, 198, 145,  24, 178,  37,   1,
-215, 245,  66, 120, 230, 193, 113, 253, 165, 218,  66, 134,  49, 231, 124, 204,
-  0};
-
-// The initial value for 256 digest size.
-const BitSequence SWIF_HAIFA_IV_256[SWIFFTX_OUTPUT_BLOCK_SIZE] = 
-{250,  50,  42,  40,  14, 233,  53,  48, 227,  42, 237, 187, 211, 120, 209, 234,
-  27, 144,   4,  61, 243, 244,  29, 247,  37, 162,  70,  11, 231, 196,  53,   6,
- 193, 240,  94, 126, 204, 132, 104,  46, 114,  29,   3, 104, 118, 184, 201,   3,
-  57,  77,  91, 101,  31, 155,  84, 199, 228,  39, 198,  42, 248, 198, 201, 178,
-   8};
-
-// The initial value for 384 digest size.
-const BitSequence SWIF_HAIFA_IV_384[SWIFFTX_OUTPUT_BLOCK_SIZE] = 
-{40, 145, 193, 100, 205, 171,  47,  76, 254,  10, 196,  41, 165, 207, 200,  79,
-109,  13,  75, 201,  17, 172,  64, 162, 217,  22,  88,  39,  51,  30, 220, 151,
-133,  73, 216, 233, 184, 203,  77,   0, 248,  13,  28, 199,  30, 147, 232, 242,
-227, 124, 169, 174,  14,  45,  27,  87, 254,  73,  68, 136, 135, 159,  83, 152,
-  0};
-
-// The initial value for 512 digest size.
-const BitSequence SWIF_HAIFA_IV_512[SWIFFTX_OUTPUT_BLOCK_SIZE] = 
-{195, 126, 197, 167, 157, 114,  99, 126, 208, 105, 200,  90,  71, 195, 144, 138,
- 142, 122, 123, 116,  24, 214, 168, 173, 203, 183, 194, 210, 102, 117, 138,  42,
- 114, 118, 132,  33,  35, 149, 143, 163, 163, 183, 243, 175,  72,  22, 201, 255,
- 102, 243,  22, 187, 211, 167, 239,  76, 164,  70,  80, 182, 181, 212,   9, 185,
-   0};
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-// NIST API implementation portion.
-///////////////////////////////////////////////////////////////////////////////////////////////
-
-int Swifftx::Init(int hashbitlen)
-{
-	switch(hashbitlen)
-	{
-	case 224:
-		swifftxState.hashbitlen = hashbitlen;
-		// Initializes h_0 in HAIFA:
-		memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_224, SWIFFTX_OUTPUT_BLOCK_SIZE);
-		break;
-	case 256:
-		swifftxState.hashbitlen = hashbitlen;
-		memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_256, SWIFFTX_OUTPUT_BLOCK_SIZE);
-		break;
-	case 384:
-		swifftxState.hashbitlen = hashbitlen;
-		memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_384, SWIFFTX_OUTPUT_BLOCK_SIZE);
-		break;
-	case 512:
-		swifftxState.hashbitlen = hashbitlen;
-		memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_512, SWIFFTX_OUTPUT_BLOCK_SIZE);
-		break;
-	default:
-		return BAD_HASHBITLEN;
-	}
-	
-	swifftxState.wasUpdated = false;
-	swifftxState.remainingSize = 0;
-	memset(swifftxState.remaining, 0, SWIF_HAIFA_INPUT_BLOCK_SIZE);
-	memset(swifftxState.numOfBitsChar, 0, SWIF_HAIFA_NUM_OF_BITS_SIZE);
-	// Initialize the salt with the default value.
-	memcpy(swifftxState.salt, SWIF_saltValueChar, SWIF_HAIFA_SALT_SIZE);
-
-	InitializeSWIFFTX();
-
-	return SUCCESS;
-}
-
-int Swifftx::Update(const BitSequence *data, DataLength databitlen)
-{
-	// The size of input in bytes after putting the remaining data from previous invocation.
-	int sizeOfInputAfterRemaining = 0;
-	// The input block to compression function of SWIFFTX:
-	BitSequence currInputBlock[SWIFFTX_INPUT_BLOCK_SIZE] = {0};
-	// Whether we handled a single block.
-	bool wasSingleBlockHandled = false;
-
-	swifftxState.wasUpdated = true;
-
-	// Handle an empty message as required by NIST. Since 'Final()' is oblivious to the input
-	// (but of course uses the output of the compression function from the previous round, 
-	// which is called h_{i-1} in HAIFA article), we have to do nothing here.
-	if (databitlen == 0)
-		return SUCCESS;
-
-    // If we had before an input with unaligned length, return an error
-    if (swifftxState.remainingSize % 8)
-	{
-    	return INPUT_DATA_NOT_ALIGNED;
-    }
-
-    // Convert remaining size to bytes.
-    swifftxState.remainingSize /= 8;
-
-	// As long as we have enough data combined from (remaining + data) to fill input block
-	//NASTAVENIE RUND
-	while (((databitlen / 8) + swifftxState.remainingSize) >= SWIF_HAIFA_INPUT_BLOCK_SIZE)
-	{
-		// Fill the input block with data:
-		// 1. The output of the previous block:
-		memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
-		// 2. The input part of the block:
-		// 2a. The remaining data from the previous 'Update()' call:
-		if (swifftxState.remainingSize)
-			memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE, swifftxState.remaining, 
-				   swifftxState.remainingSize);
-		// 2b. The input data that we have place for after the 'remaining':
-		sizeOfInputAfterRemaining = SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE 
-								  - ((int) swifftxState.remainingSize) - SWIF_HAIFA_NUM_OF_BITS_SIZE 
-								  - SWIF_HAIFA_SALT_SIZE;
-		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize, 
-			   data, sizeOfInputAfterRemaining);
-
-		// 3. The #bits part of the block:
-		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize 
-			 + sizeOfInputAfterRemaining,
-			   swifftxState.numOfBitsChar, SWIF_HAIFA_NUM_OF_BITS_SIZE);
-		// 4. The salt part of the block:
-		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize 
-			 + sizeOfInputAfterRemaining + SWIF_HAIFA_NUM_OF_BITS_SIZE,
-			   swifftxState.salt, SWIF_HAIFA_SALT_SIZE);
-
-		ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, false);
-
-		// Update the #bits field with SWIF_HAIFA_INPUT_BLOCK_SIZE.
-		AddToCurrInBase256(swifftxState.numOfBitsChar, SWIF_HAIFA_INPUT_BLOCK_SIZE * 8);
-		wasSingleBlockHandled = true;
-		data += sizeOfInputAfterRemaining;
-		databitlen -= (sizeOfInputAfterRemaining * 8);
-   		swifftxState.remainingSize = 0;
-	}
-
-	// Update the swifftxState.remaining and swifftxState.remainingSize.
-    // remainingSize will be in bits after exiting 'Update()'.
-	if (wasSingleBlockHandled)
-	{		
-		swifftxState.remainingSize = (unsigned int) databitlen; // now remaining size is in bits.
-        if (swifftxState.remainingSize)
-			memcpy(swifftxState.remaining, data, (swifftxState.remainingSize + 7) / 8);
-	}
-	else
-	{
-		memcpy(swifftxState.remaining + swifftxState.remainingSize, data, 
-			   (size_t) (databitlen + 7) / 8);
-		swifftxState.remainingSize = (swifftxState.remainingSize * 8) + (unsigned short) databitlen;
-	}
-
-	return SUCCESS;
-}
-
-int Swifftx::Final(BitSequence *hashval)
-{
-    int i;
-    // Whether to add one last block. True if the padding appended to the last block overflows
-	// the block size.
-    bool toAddFinalBlock = false;
-    bool toPutOneInFinalBlock = false;
-    unsigned short oneShift = 0;
-   	// The size of the last input block before the zeroes padding. We add 1 here because we
-    // include the final '1' bit in the calculation and 7 as we round the length to bytes.
-	unsigned short sizeOfLastInputBlock = (swifftxState.remainingSize + 1 + 7) / 8;
-    // The number of bytes of zero in the padding part.
-	// The padding contains:
-	// 1. A single 1 bit.
-	// 2. As many zeroes as needed.
-	// 3. The message length in bits. Occupies SWIF_HAIFA_NUM_OF_BITS_SIZE bytes.
-	// 4. The digest size. Maximum is 512, so we need 2 bytes.
-	// If the total number achieved is negative, add an additional block, as HAIFA specifies.
-	short numOfZeroBytesInPadding = (short) SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE 
-								  - sizeOfLastInputBlock - (2 * SWIF_HAIFA_NUM_OF_BITS_SIZE) - 2 
-								  - SWIF_HAIFA_SALT_SIZE;
-   	// The input block to compression function of SWIFFTX:
-	BitSequence currInputBlock[SWIFFTX_INPUT_BLOCK_SIZE] = {0};
-	// The message length in base 256.
-	BitSequence messageLengthChar[SWIF_HAIFA_NUM_OF_BITS_SIZE] = {0};
-   	// The digest size used for padding:
-	unsigned char digestSizeLSB = swifftxState.hashbitlen % 256;
-	unsigned char digestSizeMSB = (swifftxState.hashbitlen - digestSizeLSB) / 256;
-
-	if (numOfZeroBytesInPadding < 1)
-		toAddFinalBlock = true;
-
-	// Fill the input block with data:
-	// 1. The output of the previous block:
-	memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
-	// 2a. The input part of the block, which is the remaining data from the previous 'Update()'
-    //     call, if exists and an extra '1' bit (maybe all we have is this extra 1):
-
-    // Add the last 1 in big-endian convention ...
-    if (swifftxState.remainingSize % 8 == 0)
-	{
-       swifftxState.remaining[sizeOfLastInputBlock - 1] = 0x80;
-    }
-    else 
-	{
-       swifftxState.remaining[sizeOfLastInputBlock - 1] |= (1 << (7 - (swifftxState.remainingSize % 8)));
-    }
-
-	if (sizeOfLastInputBlock)
-		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE, swifftxState.remaining, 
-			   sizeOfLastInputBlock);
-    
-   	// Compute the message length in base 256:
-	for (i = 0; i < SWIF_HAIFA_NUM_OF_BITS_SIZE; ++i)
-        messageLengthChar[i] = swifftxState.numOfBitsChar[i];
-    if (sizeOfLastInputBlock)
-		AddToCurrInBase256(messageLengthChar, sizeOfLastInputBlock * 8);
-
-	if (!toAddFinalBlock)
-	{
-		// 2b. Put the zeroes:
-		memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock,
-			   0, numOfZeroBytesInPadding);
-		// 2c. Pad the message length:
-		for (i = 0; i < SWIF_HAIFA_NUM_OF_BITS_SIZE; ++i)
-			currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock 
-						 + numOfZeroBytesInPadding + i] = messageLengthChar[i];
-		// 2d. Pad the digest size:
-		currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock 
-					 + numOfZeroBytesInPadding + SWIF_HAIFA_NUM_OF_BITS_SIZE] = digestSizeMSB;
-		currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock 
-					 + numOfZeroBytesInPadding + SWIF_HAIFA_NUM_OF_BITS_SIZE + 1] = digestSizeLSB;
-	}
-	else
-	{
-		// 2b. Put the zeroes, if at all:
-		if ((SWIF_HAIFA_INPUT_BLOCK_SIZE - sizeOfLastInputBlock) > 0)
-		{
-			 memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock,
-					0, SWIF_HAIFA_INPUT_BLOCK_SIZE - sizeOfLastInputBlock);
-		}
-	}
-
-   	// 3. The #bits part of the block: 
-	memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE, 
-           swifftxState.numOfBitsChar, SWIF_HAIFA_NUM_OF_BITS_SIZE);
-	// 4. The salt part of the block:
-	memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE 
-		 + SWIF_HAIFA_NUM_OF_BITS_SIZE, 
-           swifftxState.salt, 
-		   SWIF_HAIFA_SALT_SIZE);
-
-    ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, !toAddFinalBlock); 
-
-	// If we have to add one more block, it is now:
-	if (toAddFinalBlock)
-	{
-		// 1. The previous output block, as usual.
-		memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
-
-		// 2a. Instead of the input, zeroes:
-		memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE , 0, 
-			   SWIF_HAIFA_INPUT_BLOCK_SIZE - SWIF_HAIFA_NUM_OF_BITS_SIZE - 2);
-		// 2b. Instead of the input, the message length:
-		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE 
-			 - SWIF_HAIFA_NUM_OF_BITS_SIZE - 2,
-			   messageLengthChar,
-			   SWIF_HAIFA_NUM_OF_BITS_SIZE);
-		// 2c. Instead of the input, the digest size:
-		currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE - 2] = digestSizeMSB;
-		currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE - 1] = digestSizeLSB;
-		// 3. The #bits part of the block, which is zero in case of additional block:
-		memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE,
-			   0,
-			   SWIF_HAIFA_NUM_OF_BITS_SIZE);
-		// 4. The salt part of the block:
-		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE 
-			 + SWIF_HAIFA_NUM_OF_BITS_SIZE, 
-               swifftxState.salt, 
-			   SWIF_HAIFA_SALT_SIZE);
-
-        ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, true); 
-	}
-
-	// Finally, copy the result into 'hashval'. In case the digest size is not 512bit, copy the
-	// first hashbitlen of them:
-    for (i = 0; i < (swifftxState.hashbitlen / 8); ++i)
-		hashval[i] = swifftxState.currOutputBlock[i];
-
-	return SUCCESS;
-}
-
-int Swifftx::Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, 
-				BitSequence *hashval)
-{
-	int result;
-	//hashState state;
-   	// The pointer to the current place in the input we take into the compression function.
-	DataLength currInputIndex = 0;
-
-    result = Swifftx::Init(hashbitlen);
-
-	if (result != SUCCESS)
-		return result;
-
-	for ( ; (databitlen / 8) >  SWIF_HAIFA_INPUT_BLOCK_SIZE; 
-         currInputIndex += SWIF_HAIFA_INPUT_BLOCK_SIZE, databitlen -= (SWIF_HAIFA_INPUT_BLOCK_SIZE * 8))
-	{
-		result = Swifftx::Update(data + currInputIndex, SWIF_HAIFA_INPUT_BLOCK_SIZE * 8); 
-		if (result != SUCCESS)
-			return result;
-	}
-
-	// The length of the last block may be shorter than (SWIF_HAIFA_INPUT_BLOCK_SIZE * 8)
-	result = Swifftx::Update(data + currInputIndex, databitlen); 
-	if (result != SUCCESS)
-	{
-		return result;
-	}
-
-    return Swifftx::Final(hashval);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-// Helper fuction implementation portion.
-///////////////////////////////////////////////////////////////////////////////////////////////
-
-void Swifftx::AddToCurrInBase256(BitSequence value[SWIF_HAIFA_NUM_OF_BITS_SIZE], 
-							   unsigned short toAdd)
-{
-	unsigned char remainder = 0;
-	short i;
-	BitSequence currValueInBase256[8] = {0};
-	unsigned short currIndex = 7;
-	unsigned short temp = 0;
-
-	do
-	{
-		remainder = toAdd % 256;
-		currValueInBase256[currIndex--] = remainder;
-		toAdd -= remainder;
-		toAdd /= 256;
-	}
-	while(toAdd != 0);
-
-	for (i = 7; i >= 0; --i)
-	{
-		temp = value[i] + currValueInBase256[i];
-		if (temp > 255)
-		{
-			value[i] = temp % 256;
-			currValueInBase256[i - 1]++;
-		}
-		else
-			value[i] = (unsigned char) temp;
-	}
-}
--- a/algo/swifftx/Swifftx_sha3.h
+++ b/algo/swifftx/Swifftx_sha3.h
@@ -1,79 +0,0 @@
-#ifndef SWIFFTX_SHA3_H
-#define SWIFFTX_SHA3_H
-
-#include "sha3_interface.h"
-#include "stdbool.h"
-#include "stdint.h"
-
-class Swifftx : public SHA3 {
-
-#define SWIFFTX_INPUT_BLOCK_SIZE 256
-#define SWIFFTX_OUTPUT_BLOCK_SIZE 65
-#define SWIF_HAIFA_SALT_SIZE 8
-#define SWIF_HAIFA_NUM_OF_BITS_SIZE 8
-#define SWIF_HAIFA_INPUT_BLOCK_SIZE (SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE \
-							  - SWIF_HAIFA_NUM_OF_BITS_SIZE - SWIF_HAIFA_SALT_SIZE)
-
-	typedef unsigned char BitSequence;
-//const DataLength SWIF_SALT_VALUE;
-
-#define SWIF_HAIFA_IV 0
-
-/*const BitSequence SWIF_HAIFA_IV_224[SWIFFTX_OUTPUT_BLOCK_SIZE];
-const BitSequence SWIF_HAIFA_IV_256[SWIFFTX_OUTPUT_BLOCK_SIZE];
-const BitSequence SWIF_HAIFA_IV_384[SWIFFTX_OUTPUT_BLOCK_SIZE];
-const BitSequence SWIF_HAIFA_IV_512[SWIFFTX_OUTPUT_BLOCK_SIZE];*/
-
-typedef enum 
-{ 
-	SUCCESS = 0,
-	FAIL = 1,
-	BAD_HASHBITLEN = 2,
-	BAD_SALT_SIZE = 3,
-	SET_SALT_VALUE_FAILED = 4,
-	INPUT_DATA_NOT_ALIGNED = 5
-} HashReturn;
-
-typedef struct hashState {
-	unsigned short hashbitlen;
-
-	// The data remained after the recent call to 'Update()'. 
-	BitSequence remaining[SWIF_HAIFA_INPUT_BLOCK_SIZE + 1];
-
-	// The size of the remaining data in bits.
-	// Is 0 in case there is no remaning data at all.
-	unsigned int remainingSize;
-
-	// The current output of the compression function. At the end will contain the final digest
-	// (which may be needed to be truncated, depending on hashbitlen).
-	BitSequence currOutputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE];
-
-	// The value of '#bits hashed so far' field in HAIFA, in base 256.
-	BitSequence numOfBitsChar[SWIF_HAIFA_NUM_OF_BITS_SIZE];
-
-	// The salt value currently in use:
-	BitSequence salt[SWIF_HAIFA_SALT_SIZE];
-
-	// Indicates whether a single 'Update()' occured. 
-	// Ater a call to 'Update()' the key and the salt values cannot be changed.
-	bool wasUpdated;
-} hashState;
-
-private:
-int swifftxNumRounds;
-hashState swifftxState;
-
-
-public:
-int Init(int hashbitlen);
-int Update(const BitSequence *data, DataLength databitlen);
-int Final(BitSequence *hashval);
-int Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, 
-				BitSequence *hashval);
-
-private:
-static void AddToCurrInBase256(BitSequence value[SWIF_HAIFA_NUM_OF_BITS_SIZE], unsigned short toAdd);
-
-};
-
-#endif
--- a/algo/swifftx/hash_interface.h
+++ b/algo/swifftx/hash_interface.h
@@ -1,21 +0,0 @@
-#pragma once
-
-#include <cstdint>
-
-namespace hash {
-
-using BitSequence = unsigned char;
-using DataLength = unsigned long long;
-
-struct hash_interface {
-    virtual ~hash_interface() = default;
-
-    virtual int Init(int hash_bitsize) = 0;
-    virtual int Update(const BitSequence *data, DataLength data_bitsize) = 0;
-    virtual int Final(BitSequence *hash) = 0;
-
-    virtual int
-    Hash(int hash_bitsize, const BitSequence *data, DataLength data_bitsize, BitSequence *hash) = 0;
-};
-
-} // namespace hash
--- a/algo/swifftx/sha3_interface.h
+++ b/algo/swifftx/sha3_interface.h
@@ -1,14 +0,0 @@
-#pragma once
-
-#include <cstdint>
-//#include <streams/hash/hash_interface.h>
-#include "hash_interface.h"
-
-namespace sha3 {
-
-using BitSequence = hash::BitSequence;
-using DataLength = hash::DataLength;
-
-struct sha3_interface : hash::hash_interface {};
-
-} // namespace sha3
--- a/algo/x16/minotaur.c
+++ b/algo/x16/minotaur.c
@@ -27,7 +27,7 @@
 #else
  #include "algo/echo/sph_echo.h"
 #endif
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #else
  #include "algo/groestl/sph_groestl.h"
@@ -50,7 +50,7 @@ typedef struct TortureGarden TortureGarden;
 // Graph of hash algos plus SPH contexts
 struct TortureGarden
 {
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   hashState_groestl       groestl;
 #else
   sph_groestl512_context  groestl;
@@ -123,7 +123,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
 #endif
 	         break;
        case 5:
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
            groestl512_full( &garden->groestl, hash, input, 512 );
 #else
            sph_groestl512_init( &garden->groestl) ;
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -1092,7 +1092,7 @@ int x16r_2x64_hash_generic( void* output, const void* input, int thrid,
            dintrlv_2x64( hash0, hash1, vhash, 512 );
         break;
         case GROESTL:
-#if defined(__AES__)  // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
            groestl512_full( &ctx.groestl, hash0, in0, size<<3 );
            groestl512_full( &ctx.groestl, hash1, in1, size<<3 );
 #else
@@ -1173,7 +1173,7 @@ int x16r_2x64_hash_generic( void* output, const void* input, int thrid,
            simd512_ctx( &ctx.simd, hash1, in1, size );
         break;
         case ECHO:
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
            echo_full( &ctx.echo, hash0, 512, in0, size );
            echo_full( &ctx.echo, hash1, 512, in1, size );
 #else
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -218,7 +218,7 @@ union _x16r_2x64_context_overlay
 {
    blake512_2x64_context   blake;
    bmw512_2x64_context     bmw;
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
    hashState_groestl       groestl;
 #else
    sph_groestl512_context  groestl;
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -1208,7 +1208,7 @@ union _x16rv2_2x64_context_overlay
 {
    blake512_2x64_context   blake;
    bmw512_2x64_context     bmw;
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
    hashState_groestl       groestl;
 #else
    sph_groestl512_context  groestl;
@@ -1294,7 +1294,7 @@ int x16rv2_2x64_hash( void* output, const void* input, int thrid )
            dintrlv_2x64( hash0, hash1, vhash, 512 );
         break;
         case GROESTL:
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
            groestl512_full( &ctx.groestl, hash0, in0, size<<3 );
            groestl512_full( &ctx.groestl, hash1, in1, size<<3 );
 #else
@@ -1400,7 +1400,7 @@ int x16rv2_2x64_hash( void* output, const void* input, int thrid )
            simd512_ctx( &ctx.simd, hash1, in1, size );
         break;
         case ECHO:
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
            echo_full( &ctx.echo, hash0, 512, in0, size );
            echo_full( &ctx.echo, hash1, 512, in1, size );
 #else
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -294,7 +294,6 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t bedata1[2] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -938,7 +938,7 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,
 #endif
 #include "algo/shabal/sph_shabal.h"
 #include "algo/haval/sph-haval.h"
-#if !( defined(__AES__) ) //|| defined(__ARM_FEATURE_AES) )
+#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
  #include "algo/groestl/sph_groestl.h"
 #endif
 #if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
@@ -950,7 +950,7 @@ union _x17_context_overlay
 {
        blake512_2x64_context   blake;
        bmw512_2x64_context     bmw;
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_groestl       groestl;
 #else
        sph_groestl512_context  groestl;
@@ -1000,7 +1000,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )

    dintrlv_2x64( hash0, hash1, vhash, 512 );

-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
    groestl512_full( &ctx.groestl, hash0, hash0, 512 );
    groestl512_full( &ctx.groestl, hash1, hash1, 512 );
 #else
--- a/algo/x22/x22i.c
+++ b/algo/x22/x22i.c
@@ -5,15 +5,15 @@
 #include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
 #if defined(__AES__)
-  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/fugue/fugue-aesni.h"
 #else
-  #include "algo/groestl/sph_groestl.h"
  #include "algo/fugue/sph_fugue.h"
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
+  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #else
+  #include "algo/groestl/sph_groestl.h"
  #include "algo/echo/sph_echo.h"
 #endif
 #include "algo/skein/sph_skein.h"
@@ -39,15 +39,15 @@ union _x22i_context_overlay
        blake512_context       blake;
        sph_bmw512_context     bmw;
 #if defined(__AES__)
-        hashState_groestl       groestl;
        hashState_fugue         fugue;
 #else
-        sph_groestl512_context  groestl;
        sph_fugue512_context    fugue;
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
+        hashState_groestl       groestl;
        hashState_echo          echo;
 #else
+        sph_groestl512_context  groestl;
        sph_echo512_context     echo;
 #endif
        sph_jh512_context       jh;
@@ -81,7 +81,7 @@ int x22i_hash( void *output, const void *input, int thrid )
   sph_bmw512(&ctx.bmw, (const void*) hash, 64);
   sph_bmw512_close(&ctx.bmw, hash);

-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   groestl512_full( &ctx.groestl, hash, hash, 512 );
 #else
   sph_groestl512_init( &ctx.groestl );
--- a/algo/x22/x25x.c
+++ b/algo/x22/x25x.c
@@ -5,15 +5,15 @@
 #include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
 #if defined(__AES__)
-  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/fugue/fugue-aesni.h"
 #else
-  #include "algo/groestl/sph_groestl.h"
  #include "algo/fugue/sph_fugue.h"
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
+  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #else
+  #include "algo/groestl/sph_groestl.h"
  #include "algo/echo/sph_echo.h"
 #endif
 #include "algo/skein/sph_skein.h"
@@ -42,15 +42,15 @@ union _x25x_context_overlay
        blake512_context        blake;
        sph_bmw512_context      bmw;
 #if defined(__AES__)
-        hashState_groestl       groestl;
        hashState_fugue         fugue;
 #else
-        sph_groestl512_context  groestl;
        sph_fugue512_context    fugue;
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
+        hashState_groestl       groestl;
        hashState_echo          echo;
 #else
+        sph_groestl512_context  groestl;
        sph_echo512_context     echo;
 #endif
        sph_jh512_context       jh;
@@ -86,7 +86,7 @@ int x25x_hash( void *output, const void *input, int thrid )
   sph_bmw512(&ctx.bmw, (const void*) &hash[0], 64);
   sph_bmw512_close(&ctx.bmw, &hash[1]);

-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   groestl512_full( &ctx.groestl, (void*)&hash[2], (const void*)&hash[1], 512 );
 #else
   sph_groestl512_init( &ctx.groestl );
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.13.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.14.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='23.13'
-PACKAGE_STRING='cpuminer-opt 23.13'
+PACKAGE_VERSION='23.14'
+PACKAGE_STRING='cpuminer-opt 23.14'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 23.13 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 23.14 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1432,7 +1432,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 23.13:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 23.14:";;
   esac
  cat <<\_ACEOF

@@ -1538,7 +1538,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 23.13
+cpuminer-opt configure 23.14
 generated by GNU Autoconf 2.71

 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 23.13, which was
+It was created by cpuminer-opt $as_me 23.14, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='23.13'
+ VERSION='23.14'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 23.13, which was
+This file was extended by cpuminer-opt $as_me 23.14, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 23.13
+cpuminer-opt config.status 23.14
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [23.13])
+AC_INIT([cpuminer-opt], [23.14])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 23.13.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 23.14.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='23.13'
-PACKAGE_STRING='cpuminer-opt 23.13'
+PACKAGE_VERSION='23.14'
+PACKAGE_STRING='cpuminer-opt 23.14'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 23.13 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 23.14 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 23.13:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 23.14:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 23.13
+cpuminer-opt configure 23.14
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 23.13, which was
+It was created by cpuminer-opt $as_me 23.14, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='23.13'
+ VERSION='23.14'


 cat >>confdefs.h <<_ACEOF
@@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 23.13, which was
+This file was extended by cpuminer-opt $as_me 23.14, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6784,7 +6784,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 23.13
+cpuminer-opt config.status 23.14
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -3666,11 +3666,6 @@ static int thread_create(struct thr_info *thr, void* func)

 void get_defconfig_path(char *out, size_t bufsize, char *argv0);

-
-#include "simd-utils.h"
-#include "algo/echo/aes_ni/hash_api.h"
-#include "compat/aes_helper.c"
-
 int main(int argc, char *argv[])
 {
 	struct thr_info *thr;
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -322,6 +322,7 @@ static inline __m128i v128_neg1_fn()
 #define mm128_xim_32( v1, v0, c ) \
   _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
                                    _mm_castsi128_ps( v0 ), c ) )
+#define v128_xim32 mm128_xim_32

 // Examples of simple operations using xim:
 /*
--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -68,7 +68,7 @@
 #define v128_mul32                    vmulq_u32
 #define v128_mul16                    vmulq_u16

-// slow, tested with argon2d
+// Widening, shuffle high element to align with Intel
 static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 {
   return vmull_u32( vget_low_u32( vcopyq_laneq_u32( v1, 1, v1, 2 ) ),
@@ -86,7 +86,7 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )

 // Not yet needed
 //#define v128_cmpeq1
-
+// Signed
 #define v128_cmpgt64( v1, v0 )        vcgtq_s64( (int64x2_t)v1, (int64x2_t)v0 )
 #define v128_cmpgt32( v1, v0 )        vcgtq_s32( (int32x4_t)v1, (int32x4_t)v0 )
 #define v128_cmpgt16( v1, v0 )        vcgtq_s16( (int16x8_t)v1, (int16x8_t)v0 )
@@ -406,34 +406,15 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 v1 = vorrq_u32( v1, t1 ); \
 }

-// Cross lane shuffles, no programmable shuffle in NEON
-
-// vector mask, use as last resort. prefer rev, alignr, etc
+// vector mask, use as last resort. prefer tbl, rev, alignr, etc
 #define v128_shufflev32( v, vmask ) \
  v128_set32( ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[3] ], \
              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[2] ], \
              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[1] ], \
              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[0] ] ) \

-// compatible with x86_64, but very slow, avoid
 #define v128_shuffle8( v, vmask ) \
-   v128_set8( ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[15] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[14] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[13] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[12] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[11] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[10] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 9] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 8] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 7] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 6] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 5] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 4] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 3] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 2] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 1] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 0] ] )
-
+     vqtbl1q_u8( (uint8x16_t)v, (uint8x16_t)vmask );

 // sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
 // Bit rotation already promotes faster widths. Usage is context sensitive.