v23.15

v23.14
v23.13
2025-09-17 23:44:27 +00:00 · 2023-11-30 14:36:47 -05:00 · 2023-11-28 00:58:43 -05:00 · 2023-11-21 14:18:15 -05:00 · 2023-11-20 11:51:57 -05:00 · 2023-11-15 11:05:41 -05:00
97 changed files with 3946 additions and 2790 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -250,6 +250,7 @@ cpuminer_SOURCES = \
  algo/x16/x16rt.c \
  algo/x16/x16rt-4way.c \
  algo/x16/hex.c \
+  algo/x16/x20r.c \
  algo/x16/x21s-4way.c \
  algo/x16/x21s.c \
  algo/x16/minotaur.c \
--- a/README.md
+++ b/README.md
@@ -87,7 +87,6 @@ Supported Algorithms
                          groestl       Groestl coin
                          hex           x16r-hex
                          hmq1725       
-                          hodl          Hodlcoin
                          jha           Jackpotcoin
                          keccak        Maxcoin
                          keccakc       Creative coin
@@ -115,9 +114,11 @@ Supported Algorithms
                          scrypt:N      scrypt(N, 1, 1)
                          scryptn2      scrypt(1048576, 1, 1)
                          sha256d       Double SHA-256
+                          sha256dt
                          sha256q       Quad SHA-256
                          sha256t       Triple SHA-256
                          sha3d         Double keccak256 (BSHA3)
+                          sha512256d
                          skein         Skein+Sha (Skeincoin)
                          skein2        Double Skein (Woodcoin)
                          skunk         Signatum (SIGT)
@@ -145,6 +146,7 @@ Supported Algorithms
                          x16rt-veil    veil
                          x16s          
                          x17
+                          x20r
                          x21s
                          x22i
                          x25x
--- a/59
+++ b/59
@@ -27,17 +27,19 @@ See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions
 Requirements
 ------------

-Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
-supported.
+- A x86_64 architecture CPU with a minimum of SSE2 support. This includes Intel Core2 and newer and AMD equivalents.
+- Arm CPU supporting AArch64 and NEON.

-64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
-are not supported. FreeBSD YMMV.
+32 bit CPUs are not supported.

-ARM requirements (Beta):
+Older CPUs are supported by open source cpuminer-multi by TPruvot but at reduced performance.

-CPU: Armv8 and NEON, SHA2 & AES are optional
-OS: Linux distribution built for AArch64.
-Packages: source code only.
+Mining on mobile devices that meet the requirements is not recommended due to the risk of
+overheating and damaging the battery. Mining has unlimited demand, it will push any device
+to or beyond its limits. There is also a fire risk with overheated lithium batteries.
+
+Beware of apps claiming "mobile only mining". There is no such thing, they aren't miners.
+If a mobile CPU can mine it any CPU can.

 See wiki for details.

@@ -73,6 +75,47 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v23.15
+
+Fixed x11gost (sib) algo for all architectures, broken in v3.23.4.
+ARM: Fugue AES optimizations enabled.
+ARM: quark, qubit, x11gost algos optimized with NEON & AES.
+
+v23.14
+
+ARM: Groestl AES optimizations enabled.
+All: Small optimization to Shabal 4way.
+x86_64: Extend Shabal 4way support to SSE2 from SSE4.1.
+All: deleted some unused files.
+
+v23.13
+
+Added x20r algo.
+Eliminated redundant hash order calculations for x16r family.
+
+v23.12
+
+Several bugs fixes and speed improvements for x16r family for all CPU architectures.
+
+v23.11
+
+This is a release candidate for full AArch64 support, marking the end of the Beta phase.
+Fixed hmq1725 & x25x algos, SSE2 & NEON, broken in v3.23.4.
+Most CPU-mineable SHA3 algos (X*) upgraded to 2-way SSE2 & NEON.
+
+v23.10
+
+x86_64: Fixed scrypt, scryptn2 algos SSE2. 
+Fixed sha512256d algo AVX2, SSE2, NEON.
+Fixed a bug in Skein N-way that reduced performance.
+ARM: Skein optimized for NEON, SHA2 & SSE2.
+Skein2 algo 2-way optimized for NEON & SSE2.
+
+v23.9
+
+x86_64: fixed minotaurx crash, broken in 23.7.
+ARM: #407 fix compile error due to incorrect type casting for vrev instruction argument.
+
 v23.8

 Cpuminer-opt is no longer dependant on OpenSSL.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -368,6 +368,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_X16RT_VEIL:   rc = register_x16rt_veil_algo    ( gate ); break;
    case ALGO_X16S:         rc = register_x16s_algo          ( gate ); break;
    case ALGO_X17:          rc = register_x17_algo           ( gate ); break;
+    case ALGO_X20R:         rc = register_x20r_algo          ( gate ); break;
    case ALGO_X21S:         rc = register_x21s_algo          ( gate ); break;
    case ALGO_X22I:         rc = register_x22i_algo          ( gate ); break;
    case ALGO_X25X:         rc = register_x25x_algo          ( gate ); break;
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -99,7 +99,7 @@ typedef  uint32_t set_t;
 #define AES_OPT          1 <<  7   // Intel Westmere, AArch64
 #define VAES_OPT         1 <<  8   // Icelake, Zen3
 #define SHA_OPT          1 <<  9   // Zen1, Icelake, AArch64 
-#define SHA512_OPT       1 << 10   // AArch64 
+#define SHA512_OPT       1 << 10   // Intel Arrow Lake, AArch64 
 #define NEON_OPT         1 << 11   // AArch64 

 // AVX10 does not have explicit algo features:
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -39,7 +39,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
   blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );

   do {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );

      blakehash_4way( hash, vdata );

--- a/algo/blake/blake256-hash.c
+++ b/algo/blake/blake256-hash.c
@@ -429,7 +429,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
 #define BLAKE256_4X32_BLOCK_BSWAP32 \
 { \
   v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
-                                          0x0405060700010203 ); \
+                                     0x0405060700010203 ); \
   M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
   M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
   M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
@@ -931,14 +931,14 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
   const v128_t shuf_bswap32 =
                      v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );

-   H[0] = _mm_shuffle_epi8( mm128_xor3( V8, V0, h[0] ), shuf_bswap32 );
-   H[1] = _mm_shuffle_epi8( mm128_xor3( V9, V1, h[1] ), shuf_bswap32 );
-   H[2] = _mm_shuffle_epi8( mm128_xor3( VA, V2, h[2] ), shuf_bswap32 );
-   H[3] = _mm_shuffle_epi8( mm128_xor3( VB, V3, h[3] ), shuf_bswap32 );
-   H[4] = _mm_shuffle_epi8( mm128_xor3( VC, V4, h[4] ), shuf_bswap32 );
-   H[5] = _mm_shuffle_epi8( mm128_xor3( VD, V5, h[5] ), shuf_bswap32 );
-   H[6] = _mm_shuffle_epi8( mm128_xor3( VE, V6, h[6] ), shuf_bswap32 );
-   H[7] = _mm_shuffle_epi8( mm128_xor3( VF, V7, h[7] ), shuf_bswap32 );
+   H[0] = _mm_shuffle_epi8( v128_xor3( V8, V0, h[0] ), shuf_bswap32 );
+   H[1] = _mm_shuffle_epi8( v128_xor3( V9, V1, h[1] ), shuf_bswap32 );
+   H[2] = _mm_shuffle_epi8( v128_xor3( VA, V2, h[2] ), shuf_bswap32 );
+   H[3] = _mm_shuffle_epi8( v128_xor3( VB, V3, h[3] ), shuf_bswap32 );
+   H[4] = _mm_shuffle_epi8( v128_xor3( VC, V4, h[4] ), shuf_bswap32 );
+   H[5] = _mm_shuffle_epi8( v128_xor3( VD, V5, h[5] ), shuf_bswap32 );
+   H[6] = _mm_shuffle_epi8( v128_xor3( VE, V6, h[6] ), shuf_bswap32 );
+   H[7] = _mm_shuffle_epi8( v128_xor3( VF, V7, h[7] ), shuf_bswap32 );

 #else

--- a/algo/blake/blake512-hash.c
+++ b/algo/blake/blake512-hash.c
@@ -475,11 +475,12 @@ void blake512_update(blake512_context *sc, const void *data, size_t len)
 void blake512_close( blake512_context *sc, void *dst )
 {
   unsigned char buf[128] __attribute__((aligned(32)));
-   size_t ptr;
+   size_t ptr, k;
   unsigned bit_len;
   uint64_t th, tl;

   ptr = sc->ptr;
+   memcpy( buf, sc->buf, ptr );
   bit_len = ((unsigned)ptr << 3);
   buf[ptr] = 0x80;
   tl = sc->T0 + bit_len;
@@ -519,7 +520,8 @@ void blake512_close( blake512_context *sc, void *dst )
      blake512_update( sc, buf, 128 );
   }
   
-   v128_block_bswap64_512( dst, sc->H ); 
+   for ( k = 0; k < 8; k ++ )
+      ((uint64_t*)dst)[k] = bswap_64( sc->H[k] );
 }

 void blake512_full( blake512_context *sc, void *dst, const void *data,
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -182,7 +182,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
   blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );

   do {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      pdata[19] = n;
      blakecoin_4way_hash( hash, vdata );

--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -131,47 +131,7 @@
   V[7] = v128_alignr64( V6, V7, 1 ); \
 }

-/*
-#elif defined(__SSE2__)
-// always true
-
-#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
-{ \
-   Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
-                 _mm_set_epi64x( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
-   Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
-   Vc = _mm_add_epi64( Vc, Vd ); \
-   Vb = mm128_shuflr64_24( _mm_xor_si128( Vb, Vc ) ); \
-\
-   Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
-                 _mm_set_epi64x( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
-   Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
-   Vc = _mm_add_epi64( Vc, Vd ); \
-   Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 63 ); \
-}
-
-#define BLAKE2B_ROUND( R ) \
-{ \
-   v128_t *V = (v128_t*)v; \
-   v128_t V2, V3, V6, V7; \
-   const uint8_t *sigmaR = sigma[R]; \
-   BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
-   BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
-   V2 = mm128_alignr_64( V[3], V[2], 1 ); \
-   V3 = mm128_alignr_64( V[2], V[3], 1 ); \
-   V6 = mm128_alignr_64( V[6], V[7], 1 ); \
-   V7 = mm128_alignr_64( V[7], V[6], 1 ); \
-   BLAKE2B_G( V[0], V2, V[5], V6,  8,  9, 10, 11 ); \
-   BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
-   V[2] = mm128_alignr_64( V2, V3, 1 ); \
-   V[3] = mm128_alignr_64( V3, V2, 1 ); \
-   V[6] = mm128_alignr_64( V7, V6, 1 ); \
-   V[7] = mm128_alignr_64( V6, V7, 1 ); \
-}
-*/
-
 #else
-// never used, SSE2 is always available

 #ifndef ROTR64
 #define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -62,78 +62,78 @@ static const uint32_t IV256[] = {
 */

 #define ss0(x) \
-   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
-                                 _mm_slli_epi32( (x), 3) ), \
-                  _mm_xor_si128( mm128_rol_32( (x),  4), \
-                                 mm128_rol_32( (x), 19) ) )
+   v128_xor( v128_xor( v128_sr32( (x), 1), \
+                                 v128_sl32( (x), 3) ), \
+                  v128_xor( v128_rol32( (x),  4), \
+                                 v128_rol32( (x), 19) ) )

 #define ss1(x) \
-   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
-                                 _mm_slli_epi32( (x), 2) ), \
-                  _mm_xor_si128( mm128_rol_32( (x),  8), \
-                                 mm128_rol_32( (x), 23) ) )
+   v128_xor( v128_xor( v128_sr32( (x), 1), \
+                                 v128_sl32( (x), 2) ), \
+                  v128_xor( v128_rol32( (x),  8), \
+                                 v128_rol32( (x), 23) ) )

 #define ss2(x) \
-   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
-                                 _mm_slli_epi32( (x), 1) ), \
-                  _mm_xor_si128( mm128_rol_32( (x), 12), \
-                                 mm128_rol_32( (x), 25) ) )
+   v128_xor( v128_xor( v128_sr32( (x), 2), \
+                                 v128_sl32( (x), 1) ), \
+                  v128_xor( v128_rol32( (x), 12), \
+                                 v128_rol32( (x), 25) ) )

 #define ss3(x) \
-   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
-                                 _mm_slli_epi32( (x), 2) ), \
-                  _mm_xor_si128( mm128_rol_32( (x), 15), \
-                                 mm128_rol_32( (x), 29) ) )
+   v128_xor( v128_xor( v128_sr32( (x), 2), \
+                                 v128_sl32( (x), 2) ), \
+                  v128_xor( v128_rol32( (x), 15), \
+                                 v128_rol32( (x), 29) ) )

 #define ss4(x) \
-  _mm_xor_si128( (x), _mm_srli_epi32( (x), 1 ) )
+  v128_xor( (x), v128_sr32( (x), 1 ) )

 #define ss5(x) \
-  _mm_xor_si128( (x), _mm_srli_epi32( (x), 2 ) )
+  v128_xor( (x), v128_sr32( (x), 2 ) )

-#define rs1(x)    mm128_rol_32( x,  3 ) 
-#define rs2(x)    mm128_rol_32( x,  7 ) 
-#define rs3(x)    mm128_rol_32( x, 13 ) 
-#define rs4(x)    mm128_rol_32( x, 16 ) 
-#define rs5(x)    mm128_rol_32( x, 19 ) 
-#define rs6(x)    mm128_rol_32( x, 23 ) 
-#define rs7(x)    mm128_rol_32( x, 27 ) 
+#define rs1(x)    v128_rol32( x,  3 ) 
+#define rs2(x)    v128_rol32( x,  7 ) 
+#define rs3(x)    v128_rol32( x, 13 ) 
+#define rs4(x)    v128_rol32( x, 16 ) 
+#define rs5(x)    v128_rol32( x, 19 ) 
+#define rs6(x)    v128_rol32( x, 23 ) 
+#define rs7(x)    v128_rol32( x, 27 ) 

 #define rol_off_32( M, j, off ) \
-   mm128_rol_32( M[ ( (j) + (off) ) & 0xF ] , \
+   v128_rol32( M[ ( (j) + (off) ) & 0xF ] , \
                ( ( (j) + (off) ) & 0xF ) + 1 )

 #define add_elt_s( M, H, j ) \
-   _mm_xor_si128( \
-       _mm_add_epi32( \
-             _mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
+   v128_xor( \
+       v128_add32( \
+             v128_sub32( v128_add32( rol_off_32( M, j, 0 ), \
                                           rol_off_32( M, j, 3 ) ), \
                            rol_off_32( M, j, 10 ) ), \
-       _mm_set1_epi32( ( (j)+16 ) * 0x05555555UL ) ), \
+       v128_32( ( (j)+16 ) * 0x05555555UL ) ), \
   H[ ( (j)+7 ) & 0xF ] )


 #define expand1s( qt, M, H, i ) \
-   _mm_add_epi32(  mm128_add4_32( \
-            mm128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
+   v128_add32(  v128_add4_32( \
+            v128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
                           ss3( qt[ (i)-14 ] ), ss0( qt[ (i)-13 ] ) ), \
-            mm128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
+            v128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
                           ss3( qt[ (i)-10 ] ), ss0( qt[ (i)- 9 ] ) ), \
-            mm128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
+            v128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
                           ss3( qt[ (i)- 6 ] ), ss0( qt[ (i)- 5 ] ) ),  \
-            mm128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
+            v128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
                           ss3( qt[ (i)- 2 ] ), ss0( qt[ (i)- 1 ] ) ) ), \
      add_elt_s( M, H, (i)-16 ) )

 #define expand2s( qt, M, H, i) \
-   _mm_add_epi32( mm128_add4_32( \
-            mm128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
+   v128_add32( v128_add4_32( \
+            v128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
                           qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ), \
-            mm128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
+            v128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
                           qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ), \
-            mm128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
+            v128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
                           qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ), \
-            mm128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
+            v128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
                           ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \
      add_elt_s( M, H, (i)-16 ) )

@@ -141,169 +141,169 @@ static const uint32_t IV256[] = {
 // resulting in some sign changes compared to the reference code.

 #define Ws0 \
-   _mm_add_epi32( \
-      _mm_add_epi32( \
-         _mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
-                        _mm_xor_si128( M[ 7], H[ 7] ) ), \
-         _mm_xor_si128( M[10], H[10] ) ), \
-      _mm_add_epi32( _mm_xor_si128( M[13], H[13] ), \
-                     _mm_xor_si128( M[14], H[14] ) ) )
+   v128_add32( \
+      v128_add32( \
+         v128_sub32( v128_xor( M[ 5], H[ 5] ), \
+                        v128_xor( M[ 7], H[ 7] ) ), \
+         v128_xor( M[10], H[10] ) ), \
+      v128_add32( v128_xor( M[13], H[13] ), \
+                     v128_xor( M[14], H[14] ) ) )

 #define Ws1 \
-   _mm_add_epi32( \
-       _mm_add_epi32( \
-          _mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
-                         _mm_xor_si128( M[ 8], H[ 8] ) ), \
-          _mm_xor_si128( M[11], H[11] ) ), \
-       _mm_sub_epi32( _mm_xor_si128( M[14], H[14] ), \
-                      _mm_xor_si128( M[15], H[15] ) ) )
+   v128_add32( \
+       v128_add32( \
+          v128_sub32( v128_xor( M[ 6], H[ 6] ), \
+                         v128_xor( M[ 8], H[ 8] ) ), \
+          v128_xor( M[11], H[11] ) ), \
+       v128_sub32( v128_xor( M[14], H[14] ), \
+                      v128_xor( M[15], H[15] ) ) )

 #define Ws2 \
-   _mm_sub_epi32( \
-      _mm_add_epi32( \
-         _mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
-                        _mm_xor_si128( M[ 7], H[ 7] ) ), \
-         _mm_xor_si128( M[ 9], H[ 9] ) ), \
-      _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
-                     _mm_xor_si128( M[15], H[15] ) ) )
+   v128_sub32( \
+      v128_add32( \
+         v128_add32( v128_xor( M[ 0], H[ 0] ), \
+                        v128_xor( M[ 7], H[ 7] ) ), \
+         v128_xor( M[ 9], H[ 9] ) ), \
+      v128_sub32( v128_xor( M[12], H[12] ), \
+                     v128_xor( M[15], H[15] ) ) )

 #define Ws3 \
-   _mm_sub_epi32( \
-      _mm_add_epi32( \
-         _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
-                        _mm_xor_si128( M[ 1], H[ 1] ) ), \
-         _mm_xor_si128( M[ 8], H[ 8] ) ), \
-      _mm_sub_epi32( _mm_xor_si128( M[10], H[10] ), \
-                     _mm_xor_si128( M[13], H[13] ) ) )
+   v128_sub32( \
+      v128_add32( \
+         v128_sub32( v128_xor( M[ 0], H[ 0] ), \
+                        v128_xor( M[ 1], H[ 1] ) ), \
+         v128_xor( M[ 8], H[ 8] ) ), \
+      v128_sub32( v128_xor( M[10], H[10] ), \
+                     v128_xor( M[13], H[13] ) ) )

 #define Ws4 \
-   _mm_sub_epi32( \
-      _mm_add_epi32( \
-         _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
-                        _mm_xor_si128( M[ 2], H[ 2] ) ), \
-         _mm_xor_si128( M[ 9], H[ 9] ) ), \
-      _mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
-                     _mm_xor_si128( M[14], H[14] ) ) )
+   v128_sub32( \
+      v128_add32( \
+         v128_add32( v128_xor( M[ 1], H[ 1] ), \
+                        v128_xor( M[ 2], H[ 2] ) ), \
+         v128_xor( M[ 9], H[ 9] ) ), \
+      v128_add32( v128_xor( M[11], H[11] ), \
+                     v128_xor( M[14], H[14] ) ) )

 #define Ws5 \
-   _mm_sub_epi32( \
-      _mm_add_epi32( \
-         _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
-                        _mm_xor_si128( M[ 2], H[ 2] ) ), \
-         _mm_xor_si128( M[10], H[10] ) ), \
-      _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
-                     _mm_xor_si128( M[15], H[15] ) ) )
+   v128_sub32( \
+      v128_add32( \
+         v128_sub32( v128_xor( M[ 3], H[ 3] ), \
+                        v128_xor( M[ 2], H[ 2] ) ), \
+         v128_xor( M[10], H[10] ) ), \
+      v128_sub32( v128_xor( M[12], H[12] ), \
+                     v128_xor( M[15], H[15] ) ) )

 #define Ws6 \
-   _mm_sub_epi32( \
-      _mm_sub_epi32( \
-         _mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
-                        _mm_xor_si128( M[ 0], H[ 0] ) ), \
-         _mm_xor_si128( M[ 3], H[ 3] ) ), \
-      _mm_sub_epi32( _mm_xor_si128( M[11], H[11] ), \
-                     _mm_xor_si128( M[13], H[13] ) ) )
+   v128_sub32( \
+      v128_sub32( \
+         v128_sub32( v128_xor( M[ 4], H[ 4] ), \
+                        v128_xor( M[ 0], H[ 0] ) ), \
+         v128_xor( M[ 3], H[ 3] ) ), \
+      v128_sub32( v128_xor( M[11], H[11] ), \
+                     v128_xor( M[13], H[13] ) ) )

 #define Ws7 \
-   _mm_sub_epi32( \
-      _mm_sub_epi32( \
-         _mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
-                        _mm_xor_si128( M[ 4], H[ 4] ) ), \
-         _mm_xor_si128( M[ 5], H[ 5] ) ), \
-      _mm_add_epi32( _mm_xor_si128( M[12], H[12] ), \
-                     _mm_xor_si128( M[14], H[14] ) ) )
+   v128_sub32( \
+      v128_sub32( \
+         v128_sub32( v128_xor( M[ 1], H[ 1] ), \
+                        v128_xor( M[ 4], H[ 4] ) ), \
+         v128_xor( M[ 5], H[ 5] ) ), \
+      v128_add32( v128_xor( M[12], H[12] ), \
+                     v128_xor( M[14], H[14] ) ) )

 #define Ws8 \
-   _mm_add_epi32( \
-      _mm_sub_epi32( \
-         _mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
-                        _mm_xor_si128( M[ 5], H[ 5] ) ), \
-         _mm_xor_si128( M[ 6], H[ 6] ) ), \
-      _mm_sub_epi32( _mm_xor_si128( M[13], H[13] ), \
-                     _mm_xor_si128( M[15], H[15] ) ) )
+   v128_add32( \
+      v128_sub32( \
+         v128_sub32( v128_xor( M[ 2], H[ 2] ), \
+                        v128_xor( M[ 5], H[ 5] ) ), \
+         v128_xor( M[ 6], H[ 6] ) ), \
+      v128_sub32( v128_xor( M[13], H[13] ), \
+                     v128_xor( M[15], H[15] ) ) )
 #define Ws9 \
-   _mm_sub_epi32( \
-      _mm_add_epi32( \
-         _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
-                        _mm_xor_si128( M[ 3], H[ 3] ) ), \
-         _mm_xor_si128( M[ 6], H[ 6] ) ), \
-      _mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
-                     _mm_xor_si128( M[14], H[14] ) ) )
+   v128_sub32( \
+      v128_add32( \
+         v128_sub32( v128_xor( M[ 0], H[ 0] ), \
+                        v128_xor( M[ 3], H[ 3] ) ), \
+         v128_xor( M[ 6], H[ 6] ) ), \
+      v128_sub32( v128_xor( M[ 7], H[ 7] ), \
+                     v128_xor( M[14], H[14] ) ) )

 #define Ws10 \
-   _mm_sub_epi32( \
-      _mm_sub_epi32( \
-         _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
-                        _mm_xor_si128( M[ 1], H[ 1] ) ), \
-         _mm_xor_si128( M[ 4], H[ 4] ) ), \
-      _mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
-                     _mm_xor_si128( M[15], H[15] ) ) )
+   v128_sub32( \
+      v128_sub32( \
+         v128_sub32( v128_xor( M[ 8], H[ 8] ), \
+                        v128_xor( M[ 1], H[ 1] ) ), \
+         v128_xor( M[ 4], H[ 4] ) ), \
+      v128_sub32( v128_xor( M[ 7], H[ 7] ), \
+                     v128_xor( M[15], H[15] ) ) )

 #define Ws11 \
-   _mm_sub_epi32( \
-      _mm_sub_epi32( \
-         _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
-                        _mm_xor_si128( M[ 0], H[ 0] ) ), \
-         _mm_xor_si128( M[ 2], H[ 2] ) ), \
-      _mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
-                     _mm_xor_si128( M[ 9], H[ 9] ) ) )
+   v128_sub32( \
+      v128_sub32( \
+         v128_sub32( v128_xor( M[ 8], H[ 8] ), \
+                        v128_xor( M[ 0], H[ 0] ) ), \
+         v128_xor( M[ 2], H[ 2] ) ), \
+      v128_sub32( v128_xor( M[ 5], H[ 5] ), \
+                     v128_xor( M[ 9], H[ 9] ) ) )

 #define Ws12 \
-   _mm_sub_epi32( \
-      _mm_sub_epi32( \
-         _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
-                        _mm_xor_si128( M[ 3], H[ 3] ) ), \
-         _mm_xor_si128( M[ 6], H[ 6] ) ), \
-      _mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
-                     _mm_xor_si128( M[10], H[10] ) ) )
+   v128_sub32( \
+      v128_sub32( \
+         v128_add32( v128_xor( M[ 1], H[ 1] ), \
+                        v128_xor( M[ 3], H[ 3] ) ), \
+         v128_xor( M[ 6], H[ 6] ) ), \
+      v128_sub32( v128_xor( M[ 9], H[ 9] ), \
+                     v128_xor( M[10], H[10] ) ) )

 #define Ws13 \
-   _mm_add_epi32( \
-      _mm_add_epi32( \
-         _mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
-                        _mm_xor_si128( M[ 4], H[ 4] ) ), \
-         _mm_xor_si128( M[ 7], H[ 7] ) ), \
-      _mm_add_epi32( _mm_xor_si128( M[10], H[10] ), \
-                     _mm_xor_si128( M[11], H[11] ) ) )
+   v128_add32( \
+      v128_add32( \
+         v128_add32( v128_xor( M[ 2], H[ 2] ), \
+                        v128_xor( M[ 4], H[ 4] ) ), \
+         v128_xor( M[ 7], H[ 7] ) ), \
+      v128_add32( v128_xor( M[10], H[10] ), \
+                     v128_xor( M[11], H[11] ) ) )

 #define Ws14 \
-   _mm_sub_epi32( \
-      _mm_add_epi32( \
-         _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
-                        _mm_xor_si128( M[ 5], H[ 5] ) ), \
-         _mm_xor_si128( M[ 8], H[ 8] ) ), \
-      _mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
-                     _mm_xor_si128( M[12], H[12] ) ) )
+   v128_sub32( \
+      v128_add32( \
+         v128_sub32( v128_xor( M[ 3], H[ 3] ), \
+                        v128_xor( M[ 5], H[ 5] ) ), \
+         v128_xor( M[ 8], H[ 8] ) ), \
+      v128_add32( v128_xor( M[11], H[11] ), \
+                     v128_xor( M[12], H[12] ) ) )

 #define Ws15 \
-   _mm_sub_epi32( \
-      _mm_sub_epi32( \
-         _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
-                        _mm_xor_si128( M[ 4], H[4] ) ), \
-         _mm_xor_si128( M[ 6], H[ 6] ) ), \
-      _mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
-                     _mm_xor_si128( M[13], H[13] ) ) )
+   v128_sub32( \
+      v128_sub32( \
+         v128_sub32( v128_xor( M[12], H[12] ), \
+                        v128_xor( M[ 4], H[4] ) ), \
+         v128_xor( M[ 6], H[ 6] ) ), \
+      v128_sub32( v128_xor( M[ 9], H[ 9] ), \
+                     v128_xor( M[13], H[13] ) ) )


-void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
+void compress_small( const v128u64_t *M, const v128u64_t H[16], v128u64_t dH[16] )
 {
-   __m128i qt[32], xl, xh; \
+   v128u64_t qt[32], xl, xh; \

-   qt[ 0] = _mm_add_epi32( ss0( Ws0 ), H[ 1] );
-   qt[ 1] = _mm_add_epi32( ss1( Ws1 ), H[ 2] );
-   qt[ 2] = _mm_add_epi32( ss2( Ws2 ), H[ 3] );
-   qt[ 3] = _mm_add_epi32( ss3( Ws3 ), H[ 4] );
-   qt[ 4] = _mm_add_epi32( ss4( Ws4 ), H[ 5] );
-   qt[ 5] = _mm_add_epi32( ss0( Ws5 ), H[ 6] );
-   qt[ 6] = _mm_add_epi32( ss1( Ws6 ), H[ 7] );
-   qt[ 7] = _mm_add_epi32( ss2( Ws7 ), H[ 8] );
-   qt[ 8] = _mm_add_epi32( ss3( Ws8 ), H[ 9] );
-   qt[ 9] = _mm_add_epi32( ss4( Ws9 ), H[10] );
-   qt[10] = _mm_add_epi32( ss0( Ws10), H[11] );
-   qt[11] = _mm_add_epi32( ss1( Ws11), H[12] );
-   qt[12] = _mm_add_epi32( ss2( Ws12), H[13] );
-   qt[13] = _mm_add_epi32( ss3( Ws13), H[14] );
-   qt[14] = _mm_add_epi32( ss4( Ws14), H[15] );
-   qt[15] = _mm_add_epi32( ss0( Ws15), H[ 0] );
+   qt[ 0] = v128_add32( ss0( Ws0 ), H[ 1] );
+   qt[ 1] = v128_add32( ss1( Ws1 ), H[ 2] );
+   qt[ 2] = v128_add32( ss2( Ws2 ), H[ 3] );
+   qt[ 3] = v128_add32( ss3( Ws3 ), H[ 4] );
+   qt[ 4] = v128_add32( ss4( Ws4 ), H[ 5] );
+   qt[ 5] = v128_add32( ss0( Ws5 ), H[ 6] );
+   qt[ 6] = v128_add32( ss1( Ws6 ), H[ 7] );
+   qt[ 7] = v128_add32( ss2( Ws7 ), H[ 8] );
+   qt[ 8] = v128_add32( ss3( Ws8 ), H[ 9] );
+   qt[ 9] = v128_add32( ss4( Ws9 ), H[10] );
+   qt[10] = v128_add32( ss0( Ws10), H[11] );
+   qt[11] = v128_add32( ss1( Ws11), H[12] );
+   qt[12] = v128_add32( ss2( Ws12), H[13] );
+   qt[13] = v128_add32( ss3( Ws13), H[14] );
+   qt[14] = v128_add32( ss4( Ws14), H[15] );
+   qt[15] = v128_add32( ss0( Ws15), H[ 0] );
   qt[16] = expand1s( qt, M, H, 16 );
   qt[17] = expand1s( qt, M, H, 17 );
   qt[18] = expand2s( qt, M, H, 18 );
@@ -321,92 +321,92 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
   qt[30] = expand2s( qt, M, H, 30 );
   qt[31] = expand2s( qt, M, H, 31 );

-   xl = _mm_xor_si128( mm128_xor4( qt[16], qt[17], qt[18], qt[19] ),
-                       mm128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
-   xh = _mm_xor_si128( xl, _mm_xor_si128(
-                             mm128_xor4( qt[24], qt[25], qt[26], qt[27] ),
-                             mm128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
+   xl = v128_xor( v128_xor4( qt[16], qt[17], qt[18], qt[19] ),
+                       v128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
+   xh = v128_xor( xl, v128_xor(
+                             v128_xor4( qt[24], qt[25], qt[26], qt[27] ),
+                             v128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );

-   dH[ 0] = _mm_add_epi32(
-                 _mm_xor_si128( M[0],
-                      _mm_xor_si128( _mm_slli_epi32( xh, 5 ),
-                                     _mm_srli_epi32( qt[16], 5 ) ) ),
-                 _mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] ));
-   dH[ 1] = _mm_add_epi32(
-                 _mm_xor_si128( M[1],
-                      _mm_xor_si128( _mm_srli_epi32( xh, 7 ),
-                                     _mm_slli_epi32( qt[17], 8 ) ) ),
-                 _mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] ));
-   dH[ 2] = _mm_add_epi32(
-                 _mm_xor_si128( M[2],
-                      _mm_xor_si128( _mm_srli_epi32( xh, 5 ),
-                                     _mm_slli_epi32( qt[18], 5 ) ) ),
-                 _mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] ));
-   dH[ 3] = _mm_add_epi32(
-                 _mm_xor_si128( M[3],
-                      _mm_xor_si128( _mm_srli_epi32( xh, 1 ),
-                                     _mm_slli_epi32( qt[19], 5 ) ) ),
-                 _mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] ));
-   dH[ 4] = _mm_add_epi32(
-                 _mm_xor_si128( M[4],
-                      _mm_xor_si128( _mm_srli_epi32( xh, 3 ),
-                                     _mm_slli_epi32( qt[20], 0 ) ) ),
-                 _mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] ));
-   dH[ 5] = _mm_add_epi32(
-                 _mm_xor_si128( M[5],
-                      _mm_xor_si128( _mm_slli_epi32( xh, 6 ),
-                                     _mm_srli_epi32( qt[21], 6 ) ) ),
-                 _mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] ));
-   dH[ 6] = _mm_add_epi32(
-                 _mm_xor_si128( M[6],
-                      _mm_xor_si128( _mm_srli_epi32( xh, 4 ),
-                                     _mm_slli_epi32( qt[22], 6 ) ) ),
-                 _mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] ));
-   dH[ 7] = _mm_add_epi32(
-                 _mm_xor_si128( M[7],
-                      _mm_xor_si128( _mm_srli_epi32( xh, 11 ),
-                                     _mm_slli_epi32( qt[23], 2 ) ) ),
-                 _mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ));
-   dH[ 8] = _mm_add_epi32( _mm_add_epi32(
-                 mm128_rol_32( dH[4], 9 ),
-                 _mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] )),
-                 _mm_xor_si128( _mm_slli_epi32( xl, 8 ),
-                                _mm_xor_si128( qt[23], qt[ 8] ) ) );
-   dH[ 9] = _mm_add_epi32( _mm_add_epi32(
-                 mm128_rol_32( dH[5], 10 ),
-                 _mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] )),
-                 _mm_xor_si128( _mm_srli_epi32( xl, 6 ),
-                                _mm_xor_si128( qt[16], qt[ 9] ) ) );
-   dH[10] = _mm_add_epi32( _mm_add_epi32(
-                 mm128_rol_32( dH[6], 11 ),
-                 _mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] )),
-                 _mm_xor_si128( _mm_slli_epi32( xl, 6 ),
-                                _mm_xor_si128( qt[17], qt[10] ) ) );
-   dH[11] = _mm_add_epi32( _mm_add_epi32(
-                 mm128_rol_32( dH[7], 12 ),
-                 _mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )),
-                 _mm_xor_si128( _mm_slli_epi32( xl, 4 ),
-                                _mm_xor_si128( qt[18], qt[11] ) ) );
-   dH[12] = _mm_add_epi32( _mm_add_epi32(
-                 mm128_rol_32( dH[0], 13 ),
-                 _mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] )),
-                 _mm_xor_si128( _mm_srli_epi32( xl, 3 ),
-                                _mm_xor_si128( qt[19], qt[12] ) ) );
-   dH[13] = _mm_add_epi32( _mm_add_epi32(
-                 mm128_rol_32( dH[1], 14 ),
-                 _mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] )),
-                 _mm_xor_si128( _mm_srli_epi32( xl, 4 ),
-                                _mm_xor_si128( qt[20], qt[13] ) ) );
-   dH[14] = _mm_add_epi32( _mm_add_epi32(
-                 mm128_rol_32( dH[2], 15 ),
-                 _mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] )),
-                 _mm_xor_si128( _mm_srli_epi32( xl, 7 ),
-                                _mm_xor_si128( qt[21], qt[14] ) ) );
-   dH[15] = _mm_add_epi32( _mm_add_epi32(
-                 mm128_rol_32( dH[3], 16 ),
-                 _mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] )),
-                 _mm_xor_si128( _mm_srli_epi32( xl, 2 ),
-                                _mm_xor_si128( qt[22], qt[15] ) ) );
+   dH[ 0] = v128_add32(
+                 v128_xor( M[0],
+                      v128_xor( v128_sl32( xh, 5 ),
+                                     v128_sr32( qt[16], 5 ) ) ),
+                 v128_xor( v128_xor( xl, qt[24] ), qt[ 0] ));
+   dH[ 1] = v128_add32(
+                 v128_xor( M[1],
+                      v128_xor( v128_sr32( xh, 7 ),
+                                     v128_sl32( qt[17], 8 ) ) ),
+                 v128_xor( v128_xor( xl, qt[25] ), qt[ 1] ));
+   dH[ 2] = v128_add32(
+                 v128_xor( M[2],
+                      v128_xor( v128_sr32( xh, 5 ),
+                                     v128_sl32( qt[18], 5 ) ) ),
+                 v128_xor( v128_xor( xl, qt[26] ), qt[ 2] ));
+   dH[ 3] = v128_add32(
+                 v128_xor( M[3],
+                      v128_xor( v128_sr32( xh, 1 ),
+                                     v128_sl32( qt[19], 5 ) ) ),
+                 v128_xor( v128_xor( xl, qt[27] ), qt[ 3] ));
+   dH[ 4] = v128_add32(
+                 v128_xor( M[4],
+                      v128_xor( v128_sr32( xh, 3 ),
+                                     v128_sl32( qt[20], 0 ) ) ),
+                 v128_xor( v128_xor( xl, qt[28] ), qt[ 4] ));
+   dH[ 5] = v128_add32(
+                 v128_xor( M[5],
+                      v128_xor( v128_sl32( xh, 6 ),
+                                     v128_sr32( qt[21], 6 ) ) ),
+                 v128_xor( v128_xor( xl, qt[29] ), qt[ 5] ));
+   dH[ 6] = v128_add32(
+                 v128_xor( M[6],
+                      v128_xor( v128_sr32( xh, 4 ),
+                                     v128_sl32( qt[22], 6 ) ) ),
+                 v128_xor( v128_xor( xl, qt[30] ), qt[ 6] ));
+   dH[ 7] = v128_add32(
+                 v128_xor( M[7],
+                      v128_xor( v128_sr32( xh, 11 ),
+                                     v128_sl32( qt[23], 2 ) ) ),
+                 v128_xor( v128_xor( xl, qt[31] ), qt[ 7] ));
+   dH[ 8] = v128_add32( v128_add32(
+                 v128_rol32( dH[4], 9 ),
+                 v128_xor( v128_xor( xh, qt[24] ), M[ 8] )),
+                 v128_xor( v128_sl32( xl, 8 ),
+                                v128_xor( qt[23], qt[ 8] ) ) );
+   dH[ 9] = v128_add32( v128_add32(
+                 v128_rol32( dH[5], 10 ),
+                 v128_xor( v128_xor( xh, qt[25] ), M[ 9] )),
+                 v128_xor( v128_sr32( xl, 6 ),
+                                v128_xor( qt[16], qt[ 9] ) ) );
+   dH[10] = v128_add32( v128_add32(
+                 v128_rol32( dH[6], 11 ),
+                 v128_xor( v128_xor( xh, qt[26] ), M[10] )),
+                 v128_xor( v128_sl32( xl, 6 ),
+                                v128_xor( qt[17], qt[10] ) ) );
+   dH[11] = v128_add32( v128_add32(
+                 v128_rol32( dH[7], 12 ),
+                 v128_xor( v128_xor( xh, qt[27] ), M[11] )),
+                 v128_xor( v128_sl32( xl, 4 ),
+                                v128_xor( qt[18], qt[11] ) ) );
+   dH[12] = v128_add32( v128_add32(
+                 v128_rol32( dH[0], 13 ),
+                 v128_xor( v128_xor( xh, qt[28] ), M[12] )),
+                 v128_xor( v128_sr32( xl, 3 ),
+                                v128_xor( qt[19], qt[12] ) ) );
+   dH[13] = v128_add32( v128_add32(
+                 v128_rol32( dH[1], 14 ),
+                 v128_xor( v128_xor( xh, qt[29] ), M[13] )),
+                 v128_xor( v128_sr32( xl, 4 ),
+                                v128_xor( qt[20], qt[13] ) ) );
+   dH[14] = v128_add32( v128_add32(
+                 v128_rol32( dH[2], 15 ),
+                 v128_xor( v128_xor( xh, qt[30] ), M[14] )),
+                 v128_xor( v128_sr32( xl, 7 ),
+                                v128_xor( qt[21], qt[14] ) ) );
+   dH[15] = v128_add32( v128_add32(
+                 v128_rol32( dH[3], 16 ),
+                 v128_xor( v128_xor( xh, qt[31] ), M[15] )),
+                 v128_xor( v128_sr32( xl, 2 ),
+                                v128_xor( qt[22], qt[15] ) ) );
 }

 static const uint32_t final_s[16][4] =
@@ -429,7 +429,7 @@ static const uint32_t final_s[16][4] =
   { 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
 };
 /*
-static const __m128i final_s[16] =
+static const v128u64_t final_s[16] =
 {
   { 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
   { 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
@@ -451,26 +451,26 @@ static const __m128i final_s[16] =
 */
 void bmw256_4way_init( bmw256_4way_context *ctx )
 {
-   ctx->H[ 0] = _mm_set1_epi64x( 0x4041424340414243 );
-   ctx->H[ 1] = _mm_set1_epi64x( 0x4445464744454647 );
-   ctx->H[ 2] = _mm_set1_epi64x( 0x48494A4B48494A4B );
-   ctx->H[ 3] = _mm_set1_epi64x( 0x4C4D4E4F4C4D4E4F );
-   ctx->H[ 4] = _mm_set1_epi64x( 0x5051525350515253 );
-   ctx->H[ 5] = _mm_set1_epi64x( 0x5455565754555657 );
-   ctx->H[ 6] = _mm_set1_epi64x( 0x58595A5B58595A5B );
-   ctx->H[ 7] = _mm_set1_epi64x( 0x5C5D5E5F5C5D5E5F );
-   ctx->H[ 8] = _mm_set1_epi64x( 0x6061626360616263 );
-   ctx->H[ 9] = _mm_set1_epi64x( 0x6465666764656667 );
-   ctx->H[10] = _mm_set1_epi64x( 0x68696A6B68696A6B );
-   ctx->H[11] = _mm_set1_epi64x( 0x6C6D6E6F6C6D6E6F );
-   ctx->H[12] = _mm_set1_epi64x( 0x7071727370717273 );
-   ctx->H[13] = _mm_set1_epi64x( 0x7475767774757677 );
-   ctx->H[14] = _mm_set1_epi64x( 0x78797A7B78797A7B );
-   ctx->H[15] = _mm_set1_epi64x( 0x7C7D7E7F7C7D7E7F );
+   ctx->H[ 0] = v128_64( 0x4041424340414243 );
+   ctx->H[ 1] = v128_64( 0x4445464744454647 );
+   ctx->H[ 2] = v128_64( 0x48494A4B48494A4B );
+   ctx->H[ 3] = v128_64( 0x4C4D4E4F4C4D4E4F );
+   ctx->H[ 4] = v128_64( 0x5051525350515253 );
+   ctx->H[ 5] = v128_64( 0x5455565754555657 );
+   ctx->H[ 6] = v128_64( 0x58595A5B58595A5B );
+   ctx->H[ 7] = v128_64( 0x5C5D5E5F5C5D5E5F );
+   ctx->H[ 8] = v128_64( 0x6061626360616263 );
+   ctx->H[ 9] = v128_64( 0x6465666764656667 );
+   ctx->H[10] = v128_64( 0x68696A6B68696A6B );
+   ctx->H[11] = v128_64( 0x6C6D6E6F6C6D6E6F );
+   ctx->H[12] = v128_64( 0x7071727370717273 );
+   ctx->H[13] = v128_64( 0x7475767774757677 );
+   ctx->H[14] = v128_64( 0x78797A7B78797A7B );
+   ctx->H[15] = v128_64( 0x7C7D7E7F7C7D7E7F );


 //   for ( int i = 0; i < 16; i++ )
-//      sc->H[i] = _mm_set1_epi32( iv[i] );
+//      sc->H[i] = v128_32( iv[i] );
   ctx->ptr = 0;
   ctx->bit_count = 0;
 }
@@ -478,10 +478,10 @@ void bmw256_4way_init( bmw256_4way_context *ctx )
 static void
 bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
 {
-   __m128i *vdata = (__m128i*)data;
-   __m128i *buf;
-   __m128i htmp[16];
-   __m128i *h1, *h2;
+   v128u64_t *vdata = (v128u64_t*)data;
+   v128u64_t *buf;
+   v128u64_t htmp[16];
+   v128u64_t *h1, *h2;
   size_t ptr;
   const int buf_size = 64;  // bytes of one lane, compatible with len

@@ -497,13 +497,13 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
      clen = buf_size - ptr;
      if ( clen > len )
         clen = len;
-      memcpy_128( buf + (ptr>>2), vdata, clen >> 2 );
+      v128_memcpy( buf + (ptr>>2), vdata, clen >> 2 );
      vdata += ( clen >> 2 );
      len -= clen;
      ptr += clen;
      if ( ptr == buf_size )
      {
-         __m128i *ht;
+         v128u64_t *ht;
         compress_small( buf, h1, h2 );
         ht = h1;
         h1 = h2;
@@ -513,46 +513,45 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
   }
   sc->ptr = ptr;

-
   if ( h1 != sc->H )
-        memcpy_128( sc->H, h1, 16 );
+        v128_memcpy( sc->H, h1, 16 );
 }

 static void
 bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
 	void *dst, size_t out_size_w32)
 {
-   __m128i *buf;
-   __m128i h1[16], h2[16], *h;
+   v128u64_t *buf;
+   v128u64_t h1[16], h2[16], *h;
   size_t ptr, u, v;
   const int buf_size = 64;  // bytes of one lane, compatible with len

   buf = sc->buf;
   ptr = sc->ptr;
-   buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
+   buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
   ptr += 4;
   h = sc->H;

   // assume bit_count fits in 32 bits 
   if ( ptr > buf_size - 4 )
   {
-      memset_zero_128( buf + (ptr>>2), (buf_size - ptr) >> 2 );
+      v128_memset_zero( buf + (ptr>>2), (buf_size - ptr) >> 2 );
      compress_small( buf, h, h1 );
      ptr = 0;
      h = h1;
   }
-   memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
-   buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
-   buf[ (buf_size - 4) >> 2 ] = m128_zero;
+   v128_memset_zero( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
+   buf[ (buf_size - 8) >> 2 ] = v128_32( sc->bit_count + n );
+   buf[ (buf_size - 4) >> 2 ] = v128_zero;
   compress_small( buf, h, h2 );

   for ( u = 0; u < 16; u ++ )
      buf[u] = h2[u];

-   compress_small( buf, (__m128i*)final_s, h1 );
+   compress_small( buf, (v128u64_t*)final_s, h1 );

   for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
-      casti_m128i( dst, u ) = h1[v];
+      casti_v128( dst, u ) = h1[v];
 }

 /*
--- a/algo/bmw/bmw512-4way.c
+++ b/algo/bmw/bmw512-4way.c
@@ -2,12 +2,11 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
-//#include "sph_keccak.h"
 #include "bmw-hash-4way.h"

 #if defined(BMW512_8WAY)

-void bmw512hash_8way(void *state, const void *input)
+void bmw512hash_8way( void *state, const void *input )
 {
    bmw512_8way_context ctx;
    bmw512_8way_init( &ctx );
@@ -27,9 +26,9 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
-   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   __m512i  *noncev = (__m512i*)vdata + 9;
   const uint32_t Htarg = ptarget[7];
-   int thr_id = mythr->id;
+   const int thr_id = mythr->id;

   mm512_bswap32_intrlv80_8x64( vdata, pdata );
   do {
@@ -43,7 +42,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) )
+          if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
          {
              pdata[19] = n + lane;
              submit_solution( work, lane_hash, mythr );
@@ -59,9 +58,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
   
 #elif defined(BMW512_4WAY)

-//#ifdef BMW512_4WAY
-
-void bmw512hash_4way(void *state, const void *input)
+void bmw512hash_4way( void *state, const void *input )
 {
    bmw512_4way_context ctx;
    bmw512_4way_init( &ctx );
@@ -80,10 +77,10 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce -  4;
-   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   const uint32_t last_nonce = max_nonce - 4;
+   __m256i  *noncev = (__m256i*)vdata + 9; 
   const uint32_t Htarg = ptarget[7];
-    int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;  

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
   do {
@@ -96,7 +93,7 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) )
+          if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
          {
              pdata[19] = n + lane;
              submit_solution( work, lane_hash, mythr );
@@ -110,4 +107,55 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }

+#elif defined(BMW512_2WAY)
+
+void bmw512hash_2x64( void *state, const void *input )
+{
+    bmw512_2x64_context ctx;
+    bmw512_2x64_init( &ctx );
+    bmw512_2x64_update( &ctx, input, 80 );
+    bmw512_2x64_close( &ctx, state );
+}
+
+int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
+   uint32_t hash[16*2] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[13]);   // 3*4+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   v128_t *noncev = (v128_t*)vdata + 9;  
+   const uint32_t Htarg = ptarget[7];
+   const int thr_id = mythr->id; 
+
+   v128_bswap32_intrlv80_2x64( vdata, pdata );
+   do {
+      *noncev = v128_intrlv_blend_32( v128_bswap32(
+                                      v128_set32( n+1, 0, n, 0 ) ), *noncev );
+
+      bmw512hash_2x64( hash, vdata );
+
+      for ( int lane = 0; lane < 2; lane++ )
+      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
+      {
+          extr_lane_2x64( lane_hash, hash, lane, 256 );
+          if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
+          {
+              pdata[19] = n + lane;
+              submit_solution( work, lane_hash, mythr );
+          }
+      }
+      n += 2;
+
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
 #endif
--- a/algo/bmw/bmw512-gate.c
+++ b/algo/bmw/bmw512-gate.c
@@ -2,7 +2,7 @@

 bool register_bmw512_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
  opt_target_factor = 256.0;
 #if defined (BMW512_8WAY)
  gate->scanhash  = (void*)&scanhash_bmw512_8way;
@@ -10,6 +10,9 @@ bool register_bmw512_algo( algo_gate_t* gate )
 #elif defined (BMW512_4WAY)
  gate->scanhash  = (void*)&scanhash_bmw512_4way;
  gate->hash      = (void*)&bmw512hash_4way;
+#elif defined (BMW512_2WAY)
+  gate->scanhash  = (void*)&scanhash_bmw512_2x64;
+  gate->hash      = (void*)&bmw512hash_2x64;
 #else
  gate->scanhash        = (void*)&scanhash_bmw512;
  gate->hash            = (void*)&bmw512hash;
--- a/algo/bmw/bmw512-gate.h
+++ b/algo/bmw/bmw512-gate.h
@@ -8,19 +8,27 @@
  #define BMW512_8WAY 1
 #elif defined(__AVX2__)
  #define BMW512_4WAY 1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define BMW512_2WAY 1
 #endif

 #if defined(BMW512_8WAY)

 void bmw512hash_8way( void *state, const void *input );
 int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
+                          uint64_t *hashes_done, struct thr_info *mythr );

 #elif defined(BMW512_4WAY)

 void bmw512hash_4way( void *state, const void *input );
 int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
+                          uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(BMW512_2WAY)
+
+void bmw512hash_2x64( void *state, const void *input );
+int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );

 #else

--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -236,9 +236,7 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc

 }

-
-
-HashReturn init_echo(hashState_echo *ctx, int nHashSize)
+HashReturn init_echo( hashState_echo *ctx, int nHashSize )
 {
 	int i, j;

@@ -280,7 +278,8 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
 	return SUCCESS;
 }

-HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
+HashReturn update_echo( hashState_echo *state, const void *data,
+                        uint32_t databitlen )
 {
 	unsigned int uByteLength, uBlockCount, uRemainingBytes;

@@ -330,7 +329,7 @@ HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLengt
 	return SUCCESS;
 }

-HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
+HashReturn final_echo( hashState_echo *state, void *hashval)
 {
 	v128_t remainingbits;

@@ -407,8 +406,8 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
 	return SUCCESS;
 }

-HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
-                              const BitSequence *data, DataLength databitlen )
+HashReturn update_final_echo( hashState_echo *state, void *hashval,
+                              const void *data, uint32_t databitlen )
 {
   unsigned int uByteLength, uBlockCount, uRemainingBytes;

@@ -530,8 +529,8 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
   return SUCCESS;
 }

-HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
-            int nHashSize, const BitSequence *data, DataLength datalen )
+HashReturn echo_full( hashState_echo *state, void *hashval,
+            int nHashSize, const void *data, uint32_t datalen )
 {
   int i, j;

@@ -578,7 +577,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
        {
           // Fill the buffer
           memcpy( state->buffer + state->uBufferBytes,
-                   (void*)data, state->uBlockLength - state->uBufferBytes );
+                   data, state->uBlockLength - state->uBufferBytes );

           // Process buffer
           Compress( state, state->buffer, 1 );
@@ -601,7 +600,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
        }

        if( uRemainingBytes > 0 )
-        memcpy(state->buffer, (void*)data, uRemainingBytes);
+        memcpy(state->buffer, data, uRemainingBytes);

        state->uBufferBytes = uRemainingBytes;
   }
@@ -689,7 +688,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
 }


-
+#if 0
 HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
 {
 	HashReturn hRet;
@@ -746,5 +745,6 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit

 	return SUCCESS;
 }
+#endif

 #endif
--- a/algo/echo/aes_ni/hash_api.h
+++ b/algo/echo/aes_ni/hash_api.h
@@ -47,16 +47,16 @@ HashReturn init_echo(hashState_echo *state, int hashbitlen);

 HashReturn reinit_echo(hashState_echo *state);

-HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen);
+HashReturn update_echo(hashState_echo *state, const void *data, uint32_t databitlen);

-HashReturn final_echo(hashState_echo *state, BitSequence *hashval);
+HashReturn final_echo(hashState_echo *state, void *hashval);

-HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
+HashReturn hash_echo(int hashbitlen, const void *data, uint32_t databitlen, void *hashval);

-HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
-                              const BitSequence *data, DataLength databitlen );
-HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
-            int nHashSize, const BitSequence *data, DataLength databitlen );
+HashReturn update_final_echo( hashState_echo *state, void *hashval,
+                              const void *data, uint32_t databitlen );
+HashReturn echo_full( hashState_echo *state, void *hashval,
+            int nHashSize, const void *data, uint32_t databitlen );

 #endif // HASH_API_H

--- a/algo/echo/sph_echo.c
+++ b/algo/echo/sph_echo.c
@@ -36,7 +36,6 @@

 #include "sph_echo.h"

-#if !defined(__AES__)

 #ifdef __cplusplus
 extern "C"{
@@ -1031,4 +1030,3 @@ sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #ifdef __cplusplus
 }
 #endif 
-#endif  // !AES
--- a/algo/echo/sph_echo.h
+++ b/algo/echo/sph_echo.h
@@ -36,8 +36,6 @@
 #ifndef SPH_ECHO_H__
 #define SPH_ECHO_H__

-#if !defined(__AES__)
-
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -318,5 +316,4 @@ void sph_echo512_addbits_and_close(
 #ifdef __cplusplus
 }
 #endif
-#endif // !AES
 #endif
--- a/algo/fugue/fugue-aesni.c
+++ b/algo/fugue/fugue-aesni.c
@@ -15,237 +15,176 @@
 *
 */

-#if defined(__AES__)
-
-#include <x86intrin.h>
+#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )

 #include <memory.h>
 #include "fugue-aesni.h"

+static const v128u64_t _supermix1a	__attribute__ ((aligned (16))) =
+   { 0x0202010807020100, 0x0a05000f06010c0b };

-MYALIGN const unsigned long long _supermix1a[]	= {0x0202010807020100, 0x0a05000f06010c0b};
-MYALIGN const unsigned long long _supermix1b[]	= {0x0b0d080703060504, 0x0e0a090c050e0f0a};
-MYALIGN const unsigned long long _supermix1c[]	= {0x0402060c070d0003, 0x090a060580808080};
-MYALIGN const unsigned long long _supermix1d[]	= {0x808080800f0e0d0c, 0x0f0e0d0c80808080};
-MYALIGN const unsigned long long _supermix2a[]	= {0x07020d0880808080, 0x0b06010c050e0f0a};
-MYALIGN const unsigned long long _supermix4a[]	= {0x000f0a050c0b0601, 0x0302020404030e09};
-MYALIGN const unsigned long long _supermix4b[]	= {0x07020d08080e0d0d, 0x07070908050e0f0a};
-MYALIGN const unsigned long long _supermix4c[]	= {0x0706050403020000, 0x0302000007060504};
-MYALIGN const unsigned long long _supermix7a[]	= {0x010c0b060d080702, 0x0904030e03000104};
-MYALIGN const unsigned long long _supermix7b[]	= {0x8080808080808080, 0x0504070605040f06};
-//MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
-//MYALIGN const unsigned char _shift_one_mask[]   = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
-//MYALIGN const unsigned char _shift_four_mask[]  = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
-//MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
-//MYALIGN const unsigned char _aes_shift_rows[]   = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
-MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
-MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
-MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
-MYALIGN const unsigned int _lsbmask2[] = {0x03030303, 0x03030303, 0x03030303, 0x03030303};
+static const v128u64_t _supermix1b	__attribute__ ((aligned (16))) =
+   { 0x0b0d080703060504, 0x0e0a090c050e0f0a };

+static const v128u64_t _supermix1c	__attribute__ ((aligned (16))) =
+   { 0x0402060c070d0003, 0x090a060580808080 };

-MYALIGN const unsigned int _IV512[] = {		
-	0x00000000, 0x00000000,	0x7ea50788, 0x00000000,
+static const v128u64_t _supermix1d	__attribute__ ((aligned (16))) =
+   { 0x808080800f0e0d0c, 0x0f0e0d0c80808080 };
+
+static const v128u64_t _supermix2a	__attribute__ ((aligned (16))) =
+   { 0x07020d0880808080, 0x0b06010c050e0f0a };
+
+static const v128u64_t _supermix4a	__attribute__ ((aligned (16))) =
+   { 0x000f0a050c0b0601, 0x0302020404030e09 };
+
+static const v128u64_t _supermix4b	__attribute__ ((aligned (16))) =
+   { 0x07020d08080e0d0d, 0x07070908050e0f0a };
+
+static const v128u64_t _supermix4c	__attribute__ ((aligned (16))) =
+   { 0x0706050403020000, 0x0302000007060504 };
+
+static const v128u64_t _supermix7a	__attribute__ ((aligned (16))) =
+   { 0x010c0b060d080702, 0x0904030e03000104 };
+
+static const v128u64_t _supermix7b	__attribute__ ((aligned (16))) =
+   { 0x8080808080808080, 0x0504070605040f06 };
+
+static const v128u64_t _inv_shift_rows __attribute__ ((aligned (16))) =
+   { 0x0b0e0104070a0d00, 0x0306090c0f020508 };
+
+static const v128u64_t _mul2mask __attribute__ ((aligned (16))) =
+   { 0x000000001b1b0000, 0x0000000000000000 };
+
+static const v128u64_t _mul4mask __attribute__ ((aligned (16))) =
+   { 0x000000002d361b00, 0x0000000000000000 };
+
+static const v128u64_t _lsbmask2 __attribute__ ((aligned (16))) =
+   { 0x0303030303030303, 0x0303030303030303 };
+
+static const uint32_t _IV512[] __attribute__ ((aligned (32))) =
+ {	0x00000000, 0x00000000,	0x7ea50788, 0x00000000,
 	0x75af16e6, 0xdbe4d3c5, 0x27b09aac, 0x00000000,
 	0x17f115d9, 0x54cceeb6, 0x0b02e806, 0x00000000,
 	0xd1ef924a, 0xc9e2c6aa, 0x9813b2dd, 0x00000000,
 	0x3858e6ca, 0x3f207f43, 0xe778ea25, 0x00000000,
-	0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000};
+	0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000
+ };

-#if defined(__SSE4_1__)
+#if defined(__ARM_NEON)

-#define PACK_S0(s0, s1, t1)\
-   s0 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s0), _mm_castsi128_ps(s1), 0x30))
+#define mask_1000(v)         v128_put32( v, 0, 3 )

-#define UNPACK_S0(s0, s1, t1)\
-   s1 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s1), _mm_castsi128_ps(s0), 0xc0));\
-   s0 = mm128_mask_32( s0, 8 )
+static const v128u32_t MASK_3321 __attribute__ ((aligned (16))) =
+   { 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x0f0e0d0c };

-#define CMIX(s1, s2, r1, r2, t1, t2)\
-   t1 = s1;\
-   t1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t1), _mm_castsi128_ps(s2), _MM_SHUFFLE(3, 0, 2, 1)));\
-   r1 = _mm_xor_si128(r1, t1);\
-   r2 = _mm_xor_si128(r2, t1);
+static const v128u32_t MASK_3033 __attribute__ ((aligned (16))) =
+   { 0x0f0e0d0c, 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c };

-#else   // SSE2
+static const v128u32_t MASK_3303 __attribute__ ((aligned (16))) =
+   { 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c, 0x0f0e0d0c };

-#define PACK_S0(s0, s1, t1)\
-   t1 = _mm_shuffle_epi32(s1, _MM_SHUFFLE(0, 3, 3, 3));\
-   s0 = _mm_xor_si128(s0, t1);
+static const v128u32_t MASK_0321 __attribute__ ((aligned (16))) =
+   { 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x03020100 };

-#define UNPACK_S0(s0, s1, t1)\
-   t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 3, 3));\
-   s1 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s1), _mm_castsi128_ps(t1)));\
-   s0 = mm128_mask_32( s0, 8 )
+#define shuffle_3303(v)      vqtbl1q_u8( v, MASK_3303 )
+#define shuffle_0321(v)      vqtbl1q_u8( v, MASK_0321 )

-#define CMIX(s1, s2, r1, r2, t1, t2)\
-   t1 = _mm_shuffle_epi32(s1, 0xf9);\
-   t2 = _mm_shuffle_epi32(s2, 0xcf);\
-   t1 = _mm_xor_si128(t1, t2);\
-   r1 = _mm_xor_si128(r1, t1);\
-   r2 = _mm_xor_si128(r2, t1)
+#define CMIX( s1, s2, r1, r2, t1, t2 ) \
+   t1 = vqtbl1q_u8( s1, MASK_3321 ); \
+   t2 = vqtbl1q_u8( s2, MASK_3033 ); \
+   t1 = v128_xor( t1, t2 ); \
+   r1 = v128_xor( r1, t1 ); \
+   r2 = v128_xor( r2, t1 );
+
+#elif defined(__SSE4_1__)
+
+#define mask_1000(v)         v128_mask32( v, 8 )
+
+#define shuffle_3303(v)      _mm_shuffle_epi32( v, 0xf3 )
+#define shuffle_0321(v)      _mm_shuffle_epi32( v, 0x39 )
+
+#define CMIX( s1, s2, r1, r2, t1, t2 ) \
+   t1 = s1; \
+   t1 = v128_shuffle2_32( t1, s2, _MM_SHUFFLE( 3, 0, 2, 1 ) ); \
+   r1 = v128_xor( r1, t1 ); \
+   r2 = v128_xor( r2, t1 );

 #endif

-#define TIX256(msg, s10, s8, s24, s0, t1, t2, t3)\
-	t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
-	s10 = _mm_xor_si128(s10, t1);\
-	t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
-	s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
-	t1 = _mm_slli_si128(t1, 8);\
-	s8 = _mm_xor_si128(s8, t1);\
-	t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
-	s0 = _mm_xor_si128(s0, t1)
+#define PACK_S0( s0, s1, t1 ) \
+ s0 = v128_movlane32( s0, 3, s1, 0 )

-
-#define TIX384(msg, s16, s8, s27, s30, s0, s4, t1, t2, t3)\
-	t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
-	s16 = _mm_xor_si128(s16, t1);\
-	t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
-	s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
-	t1 = _mm_slli_si128(t1, 8);\
-	s8 = _mm_xor_si128(s8, t1);\
-	t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
-	s0 = _mm_xor_si128(s0, t1);\
-	t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
-	s4 = _mm_xor_si128(s4, t1)
+#define UNPACK_S0( s0, s1, t1 ) \
+   s1 = v128_movlane32( s1, 0, s0, 3 ); \
+   s0 = mask_1000( s0 )

 #define TIX512(msg, s22, s8, s24, s27, s30, s0, s4, s7, t1, t2, t3)\
-	t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
-	s22 = _mm_xor_si128(s22, t1);\
-	t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
-	s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
-	t1 = _mm_slli_si128(t1, 8);\
-	s8 = _mm_xor_si128(s8, t1);\
-	t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
-	s0 = _mm_xor_si128(s0, t1);\
-	t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
-	s4 = _mm_xor_si128(s4, t1);\
-	t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
-	s7 = _mm_xor_si128(s7, t1)
+	t1 = shuffle_3303( s0 ); \
+	s22 = v128_xor(s22, t1);\
+	t1 = v128_put32( v128_zero, *(uint32_t*)msg, 0 ); \
+	s0 = v128_movlane32( s0, 0, t1, 0 ); \
+	t1 = v128_alignr64( t1, v128_zero, 1 ); \
+	s8 = v128_xor(s8, t1);\
+	t1 = shuffle_3303( s24 ); \
+	s0 = v128_xor(s0, t1);\
+	t1 = shuffle_3303( s27 ); \
+	s4 = v128_xor(s4, t1);\
+	t1 = shuffle_3303( s30 ); \
+	s7 = v128_xor(s7, t1)

-#define PRESUPERMIX(t0, t1, t2, t3, t4)\
-   t2 = t0;\
-   t3 = _mm_add_epi8(t0, t0);\
-   t4 = _mm_add_epi8(t3, t3);\
-   t1 = _mm_srli_epi16(t0, 6);\
-   t1 = _mm_and_si128(t1, M128(_lsbmask2));\
-   t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
-   t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
-
-/*
-#define PRESUPERMIX(x, t1, s1, s2, t2)\
-	s1 = x;\
-	s2 = _mm_add_epi8(x, x);\
-	t2 = _mm_add_epi8(s2, s2);\
-	t1 = _mm_srli_epi16(x, 6);\
-	t1 = _mm_and_si128(t1, M128(_lsbmask2));\
-	s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
-	x  = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
-*/
-
-#define SUBSTITUTE(r0, _t2 )\
-	_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
-	_t2 = _mm_aesenclast_si128( _t2, m128_zero )
+#define SUBSTITUTE( r0, _t2 ) \
+	_t2 = v128_shuffle8( r0, _inv_shift_rows ); \
+	_t2 = v128_aesenclast_nokey( _t2 )

 #define SUPERMIX(t0, t1, t2, t3, t4)\
   t2 = t0;\
-   t3 = _mm_add_epi8(t0, t0);\
-   t4 = _mm_add_epi8(t3, t3);\
-   t1 = _mm_srli_epi16(t0, 6);\
-   t1 = _mm_and_si128(t1, M128(_lsbmask2));\
-   t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
-   t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
-   t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
-   t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
-   t4 = _mm_xor_si128(t4, t1);\
-   t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
-   t4 = _mm_xor_si128(t4, t1);\
-   t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
-   t2 = mm128_xor3(t2, t3, t0 );\
-   t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
-   t4 = mm128_xor3( t4, t1, t2 ); \
-   t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
-   t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
-   t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
-   t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
-   t4 = mm128_xor3( t4, t2, t1 ); \
-   t0 = _mm_xor_si128(t0, t3);\
-   t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
-
-/*
-#define SUPERMIX(t0, t1, t2, t3, t4)\
-	PRESUPERMIX(t0, t1, t2, t3, t4);\
-	POSTSUPERMIX(t0, t1, t2, t3, t4)
-*/
-
-#define POSTSUPERMIX(t0, t1, t2, t3, t4)\
-	t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
-	t4 = t1;\
-	t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
-	t4 = _mm_xor_si128(t4, t1);\
-	t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
-	t4 = _mm_xor_si128(t4, t1);\
-	t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
-	t4 = _mm_xor_si128(t4, t1);\
-	t2 = mm128_xor3(t2, t3, t0 );\
-	t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
-	t4 = _mm_xor_si128(t4, t2);\
-	t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
-	t4 = _mm_xor_si128(t4, t2);\
-	t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
-	t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
-	t4 = _mm_xor_si128(t4, t1);\
-	t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
-	t0 = _mm_xor_si128(t0, t3);\
-	t4 = _mm_xor_si128(t4, t0);\
-	t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
-	t4 = _mm_xor_si128(t4, t0)
-
-#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
-	CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
-	PACK_S0(r1c, r1a, _t0);\
-	SUBSTITUTE(r1c, _t2 );\
-	SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
-	_t0 = _mm_shuffle_epi32(r1c, 0x39);\
-	r2c = _mm_xor_si128(r2c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
-	r2d = _mm_xor_si128(r2d, _t0);\
-	UNPACK_S0(r1c, r1a, _t3);\
-	SUBSTITUTE(r2c, _t2 );\
-	SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
-	_t0 = _mm_shuffle_epi32(r2c, 0x39);\
-	r3c = _mm_xor_si128(r3c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
-	r3d = _mm_xor_si128(r3d, _t0);\
-	UNPACK_S0(r2c, r2a, _t3);\
-	SUBSTITUTE(r3c, _t2 );\
-	SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
-	UNPACK_S0(r3c, r3a, _t3)
+   t3 = v128_add8( t0, t0 ); \
+   t4 = v128_add8( t3, t3 ); \
+   t1 = v128_sr16( t0, 6 ); \
+   t1 = v128_and( t1, _lsbmask2 ); \
+   t0 = v128_xor( t4, v128_shuffle8( _mul4mask, t1 ) ); \
+   t4 = v128_shuffle8( t2, _supermix1b ); \
+   t3 = v128_xor( t3, v128_shuffle8( _mul2mask, t1 ) ); \
+   t1 = v128_shuffle8( t4, _supermix1c ); \
+   t4 = v128_xor( t4, t1 ); \
+   t1 = v128_shuffle8( t4, _supermix1d ); \
+   t4 = v128_xor( t4, t1 ); \
+   t1 = v128_shuffle8( t2, _supermix1a ); \
+   t2 = v128_xor3( t2, t3, t0 ); \
+   t2 = v128_shuffle8( t2, _supermix7a ); \
+   t4 = v128_xor3( t4, t1, t2 ); \
+   t2 = v128_shuffle8( t2, _supermix7b ); \
+   t3 = v128_shuffle8( t3, _supermix2a ); \
+   t1 = v128_shuffle8( t0, _supermix4a ); \
+   t0 = v128_shuffle8( t0, _supermix4b ); \
+   t4 = v128_xor3( t4, t2, t1 ); \
+   t0 = v128_xor( t0, t3 ); \
+   t4 = v128_xor3( t4, t0, v128_shuffle8( t0, _supermix4c ) );

 #define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
 	CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
 	PACK_S0(r1c, r1a, _t0);\
 	SUBSTITUTE( r1c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
-	_t0 = _mm_shuffle_epi32(r1c, 0x39);\
-	r2c = _mm_xor_si128(r2c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
-	r2d = _mm_xor_si128(r2d, _t0);\
+	_t0 = shuffle_0321( r1c ); \
+	r2c = v128_xor(r2c, _t0);\
+   _t0 = mask_1000( _t0 ); \
+	r2d = v128_xor(r2d, _t0);\
 	UNPACK_S0(r1c, r1a, _t3);\
 	SUBSTITUTE(r2c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
-	_t0 = _mm_shuffle_epi32(r2c, 0x39);\
-	r3c = _mm_xor_si128(r3c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
-	r3d = _mm_xor_si128(r3d, _t0);\
+	_t0 = shuffle_0321( r2c ); \
+	r3c = v128_xor(r3c, _t0);\
+   _t0 = mask_1000( _t0 ); \
+	r3d = v128_xor(r3d, _t0);\
 	UNPACK_S0(r2c, r2a, _t3);\
 	SUBSTITUTE( r3c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
-	_t0 = _mm_shuffle_epi32(r3c, 0x39);\
-	r4c = _mm_xor_si128(r4c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
-	r4d = _mm_xor_si128(r4d, _t0);\
+	_t0 = shuffle_0321( r3c ); \
+	r4c = v128_xor(r4c, _t0);\
+   _t0 = mask_1000( _t0 ); \
+	r4d = v128_xor(r4d, _t0);\
 	UNPACK_S0(r3c, r3a, _t3);\
 	SUBSTITUTE( r4c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
@@ -256,18 +195,19 @@ MYALIGN const unsigned int _IV512[] = {
 	block[1] = col[(base + a + 1) % s];\
 	block[2] = col[(base + a + 2) % s];\
 	block[3] = col[(base + a + 3) % s];\
-	x = _mm_load_si128((__m128i*)block)
+	x = v128_load( (v128_t*)block )

 #define STORECOLUMN(x, s)\
-	_mm_store_si128((__m128i*)block, x);\
+	v128_store((v128_t*)block, x );\
 	col[(base + 0) % s] = block[0];\
 	col[(base + 1) % s] = block[1];\
 	col[(base + 2) % s] = block[2];\
 	col[(base + 3) % s] = block[3]

-void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
+void Compress512( hashState_fugue *ctx, const unsigned char *pmsg,
+                  unsigned int uBlockCount )
 {
-   __m128i _t0, _t1, _t2, _t3;
+   v128_t _t0, _t1, _t2, _t3;

   switch(ctx->base)
   {
@@ -346,134 +286,133 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
      pmsg += 4;
      uBlockCount--;
   }
-
 }

-void Final512(hashState_fugue *ctx, BitSequence *hashval)
+void Final512( hashState_fugue *ctx, uint8_t *hashval )
 {
   unsigned int block[4] __attribute__ ((aligned (32)));
   unsigned int col[36] __attribute__ ((aligned (16)));
 	unsigned int i, base;
-	__m128i r0, _t0, _t1, _t2, _t3;
+	v128_t r0, _t0, _t1, _t2, _t3;

-	for(i = 0; i < 12; i++)
+	for( i = 0; i < 12; i++ )
 	{
-		_mm_store_si128((__m128i*)block, ctx->state[i]);
+		v128_store( (v128_t*)block, ctx->state[i] );

 		col[3 * i + 0] = block[0];
 		col[3 * i + 1] = block[1];
 		col[3 * i + 2] = block[2];
 	}

-	base = (36 - (12 * ctx->base)) % 36;
+	base = ( 36 - (12 * ctx->base) ) % 36;

-	for(i = 0; i < 32; i++)
+	for( i = 0; i < 32; i++ )
 	{
 		// ROR3
 		base = (base + 33) % 36;

 		// CMIX
-		col[(base +  0) % 36] ^= col[(base + 4) % 36];
-		col[(base +  1) % 36] ^= col[(base + 5) % 36];
-		col[(base +  2) % 36] ^= col[(base + 6) % 36];
-		col[(base +  18) % 36] ^= col[(base + 4) % 36];
-		col[(base +  19) % 36] ^= col[(base + 5) % 36];
-		col[(base +  20) % 36] ^= col[(base + 6) % 36];
+		col[ (base +  0) % 36 ] ^= col[ (base + 4) % 36 ];
+		col[ (base +  1) % 36 ] ^= col[ (base + 5) % 36 ];
+		col[ (base +  2) % 36 ] ^= col[ (base + 6) % 36 ];
+		col[ (base + 18) % 36 ] ^= col[ (base + 4) % 36 ];
+		col[ (base + 19) % 36 ] ^= col[ (base + 5) % 36 ];
+		col[ (base + 20) % 36 ] ^= col[ (base + 6) % 36 ];

 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t2);
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
-		STORECOLUMN(r0, 36);
+		LOADCOLUMN( r0, 36, 0 );
+		SUBSTITUTE( r0, _t2 );
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
+		STORECOLUMN( r0, 36 );
 	}

-	for(i = 0; i < 13; i++)
+	for( i = 0; i < 13; i++ )
 	{
 		// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
-		col[(base +  4) % 36] ^= col[(base + 0) % 36];
-		col[(base +  9) % 36] ^= col[(base + 0) % 36];
-		col[(base + 18) % 36] ^= col[(base + 0) % 36];
-		col[(base + 27) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base +  9) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];

 		// ROR9
 		base = (base + 27) % 36;

 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t2);
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
-		STORECOLUMN(r0, 36);
+		LOADCOLUMN( r0, 36, 0 );
+		SUBSTITUTE( r0, _t2 );
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
+		STORECOLUMN( r0, 36 );

 		// S4 += S0; S10 += S0; S18 += S0; S27 += S0;
-		col[(base +  4) % 36] ^= col[(base + 0) % 36];
-		col[(base + 10) % 36] ^= col[(base + 0) % 36];
-		col[(base + 18) % 36] ^= col[(base + 0) % 36];
-		col[(base + 27) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];

 		// ROR9
 		base = (base + 27) % 36;

 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t2);
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
-		STORECOLUMN(r0, 36);
+		LOADCOLUMN( r0, 36, 0 );
+		SUBSTITUTE( r0, _t2 );
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
+		STORECOLUMN( r0, 36 );

 		// S4 += S0; S10 += S0; S19 += S0; S27 += S0;
-		col[(base +  4) % 36] ^= col[(base + 0) % 36];
-		col[(base + 10) % 36] ^= col[(base + 0) % 36];
-		col[(base + 19) % 36] ^= col[(base + 0) % 36];
-		col[(base + 27) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];

 		// ROR9
 		base = (base + 27) % 36;

 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t2);
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
-		STORECOLUMN(r0, 36);
+		LOADCOLUMN( r0, 36, 0 );
+		SUBSTITUTE( r0, _t2 );
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
+		STORECOLUMN( r0, 36 );

 		// S4 += S0; S10 += S0; S19 += S0; S28 += S0;
-		col[(base +  4) % 36] ^= col[(base + 0) % 36];
-		col[(base + 10) % 36] ^= col[(base + 0) % 36];
-		col[(base + 19) % 36] ^= col[(base + 0) % 36];
-		col[(base + 28) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 28) % 36 ] ^= col[ (base + 0) % 36 ];

 		// ROR8
 		base = (base + 28) % 36;

 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t2);
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
-		STORECOLUMN(r0, 36);
+		LOADCOLUMN( r0, 36, 0 );
+		SUBSTITUTE( r0, _t2 );
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
+		STORECOLUMN( r0, 36 );
 	}

 	// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
-	col[(base +  4) % 36] ^= col[(base + 0) % 36];
-	col[(base +  9) % 36] ^= col[(base + 0) % 36];
-	col[(base + 18) % 36] ^= col[(base + 0) % 36];
-	col[(base + 27) % 36] ^= col[(base + 0) % 36];
+	col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
+	col[ (base +  9) % 36 ] ^= col[ (base + 0) % 36 ];
+	col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
+	col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];

 	// Transform to the standard basis and store output; S1 || S2 || S3 || S4
-	LOADCOLUMN(r0, 36, 1);
-	_mm_store_si128((__m128i*)hashval, r0);
+	LOADCOLUMN( r0, 36, 1 );
+	v128_store( (v128_t*)hashval, r0 );

 	// Transform to the standard basis and store output; S9 || S10 || S11 || S12
-	LOADCOLUMN(r0, 36, 9);
-	_mm_store_si128((__m128i*)hashval + 1, r0);
+	LOADCOLUMN( r0, 36, 9 );
+	v128_store( (v128_t*)hashval + 1, r0 );

 	// Transform to the standard basis and store output; S18 || S19 || S20 || S21
-	LOADCOLUMN(r0, 36, 18);
-	_mm_store_si128((__m128i*)hashval + 2, r0);
+	LOADCOLUMN( r0, 36, 18 );
+	v128_store( (v128_t*)hashval + 2, r0 );

 	// Transform to the standard basis and store output; S27 || S28 || S29 || S30
-	LOADCOLUMN(r0, 36, 27);
-	_mm_store_si128((__m128i*)hashval + 3, r0);
+	LOADCOLUMN( r0, 36, 27 );
+	v128_store( (v128_t*)hashval + 3, r0 );
 }

-HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
+int fugue512_Init( hashState_fugue *ctx, int nHashSize )
 {
 	int i;
 	ctx->processed_bits = 0;
@@ -485,20 +424,20 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
 	ctx->uBlockLength = 4;

 	for(i = 0; i < 6; i++)
-		ctx->state[i] = m128_zero;
+		ctx->state[i] = v128_zero;

-	ctx->state[6]  = _mm_load_si128((__m128i*)_IV512 + 0);
-	ctx->state[7]  = _mm_load_si128((__m128i*)_IV512 + 1);
-	ctx->state[8]  = _mm_load_si128((__m128i*)_IV512 + 2);
-	ctx->state[9]  = _mm_load_si128((__m128i*)_IV512 + 3);
-	ctx->state[10] = _mm_load_si128((__m128i*)_IV512 + 4);
-	ctx->state[11] = _mm_load_si128((__m128i*)_IV512 + 5);
+	ctx->state[6]  = casti_v128( _IV512, 0 );
+	ctx->state[7]  = casti_v128( _IV512, 1 );
+	ctx->state[8]  = casti_v128( _IV512, 2 );
+	ctx->state[9]  = casti_v128( _IV512, 3 );
+	ctx->state[10] = casti_v128( _IV512, 4 );
+	ctx->state[11] = casti_v128( _IV512, 5 );

-	return SUCCESS;
+	return 0;
 }

-
-HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen)
+int fugue512_Update( hashState_fugue *state, const void *data,
+                            uint64_t databitlen )
 {
 	unsigned int uByteLength, uBlockCount, uRemainingBytes;

@@ -509,7 +448,8 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
 		if(state->uBufferBytes != 0)
 		{
 			// Fill the buffer
-			memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
+			memcpy( state->buffer + state->uBufferBytes, (void*)data,
+                 state->uBlockLength - state->uBufferBytes );

 			// Process the buffer
 			Compress512(state, state->buffer, 1);
@@ -545,13 +485,13 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
 		state->uBufferBytes += uByteLength;
 	}

-	return SUCCESS;
+	return 0;
 }

-HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
+int fugue512_Final( hashState_fugue *state, void *hashval )
 {
 	unsigned int i;
-	BitSequence lengthbuf[8] __attribute__((aligned(64)));
+	uint8_t lengthbuf[8] __attribute__((aligned(64)));

 	// Update message bit count
 	state->processed_bits += state->uBufferBytes * 8;
@@ -575,16 +515,17 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
 	// Finalization
 	Final512(state, hashval);

-	return SUCCESS;
+	return 0;
 }


-HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen)
+int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
+                   uint64_t databitlen )
 {
-	fugue512_Init(hs, 512);
-	fugue512_Update(hs, data, databitlen*8);
-	fugue512_Final(hs, hashval);
-	return SUCCESS;
+	fugue512_Init( hs, 512 );
+	fugue512_Update( hs, data, databitlen*8 );
+	fugue512_Final( hs, hashval );
+	return 0;
 }

 #endif  // AES
--- a/algo/fugue/fugue-aesni.h
+++ b/algo/fugue/fugue-aesni.h
@@ -14,37 +14,31 @@
 #ifndef FUGUE_HASH_API_H
 #define FUGUE_HASH_API_H

-#if defined(__AES__) 
+#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )

-#if !defined(__SSE4_1__)
-#error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
-#endif
-
-#include "compat/sha3_common.h"
 #include "simd-utils.h"

-
 typedef struct
 {
-	__m128i			state[12];
+	v128_t			state[12];
 	unsigned int	base;
-
 	unsigned int	uHashSize;
 	unsigned int	uBlockLength;
 	unsigned int	uBufferBytes;
-	DataLength		processed_bits;
-	BitSequence		buffer[4];
+	uint64_t 		processed_bits;
+	uint8_t  		buffer[4];

 } hashState_fugue __attribute__ ((aligned (64)));


 // These functions are deprecated, use the lower case macro aliases that use
 // the standard interface. This will be cleaned up at a later date.
-HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen);
+int fugue512_Init( hashState_fugue *state, int hashbitlen );

-HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen);
+int fugue512_Update( hashState_fugue *state, const void *data,
+                     uint64_t databitlen );

-HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
+int fugue512_Final( hashState_fugue *state, void *hashval );

 #define fugue512_init( state ) \
   fugue512_Init( state, 512 )
@@ -54,7 +48,8 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
   fugue512_Final


-HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen);
+int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
+                   uint64_t databitlen);

 #endif // AES
 #endif // HASH_API_H
--- a/algo/gost/sph_gost.c
+++ b/algo/gost/sph_gost.c
@@ -704,15 +704,15 @@ static void AddXor512(const void *a,const void *b,void *c)
                                           casti_m256i( b, 0 ) );
   casti_m256i( c, 1 ) = _mm256_xor_si256( casti_m256i( a, 1 ),
                                           casti_m256i( b, 1 ) );
-#elif defined(__SSE2__)
-   casti_m128i( c, 0 ) = _mm_xor_si128( casti_m128i( a, 0 ),
-                                        casti_m128i( b, 0 ) );
-   casti_m128i( c, 1 ) = _mm_xor_si128( casti_m128i( a, 1 ),
-                                        casti_m128i( b, 1 ) );
-   casti_m128i( c, 2 ) = _mm_xor_si128( casti_m128i( a, 2 ),
-                                        casti_m128i( b, 2 ) );
-   casti_m128i( c, 3 ) = _mm_xor_si128( casti_m128i( a, 3 ),
-                                        casti_m128i( b, 3 ) );
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+   casti_v128( c, 0 ) = v128_xor( casti_v128( a, 0 ),
+                                  casti_v128( b, 0 ) );
+   casti_v128( c, 1 ) = v128_xor( casti_v128( a, 1 ),
+                                  casti_v128( b, 1 ) );
+   casti_v128( c, 2 ) = v128_xor( casti_v128( a, 2 ),
+                                  casti_v128( b, 2 ) );
+   casti_v128( c, 3 ) = v128_xor( casti_v128( a, 3 ),
+                                  casti_v128( b, 3 ) );
 #else
   const unsigned long long *A=a, *B=b;
 	unsigned long long *C=c;
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -60,21 +60,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };

 #if defined(__ARM_NEON)

-// No fast shuffle on NEON
-//static const uint32x4_t vmask_d8 = {  3, 1, 2, 0 };  
-static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
+static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
+   { 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };

-#define gr_shuffle32( v )      v128_blendv( v128_qrev32( v ), v, BLEND_MASK )
-
-//#define gr_shuffle32( v )       v128_shufflev32( v, vmask_d8 )
+#define gr_shuffle32(v)       vqtbl1q_u8( v, gr_mask ) 

 #else

-#define gr_shuffle32( v )       _mm_shuffle_epi32( v, 0xd8 )
+#define gr_shuffle32(v)       _mm_shuffle_epi32( v, 0xd8 )

 #endif

-
 #define tos(a)    #a
 #define tostr(a)  tos(a)

@@ -301,17 +297,16 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
 */
 #define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* SubBytes */\
-  b0 = v128_xor(b0, b0);\
-  a0 = v128_aesenclast(a0, b0);\
-  a1 = v128_aesenclast(a1, b0);\
-  a2 = v128_aesenclast(a2, b0);\
-  a3 = v128_aesenclast(a3, b0);\
-  a4 = v128_aesenclast(a4, b0);\
-  a5 = v128_aesenclast(a5, b0);\
-  a6 = v128_aesenclast(a6, b0);\
-  a7 = v128_aesenclast(a7, b0);\
+  a0 = v128_aesenclast_nokey( a0 ); \
+  a1 = v128_aesenclast_nokey( a1 ); \
+  a2 = v128_aesenclast_nokey( a2 ); \
+  a3 = v128_aesenclast_nokey( a3 ); \
+  a4 = v128_aesenclast_nokey( a4 ); \
+  a5 = v128_aesenclast_nokey( a5 ); \
+  a6 = v128_aesenclast_nokey( a6 ); \
+  a7 = v128_aesenclast_nokey( a7 ); \
  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+  MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7 ); \
 }

 #define ROUNDS_P(){\
@@ -329,10 +324,9 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
    xmm13 = v128_shuffle8( xmm13, SUBSH_MASK5 ); \
    xmm14 = v128_shuffle8( xmm14, SUBSH_MASK6 ); \
    xmm15 = v128_shuffle8( xmm15, SUBSH_MASK7 ); \
-    /* SubBytes + MixBytes */\
+     /* SubBytes + MixBytes */\
    SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
            xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7 ); \
-    \
    /* AddRoundConstant P1024 */\
    xmm0 = v128_xor( xmm0, \
             casti_v128( round_const_p, round_counter+1 ) ); \
@@ -434,7 +428,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
  t1 = v128_unpackhi16(t1, i3);\
  i2 = v128_unpacklo16(i2, i3);\
  i0 = v128_unpacklo16(i0, i1);\
-\
  /* shuffle with immediate */\
  t0 = gr_shuffle32( t0 ); \
  t1 = gr_shuffle32( t1 ); \
@@ -444,7 +437,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
  i2 = gr_shuffle32( i2 ); \
  i4 = gr_shuffle32( i4 ); \
  i6 = gr_shuffle32( i6 ); \
-\
  /* continue with unpack */\
  t4 = i0;\
  i0 = v128_unpacklo32(i0, i2);\
@@ -551,7 +543,8 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
  /* transpose done */\
 }/**/

-
+#if 0
+// not used
 void INIT( v128_t* chaining )
 {
  static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
@@ -580,6 +573,7 @@ void INIT( v128_t* chaining )
  chaining[6] = xmm14;
  chaining[7] = xmm15;
 }
+#endif

 void TF1024( v128_t* chaining, const v128_t* message )
 {
--- a/algo/groestl/aes_ni/groestl256-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl256-intr-aes.h
@@ -1,3 +1,6 @@
+#if !defined GROESTL256_INTR_AES_H__
+#define GROESTL256_INTR_AES_H__
+
 /* groestl-intr-aes.h     Aug 2011
 *
 * Groestl implementation with intrinsics using ssse3, sse4.1, and aes
@@ -50,18 +53,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };

 #if defined(__ARM_NEON)

-// No fast shuffle on NEON
-static const uint32x4_t vmask_d8 = {  3, 1, 2, 0 };
+static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
+   { 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };

-#define gr_shuffle32( v )       v128_shufflev32( v, vmask_d8 )
+#define gr_shuffle32(v)       vqtbl1q_u8( v, gr_mask ) 

 #else

-#define gr_shuffle32( v )       _mm_shuffle_epi32( v, 0xd8 )
+#define gr_shuffle32(v)       _mm_shuffle_epi32( v, 0xd8 )

 #endif

-
 #define tos(a)    #a
 #define tostr(a)  tos(a)

@@ -598,4 +600,4 @@ void OF512( v128_t* chaining )
  chaining[3] = xmm11;
 }

-
+#endif
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -146,7 +146,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
   const int hash_offset = SIZE512 - hashlen_m128i;
   uint64_t blocks = len / SIZE512;
   v128_t* in = (v128_t*)input;
-
+   
   // digest any full blocks, process directly from input 
   for ( i = 0; i < blocks; i++ )
      TF1024( ctx->chaining, &in[ i * SIZE512 ] );
@@ -181,6 +181,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,

   // digest final padding block and do output transform
   TF1024( ctx->chaining, ctx->buffer );
+
   OF1024( ctx->chaining );

   // store hash result in output 
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -87,6 +87,7 @@ int final_groestl( hashState_groestl*, void* );
 int update_and_final_groestl( hashState_groestl*,  void*, const void*, int );
 int groestl512( hashState_groestl*,  void*, const void*, uint64_t );
 #define groestl512_full   groestl512
+#define groestl512_ctx    groestl512


 #endif /* __hash_h */
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -626,7 +626,7 @@ static const __m256i SUBSH_MASK7_2WAY =

 #define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
-  b1 = mm256_bcast_m128( mm128_mask_32( m128_neg1, 0x3 ) ); \
+  b1 = mm256_bcast_m128( v128_mask32( v128_neg1, 0x3 ) ); \
  a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
  a1 = _mm256_xor_si256( a1, b1 );\
  a2 = _mm256_xor_si256( a2, b1 );\
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -213,7 +213,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,

   v128_bswap32_intrlv80_4x32( vdata, pdata );
   do {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
+      *noncev = v128_bswap32( _mm_set_epi32( n+3,n+2,n+1,n ) );

      myriad_4way_hash( hash, vdata );
      pdata[19] = n;
--- a/algo/groestl/sph_groestl.c
+++ b/algo/groestl/sph_groestl.c
@@ -35,8 +35,6 @@

 #include "sph_groestl.h"

-#if !defined(__AES__)
-
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -3119,5 +3117,4 @@ sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #ifdef __cplusplus
 }

-#endif  // !AES
 #endif
--- a/algo/groestl/sph_groestl.h
+++ b/algo/groestl/sph_groestl.h
@@ -42,7 +42,6 @@ extern "C"{
 #include <stddef.h>
 #include "compat/sph_types.h"

-#if !defined(__AES__)   
 /**
 * Output size (in bits) for Groestl-224.
 */
@@ -327,5 +326,4 @@ void sph_groestl512_addbits_and_close(
 }
 #endif

-#endif  // !AES
 #endif
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -38,7 +38,7 @@
 #include <stddef.h>
 #include "simd-utils.h"

-// SSE2 or NEON Hamsi-512 2x64
+#if defined(__SSE4_2__) || defined(__ARM_NEON)

 typedef struct
 {
@@ -57,6 +57,8 @@ void hamsi512_2x64_ctx( hamsi512_2x64_context *sc, void *dst, const void *data,
                        size_t len );
 void hamsi512_2x64( void *dst, const void *data, size_t len );

+#endif
+
 #if defined (__AVX2__)

 // Hamsi-512 4x64
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -78,7 +78,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
-   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   __m256i *noncev = (__m256i*)vdata + 9;   // aligned
   const uint32_t Htarg = ptarget[7];
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
@@ -108,4 +108,53 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }

+#elif defined(KECCAK_2WAY)
+
+void keccakhash_2x64(void *state, const void *input)
+{
+    keccak256_2x64_context ctx;
+    keccak256_2x64_init( &ctx );
+    keccak256_2x64_update( &ctx, input, 80 );
+    keccak256_2x64_close( &ctx, state );
+}
+
+int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
+   uint32_t hash[16*2] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[13]);   // 3*4+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   v128_t *noncev = (v128_t*)vdata + 9;
+   const uint32_t Htarg = ptarget[7];
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+
+   v128_bswap32_intrlv80_2x64( vdata, pdata );
+   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
+   do {
+      keccakhash_2x64( hash, vdata );
+
+      for ( int lane = 0; lane < 2; lane++ )
+      if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
+      {
+          extr_lane_2x64( lane_hash, hash, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ))
+          {
+              pdata[19] = bswap_32( n + lane );
+              submit_solution( work, lane_hash, mythr );
+          }
+      }
+      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
+      n += 2;
+   } while ( (n < max_nonce-2) && !work_restart[thr_id].restart);
+   pdata[19] = n;
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
 #endif
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -17,6 +17,9 @@ bool register_keccak_algo( algo_gate_t* gate )
 #elif defined (KECCAK_4WAY)
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
+#elif defined (KECCAK_2WAY)
+  gate->scanhash  = (void*)&scanhash_keccak_2x64;
+  gate->hash      = (void*)&keccakhash_2x64;
 #else
  gate->scanhash  = (void*)&scanhash_keccak;
  gate->hash      = (void*)&keccakhash;
@@ -37,6 +40,9 @@ bool register_keccakc_algo( algo_gate_t* gate )
 #elif defined (KECCAK_4WAY)
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
+#elif defined (KECCAK_2WAY)
+  gate->scanhash  = (void*)&scanhash_keccak_2x64;
+  gate->hash      = (void*)&keccakhash_2x64;
 #else
  gate->scanhash  = (void*)&scanhash_keccak;
  gate->hash      = (void*)&keccakhash;
@@ -75,15 +81,17 @@ void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
 bool register_sha3d_algo( algo_gate_t* gate )
 {
  hard_coded_eb = 6;
-//  opt_extranonce = false;
-  gate->optimizations = AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
  gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root;
-#if defined (KECCAK_8WAY)
+#if defined (SHA3D_8WAY)
  gate->scanhash  = (void*)&scanhash_sha3d_8way;
  gate->hash      = (void*)&sha3d_hash_8way;
-#elif defined (KECCAK_4WAY)
+#elif defined (SHA3D_4WAY)
  gate->scanhash  = (void*)&scanhash_sha3d_4way;
  gate->hash      = (void*)&sha3d_hash_4way;
+#elif defined (SHA3D_2WAY)
+  gate->scanhash  = (void*)&scanhash_sha3d_2x64;
+  gate->hash      = (void*)&sha3d_hash_2x64;
 #else
  gate->scanhash  = (void*)&scanhash_sha3d;
  gate->hash      = (void*)&sha3d_hash;
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -8,6 +8,16 @@
  #define KECCAK_8WAY 1
 #elif defined(__AVX2__)
  #define KECCAK_4WAY 1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define KECCAK_2WAY 1
+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define SHA3D_8WAY 1
+#elif defined(__AVX2__)
+  #define SHA3D_4WAY 1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define SHA3D_2WAY 1
 #endif

 extern int hard_coded_eb;
@@ -16,27 +26,47 @@ extern int hard_coded_eb;

 void keccakhash_8way( void *state, const void *input );
 int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
-
-void sha3d_hash_8way( void *state, const void *input );
-int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
+                          uint64_t *hashes_done, struct thr_info *mythr );

 #elif defined(KECCAK_4WAY)

 void keccakhash_4way( void *state, const void *input );
 int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
+                          uint64_t *hashes_done, struct thr_info *mythr );

-void sha3d_hash_4way( void *state, const void *input );
-int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
+#elif defined(KECCAK_2WAY)
+
+void keccakhash_2x64( void *state, const void *input );
+int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );

 #else

 void keccakhash( void *state, const void *input );
 int scanhash_keccak( struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done, struct thr_info *mythr );
+                     uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
+#if defined(SHA3D_8WAY)
+
+void sha3d_hash_8way( void *state, const void *input );
+int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(SHA3D_4WAY)
+
+void sha3d_hash_4way( void *state, const void *input );
+int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(SHA3D_2WAY)
+
+void sha3d_hash_2x64( void *state, const void *input );
+int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#else

 void sha3d_hash( void *state, const void *input );
 int scanhash_sha3d( struct work *work, uint32_t max_nonce,
--- a/algo/keccak/sha3d-4way.c
+++ b/algo/keccak/sha3d-4way.c
@@ -4,7 +4,7 @@
 #include <stdint.h>
 #include "keccak-hash-4way.h"

-#if defined(KECCAK_8WAY)
+#if defined(SHA3D_8WAY)

 void sha3d_hash_8way(void *state, const void *input)
 {
@@ -64,7 +64,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
   return 0;
 }

-#elif defined(KECCAK_4WAY)
+#elif defined(SHA3D_4WAY)

 void sha3d_hash_4way(void *state, const void *input)
 {
@@ -122,4 +122,60 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }

+#elif defined(SHA3D_2WAY)
+
+void sha3d_hash_2x64(void *state, const void *input)
+{
+    uint32_t buffer[16*4] __attribute__ ((aligned (64)));
+    keccak256_2x64_context ctx;
+
+    keccak256_2x64_init( &ctx );
+    keccak256_2x64_update( &ctx, input, 80 );
+    keccak256_2x64_close( &ctx, buffer );
+
+    keccak256_2x64_init( &ctx );
+    keccak256_2x64_update( &ctx, buffer, 32 );
+    keccak256_2x64_close( &ctx, state );
+}
+
+int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
+   uint32_t hash[16*2] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[13]);   // 3*4+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   v128_t *noncev = (v128_t*)vdata + 9;
+   const uint32_t Htarg = ptarget[7];
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+
+   v128_bswap32_intrlv80_2x64( vdata, pdata );
+   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
+   do {
+      sha3d_hash_2x64( hash, vdata );
+
+      for ( int lane = 0; lane < 2; lane++ )
+      if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
+      {
+          extr_lane_2x64( lane_hash, hash, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) )
+          {
+              pdata[19] = bswap_32( n + lane );
+              submit_solution( work, lane_hash, mythr );
+          }
+      }
+      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
+      n += 2;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
 #endif
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -75,16 +75,16 @@
 #define SUBCRUMB( a0, a1, a2, a3 ) \
 { \
    v128_t t = a0; \
-    a0 = mm128_xoror( a3, a0, a1 ); \
+    a0 = v128_xoror( a3, a0, a1 ); \
    a2 = v128_xor( a2, a3 ); \
    a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
-    a3 = mm128_xorand( a2, a3, t ); \
-    a2 = mm128_xorand( a1, a2, a0 ); \
+    a3 = v128_xorand( a2, a3, t ); \
+    a2 = v128_xorand( a1, a2, a0 ); \
    a1 = v128_or( a1, a3 ); \
    a3 = v128_xor( a3, a2 ); \
    t  = v128_xor( t, a1 ); \
    a2 = v128_and( a2, a1 ); \
-    a1 = mm128_xnor( a1, a0 ); \
+    a1 = v128_xnor( a1, a0 ); \
    a0 = t; \
 }

--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -465,12 +465,8 @@ typedef union
 {
   keccak256_2x64_context    keccak;
   cubehashParam             cube;
-//#if defined(__x86_64__)
   skein256_2x64_context     skein;
-//#else
-//   sph_skein512_context      skein;
-//#endif
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   hashState_groestl256      groestl;
 #else
   sph_groestl256_context     groestl;
@@ -516,7 +512,6 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );

-//#if defined(__x86_64__)
   intrlv_2x64( vhashA, hash0, hash1, 256 );
   skein256_2x64_init( &ctx.skein );
   skein256_2x64_update( &ctx.skein, vhashA, 32 );
@@ -527,23 +522,8 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
   skein256_2x64_update( &ctx.skein, vhashA, 32 );
   skein256_2x64_close( &ctx.skein, vhashA );
   dintrlv_2x64( hash2, hash3, vhashA, 256 );
-/*
-#else
-    sph_skein256_init( &ctx.skein );
-    sph_skein256( &ctx.skein, hash0, 32 );
-    sph_skein256_close( &ctx.skein, hash0 );
-    sph_skein256_init( &ctx.skein );
-    sph_skein256( &ctx.skein, hash1, 32 );
-    sph_skein256_close( &ctx.skein, hash1 );
-    sph_skein256_init( &ctx.skein );
-    sph_skein256( &ctx.skein, hash2, 32 );
-    sph_skein256_close( &ctx.skein, hash2 );
-    sph_skein256_init( &ctx.skein );
-    sph_skein256( &ctx.skein, hash3, 32 );
-    sph_skein256_close( &ctx.skein, hash3 );
-#endif
-*/
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   groestl256_full( &ctx.groestl, hash0, hash0, 256 );
   groestl256_full( &ctx.groestl, hash1, hash1, 256 );
   groestl256_full( &ctx.groestl, hash2, hash2, 256 );
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -67,7 +67,7 @@ int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
   lyra2h_4way_midstate( vdata );

   do {
-     *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+     *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      lyra2h_4way_hash( hash, vdata );

      for ( int i = 0; i < 4; i++ )
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -456,7 +456,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,

   do
   {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );

      lyra2rev2_4way_hash( hash, vdata );

--- a/algo/quark/hmq1725-gate.c
+++ b/algo/quark/hmq1725-gate.c
@@ -9,11 +9,11 @@ bool register_hmq1725_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_hmq1725_4way;
  gate->hash      = (void*)&hmq1725_4way_hash;
 #else
-  init_hmq1725_ctx();
  gate->scanhash  = (void*)&scanhash_hmq1725;
  gate->hash      = (void*)&hmq1725hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT 
+                      | NEON_OPT;
  opt_target_factor = 65536.0;
  return true;
 };
--- a/algo/quark/hmq1725-gate.h
+++ b/algo/quark/hmq1725-gate.h
@@ -29,7 +29,6 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
 void hmq1725hash( void *state, const void *input );
 int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );
-void init_hmq1725_ctx();

 #endif

--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -4,367 +4,273 @@

 #include <string.h>
 #include <stdint.h>
-#include "algo/blake/sph_blake.h"
+#include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
-#include "algo/groestl/sph_groestl.h"
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+  #include "algo/fugue/fugue-aesni.h"
+#else
+  #include "algo/fugue/sph_fugue.h"
+#endif
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+  #include "algo/groestl/aes_ni/hash-groestl.h"
+  #include "algo/echo/aes_ni/hash_api.h"
+#else
+  #include "algo/groestl/sph_groestl.h"
+  #include "algo/echo/sph_echo.h"
+#endif
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
-#include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/sha/sph_sha2.h"
-#if defined(__AES__)
-  #include "algo/groestl/aes_ni/hash-groestl.h"
-  #include "algo/echo/aes_ni/hash_api.h"
-  #include "algo/fugue/fugue-aesni.h"
-#else
-  #include "algo/groestl/sph_groestl.h"
-  #include "algo/echo/sph_echo.h"
-  #include "algo/fugue/sph_fugue.h"
-#endif
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"

-typedef struct {
-  sph_blake512_context    blake1, blake2;
-  sph_bmw512_context      bmw1, bmw2, bmw3;
-  sph_skein512_context    skein1, skein2;
-  sph_jh512_context       jh1, jh2;
-  sph_keccak512_context   keccak1, keccak2;
-  hashState_luffa         luffa1, luffa2;
-  cubehashParam           cube;
-  sph_shavite512_context  shavite1, shavite2;
-#if defined(__aarch64__)
-  sph_simd512_context     simd1, simd2;
-#else
-  hashState_sd            simd1, simd2;
-#endif
-  sph_hamsi512_context    hamsi1;
-  sph_shabal512_context   shabal1;
-  sph_whirlpool_context   whirlpool1, whirlpool2, whirlpool3, whirlpool4;
-  sph_sha512_context      sha1, sha2;
-  sph_haval256_5_context  haval1, haval2;
-#if defined(__AES__)
-  hashState_echo          echo1, echo2;
-  hashState_groestl       groestl1, groestl2;
-  hashState_fugue         fugue1, fugue2;
-#else
-  sph_groestl512_context  groestl1, groestl2;
-  sph_echo512_context     echo1, echo2;
-  sph_fugue512_context    fugue1, fugue2;
-#endif
-} hmq1725_ctx_holder;
-
-static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64)));
-static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64)));
-
-void init_hmq1725_ctx()
+union _hmq1725_ctx_holder
 {
-    sph_blake512_init(&hmq1725_ctx.blake1);
-    sph_blake512_init(&hmq1725_ctx.blake2);
-
-    sph_bmw512_init(&hmq1725_ctx.bmw1);
-    sph_bmw512_init(&hmq1725_ctx.bmw2);
-    sph_bmw512_init(&hmq1725_ctx.bmw3);
-
-    sph_skein512_init(&hmq1725_ctx.skein1);
-    sph_skein512_init(&hmq1725_ctx.skein2);
-
-    sph_jh512_init(&hmq1725_ctx.jh1);
-    sph_jh512_init(&hmq1725_ctx.jh2);
-
-    sph_keccak512_init(&hmq1725_ctx.keccak1);
-    sph_keccak512_init(&hmq1725_ctx.keccak2);
-
-    init_luffa( &hmq1725_ctx.luffa1, 512 );
-    init_luffa( &hmq1725_ctx.luffa2, 512 );
-
-    cubehashInit( &hmq1725_ctx.cube, 512, 16, 32 );
-
-    sph_shavite512_init(&hmq1725_ctx.shavite1);
-    sph_shavite512_init(&hmq1725_ctx.shavite2);
-
-#if defined(__aarch64__)
-    sph_simd512_init(&hmq1725_ctx.simd1);
-    sph_simd512_init(&hmq1725_ctx.simd2);
-#else    
-    init_sd( &hmq1725_ctx.simd1, 512 );
-    init_sd( &hmq1725_ctx.simd2, 512 );
-#endif
-
-    sph_hamsi512_init(&hmq1725_ctx.hamsi1);
-
-#if defined(__AES__)
-    fugue512_Init( &hmq1725_ctx.fugue1, 512 );
-    fugue512_Init( &hmq1725_ctx.fugue2, 512 );
+   blake512_context        blake;
+   sph_bmw512_context      bmw;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+   hashState_fugue         fugue;
 #else
-    sph_fugue512_init(&hmq1725_ctx.fugue1);
-    sph_fugue512_init(&hmq1725_ctx.fugue2);
+   sph_fugue512_context    fugue;
 #endif
-
-    sph_shabal512_init(&hmq1725_ctx.shabal1);
-
-    sph_whirlpool_init(&hmq1725_ctx.whirlpool1);
-    sph_whirlpool_init(&hmq1725_ctx.whirlpool2);
-    sph_whirlpool_init(&hmq1725_ctx.whirlpool3);
-    sph_whirlpool_init(&hmq1725_ctx.whirlpool4);
-
-    sph_sha512_init( &hmq1725_ctx.sha1 );
-    sph_sha512_init( &hmq1725_ctx.sha2 );
-
-    sph_haval256_5_init(&hmq1725_ctx.haval1);
-    sph_haval256_5_init(&hmq1725_ctx.haval2);
-
-#if defined(__AES__)
-     init_echo( &hmq1725_ctx.echo1, 512 );
-     init_echo( &hmq1725_ctx.echo2, 512 );
-     init_groestl( &hmq1725_ctx.groestl1, 64 );
-     init_groestl( &hmq1725_ctx.groestl2, 64 );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+   hashState_groestl       groestl;
+   hashState_echo          echo;
 #else
-     sph_groestl512_init( &hmq1725_ctx.groestl1 );
-     sph_groestl512_init( &hmq1725_ctx.groestl2 );
-     sph_echo512_init( &hmq1725_ctx.echo1 );
-     sph_echo512_init( &hmq1725_ctx.echo2 );
+   sph_groestl512_context  groestl;
+   sph_echo512_context     echo;
 #endif
-}
-
-void hmq_bmw512_midstate( const void* input )
-{
-    memcpy( &hmq_bmw_mid, &hmq1725_ctx.bmw1, sizeof hmq_bmw_mid );
-    sph_bmw512( &hmq_bmw_mid, input, 64 );
-}
-
-__thread hmq1725_ctx_holder h_ctx __attribute__ ((aligned (64)));
+   sph_skein512_context    skein;
+   sph_jh512_context       jh;
+   sph_keccak512_context   keccak;
+   hashState_luffa         luffa;
+   cubehashParam           cube;
+   sph_shavite512_context  shavite;
+   simd512_context         simd;
+   sph_hamsi512_context    hamsi;
+   sph_shabal512_context   shabal;
+   sph_whirlpool_context   whirlpool;
+   sph_sha512_context      sha;
+   sph_haval256_5_context  haval;
+};
+typedef union _hmq1725_ctx_holder hmq1725_ctx_holder;

 extern void hmq1725hash(void *state, const void *input)
 {
    const uint32_t mask = 24;
-    uint32_t hashA[32] __attribute__((aligned(64)));
-    uint32_t hashB[32] __attribute__((aligned(64)));
-    const int midlen = 64;            // bytes
-    const int tail   = 80 - midlen;   // 16
+    uint32_t hashA[32] __attribute__((aligned(32)));
+    uint32_t hashB[32] __attribute__((aligned(32)));
+    hmq1725_ctx_holder ctx __attribute__ ((aligned (64)));

-    memcpy(&h_ctx, &hmq1725_ctx, sizeof(hmq1725_ctx));
+    sph_bmw512_init( &ctx.bmw );
+    sph_bmw512( &ctx.bmw, input, 80 );
+    sph_bmw512_close( &ctx.bmw, hashA );   //1

-    memcpy( &h_ctx.bmw1, &hmq_bmw_mid, sizeof hmq_bmw_mid );
-    sph_bmw512( &h_ctx.bmw1, input + midlen, tail );
-    sph_bmw512_close(&h_ctx.bmw1, hashA);   //1
-
-    sph_whirlpool (&h_ctx.whirlpool1, hashA, 64);    //0
-    sph_whirlpool_close(&h_ctx.whirlpool1, hashB);   //1
+    sph_whirlpool_init( &ctx.whirlpool );
+    sph_whirlpool( &ctx.whirlpool, hashA, 64 );    //0
+    sph_whirlpool_close( &ctx.whirlpool, hashB );   //1

    if ( hashB[0] & mask )   //1
    {
-#if defined(__AES__)
-     update_and_final_groestl( &h_ctx.groestl1, (char*)hashA,
-                               (const char*)hashB, 512 );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+       groestl512_full( &ctx.groestl, hashA, hashB, 512 );
 #else
-     sph_groestl512 (&h_ctx.groestl1, hashB, 64); //1
-     sph_groestl512_close(&h_ctx.groestl1, hashA); //2
+       sph_groestl512_init( &ctx.groestl );
+       sph_groestl512( &ctx.groestl, hashB, 64 ); //1
+       sph_groestl512_close( &ctx.groestl, hashA ); //2
 #endif
    }
    else
    {
-      sph_skein512 (&h_ctx.skein1, hashB, 64); //1
-      sph_skein512_close(&h_ctx.skein1, hashA); //2
+      sph_skein512_init( &ctx.skein );
+      sph_skein512( &ctx.skein, hashB, 64 ); //1
+      sph_skein512_close( &ctx.skein, hashA ); //2
    }
 	
-    sph_jh512 (&h_ctx.jh1, hashA, 64); //3
-    sph_jh512_close(&h_ctx.jh1, hashB); //4
+    sph_jh512_init( &ctx.jh );
+    sph_jh512( &ctx.jh, hashA, 64 ); //3
+    sph_jh512_close( &ctx.jh, hashB ); //4

-    sph_keccak512 (&h_ctx.keccak1, hashB, 64); //2
-    sph_keccak512_close(&h_ctx.keccak1, hashA); //3
+    sph_keccak512_init( &ctx.keccak );
+    sph_keccak512( &ctx.keccak, hashB, 64 ); //2
+    sph_keccak512_close( &ctx.keccak, hashA ); //3

    if ( hashA[0] & mask ) //4
    {
-        sph_blake512 (&h_ctx.blake1, hashA, 64); //
-        sph_blake512_close(&h_ctx.blake1, hashB); //5
+        blake512_init( &ctx.blake );
+        blake512_update( &ctx.blake, hashA, 64 );
+        blake512_close( &ctx.blake, hashB );
    }
    else
    {
-        sph_bmw512 (&h_ctx.bmw2, hashA, 64); //4
-        sph_bmw512_close(&h_ctx.bmw2, hashB);   //5
+        sph_bmw512_init( &ctx.bmw );
+        sph_bmw512( &ctx.bmw, hashA, 64 ); //4
+        sph_bmw512_close( &ctx.bmw, hashB );   //5
    }
    
-     update_and_final_luffa( &h_ctx.luffa1, hashA, hashB, 64 );
+    luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );

-     cubehashUpdateDigest( &h_ctx.cube, hashB, hashA, 64 );
+    cubehash_full( &ctx.cube, hashB, 512, hashA, 64 );

    if ( hashB[0] & mask ) //7
    {
-        sph_keccak512 (&h_ctx.keccak2, hashB, 64); //
-        sph_keccak512_close(&h_ctx.keccak2, hashA); //8
+        sph_keccak512_init( &ctx.keccak );
+        sph_keccak512( &ctx.keccak, hashB, 64 ); //
+        sph_keccak512_close( &ctx.keccak, hashA ); //8
    }
    else
    {
-        sph_jh512 (&h_ctx.jh2, hashB, 64); //7
-        sph_jh512_close(&h_ctx.jh2, hashA); //8
+        sph_jh512_init( &ctx.jh );
+        sph_jh512( &ctx.jh, hashB, 64 ); //7
+        sph_jh512_close( &ctx.jh, hashA ); //8
    }

-    sph_shavite512 (&h_ctx.shavite1, hashA, 64); //3
-    sph_shavite512_close(&h_ctx.shavite1, hashB); //4
+    sph_shavite512_init( &ctx.shavite );
+    sph_shavite512( &ctx.shavite, hashA, 64 ); //3
+    sph_shavite512_close( &ctx.shavite, hashB ); //4

-#if defined(__aarch64__)
-    sph_simd512 (&h_ctx.simd1, hashB, 64); //3
-    sph_simd512_close(&h_ctx.simd1, hashA); //4
-#else    
-    update_final_sd( &h_ctx.simd1, (BitSequence *)hashA,
-                                   (const BitSequence *)hashB, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hashA, hashB, 64 );

    if ( hashA[0] & mask ) //4
    {
-        sph_whirlpool (&h_ctx.whirlpool2, hashA, 64); //
-        sph_whirlpool_close(&h_ctx.whirlpool2, hashB); //5
+        sph_whirlpool_init( &ctx.whirlpool );
+        sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //
+        sph_whirlpool_close( &ctx.whirlpool, hashB ); //5
    }
    else
    {
-        sph_haval256_5 (&h_ctx.haval1, hashA, 64); //4
-        sph_haval256_5_close(&h_ctx.haval1, hashB);   //5
+        sph_haval256_5_init( &ctx.haval );
+        sph_haval256_5( &ctx.haval, hashA, 64 ); //4
+        sph_haval256_5_close( &ctx.haval, hashB );   //5
        memset(&hashB[8], 0, 32);
    }

-#if defined(__AES__)
-    update_final_echo ( &h_ctx.echo1, (BitSequence *)hashA,
-                        (const BitSequence *)hashB, 512 );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+    echo_full( &ctx.echo, hashA, 512, hashB, 64 );
 #else
-    sph_echo512 (&h_ctx.echo1, hashB, 64); //5
-    sph_echo512_close(&h_ctx.echo1, hashA); //6
+    sph_echo512_init( &ctx.echo );
+    sph_echo512( &ctx.echo, hashB, 64 ); //5
+    sph_echo512_close( &ctx.echo, hashA ); //6
 #endif

-    sph_blake512 (&h_ctx.blake2, hashA, 64); //6
-    sph_blake512_close(&h_ctx.blake2, hashB); //7
+    blake512_init( &ctx.blake );
+    blake512_update( &ctx.blake, hashA, 64 );
+    blake512_close( &ctx.blake, hashB );

    if ( hashB[0] & mask ) //7
    {
-        sph_shavite512 (&h_ctx.shavite2, hashB, 64); //
-        sph_shavite512_close(&h_ctx.shavite2, hashA); //8
+       sph_shavite512_init( &ctx.shavite );
+       sph_shavite512( &ctx.shavite, hashB, 64 ); //
+       sph_shavite512_close( &ctx.shavite, hashA ); //8
    }
    else
-    {
-     update_and_final_luffa( &h_ctx.luffa2, hashA, hashB, 64 );
-    }
+       luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );

-    sph_hamsi512 (&h_ctx.hamsi1, hashA, 64); //3
-    sph_hamsi512_close(&h_ctx.hamsi1, hashB); //4
+    sph_hamsi512_init( &ctx.hamsi );
+    sph_hamsi512( &ctx.hamsi, hashA, 64 ); //3
+    sph_hamsi512_close( &ctx.hamsi, hashB ); //4

-#if defined(__AES__)
-    fugue512_Update( &h_ctx.fugue1, hashB, 512 ); //2   ////
-    fugue512_Final( &h_ctx.fugue1, hashA ); //3 
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+    fugue512_full( &ctx.fugue, hashA, hashB, 64 );
 #else
-    sph_fugue512 (&h_ctx.fugue1, hashB, 64); //2   ////
-    sph_fugue512_close(&h_ctx.fugue1, hashA); //3 
+    sph_fugue512_init( &ctx.fugue );
+    sph_fugue512( &ctx.fugue, hashB, 64 ); //2   ////
+    sph_fugue512_close( &ctx.fugue, hashA ); //3 
 #endif

    if ( hashA[0] & mask ) //4
    {
-#if defined(__AES__)
-     update_final_echo ( &h_ctx.echo2, (BitSequence *)hashB,
-                         (const BitSequence *)hashA, 512 );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+       echo_full( &ctx.echo, hashB, 512, hashA, 64 );
 #else
-     sph_echo512 (&h_ctx.echo2, hashA, 64); //
-     sph_echo512_close(&h_ctx.echo2, hashB); //5
+       sph_echo512_init( &ctx.echo );
+       sph_echo512( &ctx.echo, hashA, 64 ); //
+       sph_echo512_close( &ctx.echo, hashB ); //5
 #endif
    }
    else
-    {
-#if defined(__aarch64__)
-    sph_simd512(&h_ctx.simd2, hashA, 64); //6
-    sph_simd512_close(&h_ctx.simd2, hashB); //7
-#else
-    update_final_sd( &h_ctx.simd2, (BitSequence *)hashB,
-                      (const BitSequence *)hashA, 512 );
-#endif
-    }
+       simd512_ctx( &ctx.simd, hashB, hashA, 64 );

-    sph_shabal512 (&h_ctx.shabal1, hashB, 64); //5
-    sph_shabal512_close(&h_ctx.shabal1, hashA); //6
+    sph_shabal512_init( &ctx.shabal );
+    sph_shabal512( &ctx.shabal, hashB, 64 ); //5
+    sph_shabal512_close( &ctx.shabal, hashA ); //6

-    sph_whirlpool (&h_ctx.whirlpool3, hashA, 64); //6
-    sph_whirlpool_close(&h_ctx.whirlpool3, hashB); //7
+    sph_whirlpool_init( &ctx.whirlpool );
+    sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //6
+    sph_whirlpool_close( &ctx.whirlpool, hashB ); //7

    if ( hashB[0] & mask ) //7
    {
-#if defined(__AES__)
-        fugue512_Update( &h_ctx.fugue2, hashB, 512 ); //
-        fugue512_Final( &h_ctx.fugue2, hashA ); //8
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+       fugue512_full( &ctx.fugue, hashA, hashB, 64 );
 #else
-        sph_fugue512 (&h_ctx.fugue2, hashB, 64); //
-        sph_fugue512_close(&h_ctx.fugue2, hashA); //8
+       sph_fugue512_init( &ctx.fugue );
+       sph_fugue512( &ctx.fugue, hashB, 64 ); //
+       sph_fugue512_close( &ctx.fugue, hashA ); //8
 #endif
    }
    else
    {
-        sph_sha512( &h_ctx.sha1, hashB, 64 );
-        sph_sha512_close( &h_ctx.sha1, hashA );
+       sph_sha512_init( &ctx.sha );
+       sph_sha512( &ctx.sha, hashB, 64 );
+       sph_sha512_close( &ctx.sha, hashA );
    }

-#if defined(__AES__)
-    update_and_final_groestl( &h_ctx.groestl2, (char*)hashB,
-                               (const char*)hashA, 512 );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+    groestl512_full( &ctx.groestl, hashB, hashA, 512 );
 #else
-    sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3
-    sph_groestl512_close(&h_ctx.groestl2, hashB); //4
+    sph_groestl512_init( &ctx.groestl );
+    sph_groestl512( &ctx.groestl, hashA, 64 ); //3
+    sph_groestl512_close( &ctx.groestl, hashB ); //4
 #endif

-    sph_sha512( &h_ctx.sha2, hashB, 64 );
-    sph_sha512_close( &h_ctx.sha2, hashA );
+    sph_sha512_init( &ctx.sha );
+    sph_sha512( &ctx.sha, hashB, 64 );
+    sph_sha512_close( &ctx.sha, hashA );

    if ( hashA[0] & mask ) //4
    {
-        sph_haval256_5 (&h_ctx.haval2, hashA, 64); //
-        sph_haval256_5_close(&h_ctx.haval2, hashB); //5
-	memset(&hashB[8], 0, 32);
+        sph_haval256_5_init( &ctx.haval );
+        sph_haval256_5( &ctx.haval, hashA, 64 ); //
+        sph_haval256_5_close( &ctx.haval, hashB ); //5
+        memset( &hashB[8], 0, 32 );
    }
    else
    {
-        sph_whirlpool (&h_ctx.whirlpool4, hashA, 64); //4
-        sph_whirlpool_close(&h_ctx.whirlpool4, hashB);   //5
+        sph_whirlpool_init( &ctx.whirlpool );
+        sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //4
+        sph_whirlpool_close( &ctx.whirlpool, hashB );   //5
    }

-    sph_bmw512 (&h_ctx.bmw3, hashB, 64); //5
-    sph_bmw512_close(&h_ctx.bmw3, hashA); //6
+    sph_bmw512_init( &ctx.bmw );
+    sph_bmw512( &ctx.bmw, hashB, 64 ); //5
+    sph_bmw512_close( &ctx.bmw, hashA ); //6

-	memcpy(state, hashA, 32);
+	memcpy( state, hashA, 32 );
 }

 int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-//        uint32_t endiandata[32] __attribute__((aligned(64)));
-        uint32_t endiandata[20] __attribute__((aligned(64)));
-        uint32_t hash64[8] __attribute__((aligned(64)));
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t endiandata[20] __attribute__((aligned(32)));
+   uint32_t hash64[8] __attribute__((aligned(32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
   int thr_id = mythr->id;  // thr_id arg is deprecated
-	//const uint32_t Htarg = ptarget[7];

 	//we need bigendian data...
-//        for (int k = 0; k < 32; k++)
-        for (int k = 0; k < 20; k++)
-                be32enc(&endiandata[k], pdata[k]);
+   for (int k = 0; k < 20; k++)
+         be32enc(&endiandata[k], pdata[k]);

-        hmq_bmw512_midstate( endiandata );
-
-//	if (opt_debug) 
-//	{
-//		applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce);
-//	}
-	
-	/* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */
-	/* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */
 	if (ptarget[7]==0) {
 		do {
 			pdata[19] = ++n;
--- a/algo/quark/quark-gate.c
+++ b/algo/quark/quark-gate.c
@@ -14,7 +14,8 @@ bool register_quark_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_quark;
  gate->hash      = (void*)&quark_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
+                      | NEON_OPT;
  return true;
 };

--- a/algo/quark/quark.c
+++ b/algo/quark/quark.c
@@ -7,12 +7,12 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "algo/blake/sph_blake.h"
+#include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #else
  #include "algo/groestl/sph_groestl.h"
@@ -21,9 +21,9 @@
 void quark_hash(void *state, const void *input)
 {
   uint32_t hash[16] __attribute__((aligned(64)));
-   sph_blake512_context    ctx_blake;
+   blake512_context        ctx_blake;
   sph_bmw512_context      ctx_bmw;
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   hashState_groestl       ctx_groestl;
 #else
   sph_groestl512_context  ctx_groestl;
@@ -33,17 +33,15 @@ void quark_hash(void *state, const void *input)
   sph_keccak512_context   ctx_keccak;
   uint32_t mask = 8;

-   sph_blake512_init( &ctx_blake );
-   sph_blake512( &ctx_blake, input, 80 );
-   sph_blake512_close( &ctx_blake, hash );
-
+   blake512_full( &ctx_blake, hash, input, 80 );
+   
   sph_bmw512_init( &ctx_bmw );
   sph_bmw512( &ctx_bmw, hash, 64 );
   sph_bmw512_close( &ctx_bmw, hash ); 

   if ( hash[0] & mask )
   {
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
      init_groestl( &ctx_groestl, 64 );
      update_and_final_groestl( &ctx_groestl, (char*)hash,
                                        (const char*)hash, 512 );
@@ -60,7 +58,7 @@ void quark_hash(void *state, const void *input)
      sph_skein512_close( &ctx_skein, hash );
   }

-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   init_groestl( &ctx_groestl, 64 );
   update_and_final_groestl( &ctx_groestl, (char*)hash,
                                     (const char*)hash, 512 );
@@ -76,9 +74,7 @@ void quark_hash(void *state, const void *input)

   if ( hash[0] & mask )
   {
-      sph_blake512_init( &ctx_blake );
-      sph_blake512( &ctx_blake, hash, 64 );
-      sph_blake512_close( &ctx_blake, hash );
+      blake512_full( &ctx_blake, hash, hash, 64 );
   }
   else
   {
--- a/algo/qubit/deep-2way.c
+++ b/algo/qubit/deep-2way.c
@@ -83,7 +83,7 @@ int scanhash_deep_2way( struct work *work,uint32_t max_nonce,

     casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
     casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
-     casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+     casti_v128(  endiandata, 4 ) = v128_bswap32(   casti_v128(  pdata, 4 ) );

     uint64_t *edata = (uint64_t*)endiandata;
     intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -236,7 +236,7 @@ int scanhash_qubit_2way( struct work *work,uint32_t max_nonce,

     casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
     casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
-     casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+     casti_v128(  endiandata, 4 ) = v128_bswap32(   casti_v128(  pdata, 4 ) );

     uint64_t *edata = (uint64_t*)endiandata;
     intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );
--- a/algo/qubit/qubit-gate.c
+++ b/algo/qubit/qubit-gate.c
@@ -16,7 +16,8 @@ bool register_qubit_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_qubit;
  gate->hash      = (void*)&qubit_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
+                      | NEON_OPT;
  return true;
 };

--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -8,13 +8,9 @@
 #include <stdio.h>
 #include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/cubehash_sse2.h" 
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
-#ifdef __AES__
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
 #include "algo/echo/aes_ni/hash_api.h"
 #else
 #include "algo/echo/sph_echo.h"
@@ -25,12 +21,8 @@ typedef struct
        hashState_luffa         luffa;
        cubehashParam           cubehash;
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
-#ifdef __AES__
+        simd512_context         simd;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_echo          echo;
 #else
        sph_echo512_context echo;
@@ -45,12 +37,7 @@ void init_qubit_ctx()
        init_luffa(&qubit_ctx.luffa,512);
        cubehashInit(&qubit_ctx.cubehash,512,16,32);
        sph_shavite512_init(&qubit_ctx.shavite);
-#if defined(__aarch64__)
-   sph_simd512_init( &qubit_ctx.simd );
-#else
-   init_sd( &qubit_ctx.simd, 512 );
-#endif
-#ifdef __AES__
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        init_echo(&qubit_ctx.echo, 512);
 #else
        sph_echo512_init(&qubit_ctx.echo);
@@ -81,15 +68,9 @@ void qubit_hash(void *output, const void *input)
        sph_shavite512( &ctx.shavite, hash, 64);
        sph_shavite512_close( &ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_sd( &ctx.simd, (const BitSequence *)hash, 512 );
-    final_sd( &ctx.simd, (BitSequence *)hash );
-#endif
-
-#ifdef __AES__
+        simd512_ctx( &ctx.simd, hash, hash, 64 );
+        
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        update_final_echo( &ctx.echo, (BitSequence *) hash,
                     (const BitSequence *) hash, 512 );
 #else
--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -35,20 +35,20 @@ static const uint32_t IV[5] =
   _mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )

 #define F3(x, y, z) \
-   _mm_xor_si128( _mm_or_si128( x, mm128_not( y ) ), z )
+   _mm_xor_si128( _mm_or_si128( x, v128_not( y ) ), z )

 #define F4(x, y, z) \
   _mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )

 #define F5(x, y, z) \
-   _mm_xor_si128( x, _mm_or_si128( y, mm128_not( z ) ) )
+   _mm_xor_si128( x, _mm_or_si128( y, v128_not( z ) ) )

 #define RR(a, b, c, d, e, f, s, r, k) \
 do{ \
-   a = _mm_add_epi32( mm128_rol_32( _mm_add_epi32( _mm_add_epi32( \
+   a = _mm_add_epi32( v128_rol32( _mm_add_epi32( _mm_add_epi32( \
                _mm_add_epi32( a, f( b ,c, d ) ), r ), \
                                 _mm_set1_epi64x( k ) ), s ), e ); \
-   c = mm128_rol_32( c, 10 );\
+   c = v128_rol32( c, 10 );\
 } while (0)

 #define ROUND1(a, b, c, d, e, f, s, r, k)  \
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -319,7 +319,7 @@ int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
   v128_t A, B, C, D, E, F, G, H, T0, T1, T2;
   v128_t vmask, targ, hash;
   int t6_mask, flip;
-   v128_t W[16];      memcpy_128( W, data, 16 );
+   v128_t W[16];      v128_memcpy( W, data, 16 );

   A = v128_load( state_in   );
   B = v128_load( state_in+1 );
--- a/algo/sha/sha256-hash.c
+++ b/algo/sha/sha256-hash.c
@@ -587,8 +587,8 @@ void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y,
    // Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3)
    TMSG0_X = casti_m128i( msg_X, 0 );
    TMSG0_Y = casti_m128i( msg_Y, 0 );
-    TMP_X = mm128_xim_32( TMSG0_X, TMSG0_X, 0xd5 );
-    TMP_Y = mm128_xim_32( TMSG0_Y, TMSG0_Y, 0xd5 );
+    TMP_X = v128_xim32( TMSG0_X, TMSG0_X, 0xd5 );
+    TMP_Y = v128_xim32( TMSG0_Y, TMSG0_Y, 0xd5 );
    STATE0_X = _mm_add_epi32( STATE0_X, TMP_X );
    STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y );

--- a/algo/sha/sha512256d-4way.c
+++ b/algo/sha/sha512256d-4way.c
@@ -5,11 +5,11 @@
 #include <stdint.h>

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
- #define SHA512256D_8WAY 1
+#define SHA512256D_8WAY 1
 #elif defined(__AVX2__)
- #define SHA512256D_4WAY 1
+#define SHA512256D_4WAY 1
 #elif defined(__SSE2__) || defined(__ARM_NEON)
- #define SHA512256D_2WAY 1
+#define SHA512256D_2WAY 1
 #endif

 #if defined(SHA512256D_8WAY)
@@ -110,14 +110,13 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
    const uint32_t first_nonce = pdata[19];
    const uint32_t last_nonce = max_nonce - 4;
    uint32_t n = first_nonce;
-    __m256i  *noncev = (__m256i*)vdata + 9;
    const int thr_id = mythr->id;
    const bool bench = opt_benchmark;
    const __m256i four = v256_64( 0x0000000400000000 );

    mm256_bswap32_intrlv80_4x64( vdata, pdata );
-    *noncev = mm256_intrlv_blend_32(
-                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
+    casti_m256i( vdata,9 ) = mm256_intrlv_blend_32( _mm256_set_epi32(
+                     n+3, 0, n+2, 0, n+1, 0, n, 0 ), casti_m256i( vdata,9 ) );
    do
    {
       sha512256d_4way_init( &ctx );
@@ -138,7 +137,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
             submit_solution( work, lane_hash, mythr );
          }
       }
-       *noncev = _mm256_add_epi32( *noncev, four );
+       casti_m256i( vdata,9 ) = _mm256_add_epi32( casti_m256i( vdata,9 ), four );
       n += 4;
    } while ( (n < last_nonce) && !work_restart[thr_id].restart );

@@ -180,11 +179,10 @@ int scanhash_sha512256d_2x64( struct work *work, uint32_t max_nonce,
    v128u64_t *noncev = (v128u64_t*)vdata + 9;
    const int thr_id = mythr->id;
    const bool bench = opt_benchmark;
-    const v128u64_t two = v128_64( 0x0000000200000000 );
+    const v128_t two = v128_64( 0x0000000200000000 );

    v128_bswap32_intrlv80_2x64( vdata, pdata );
-    *noncev = v128_add32( v128_set32( 1, 0, 0, 0 ), *noncev );
-//    *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
+    *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );

    do
    {
@@ -279,7 +277,7 @@ int scanhash_sha512256d( struct work *work,   uint32_t max_nonce,

 bool register_sha512256d_algo( algo_gate_t* gate )
 {
-   gate->optimizations = AVX2_OPT | AVX512_OPT;
+   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
 #if defined(SHA512256D_8WAY)
   gate->scanhash = (void*)&scanhash_sha512256d_8way;
 #elif defined(SHA512256D_4WAY)
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -34,8 +34,6 @@
 #include <string.h>
 #include "shabal-hash-4way.h"

-//#if defined(__SSE4_1__) || defined(__ARM_NEON)
-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 #define DECL_STATE16   \
@@ -47,8 +45,6 @@
           C8, C9, CA, CB, CC, CD, CE, CF; \
   __m512i M0, M1, M2, M3, M4, M5, M6, M7, \
           M8, M9, MA, MB, MC, MD, ME, MF; \
-   const __m512i FIVE  = v512_32( 5 ); \
-   const __m512i THREE = v512_32( 3 ); \
   uint32_t Wlow, Whigh;

 #define READ_STATE16(state) do \
@@ -292,11 +288,21 @@ do { \
    mm512_swap1024_512( BF, CF ); \
 } while (0)

+static inline __m512i v512_mult_x3( const __m512i x )
+{
+   return _mm512_add_epi32( x, _mm512_slli_epi32( x, 1 ) );
+}
+
+static inline __m512i v512_mult_x5( const __m512i x )
+{
+   return _mm512_add_epi32( x, _mm512_slli_epi32( x, 2 ) );
+}
+
 #define PERM_ELT16( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
   xa0 = mm512_xor3( xm, xb1, mm512_xorandnot( \
-           _mm512_mullo_epi32( mm512_xor3( xa0, xc, \
-              _mm512_mullo_epi32( mm512_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
+           v512_mult_x3( mm512_xor3( xa0, xc, \
+              v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ) ), \
           xb3, xb2 ) ); \
   xb0 = mm512_xnor( xa0, mm512_rol_32( xb0, 1 ) ); \
 } while (0)
@@ -644,8 +650,6 @@ shabal512_16way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
           C8, C9, CA, CB, CC, CD, CE, CF; \
   __m256i M0, M1, M2, M3, M4, M5, M6, M7, \
           M8, M9, MA, MB, MC, MD, ME, MF; \
-   const __m256i FIVE  = v256_32( 5 ); \
-   const __m256i THREE = v256_32( 3 ); \
   uint32_t Wlow, Whigh;

 #define READ_STATE8(state) do \
@@ -889,11 +893,21 @@ do { \
    mm256_swap512_256( BF, CF ); \
 } while (0)

+static inline __m256i v256_mult_x3( const __m256i x )
+{
+   return _mm256_add_epi32( x, _mm256_slli_epi32( x, 1 ) );
+}
+
+static inline __m256i v256_mult_x5( const __m256i x )
+{
+   return _mm256_add_epi32( x, _mm256_slli_epi32( x, 2 ) );
+}
+
 #define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
   xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
-           _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
-              _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
+           v256_mult_x3( mm256_xor3( xa0, xc, \
+              v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ) ), \
           xb3, xb2 ) ); \
   xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
 } while (0)
@@ -1226,15 +1240,13 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)

 #endif  // AVX2

-#if defined(__SSE4_1__) || defined(__ARM_NEON)
+#if defined(__SSE2__) || defined(__ARM_NEON)

 #define DECL_STATE   \
 	v128u32_t A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, AA, AB; \
 	v128u32_t B0, B1, B2, B3, B4, B5, B6, B7, B8, B9, BA, BB, BC, BD, BE, BF; \
 	v128u32_t C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, CA, CB, CC, CD, CE, CF; \
 	v128u32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; \
-   const v128u32_t FIVE  = v128_32( 5 ); \
-   const v128u32_t THREE = v128_32( 3 ); \
   uint32_t Wlow, Whigh;

 #define READ_STATE( state ) \
@@ -1479,12 +1491,22 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
    v128_swap256_128( BF, CF ); \
 }

+static inline v128_t v128_mult_x3( const v128_t x )
+{
+   return v128_add32( x, v128_sl32( x, 1 ) );
+}
+
+static inline v128_t v128_mult_x5( const v128_t x )
+{
+   return v128_add32( x, v128_sl32( x, 2 ) );
+}
+
 #define PERM_ELT( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 { \
   xa0 = v128_xor3( xm, xb1, v128_xorandnot( \
-           v128_mul32( v128_xor3( xa0, xc, \
-              v128_mul32( v128_rol32( xa1, 15 ), FIVE ) ), THREE ), \
-           xb3, xb2 ) ); \
+                               v128_mult_x3( v128_xor3( xa0, xc, \
+                                   v128_mult_x5( v128_rol32( xa1, 15 ) ) ) ), \
+                               xb3, xb2 ) ); \
   xb0 = v128_not( v128_xor( xa0, v128_rol32( xb0, 1 ) ) ); \
 }

--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -62,7 +62,7 @@ void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,

 #endif

-#if defined(__SSE4_1__) || defined(__ARM_NEON)
+#if defined(__SSE2__) || defined(__ARM_NEON)

 typedef struct {
 	v128_t buf[16] __attribute__ ((aligned (64)));
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -71,7 +71,7 @@ static const uint32_t IV512[] =
 static void
 c512_2way( shavite512_2way_context *ctx, const void *msg )
 {
-   const __m128i zero = _mm_setzero_si128();
+   const v128_t zero = v128_zero;
   __m256i p0, p1, p2, p3, x;
   __m256i k00, k01, k02, k03, k10, k11, k12, k13;
   __m256i *m = (__m256i*)msg;
@@ -278,7 +278,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
 void shavite512_2way_init( shavite512_2way_context *ctx )
 {
    __m256i *h = (__m256i*)ctx->h;
-    __m128i *iv = (__m128i*)IV512;
+    v128_t *iv = (v128_t*)IV512;
   
   h[0] = mm256_bcast_m128( iv[0] );
   h[1] = mm256_bcast_m128( iv[1] );
@@ -358,7 +358,7 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
    count.u32[3] = ctx->count3;

    casti_m256i( buf, 6 ) = mm256_bcast_m128(
-                  _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); 
+                  _mm_insert_epi16( v128_zero, count.u16[0], 7 ) ); 
    casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
                  0x0200,       count.u16[7], count.u16[6], count.u16[5],
                  count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
@@ -434,7 +434,7 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
   }

    casti_m256i( buf, 6 ) = mm256_bcast_m128(
-                  _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); 
+                  _mm_insert_epi16( v128_zero, count.u16[0], 7 ) ); 
    casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
                  0x0200,       count.u16[7], count.u16[6], count.u16[5],
                  count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
@@ -451,7 +451,7 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
                           const void *data, size_t len )
 {
    __m256i *h = (__m256i*)ctx->h;
-    __m128i *iv = (__m128i*)IV512;
+    v128_t *iv = (v128_t*)IV512;

   h[0] = mm256_bcast_m128( iv[0] );
   h[1] = mm256_bcast_m128( iv[1] );
@@ -524,7 +524,7 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
   }

    casti_m256i( buf, 6 ) = mm256_bcast_m128(
-                  _mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
+                  _mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
    casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
                  0x0200,       count.u16[7], count.u16[6], count.u16[5],
                  count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -303,7 +303,7 @@ void shavite512_4way_close( shavite512_4way_context *ctx, void *dst )
    count.u32[3] = ctx->count3;

    casti_m512i( buf, 6 ) = mm512_bcast_m128(
-                  _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); 
+                  _mm_insert_epi16( v128_zero, count.u16[0], 7 ) ); 
    casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
                  0x0200,       count.u16[7], count.u16[6], count.u16[5],
                  count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
@@ -379,7 +379,7 @@ void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
   }

    casti_m512i( buf, 6 ) = mm512_bcast_m128(
-                  _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); 
+                  _mm_insert_epi16( v128_zero, count.u16[0], 7 ) ); 
    casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
                  0x0200,       count.u16[7], count.u16[6], count.u16[5],
                  count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
@@ -470,7 +470,7 @@ void shavite512_4way_full( shavite512_4way_context *ctx, void *dst,
   }

    casti_m512i( buf, 6 ) = mm512_bcast_m128(
-                  _mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
+                  _mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
    casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
                  0x0200,       count.u16[7], count.u16[6], count.u16[5],
                  count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -159,4 +159,69 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
    return 0;
 }

+#elif defined(SKEIN_2WAY)
+
+static __thread skein512_2x64_context skein512_2x64_ctx
+                                            __attribute__ ((aligned (64)));
+
+void skeinhash_2x64( void *state, const void *input )
+{
+     uint64_t vhash64[8*2] __attribute__ ((aligned (32)));
+     uint32_t hash0[16] __attribute__ ((aligned (32)));
+     uint32_t hash1[16] __attribute__ ((aligned (32)));
+     skein512_2x64_context ctx_skein;
+     memcpy( &ctx_skein, &skein512_2x64_ctx, sizeof( ctx_skein ) );
+
+     skein512_2x64_final16( &ctx_skein, vhash64, input + (64*2) );
+
+     dintrlv_2x64( hash0, hash1, vhash64, 512 );
+
+     sha256_full( hash0, hash0, 64 );
+     sha256_full( hash1, hash1, 64 );
+
+     intrlv_2x32( state, hash0, hash1, 256 );
+}
+
+int scanhash_skein_2x64( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint32_t vdata[20*2] __attribute__ ((aligned (32)));
+    uint32_t hash[8*2] __attribute__ ((aligned (32)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+    uint32_t *hash_d7 = &(hash[7<<1]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint32_t targ_d7 = ptarget[7];
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t last_nonce = max_nonce - 2;
+    uint32_t n = first_nonce;
+    v128u32_t  *noncev = (v128u32_t*)vdata + 9;
+    const int thr_id = mythr->id;
+    const bool bench = opt_benchmark;
+
+   v128_bswap32_intrlv80_2x64( vdata, pdata );
+   skein512_2x64_prehash64( &skein512_2x64_ctx, vdata );
+   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
+   do
+   {
+       skeinhash_2x64( hash, vdata );
+       for ( int lane = 0; lane < 2; lane++ )
+       if ( unlikely( ( hash_d7[ lane ] <= targ_d7 ) && !bench ) )
+       {
+          extr_lane_2x32( lane_hash, hash, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) )
+          {
+             pdata[19] = bswap_32( n + lane );
+             submit_solution( work, lane_hash, mythr );
+          }
+       }
+       *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
+       n += 2;
+    } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+
+    pdata[19] = n;
+    *hashes_done = n - first_nonce;
+    return 0;
+}
+
 #endif
--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -3,16 +3,20 @@

 bool register_skein_algo( algo_gate_t* gate )
 {
-#if defined (SKEIN_8WAY)
-    gate->optimizations = AVX2_OPT | AVX512_OPT;
+#if defined(SKEIN_8WAY)
+    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
    gate->scanhash  = (void*)&scanhash_skein_8way;
    gate->hash      = (void*)&skeinhash_8way;
-#elif defined (SKEIN_4WAY)
-    gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
+#elif defined(SKEIN_4WAY)
+    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
    gate->scanhash  = (void*)&scanhash_skein_4way;
    gate->hash      = (void*)&skeinhash_4way;
+#elif defined(SKEIN_2WAY)
+    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
+    gate->scanhash  = (void*)&scanhash_skein_2x64;
+    gate->hash      = (void*)&skeinhash_2x64;
 #else
-    gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
+    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
    gate->scanhash  = (void*)&scanhash_skein;
    gate->hash      = (void*)&skeinhash;
 #endif
@@ -21,16 +25,15 @@ bool register_skein_algo( algo_gate_t* gate )

 bool register_skein2_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT | AVX512_OPT;
-#if defined (SKEIN_8WAY)
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
+#if defined(SKEIN_8WAY)
  gate->scanhash  = (void*)&scanhash_skein2_8way;
-  gate->hash      = (void*)&skein2hash_8way;
-#elif defined (SKEIN_4WAY)
+#elif defined(SKEIN_4WAY)
  gate->scanhash  = (void*)&scanhash_skein2_4way;
-  gate->hash      = (void*)&skein2hash_4way;
+#elif defined(SKEIN_2WAY)
+  gate->scanhash  = (void*)&scanhash_skein2_2x64;
 #else
  gate->scanhash  = (void*)&scanhash_skein2;
-  gate->hash      = (void*)&skein2hash;
 #endif
  return true;
 };
--- a/algo/skein/skein-gate.h
+++ b/algo/skein/skein-gate.h
@@ -7,6 +7,8 @@
  #define SKEIN_8WAY 1
 #elif defined(__AVX2__)
  #define SKEIN_4WAY 1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define SKEIN_2WAY 1
 #endif

 #if defined(SKEIN_8WAY)
@@ -29,6 +31,16 @@ void skein2hash_4way( void *output, const void *input );
 int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
                          uint64_t* hashes_done, struct thr_info *mythr );

+#elif defined(SKEIN_2WAY)
+
+void skeinhash_2x64( void *output, const void *input );
+int scanhash_skein_2x64( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+void skein2hash_2x64( void *output, const void *input );
+int scanhash_skein2_2x64( struct work *work, uint32_t max_nonce,
+                          uint64_t* hashes_done, struct thr_info *mythr );
+
 #else

 void skeinhash( void *output, const void *input );
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -675,11 +675,13 @@ void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data,

 // Close

-   unsigned et;
-
-   memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
-   et = 352 + ((bcount == 0) << 7);
-   UBI_BIG_8WAY( et, ptr );
+   if ( ptr )
+   {
+      unsigned et;
+      memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+      et = 352 + ((bcount == 0) << 7);
+      UBI_BIG_8WAY( et, ptr );
+   }

   memset_zero_512( buf, buf_size >> 3 );
   bcount = 0;
@@ -970,11 +972,13 @@ skein512_4way_full( skein512_4way_context *sc, void *out, const void *data,

 // Close

-   unsigned et;
-
-   memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
-   et = 352 + ((bcount == 0) << 7);
-   UBI_BIG_4WAY( et, ptr );
+   if ( ptr )
+   {
+      unsigned et;
+      memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+      et = 352 + ((bcount == 0) << 7);
+      UBI_BIG_4WAY( et, ptr );
+   }

   memset_zero_256( buf, buf_size >> 3 );
   bcount = 0;
@@ -1364,11 +1368,13 @@ skein512_2x64_full( skein512_2x64_context *sc, void *out, const void *data,

 // Close

-   unsigned et;
-
-   v128_memset_zero( buf + (ptr>>3), (buf_size - ptr) >> 3 );
-   et = 352 + ((bcount == 0) << 7);
-   UBI_BIG_2WAY( et, ptr );
+   if ( ptr )
+   {
+      unsigned et;
+      v128_memset_zero( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+      et = 352 + ((bcount == 0) << 7);
+      UBI_BIG_2WAY( et, ptr );
+   }

   v128_memset_zero( buf, buf_size >> 3 );
   bcount = 0;
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -5,19 +5,6 @@

 #if defined(SKEIN_8WAY)

- static __thread skein512_8way_context skein512_8way_ctx
-                                             __attribute__ ((aligned (64)));
-
-void skein2hash_8way( void *output, const void *input )
-{
-   uint64_t hash[16*8] __attribute__ ((aligned (128)));
-   skein512_8way_context ctx;
-   memcpy( &ctx, &skein512_8way_ctx, sizeof( ctx ) );
-
-   skein512_8way_final16( &ctx, hash, input + (64*8) );
-   skein512_8way_full( &ctx, output, hash, 64 );
-}
-
 int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
@@ -68,19 +55,6 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,

 #elif defined(SKEIN_4WAY)

-static __thread skein512_4way_context skein512_4way_ctx
-                                           __attribute__ ((aligned (64)));
-
-void skein2hash_4way( void *output, const void *input )
-{
-   skein512_4way_context ctx;
-   memcpy( &ctx, &skein512_4way_ctx, sizeof( ctx ) ); 
-   uint64_t hash[16*4] __attribute__ ((aligned (64)));
-
-   skein512_4way_final16( &ctx, hash, input + (64*4) );
-   skein512_4way_full( &ctx, output, hash, 64 );
-}
-
 int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
@@ -128,4 +102,53 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
    return 0;
 }

+#elif defined(SKEIN_2WAY)
+
+int scanhash_skein2_2x64( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint64_t hash[8*2] __attribute__ ((aligned (64)));
+    uint32_t vdata[20*2] __attribute__ ((aligned (64)));
+    skein512_2x64_context ctx;
+    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+    uint64_t *hash_q3 = &(hash[3*2]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t last_nonce = max_nonce - 4;
+    uint32_t n = first_nonce;
+    v128u64_t *noncev = (v128u64_t*)vdata + 9;
+    const int thr_id = mythr->id;
+    const bool bench = opt_benchmark;
+    const v128u64_t two = v128_64( 0x0000000200000000 );
+
+    v128_bswap32_intrlv80_2x64( vdata, pdata );
+    skein512_2x64_prehash64( &ctx, vdata );
+    *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
+
+    do
+    {
+       skein512_2x64_final16( &ctx, hash, vdata + (16*2) );
+       skein512_2x64_full( &ctx, hash, hash, 64 );
+
+       for ( int lane = 0; lane < 2; lane++ )
+       if ( hash_q3[ lane ] <= targ_q3 )
+       {
+          extr_lane_2x64( lane_hash, hash, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) && !bench )
+          {
+             pdata[19] = bswap_32( n + lane );
+             submit_solution( work, lane_hash, mythr );
+          }
+       }
+       *noncev = v128_add32( *noncev, two );
+       n += 2;
+    } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
+    pdata[19] = n;
+    *hashes_done = n - first_nonce;
+    return 0;
+}
+
 #endif
--- a/algo/swifftx/Swifftx_sha3.cpp
+++ b/algo/swifftx/Swifftx_sha3.cpp
@@ -1,369 +0,0 @@
-#include "Swifftx_sha3.h"
-extern "C" {
-#include "SWIFFTX.h"
-}
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-// The default salt value.
-// This is the expansion of e (Euler's number) - the 19 digits after 2.71:
-// 8281828459045235360.
-// The above in base 256, from MSB to LSB:
-BitSequence SWIF_saltValueChar[SWIF_HAIFA_SALT_SIZE] = {114, 238, 247, 26, 192, 28, 170, 160};
-
-// All the IVs here below were produced from the decimal digits of e's expansion.
-// The code can be found in 'ProduceRandomIV.c'.
-// The initial value for 224 digest size.
-const BitSequence SWIF_HAIFA_IV_224[SWIFFTX_OUTPUT_BLOCK_SIZE] = 
-{37, 242, 132,   2, 167,  81, 158, 237, 113,  77, 162,  60,  65, 236, 108, 246,
-101,  72, 190, 109,  58, 205,  99,   6, 114, 169, 104, 114,  38, 146, 121, 142,
- 59,  98, 233,  84,  72, 227,  22, 199,  17, 102, 198, 145,  24, 178,  37,   1,
-215, 245,  66, 120, 230, 193, 113, 253, 165, 218,  66, 134,  49, 231, 124, 204,
-  0};
-
-// The initial value for 256 digest size.
-const BitSequence SWIF_HAIFA_IV_256[SWIFFTX_OUTPUT_BLOCK_SIZE] = 
-{250,  50,  42,  40,  14, 233,  53,  48, 227,  42, 237, 187, 211, 120, 209, 234,
-  27, 144,   4,  61, 243, 244,  29, 247,  37, 162,  70,  11, 231, 196,  53,   6,
- 193, 240,  94, 126, 204, 132, 104,  46, 114,  29,   3, 104, 118, 184, 201,   3,
-  57,  77,  91, 101,  31, 155,  84, 199, 228,  39, 198,  42, 248, 198, 201, 178,
-   8};
-
-// The initial value for 384 digest size.
-const BitSequence SWIF_HAIFA_IV_384[SWIFFTX_OUTPUT_BLOCK_SIZE] = 
-{40, 145, 193, 100, 205, 171,  47,  76, 254,  10, 196,  41, 165, 207, 200,  79,
-109,  13,  75, 201,  17, 172,  64, 162, 217,  22,  88,  39,  51,  30, 220, 151,
-133,  73, 216, 233, 184, 203,  77,   0, 248,  13,  28, 199,  30, 147, 232, 242,
-227, 124, 169, 174,  14,  45,  27,  87, 254,  73,  68, 136, 135, 159,  83, 152,
-  0};
-
-// The initial value for 512 digest size.
-const BitSequence SWIF_HAIFA_IV_512[SWIFFTX_OUTPUT_BLOCK_SIZE] = 
-{195, 126, 197, 167, 157, 114,  99, 126, 208, 105, 200,  90,  71, 195, 144, 138,
- 142, 122, 123, 116,  24, 214, 168, 173, 203, 183, 194, 210, 102, 117, 138,  42,
- 114, 118, 132,  33,  35, 149, 143, 163, 163, 183, 243, 175,  72,  22, 201, 255,
- 102, 243,  22, 187, 211, 167, 239,  76, 164,  70,  80, 182, 181, 212,   9, 185,
-   0};
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-// NIST API implementation portion.
-///////////////////////////////////////////////////////////////////////////////////////////////
-
-int Swifftx::Init(int hashbitlen)
-{
-	switch(hashbitlen)
-	{
-	case 224:
-		swifftxState.hashbitlen = hashbitlen;
-		// Initializes h_0 in HAIFA:
-		memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_224, SWIFFTX_OUTPUT_BLOCK_SIZE);
-		break;
-	case 256:
-		swifftxState.hashbitlen = hashbitlen;
-		memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_256, SWIFFTX_OUTPUT_BLOCK_SIZE);
-		break;
-	case 384:
-		swifftxState.hashbitlen = hashbitlen;
-		memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_384, SWIFFTX_OUTPUT_BLOCK_SIZE);
-		break;
-	case 512:
-		swifftxState.hashbitlen = hashbitlen;
-		memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_512, SWIFFTX_OUTPUT_BLOCK_SIZE);
-		break;
-	default:
-		return BAD_HASHBITLEN;
-	}
-	
-	swifftxState.wasUpdated = false;
-	swifftxState.remainingSize = 0;
-	memset(swifftxState.remaining, 0, SWIF_HAIFA_INPUT_BLOCK_SIZE);
-	memset(swifftxState.numOfBitsChar, 0, SWIF_HAIFA_NUM_OF_BITS_SIZE);
-	// Initialize the salt with the default value.
-	memcpy(swifftxState.salt, SWIF_saltValueChar, SWIF_HAIFA_SALT_SIZE);
-
-	InitializeSWIFFTX();
-
-	return SUCCESS;
-}
-
-int Swifftx::Update(const BitSequence *data, DataLength databitlen)
-{
-	// The size of input in bytes after putting the remaining data from previous invocation.
-	int sizeOfInputAfterRemaining = 0;
-	// The input block to compression function of SWIFFTX:
-	BitSequence currInputBlock[SWIFFTX_INPUT_BLOCK_SIZE] = {0};
-	// Whether we handled a single block.
-	bool wasSingleBlockHandled = false;
-
-	swifftxState.wasUpdated = true;
-
-	// Handle an empty message as required by NIST. Since 'Final()' is oblivious to the input
-	// (but of course uses the output of the compression function from the previous round, 
-	// which is called h_{i-1} in HAIFA article), we have to do nothing here.
-	if (databitlen == 0)
-		return SUCCESS;
-
-    // If we had before an input with unaligned length, return an error
-    if (swifftxState.remainingSize % 8)
-	{
-    	return INPUT_DATA_NOT_ALIGNED;
-    }
-
-    // Convert remaining size to bytes.
-    swifftxState.remainingSize /= 8;
-
-	// As long as we have enough data combined from (remaining + data) to fill input block
-	//NASTAVENIE RUND
-	while (((databitlen / 8) + swifftxState.remainingSize) >= SWIF_HAIFA_INPUT_BLOCK_SIZE)
-	{
-		// Fill the input block with data:
-		// 1. The output of the previous block:
-		memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
-		// 2. The input part of the block:
-		// 2a. The remaining data from the previous 'Update()' call:
-		if (swifftxState.remainingSize)
-			memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE, swifftxState.remaining, 
-				   swifftxState.remainingSize);
-		// 2b. The input data that we have place for after the 'remaining':
-		sizeOfInputAfterRemaining = SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE 
-								  - ((int) swifftxState.remainingSize) - SWIF_HAIFA_NUM_OF_BITS_SIZE 
-								  - SWIF_HAIFA_SALT_SIZE;
-		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize, 
-			   data, sizeOfInputAfterRemaining);
-
-		// 3. The #bits part of the block:
-		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize 
-			 + sizeOfInputAfterRemaining,
-			   swifftxState.numOfBitsChar, SWIF_HAIFA_NUM_OF_BITS_SIZE);
-		// 4. The salt part of the block:
-		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize 
-			 + sizeOfInputAfterRemaining + SWIF_HAIFA_NUM_OF_BITS_SIZE,
-			   swifftxState.salt, SWIF_HAIFA_SALT_SIZE);
-
-		ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, false);
-
-		// Update the #bits field with SWIF_HAIFA_INPUT_BLOCK_SIZE.
-		AddToCurrInBase256(swifftxState.numOfBitsChar, SWIF_HAIFA_INPUT_BLOCK_SIZE * 8);
-		wasSingleBlockHandled = true;
-		data += sizeOfInputAfterRemaining;
-		databitlen -= (sizeOfInputAfterRemaining * 8);
-   		swifftxState.remainingSize = 0;
-	}
-
-	// Update the swifftxState.remaining and swifftxState.remainingSize.
-    // remainingSize will be in bits after exiting 'Update()'.
-	if (wasSingleBlockHandled)
-	{		
-		swifftxState.remainingSize = (unsigned int) databitlen; // now remaining size is in bits.
-        if (swifftxState.remainingSize)
-			memcpy(swifftxState.remaining, data, (swifftxState.remainingSize + 7) / 8);
-	}
-	else
-	{
-		memcpy(swifftxState.remaining + swifftxState.remainingSize, data, 
-			   (size_t) (databitlen + 7) / 8);
-		swifftxState.remainingSize = (swifftxState.remainingSize * 8) + (unsigned short) databitlen;
-	}
-
-	return SUCCESS;
-}
-
-int Swifftx::Final(BitSequence *hashval)
-{
-    int i;
-    // Whether to add one last block. True if the padding appended to the last block overflows
-	// the block size.
-    bool toAddFinalBlock = false;
-    bool toPutOneInFinalBlock = false;
-    unsigned short oneShift = 0;
-   	// The size of the last input block before the zeroes padding. We add 1 here because we
-    // include the final '1' bit in the calculation and 7 as we round the length to bytes.
-	unsigned short sizeOfLastInputBlock = (swifftxState.remainingSize + 1 + 7) / 8;
-    // The number of bytes of zero in the padding part.
-	// The padding contains:
-	// 1. A single 1 bit.
-	// 2. As many zeroes as needed.
-	// 3. The message length in bits. Occupies SWIF_HAIFA_NUM_OF_BITS_SIZE bytes.
-	// 4. The digest size. Maximum is 512, so we need 2 bytes.
-	// If the total number achieved is negative, add an additional block, as HAIFA specifies.
-	short numOfZeroBytesInPadding = (short) SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE 
-								  - sizeOfLastInputBlock - (2 * SWIF_HAIFA_NUM_OF_BITS_SIZE) - 2 
-								  - SWIF_HAIFA_SALT_SIZE;
-   	// The input block to compression function of SWIFFTX:
-	BitSequence currInputBlock[SWIFFTX_INPUT_BLOCK_SIZE] = {0};
-	// The message length in base 256.
-	BitSequence messageLengthChar[SWIF_HAIFA_NUM_OF_BITS_SIZE] = {0};
-   	// The digest size used for padding:
-	unsigned char digestSizeLSB = swifftxState.hashbitlen % 256;
-	unsigned char digestSizeMSB = (swifftxState.hashbitlen - digestSizeLSB) / 256;
-
-	if (numOfZeroBytesInPadding < 1)
-		toAddFinalBlock = true;
-
-	// Fill the input block with data:
-	// 1. The output of the previous block:
-	memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
-	// 2a. The input part of the block, which is the remaining data from the previous 'Update()'
-    //     call, if exists and an extra '1' bit (maybe all we have is this extra 1):
-
-    // Add the last 1 in big-endian convention ...
-    if (swifftxState.remainingSize % 8 == 0)
-	{
-       swifftxState.remaining[sizeOfLastInputBlock - 1] = 0x80;
-    }
-    else 
-	{
-       swifftxState.remaining[sizeOfLastInputBlock - 1] |= (1 << (7 - (swifftxState.remainingSize % 8)));
-    }
-
-	if (sizeOfLastInputBlock)
-		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE, swifftxState.remaining, 
-			   sizeOfLastInputBlock);
-    
-   	// Compute the message length in base 256:
-	for (i = 0; i < SWIF_HAIFA_NUM_OF_BITS_SIZE; ++i)
-        messageLengthChar[i] = swifftxState.numOfBitsChar[i];
-    if (sizeOfLastInputBlock)
-		AddToCurrInBase256(messageLengthChar, sizeOfLastInputBlock * 8);
-
-	if (!toAddFinalBlock)
-	{
-		// 2b. Put the zeroes:
-		memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock,
-			   0, numOfZeroBytesInPadding);
-		// 2c. Pad the message length:
-		for (i = 0; i < SWIF_HAIFA_NUM_OF_BITS_SIZE; ++i)
-			currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock 
-						 + numOfZeroBytesInPadding + i] = messageLengthChar[i];
-		// 2d. Pad the digest size:
-		currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock 
-					 + numOfZeroBytesInPadding + SWIF_HAIFA_NUM_OF_BITS_SIZE] = digestSizeMSB;
-		currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock 
-					 + numOfZeroBytesInPadding + SWIF_HAIFA_NUM_OF_BITS_SIZE + 1] = digestSizeLSB;
-	}
-	else
-	{
-		// 2b. Put the zeroes, if at all:
-		if ((SWIF_HAIFA_INPUT_BLOCK_SIZE - sizeOfLastInputBlock) > 0)
-		{
-			 memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock,
-					0, SWIF_HAIFA_INPUT_BLOCK_SIZE - sizeOfLastInputBlock);
-		}
-	}
-
-   	// 3. The #bits part of the block: 
-	memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE, 
-           swifftxState.numOfBitsChar, SWIF_HAIFA_NUM_OF_BITS_SIZE);
-	// 4. The salt part of the block:
-	memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE 
-		 + SWIF_HAIFA_NUM_OF_BITS_SIZE, 
-           swifftxState.salt, 
-		   SWIF_HAIFA_SALT_SIZE);
-
-    ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, !toAddFinalBlock); 
-
-	// If we have to add one more block, it is now:
-	if (toAddFinalBlock)
-	{
-		// 1. The previous output block, as usual.
-		memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
-
-		// 2a. Instead of the input, zeroes:
-		memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE , 0, 
-			   SWIF_HAIFA_INPUT_BLOCK_SIZE - SWIF_HAIFA_NUM_OF_BITS_SIZE - 2);
-		// 2b. Instead of the input, the message length:
-		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE 
-			 - SWIF_HAIFA_NUM_OF_BITS_SIZE - 2,
-			   messageLengthChar,
-			   SWIF_HAIFA_NUM_OF_BITS_SIZE);
-		// 2c. Instead of the input, the digest size:
-		currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE - 2] = digestSizeMSB;
-		currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE - 1] = digestSizeLSB;
-		// 3. The #bits part of the block, which is zero in case of additional block:
-		memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE,
-			   0,
-			   SWIF_HAIFA_NUM_OF_BITS_SIZE);
-		// 4. The salt part of the block:
-		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE 
-			 + SWIF_HAIFA_NUM_OF_BITS_SIZE, 
-               swifftxState.salt, 
-			   SWIF_HAIFA_SALT_SIZE);
-
-        ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, true); 
-	}
-
-	// Finally, copy the result into 'hashval'. In case the digest size is not 512bit, copy the
-	// first hashbitlen of them:
-    for (i = 0; i < (swifftxState.hashbitlen / 8); ++i)
-		hashval[i] = swifftxState.currOutputBlock[i];
-
-	return SUCCESS;
-}
-
-int Swifftx::Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, 
-				BitSequence *hashval)
-{
-	int result;
-	//hashState state;
-   	// The pointer to the current place in the input we take into the compression function.
-	DataLength currInputIndex = 0;
-
-    result = Swifftx::Init(hashbitlen);
-
-	if (result != SUCCESS)
-		return result;
-
-	for ( ; (databitlen / 8) >  SWIF_HAIFA_INPUT_BLOCK_SIZE; 
-         currInputIndex += SWIF_HAIFA_INPUT_BLOCK_SIZE, databitlen -= (SWIF_HAIFA_INPUT_BLOCK_SIZE * 8))
-	{
-		result = Swifftx::Update(data + currInputIndex, SWIF_HAIFA_INPUT_BLOCK_SIZE * 8); 
-		if (result != SUCCESS)
-			return result;
-	}
-
-	// The length of the last block may be shorter than (SWIF_HAIFA_INPUT_BLOCK_SIZE * 8)
-	result = Swifftx::Update(data + currInputIndex, databitlen); 
-	if (result != SUCCESS)
-	{
-		return result;
-	}
-
-    return Swifftx::Final(hashval);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-// Helper fuction implementation portion.
-///////////////////////////////////////////////////////////////////////////////////////////////
-
-void Swifftx::AddToCurrInBase256(BitSequence value[SWIF_HAIFA_NUM_OF_BITS_SIZE], 
-							   unsigned short toAdd)
-{
-	unsigned char remainder = 0;
-	short i;
-	BitSequence currValueInBase256[8] = {0};
-	unsigned short currIndex = 7;
-	unsigned short temp = 0;
-
-	do
-	{
-		remainder = toAdd % 256;
-		currValueInBase256[currIndex--] = remainder;
-		toAdd -= remainder;
-		toAdd /= 256;
-	}
-	while(toAdd != 0);
-
-	for (i = 7; i >= 0; --i)
-	{
-		temp = value[i] + currValueInBase256[i];
-		if (temp > 255)
-		{
-			value[i] = temp % 256;
-			currValueInBase256[i - 1]++;
-		}
-		else
-			value[i] = (unsigned char) temp;
-	}
-}
--- a/algo/swifftx/Swifftx_sha3.h
+++ b/algo/swifftx/Swifftx_sha3.h
@@ -1,79 +0,0 @@
-#ifndef SWIFFTX_SHA3_H
-#define SWIFFTX_SHA3_H
-
-#include "sha3_interface.h"
-#include "stdbool.h"
-#include "stdint.h"
-
-class Swifftx : public SHA3 {
-
-#define SWIFFTX_INPUT_BLOCK_SIZE 256
-#define SWIFFTX_OUTPUT_BLOCK_SIZE 65
-#define SWIF_HAIFA_SALT_SIZE 8
-#define SWIF_HAIFA_NUM_OF_BITS_SIZE 8
-#define SWIF_HAIFA_INPUT_BLOCK_SIZE (SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE \
-							  - SWIF_HAIFA_NUM_OF_BITS_SIZE - SWIF_HAIFA_SALT_SIZE)
-
-	typedef unsigned char BitSequence;
-//const DataLength SWIF_SALT_VALUE;
-
-#define SWIF_HAIFA_IV 0
-
-/*const BitSequence SWIF_HAIFA_IV_224[SWIFFTX_OUTPUT_BLOCK_SIZE];
-const BitSequence SWIF_HAIFA_IV_256[SWIFFTX_OUTPUT_BLOCK_SIZE];
-const BitSequence SWIF_HAIFA_IV_384[SWIFFTX_OUTPUT_BLOCK_SIZE];
-const BitSequence SWIF_HAIFA_IV_512[SWIFFTX_OUTPUT_BLOCK_SIZE];*/
-
-typedef enum 
-{ 
-	SUCCESS = 0,
-	FAIL = 1,
-	BAD_HASHBITLEN = 2,
-	BAD_SALT_SIZE = 3,
-	SET_SALT_VALUE_FAILED = 4,
-	INPUT_DATA_NOT_ALIGNED = 5
-} HashReturn;
-
-typedef struct hashState {
-	unsigned short hashbitlen;
-
-	// The data remained after the recent call to 'Update()'. 
-	BitSequence remaining[SWIF_HAIFA_INPUT_BLOCK_SIZE + 1];
-
-	// The size of the remaining data in bits.
-	// Is 0 in case there is no remaning data at all.
-	unsigned int remainingSize;
-
-	// The current output of the compression function. At the end will contain the final digest
-	// (which may be needed to be truncated, depending on hashbitlen).
-	BitSequence currOutputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE];
-
-	// The value of '#bits hashed so far' field in HAIFA, in base 256.
-	BitSequence numOfBitsChar[SWIF_HAIFA_NUM_OF_BITS_SIZE];
-
-	// The salt value currently in use:
-	BitSequence salt[SWIF_HAIFA_SALT_SIZE];
-
-	// Indicates whether a single 'Update()' occured. 
-	// Ater a call to 'Update()' the key and the salt values cannot be changed.
-	bool wasUpdated;
-} hashState;
-
-private:
-int swifftxNumRounds;
-hashState swifftxState;
-
-
-public:
-int Init(int hashbitlen);
-int Update(const BitSequence *data, DataLength databitlen);
-int Final(BitSequence *hashval);
-int Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, 
-				BitSequence *hashval);
-
-private:
-static void AddToCurrInBase256(BitSequence value[SWIF_HAIFA_NUM_OF_BITS_SIZE], unsigned short toAdd);
-
-};
-
-#endif
--- a/algo/swifftx/hash_interface.h
+++ b/algo/swifftx/hash_interface.h
@@ -1,21 +0,0 @@
-#pragma once
-
-#include <cstdint>
-
-namespace hash {
-
-using BitSequence = unsigned char;
-using DataLength = unsigned long long;
-
-struct hash_interface {
-    virtual ~hash_interface() = default;
-
-    virtual int Init(int hash_bitsize) = 0;
-    virtual int Update(const BitSequence *data, DataLength data_bitsize) = 0;
-    virtual int Final(BitSequence *hash) = 0;
-
-    virtual int
-    Hash(int hash_bitsize, const BitSequence *data, DataLength data_bitsize, BitSequence *hash) = 0;
-};
-
-} // namespace hash
--- a/algo/swifftx/sha3_interface.h
+++ b/algo/swifftx/sha3_interface.h
@@ -1,14 +0,0 @@
-#pragma once
-
-#include <cstdint>
-//#include <streams/hash/hash_interface.h>
-#include "hash_interface.h"
-
-namespace sha3 {
-
-using BitSequence = hash::BitSequence;
-using DataLength = hash::DataLength;
-
-struct sha3_interface : hash::hash_interface {};
-
-} // namespace sha3
--- a/algo/verthash/Verthash.c
+++ b/algo/verthash/Verthash.c
@@ -191,7 +191,7 @@ static void rotate_indexes( uint32_t *p )
   *(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \
                                 *(__m256i*)hash, *(__m256i*)blob_off ), k );

-#elif defined(__SSE4_1__) // || defined(__ARM_NEON)
+#elif defined(__SSE4_1__)  || defined(__ARM_NEON)

 #define MULXOR \
   casti_v128( hash, 0 ) = v128_mul32( v128_xor( \
@@ -251,7 +251,7 @@ void verthash_hash( const void *blob_bytes, const size_t blob_size,
                             / VH_BYTE_ALIGNMENT ) + 1;
 #if defined (__AVX2__)        
    const __m256i k = _mm256_set1_epi32( 0x1000193 );
-#elif defined(__SSE4_1__) // || defined(__ARM_NEON)
+#elif defined(__SSE4_1__)  || defined(__ARM_NEON)
    const v128u32_t k = v128_32( 0x1000193 );
 #endif
    
--- a/algo/verthash/verthash-gate.c
+++ b/algo/verthash/verthash-gate.c
@@ -129,7 +129,7 @@ bool register_verthash_algo( algo_gate_t* gate )
 {
  opt_target_factor = 256.0;
  gate->scanhash  = (void*)&scanhash_verthash;
-  gate->optimizations = SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE42_OPT | AVX2_OPT | NEON_OPT;
   
  const char *verthash_data_file = opt_data_file ? opt_data_file
                                                 : default_verthash_data_file;
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -506,4 +506,156 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
     return 0;
 }

+#elif defined (X11GOST_2WAY)
+
+#include "algo/luffa/luffa_for_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
+#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
+  #include "algo/groestl/sph_groestl.h"
+  #include "algo/echo/sph_echo.h"
+#endif
+
+union _x11gost_context_overlay
+{
+        blake512_2x64_context   blake;
+        bmw512_2x64_context     bmw;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+        hashState_groestl       groestl;
+#else
+        sph_groestl512_context  groestl;
+#endif
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+        hashState_echo          echo;
+#else
+        sph_echo512_context     echo;
+#endif
+        jh512_2x64_context      jh;
+        keccak512_2x64_context  keccak;
+        skein512_2x64_context   skein;
+        hashState_luffa         luffa;
+        cubehashParam           cube;
+        sph_shavite512_context  shavite;
+        simd512_context         simd;
+        sph_gost512_context     gost;
+};
+typedef union _x11gost_context_overlay x11gost_context_overlay;
+
+int x11gost_2x64_hash( void *state, const void *input, int thr_id )
+{
+    uint8_t vhash[80*2] __attribute__((aligned(64)));
+    uint8_t hash0[64]   __attribute__((aligned(64)));
+    uint8_t hash1[64]   __attribute__((aligned(64)));
+    x11gost_context_overlay ctx;
+
+    intrlv_2x64( vhash, input, input+80, 640 );
+
+    blake512_2x64_full( &ctx.blake, vhash, vhash, 80 );
+    bmw512_2x64_init( &ctx.bmw );
+    bmw512_2x64_update( &ctx.bmw, vhash, 64 );
+    bmw512_2x64_close( &ctx.bmw, vhash );
+
+    dintrlv_2x64( hash0, hash1, vhash, 512 );
+
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+    groestl512_full( &ctx.groestl, hash0, hash0, 512 );
+    groestl512_full( &ctx.groestl, hash1, hash1, 512 );
+#else
+    sph_groestl512_init( &ctx.groestl );
+    sph_groestl512( &ctx.groestl, hash0, 64 );
+    sph_groestl512_close( &ctx.groestl, hash0 );
+    sph_groestl512_init( &ctx.groestl );
+    sph_groestl512( &ctx.groestl, hash1, 64 );
+    sph_groestl512_close( &ctx.groestl, hash1 );
+#endif
+
+    intrlv_2x64( vhash, hash0, hash1, 512 );
+
+    skein512_2x64_full( &ctx.skein, vhash, vhash, 64 );
+    jh512_2x64_ctx( &ctx.jh, vhash, vhash, 64 );
+    keccak512_2x64_ctx( &ctx.keccak, vhash, vhash, 64 );
+
+    dintrlv_2x64( hash0, hash1, vhash, 512 );
+    
+    sph_gost512_init( &ctx.gost );
+    sph_gost512( &ctx.gost, hash0, 64 );
+    sph_gost512_close( &ctx.gost, hash0 );
+    sph_gost512_init( &ctx.gost );
+    sph_gost512( &ctx.gost, hash1, 64 );
+    sph_gost512_close( &ctx.gost, hash1 );
+
+    luffa_full( &ctx.luffa, hash0, 512, hash0, 64 );
+    luffa_full( &ctx.luffa, hash1, 512, hash1, 64 );
+
+    cubehash_full( &ctx.cube, hash0, 512, hash0, 64 );
+    cubehash_full( &ctx.cube, hash1, 512, hash1, 64 );
+
+    sph_shavite512_init( &ctx.shavite );
+    sph_shavite512( &ctx.shavite, hash0, 64 );
+    sph_shavite512_close( &ctx.shavite, hash0 );
+    sph_shavite512_init( &ctx.shavite );
+    sph_shavite512( &ctx.shavite, hash1, 64 );
+    sph_shavite512_close( &ctx.shavite, hash1 );
+
+    simd512_ctx( &ctx.simd, hash0, hash0, 64 );
+    simd512_ctx( &ctx.simd, hash1, hash1, 64 );
+
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+    echo_full( &ctx.echo, hash0, 512, hash0, 64 );
+    echo_full( &ctx.echo, hash1, 512, hash1, 64 );
+#else
+    sph_echo512_init( &ctx.echo );
+    sph_echo512( &ctx.echo, hash0, 64 );
+    sph_echo512_close( &ctx.echo, hash0 );
+    sph_echo512_init( &ctx.echo );
+    sph_echo512( &ctx.echo, hash1, 64 );
+    sph_echo512_close( &ctx.echo, hash1 );
+#endif
+
+    memcpy( state,    hash0, 32 );
+    memcpy( state+32, hash1, 32 );
+
+    return 1;
+}
+
+int scanhash_x11gost_2x64( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*2]   __attribute__((aligned(64)));
+   uint32_t edata[20*2]   __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+
+   v128_bswap32_80( edata, pdata );
+   memcpy( edata+20, edata, 80 );
+
+   do
+   {
+      edata[19] = n;
+      edata[39] = n+1;
+      if ( likely( x11gost_2x64_hash( hash, edata, thr_id ) ) )
+      {
+         if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = bswap_32( n );
+            submit_solution( work, hash, mythr );
+         }
+         if ( unlikely( valid_hash( hash+8, ptarget ) && !bench ) )
+         {
+            pdata[19] = bswap_32( n+1 );
+            submit_solution( work, hash+8, mythr );
+         }
+      }
+      n += 2;
+   } while ( n < last_nonce && !work_restart[thr_id].restart );
+   *hashes_done = n - first_nonce;
+   pdata[19] = n;
+   return 0;
+}
+
+
 #endif
--- a/algo/x11/x11gost-gate.c
+++ b/algo/x11/x11gost-gate.c
@@ -2,20 +2,24 @@

 bool register_x11gost_algo( algo_gate_t* gate )
 {
-#if defined (X11GOST_8WAY)
+#if defined(X11GOST_8WAY)
  init_x11gost_8way_ctx();
  gate->scanhash  = (void*)&scanhash_x11gost_8way;
  gate->hash      = (void*)&x11gost_8way_hash;
-#elif defined (X11GOST_4WAY)
+#elif defined(X11GOST_4WAY)
  init_x11gost_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x11gost_4way;
  gate->hash      = (void*)&x11gost_4way_hash;
+#elif defined(X11GOST_2WAY)
+  gate->scanhash  = (void*)&scanhash_x11gost_2x64;
+  gate->hash      = (void*)&x11gost_2x64_hash;
 #else
  init_x11gost_ctx();
  gate->scanhash  = (void*)&scanhash_x11gost;
  gate->hash      = (void*)&x11gost_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT 
+                      | NEON_OPT;
  return true;
 };

--- a/algo/x11/x11gost-gate.h
+++ b/algo/x11/x11gost-gate.h
@@ -8,6 +8,8 @@
  #define X11GOST_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X11GOST_4WAY 1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define X11GOST_2WAY 1
 #endif

 bool register_x11gost_algo( algo_gate_t* gate );
@@ -26,6 +28,12 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
 void init_x11gost_4way_ctx();

+#elif defined(X11GOST_2WAY)
+
+int x11gost_2x64_hash( void *state, const void *input, int thr_id );
+int scanhash_x11gost_2x64( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+
 #else

 void x11gost_hash( void *state, const void *input );
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -1,6 +1,8 @@
 #include "x11gost-gate.h"

-#if !defined(X11GOST_8WAY) && !defined(X11GOST_4WAY)
+// no longer used, not working when last used.
+
+#if !defined(X11GOST_8WAY) && !defined(X11GOST_4WAY) && !defined(X11GOST_2WAY)

 #include <stdlib.h>
 #include <stdint.h>
--- a/algo/x13/skunk-4way.c
+++ b/algo/x13/skunk-4way.c
@@ -155,13 +155,13 @@ void skunk_4way_hash( void *output, const void *input )
     skein512_4way_final16( &ctx.skein, vhash, input + (64*4) );
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-     cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*)hash0, 64 );
+     cubehashUpdateDigest( &ctx.cube, hash0, hash0, 64 );
     memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     cubehashUpdateDigest( &ctx.cube, hash1, hash1, 64 );
     memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     cubehashUpdateDigest( &ctx.cube, hash2, hash2, 64 );
     memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+     cubehashUpdateDigest( &ctx.cube, hash3, hash3, 64 );

     fugue512_full( &ctx.fugue, hash0, hash0, 64 );
     fugue512_full( &ctx.fugue, hash1, hash1, 64 );
--- a/algo/x16/hex.c
+++ b/algo/x16/hex.c
@@ -23,13 +23,12 @@ static void hex_getAlgoString(const uint32_t* prevblock, char *output)
   *sptr = '\0';
 }

-static __thread x16r_context_overlay hex_ctx;

 int hex_hash( void* output, const void* input, int thrid )
 {
   uint32_t _ALIGN(128) hash[16];
   x16r_context_overlay ctx;
-   memcpy( &ctx, &hex_ctx, sizeof(ctx) );
+   memcpy( &ctx, &x16r_ref_ctx, sizeof(ctx) );
   void *in = (void*) input;
   int size = 80;

@@ -52,7 +51,7 @@ int hex_hash( void* output, const void* input, int thrid )
         break;
         case GROESTL:
 #if defined(__AES__)
-            groestl512_full( &ctx.groestl, (char*)hash, (char*)in, size<<3 );
+            groestl512_full( &ctx.groestl, hash, in, size<<3 );
 #else
            sph_groestl512_init( &ctx.groestl );
            sph_groestl512( &ctx.groestl, in, size );
@@ -87,7 +86,7 @@ int hex_hash( void* output, const void* input, int thrid )
         case LUFFA:
            if ( i == 0 )
            {
-              update_and_final_luffa( &ctx.luffa, hash, (const void*)in+64, 16 );
+              update_and_final_luffa( &ctx.luffa, hash, in+64, 16 );
            }
            else
            {
@@ -97,7 +96,7 @@ int hex_hash( void* output, const void* input, int thrid )
            break;
         case CUBEHASH:
            if ( i == 0 )
-               cubehashUpdateDigest( &ctx.cube, hash, (const void*)in+64, 16 );
+               cubehashUpdateDigest( &ctx.cube, hash, in+64, 16 );
            else
            {
               cubehashInit( &ctx.cube, 512, 16, 32 );
@@ -108,26 +107,15 @@ int hex_hash( void* output, const void* input, int thrid )
            shavite512_full( &ctx.shavite, hash, in, size );
         break;
         case SIMD:
-#if defined(__aarch64__)
-            sph_simd512_init( &ctx.simd );
-            sph_simd512(&ctx.simd, (const void*) hash, 64);
-            sph_simd512_close(&ctx.simd, hash);
-#else
-            simd_full( &ctx.simd, (BitSequence *)hash,
-                             (const BitSequence*)in, size<<3 );
-             init_sd( &ctx.simd, 512 );
-             update_final_sd( &ctx.simd, (BitSequence *)hash,
-                              (const BitSequence*)in, size<<3 );
-#endif
+            simd512_ctx( &ctx.simd, hash, in, size<<3 );
         break;
         case ECHO:
-#if defined(__AES__)
-            echo_full( &ctx.echo, (BitSequence *)hash, 512,
-                              (const BitSequence *)in, size );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES) 
+            echo_full( &ctx.echo, hash, 512, in, size );
 #else
-             sph_echo512_init( &ctx.echo );
-             sph_echo512( &ctx.echo, in, size );
-             sph_echo512_close( &ctx.echo, hash );
+            sph_echo512_init( &ctx.echo );
+            sph_echo512( &ctx.echo, in, size );
+            sph_echo512_close( &ctx.echo, hash );
 #endif
         break;
         case HAMSI:
@@ -216,32 +204,32 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
   switch ( algo )
   {
      case JH:
-         sph_jh512_init( &hex_ctx.jh );
-         sph_jh512( &hex_ctx.jh, edata, 64 );
+         sph_jh512_init( &x16r_ref_ctx.jh );
+         sph_jh512( &x16r_ref_ctx.jh, edata, 64 );
      break;
      case SKEIN:
-         sph_skein512_init( &hex_ctx.skein );
-         sph_skein512( &hex_ctx.skein, edata, 64 );
+         sph_skein512_init( &x16r_ref_ctx.skein );
+         sph_skein512( &x16r_ref_ctx.skein, edata, 64 );
      break;
      case LUFFA:
-         init_luffa( &hex_ctx.luffa, 512 );
-         update_luffa( &hex_ctx.luffa, edata, 64 );
+         init_luffa( &x16r_ref_ctx.luffa, 512 );
+         update_luffa( &x16r_ref_ctx.luffa, edata, 64 );
      break;
      case CUBEHASH:
-         cubehashInit( &hex_ctx.cube, 512, 16, 32 );
-         cubehashUpdate( &hex_ctx.cube, edata, 64 );
+         cubehashInit( &x16r_ref_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x16r_ref_ctx.cube, edata, 64 );
      break;
      case HAMSI:
-         sph_hamsi512_init( &hex_ctx.hamsi );
-         sph_hamsi512( &hex_ctx.hamsi, edata, 64 );
+         sph_hamsi512_init( &x16r_ref_ctx.hamsi );
+         sph_hamsi512( &x16r_ref_ctx.hamsi, edata, 64 );
      break;
      case SHABAL:
-         sph_shabal512_init( &hex_ctx.shabal );
-         sph_shabal512( &hex_ctx.shabal, edata, 64 );
+         sph_shabal512_init( &x16r_ref_ctx.shabal );
+         sph_shabal512( &x16r_ref_ctx.shabal, edata, 64 );
      break;
      case WHIRLPOOL:
-         sph_whirlpool_init( &hex_ctx.whirlpool );
-         sph_whirlpool( &hex_ctx.whirlpool, edata, 64 );
+         sph_whirlpool_init( &x16r_ref_ctx.whirlpool );
+         sph_whirlpool( &x16r_ref_ctx.whirlpool, edata, 64 );
      break;
   }
   
--- a/algo/x16/minotaur.c
+++ b/algo/x16/minotaur.c
@@ -11,29 +11,29 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sph_sha2.h"
 #include "algo/yespower/yespower.h"
-//#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/echo/aes_ni/hash_api.h"
-  #include "algo/groestl/aes_ni/hash-groestl.h"
-//#else
+#else
  #include "algo/echo/sph_echo.h"
+#endif
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+ #include "algo/groestl/aes_ni/hash-groestl.h"
+#else
  #include "algo/groestl/sph_groestl.h"
-//#endif
-#if defined(__AES__)
+#endif
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/fugue/fugue-aesni.h"
 #else
  #include "algo/fugue/sph_fugue.h"
 #endif
-#include "algo/luffa/luffa_for_sse2.h"
-#include "algo/simd/nist.h"

 // Config
 #define MINOTAUR_ALGO_COUNT	16
@@ -47,14 +47,17 @@ typedef struct TortureGarden TortureGarden;
 // Graph of hash algos plus SPH contexts
 struct TortureGarden
 {
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
-   hashState_echo          echo;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   hashState_groestl       groestl;
 #else
-   sph_echo512_context     echo;
   sph_groestl512_context  groestl;
 #endif
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+   hashState_echo          echo;
+#else
+   sph_echo512_context     echo;
+#endif
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   hashState_fugue         fugue;
 #else
   sph_fugue512_context    fugue;
@@ -67,11 +70,7 @@ struct TortureGarden
   cubehashParam           cube;
   shavite512_context      shavite;
   hashState_luffa         luffa;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
+   simd512_context         simd;
   sph_hamsi512_context    hamsi;
   sph_shabal512_context   shabal;
   sph_whirlpool_context   whirlpool;
@@ -93,9 +92,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
    switch ( algo )
    {
        case 0:
-            blake512_init( &garden->blake );
-            blake512_update( &garden->blake, input, 64 );
-            blake512_close( &garden->blake, hash );
+            blake512_full( &garden->blake, hash, input, 64 );
            break;
        case 1:
            sph_bmw512_init( &garden->bmw );
@@ -107,7 +104,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
            cubehashUpdateDigest( &garden->cube, hash, input, 64 );
            break;
        case 3:
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
            echo_full( &garden->echo, hash, 512, input, 64 );
 #else
            sph_echo512_init( &garden->echo );
@@ -116,14 +113,14 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
 #endif
 	         break;
        case 4:
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
            fugue512_full( &garden->fugue, hash, input, 64 );
 #else
            sph_fugue512_full( &garden->fugue, hash, input, 64 );
 #endif
 	         break;
        case 5:
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
            groestl512_full( &garden->groestl, hash, input, 512 );
 #else
            sph_groestl512_init( &garden->groestl) ;
@@ -165,13 +162,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
            sph_shavite512_close( &garden->shavite, hash );          
            break;
        case 13:
-#if defined(__aarch64__)
-            sph_simd512_init( &garden->simd );
-            sph_simd512( &garden->simd, input, 64);
-            sph_simd512_close( &garden->simd, hash );
-#else
-            simd_full( &garden->simd, (BitSequence *)hash, input, 512 );
-#endif
+            simd512_ctx( &garden->simd, hash, input, 64 );
            break;
        case 14:
            sph_skein512_init( &garden->skein );
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -19,12 +19,12 @@
 // Perform midstate prehash of hash functions with block size <= 72 bytes,
 // 76 bytes for hash functions that operate on 32 bit data.

-void x16r_8way_prehash( void *vdata, void *pdata )
+void x16r_8way_prehash( void *vdata, void *pdata, const char *hash_order )
 {
   uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
   uint32_t edata[20] __attribute__ ((aligned (64)));

-   const char elem = x16r_hash_order[0];
+   const char elem = hash_order[0];
   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

   switch ( algo )
@@ -110,7 +110,8 @@ void x16r_8way_prehash( void *vdata, void *pdata )
 // Called by wrapper hash function to optionally continue hashing and
 // convert to final hash.

-int x16r_8way_hash_generic( void* output, const void* input, int thrid )
+int x16r_8way_hash_generic( void* output, const void* input, int thrid,
+     const char *hash_order, const int func_count )
 {
   uint32_t vhash[20*8] __attribute__ ((aligned (128)));
   uint32_t hash0[20] __attribute__ ((aligned (16)));
@@ -136,9 +137,9 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 input, 640 );

-   for ( int i = 0; i < 16; i++ )
+   for ( int i = 0; i < func_count; i++ )
   {
-      const char elem = x16r_hash_order[i];
+      const char elem = hash_order[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

      switch ( algo )
@@ -474,7 +475,8 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
 int x16r_8way_hash( void* output, const void* input, int thrid )
 {
   uint8_t hash[64*8] __attribute__ ((aligned (128)));
-   if ( !x16r_8way_hash_generic( hash, input, thrid ) )
+   if ( !x16r_8way_hash_generic( hash, input, thrid, x16r_hash_order, 
+                                 X16R_HASH_FUNC_COUNT ) )
      return 0;

   memcpy( output,     hash,     32 );
@@ -495,7 +497,6 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*8] __attribute__ ((aligned (128)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -508,21 +509,18 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,

   if ( bench )   ptarget[7] = 0x0cff;

-   bedata1[0] = bswap_32( pdata[1] );
-   bedata1[1] = bswap_32( pdata[2] );
-
-   static __thread uint32_t s_ntime = UINT32_MAX;
-   const uint32_t ntime = bswap_32( pdata[17] );
-   if ( s_ntime != ntime )
+   static __thread uint32_t saved_height = UINT32_MAX;
+   if ( work->height != saved_height )
   {
-      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
-      s_ntime = ntime;
-
-      if ( opt_debug && !thr_id )
-          applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
+      vdata[1] = bswap_32( pdata[1] );
+      vdata[2] = bswap_32( pdata[2] );
+      saved_height = work->height;
+      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
+      if ( !opt_quiet && !thr_id )
+           applog( LOG_INFO, "hash order %s", x16r_hash_order );
   }

-   x16r_8way_prehash( vdata, pdata );
+   x16r_8way_prehash( vdata, pdata, x16r_hash_order );
   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
@@ -546,12 +544,12 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,

 #elif defined (X16R_4WAY)

-void x16r_4way_prehash( void *vdata, void *pdata )
+void x16r_4way_prehash( void *vdata, void *pdata, const char *hash_order )
 {
   uint32_t vdata2[20*4] __attribute__ ((aligned (64)));
   uint32_t edata[20] __attribute__ ((aligned (64)));

-   const char elem = x16r_hash_order[0];
+   const char elem = hash_order[0];
   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

   switch ( algo )
@@ -627,7 +625,8 @@ void x16r_4way_prehash( void *vdata, void *pdata )
   }
 }

-int x16r_4way_hash_generic( void* output, const void* input, int thrid )
+int x16r_4way_hash_generic( void* output, const void* input, int thrid,
+                            const char *hash_order, const int func_count )
 {
   uint32_t vhash[20*4] __attribute__ ((aligned (128)));
   uint32_t hash0[20] __attribute__ ((aligned (32)));
@@ -644,9 +643,9 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )

   dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );

-   for ( int i = 0; i < 16; i++ )
+   for ( int i = 0; i < func_count; i++ )
   {
-      const char elem = x16r_hash_order[i];
+      const char elem = hash_order[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

      switch ( algo )
@@ -908,7 +907,8 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
 int x16r_4way_hash( void* output, const void* input, int thrid )
 {
   uint8_t hash[64*4] __attribute__ ((aligned (64)));
-   if ( !x16r_4way_hash_generic( hash, input, thrid ) )
+   if ( !x16r_4way_hash_generic( hash, input, thrid, x16r_hash_order,
+                                 X16R_HASH_FUNC_COUNT ) )
      return 0;

   memcpy( output,     hash,     32 );
@@ -924,7 +924,6 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -937,20 +936,18 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,

   if ( bench )  ptarget[7] = 0x0cff;

-   bedata1[0] = bswap_32( pdata[1] );
-   bedata1[1] = bswap_32( pdata[2] );
-
-   static __thread uint32_t s_ntime = UINT32_MAX;
-   const uint32_t ntime = bswap_32( pdata[17] );
-   if ( s_ntime != ntime )
+   static __thread uint32_t saved_height = UINT32_MAX;
+   if ( work->height != saved_height )
   {
-      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
-      s_ntime = ntime;
-      if ( opt_debug && !thr_id )
-         applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
+      vdata[1] = bswap_32( pdata[1] );
+      vdata[2] = bswap_32( pdata[2] );
+      saved_height = work->height;
+      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
+      if ( !opt_quiet && !thr_id )
+           applog( LOG_INFO, "hash order %s", x16r_hash_order );
   }

-   x16r_4way_prehash( vdata, pdata );
+   x16r_4way_prehash( vdata, pdata, x16r_hash_order );
   *noncev = mm256_intrlv_blend_32(
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
@@ -971,4 +968,404 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }

+#elif defined (X16R_2WAY)
+
+void x16r_2x64_prehash( void *vdata, void *pdata, const char *hash_order )
+{
+   uint32_t edata[20] __attribute__ ((aligned (64)));
+   const char elem = hash_order[0];
+   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+   switch ( algo )
+   {
+      case JH:
+         v128_bswap32_intrlv80_2x64( vdata, pdata );
+         jh512_2x64_init( &x16r_ctx.jh );
+         jh512_2x64_update( &x16r_ctx.jh, vdata, 64 );
+      break;
+      case KECCAK:
+         v128_bswap32_intrlv80_2x64( vdata, pdata );
+         keccak512_2x64_init( &x16r_ctx.keccak );
+         keccak512_2x64_update( &x16r_ctx.keccak, vdata, 72 );
+      break;
+      case SKEIN:
+         v128_bswap32_intrlv80_2x64( vdata, pdata );
+         skein512_2x64_prehash64( &x16r_ctx.skein, vdata );
+      break;
+      case LUFFA:
+      {
+         v128_bswap32_80( edata, pdata );
+         init_luffa( &x16r_ctx.luffa, 512 );
+         update_luffa( &x16r_ctx.luffa, edata, 64 );
+         intrlv_2x64( vdata, edata, edata, 640 );
+      }
+      break;
+      case CUBEHASH:
+      {
+         v128_bswap32_80( edata, pdata );
+         cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x16r_ctx.cube, edata, 64 );
+         intrlv_2x64( vdata, edata, edata, 640 );
+      }
+      break;
+      case HAMSI:
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
+         v128_bswap32_intrlv80_2x64( vdata, pdata );
+         hamsi512_2x64_init( &x16r_ctx.hamsi );
+         hamsi512_2x64_update( &x16r_ctx.hamsi, vdata, 72 );
+#else
+         v128_bswap32_80( edata, pdata );
+         sph_hamsi512_init( &x16r_ctx.hamsi );
+         sph_hamsi512( &x16r_ctx.hamsi, edata, 72 );
+         intrlv_2x64( vdata, edata, edata, 640 );
+#endif
+      break;
+      case FUGUE:
+         v128_bswap32_80( edata, pdata );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+         fugue512_init( &x16r_ctx.fugue );
+         fugue512_update( &x16r_ctx.fugue, edata, 76 );
+#else         
+         sph_fugue512_init( &x16r_ctx.fugue );
+         sph_fugue512( &x16r_ctx.fugue, edata, 76 );
+#endif
+         intrlv_2x64( vdata, edata, edata, 640 );
+      break;
+      case SHABAL:
+         v128_bswap32_80( edata, pdata );
+         sph_shabal512_init( &x16r_ctx.shabal );
+         sph_shabal512( &x16r_ctx.shabal, edata, 64);
+         intrlv_2x64( vdata, edata, edata, 640 );
+      break;
+      case WHIRLPOOL:
+         v128_bswap32_80( edata, pdata );
+         sph_whirlpool_init( &x16r_ctx.whirlpool );
+         sph_whirlpool( &x16r_ctx.whirlpool, edata, 64 );
+         intrlv_2x64( vdata, edata, edata, 640 );
+      break;
+      default:
+         v128_bswap32_intrlv80_2x64( vdata, pdata );
+   }
+}
+
+int x16r_2x64_hash_generic( void* output, const void* input, int thrid,
+                            const char *hash_order, const int func_count )
+{
+   uint32_t vhash[20*2] __attribute__ ((aligned (64)));
+   uint32_t hash0[20] __attribute__ ((aligned (32)));
+   uint32_t hash1[20] __attribute__ ((aligned (32)));
+   x16r_2x64_context_overlay ctx;
+   memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   int size = 80;
+
+   dintrlv_2x64( hash0, hash1, input, 640 );
+
+   for ( int i = 0; i < func_count; i++ )
+   {
+      const char elem = hash_order[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            if ( i == 0 )
+               blake512_2x64_full( &ctx.blake, vhash, input, size );
+            else
+            {
+               intrlv_2x64( vhash, in0, in1, size<<3 );
+               blake512_2x64_full( &ctx.blake, vhash, vhash, size );
+            }
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+         case BMW:
+            bmw512_2x64_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_2x64_update( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_2x64( vhash, in0, in1, size<<3 );
+               bmw512_2x64_update( &ctx.bmw, vhash, size );
+            }
+            bmw512_2x64_close( &ctx.bmw, vhash );
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+         case GROESTL:
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+            groestl512_full( &ctx.groestl, hash0, in0, size<<3 );
+            groestl512_full( &ctx.groestl, hash1, in1, size<<3 );
+#else
+            sph_groestl512_init( &ctx.groestl );
+            sph_groestl512( &ctx.groestl, in0, size );
+            sph_groestl512_close( &ctx.groestl, hash0 );
+            sph_groestl512_init( &ctx.groestl );
+            sph_groestl512( &ctx.groestl, in1, size );
+            sph_groestl512_close( &ctx.groestl, hash1 );
+#endif
+          break;
+         case JH:
+            if ( i == 0 )
+               jh512_2x64_update( &ctx.jh, input + (64*2), 16 );
+            else
+            {
+               intrlv_2x64( vhash, in0, in1, size<<3 );
+               jh512_2x64_init( &ctx.jh );
+               jh512_2x64_update( &ctx.jh, vhash, size );
+            }
+            jh512_2x64_close( &ctx.jh, vhash );
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+         case KECCAK:
+           if ( i == 0 )
+               keccak512_2x64_update( &ctx.keccak, input + (72*2), 8 );
+            else
+            {
+               intrlv_2x64( vhash, in0, in1, size<<3 );
+               keccak512_2x64_init( &ctx.keccak );
+               keccak512_2x64_update( &ctx.keccak, vhash, size );
+            }
+            keccak512_2x64_close( &ctx.keccak, vhash );
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+         case SKEIN:
+            if ( i == 0 )
+               skein512_2x64_final16( &ctx.skein, vhash, input + (64*2) );
+            else
+            {
+               intrlv_2x64( vhash, in0, in1, size<<3 );
+               skein512_2x64_full( &ctx.skein, vhash, vhash, size );
+            }
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+         case LUFFA:
+            if ( i == 0 )
+            {
+               update_and_final_luffa( &ctx.luffa, hash0, in0 + 64, 16 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               update_and_final_luffa( &ctx.luffa, hash1, in1 + 64, 16 );
+            }
+            else
+            {
+               luffa_full( &ctx.luffa, hash0, 512, hash0, size );
+               luffa_full( &ctx.luffa, hash1, 512, hash1, size );
+            }
+         break;
+         case CUBEHASH:
+            if ( i == 0 )
+            {
+               cubehashUpdateDigest( &ctx.cube, hash0, in0 + 64, 16 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, hash1, in1 + 64, 16 );
+            }
+            else
+            {
+               cubehash_full( &ctx.cube, hash0, 512, hash0, size );
+               cubehash_full( &ctx.cube, hash1, 512, hash1, size );
+            }
+         break;
+         case SHAVITE:
+            shavite512_full( &ctx.shavite, hash0, in0, size );
+            shavite512_full( &ctx.shavite, hash1, in1, size );
+          break;
+         case SIMD:
+            simd512_ctx( &ctx.simd, hash0, in0, size );
+            simd512_ctx( &ctx.simd, hash1, in1, size );
+         break;
+         case ECHO:
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+            echo_full( &ctx.echo, hash0, 512, in0, size );
+            echo_full( &ctx.echo, hash1, 512, in1, size );
+#else
+            sph_echo512_init( &ctx.echo );
+            sph_echo512( &ctx.echo, in0, size );
+            sph_echo512_close( &ctx.echo, hash0 );
+            sph_echo512_init( &ctx.echo );
+            sph_echo512( &ctx.echo, in1, size );
+            sph_echo512_close( &ctx.echo, hash1 );
+#endif
+          break;
+         case HAMSI:
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
+            if ( i == 0 )
+               hamsi512_2x64_update( &ctx.hamsi, input + (72*2), 8 );
+            else
+            {
+               intrlv_2x64( vhash, hash0, hash1, size<<3 );
+               hamsi512_2x64_init( &ctx.hamsi );
+               hamsi512_2x64_update( &ctx.hamsi, vhash, size );
+            }
+            hamsi512_2x64_close( &ctx.hamsi, vhash );
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+#else
+            if ( i == 0 )
+            {
+               sph_hamsi512( &ctx.hamsi, in0 + 72, 8 );
+               sph_hamsi512_close( &ctx.hamsi, hash0 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               sph_hamsi512( &ctx.hamsi, in1 + 72, 8 );
+               sph_hamsi512_close( &ctx.hamsi, hash1 );
+            }
+            else
+            {
+               sph_hamsi512_init( &ctx.hamsi );
+               sph_hamsi512( &ctx.hamsi, hash0, size );
+               sph_hamsi512_close( &ctx.hamsi, hash0 );
+               sph_hamsi512_init( &ctx.hamsi );
+               sph_hamsi512( &ctx.hamsi, hash1, size );
+               sph_hamsi512_close( &ctx.hamsi, hash1 );
+             }
+#endif
+            break;
+         case FUGUE:
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+            if ( i == 0 )
+            {
+               fugue512_update( &ctx.fugue, in0 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash0 );
+               memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in1 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash1 );
+            }
+            else
+            {
+               fugue512_full( &ctx.fugue, hash0, hash0, size );
+               fugue512_full( &ctx.fugue, hash1, hash1, size );
+            }
+#else
+            if ( i == 0 )
+            {
+               sph_fugue512( &ctx.fugue, in0 + 76, 4 );
+               sph_fugue512_close( &ctx.fugue, hash0 );
+               memcpy( &ctx, &x16r_ctx, sizeof(sph_fugue512_context) );
+               sph_fugue512( &ctx.fugue, in1 + 76, 4 );
+               sph_fugue512_close( &ctx.fugue, hash1 );
+            }
+             else
+             {
+                sph_fugue512_full( &ctx.fugue, hash0, hash0, size );
+                sph_fugue512_full( &ctx.fugue, hash1, hash1, size );
+             }
+#endif
+             break;
+         case SHABAL:
+            if ( i == 0 )
+            {
+               sph_shabal512( &ctx.shabal, in0 + 64, 16 );
+               sph_shabal512_close( &ctx.shabal, hash0 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               sph_shabal512( &ctx.shabal, in1 + 64, 16 );
+               sph_shabal512_close( &ctx.shabal, hash1 );
+            }
+            else
+            {
+               sph_shabal512_init( &ctx.shabal );
+               sph_shabal512( &ctx.shabal, hash0, size );
+               sph_shabal512_close( &ctx.shabal, hash0 );
+               sph_shabal512_init( &ctx.shabal );
+               sph_shabal512( &ctx.shabal, hash1, size );
+               sph_shabal512_close( &ctx.shabal, hash1 );
+             }
+         break;
+         case WHIRLPOOL:
+            if ( i == 0 )
+            {
+               sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash0 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash1 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+            }
+            else
+            {
+               sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, size );
+            }
+         break;
+         case SHA_512:
+            sha512_2x64_init( &ctx.sha512 );
+            if ( i == 0 )
+               sha512_2x64_update( &ctx.sha512, input, size );
+            else
+            {
+               intrlv_2x64( vhash, in0, in1, size<<3 );
+               sha512_2x64_init( &ctx.sha512 );
+               sha512_2x64_update( &ctx.sha512, vhash, size );
+            }
+            sha512_2x64_close( &ctx.sha512, vhash );
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+      }
+
+      if ( work_restart[thrid].restart ) return 0;
+
+      size = 64;
+   }
+   memcpy( output,     hash0, 64 );
+   memcpy( output+64,  hash1, 64 );
+
+   return 1;
+}
+
+int x16r_2x64_hash( void* output, const void* input, int thrid )
+{
+   uint8_t hash[64*2] __attribute__ ((aligned (64)));
+   if ( !x16r_2x64_hash_generic( hash, input, thrid, x16r_hash_order,
+                                 X16R_HASH_FUNC_COUNT ) )
+      return 0;
+
+   memcpy( output,     hash,     32 );
+   memcpy( output+32,  hash+64,  32 );
+
+   return 1;
+}
+
+int scanhash_x16r_2x64( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[16*2] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*2] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   v128_t *noncev = (v128_t*)vdata + 9;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( bench )  ptarget[7] = 0x0cff;
+
+   static __thread uint32_t saved_height = UINT32_MAX;
+   if ( work->height != saved_height )
+   {
+      vdata[1] = bswap_32( pdata[1] );
+      vdata[2] = bswap_32( pdata[2] );
+      saved_height = work->height;
+      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
+      if ( !opt_quiet && !thr_id )
+           applog( LOG_INFO, "hash order %s", x16r_hash_order );
+   }
+
+   x16r_2x64_prehash( vdata, pdata, x16r_hash_order );
+   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
+   do
+   {
+      if ( x16r_2x64_hash( hash, vdata, thr_id ) );
+      for ( int i = 0; i < 2; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
+      n += 2;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
 #endif
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -5,18 +5,21 @@ __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = { 0 };

 void (*x16_r_s_getAlgoString) ( const uint8_t*, char* ) = NULL;

-#if defined (X16R_8WAY)
+#if defined(X16R_8WAY)

 __thread x16r_8way_context_overlay x16r_ctx;

-#elif defined (X16R_4WAY)
+#elif defined(X16R_4WAY)

 __thread x16r_4way_context_overlay x16r_ctx;

+#elif defined(X16R_2WAY)
+
+__thread x16r_2x64_context_overlay x16r_ctx;
+
 #endif

-__thread x16r_context_overlay x16_ctx;
-
+__thread x16r_context_overlay x16r_ref_ctx;

 void x16r_getAlgoString( const uint8_t* prevblock, char *output )
 {
@@ -52,17 +55,21 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output )

 bool register_x16r_algo( algo_gate_t* gate )
 {
-#if defined (X16R_8WAY)
+#if defined(X16R_8WAY)
  gate->scanhash  = (void*)&scanhash_x16r_8way;
  gate->hash      = (void*)&x16r_8way_hash;
-#elif defined (X16R_4WAY)
+#elif defined(X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
+#elif defined(X16R_2WAY)
+  gate->scanhash  = (void*)&scanhash_x16r_2x64;
+  gate->hash      = (void*)&x16r_2x64_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
+                      | NEON_OPT;
  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -70,17 +77,21 @@ bool register_x16r_algo( algo_gate_t* gate )

 bool register_x16rv2_algo( algo_gate_t* gate )
 {
-#if defined (X16RV2_8WAY)
+#if defined(X16RV2_8WAY)
  gate->scanhash  = (void*)&scanhash_x16rv2_8way;
  gate->hash      = (void*)&x16rv2_8way_hash;
-#elif defined (X16RV2_4WAY)
+#elif defined(X16RV2_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rv2_4way;
  gate->hash      = (void*)&x16rv2_4way_hash;
+#elif defined(X16RV2_2WAY)
+  gate->scanhash  = (void*)&scanhash_x16rv2_2x64;
+  gate->hash      = (void*)&x16rv2_2x64_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rv2;
  gate->hash      = (void*)&x16rv2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
+                      | NEON_OPT;
  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -88,17 +99,21 @@ bool register_x16rv2_algo( algo_gate_t* gate )

 bool register_x16s_algo( algo_gate_t* gate )
 {
-#if defined (X16R_8WAY)
+#if defined(X16R_8WAY)
  gate->scanhash  = (void*)&scanhash_x16r_8way;
  gate->hash      = (void*)&x16r_8way_hash;
-#elif defined (X16R_4WAY)
+#elif defined(X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
+#elif defined(X16R_2WAY)
+  gate->scanhash  = (void*)&scanhash_x16r_2x64;
+  gate->hash      = (void*)&x16r_2x64_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
+                      | NEON_OPT;
  x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -108,7 +123,6 @@ bool register_x16s_algo( algo_gate_t* gate )
 //
 //   X16RT

-
 void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash )
 {
    int32_t maskedTime = timeStamp & 0xffffff80;
@@ -221,34 +235,42 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )

 bool register_x16rt_algo( algo_gate_t* gate )
 {
-#if defined (X16R_8WAY)
+#if defined(X16RT_8WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_8way;
  gate->hash      = (void*)&x16r_8way_hash;
-#elif defined (X16R_4WAY)
+#elif defined(X16RT_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16r_4way_hash;
+#elif defined(X16RT_2WAY)
+  gate->scanhash  = (void*)&scanhash_x16rt_2x64;
+  gate->hash      = (void*)&x16r_2x64_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
+                      | NEON_OPT;
  opt_target_factor = 256.0;
  return true;
 };

 bool register_x16rt_veil_algo( algo_gate_t* gate )
 {
-#if defined (X16R_8WAY)
+#if defined(X16RT_8WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_8way;
  gate->hash      = (void*)&x16r_8way_hash;
-#elif defined (X16R_4WAY)
+#elif defined(X16RT_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16r_4way_hash;
+#elif defined(X16RT_2WAY)
+  gate->scanhash  = (void*)&scanhash_x16rt_2x64;
+  gate->hash      = (void*)&x16r_2x64_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
+                      | NEON_OPT;
  gate->build_extraheader = (void*)&veil_build_extraheader;
  opt_target_factor = 256.0;
  return true;
@@ -262,7 +284,7 @@ bool register_hex_algo( algo_gate_t* gate )
 {
  gate->scanhash        = (void*)&scanhash_hex;
  gate->hash            = (void*)&x16r_hash;
-  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
  gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
  opt_target_factor = 128.0;
  return true;
@@ -274,20 +296,25 @@ bool register_hex_algo( algo_gate_t* gate )

 bool register_x21s_algo( algo_gate_t* gate )
 {
-#if defined (X16R_8WAY)
+#if defined(X21S_8WAY)
  gate->scanhash          = (void*)&scanhash_x21s_8way;
  gate->hash              = (void*)&x21s_8way_hash;
  gate->miner_thread_init = (void*)&x21s_8way_thread_init;
-#elif defined (X16R_4WAY)
+#elif defined(X21S_4WAY)
  gate->scanhash          = (void*)&scanhash_x21s_4way;
  gate->hash              = (void*)&x21s_4way_hash;
  gate->miner_thread_init = (void*)&x21s_4way_thread_init;
+#elif defined(X21S_2WAY)
+  gate->scanhash          = (void*)&scanhash_x21s_2x64;
+  gate->hash              = (void*)&x21s_2x64_hash;
+  gate->miner_thread_init = (void*)&x21s_2x64_thread_init;
 #else
  gate->scanhash          = (void*)&scanhash_x21s;
  gate->hash              = (void*)&x21s_hash;
  gate->miner_thread_init = (void*)&x21s_thread_init;
 #endif
-  gate->optimizations  = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
+                      | NEON_OPT;
  x16_r_s_getAlgoString   = (void*)&x16s_getAlgoString;
  opt_target_factor = 256.0;
  return true;
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -7,13 +7,15 @@
 #include <unistd.h>
 #include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
-#include "algo/groestl/sph_groestl.h"
 #include "algo/jh/sph_jh.h"
+#include "algo/groestl/sph_groestl.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/simd/sph_simd.h"
+#include "algo/simd/nist.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
@@ -21,13 +23,13 @@
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sha512-hash.h"

-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/fugue/fugue-aesni.h"
 #endif

-#if defined (__AVX2__)
+//#if defined (__AVX2__)
  #include "algo/bmw/bmw-hash-4way.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/skein/skein-hash-4way.h"
@@ -39,7 +41,7 @@
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/hamsi/hamsi-hash-4way.h"
  #include "algo/shabal/shabal-hash-4way.h"
-#endif
+//#endif

 #if defined(__VAES__)
  #include "algo/groestl/groestl512-hash-4way.h"
@@ -48,28 +50,41 @@
  #include "algo/echo/echo-hash-4way.h"
 #endif

-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
+// X16R, X16S
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X16R_8WAY   1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X16R_4WAY   1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define X16R_2WAY   1
 #endif

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-  #define X16R_8WAY   1
  #define X16RV2_8WAY 1
-  #define X16RT_8WAY  1
-  #define X21S_8WAY   1
-
 #elif defined(__AVX2__) && defined(__AES__)
-
  #define X16RV2_4WAY 1
-  #define X16RT_4WAY  1
-  #define X21S_4WAY   1
-  #define X16R_4WAY   1
-
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define X16RV2_2WAY 1
 #endif

+// X16RT, VEIL
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X16RT_8WAY  1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X16RT_4WAY  1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define X16RT_2WAY  1
+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X21S_8WAY   1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X21S_4WAY   1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define X21S_2WAY   1
+#endif
+
+
 enum x16r_Algo {
        BLAKE = 0,
        BMW,
@@ -134,18 +149,23 @@ union _x16r_8way_context_overlay
    hashState_echo          echo;
 #endif
 } __attribute__ ((aligned (64)));
+#define  _x16r_8x64_context_overlay _x16r_8way_context_overlay

 typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;
+#define  x16r_8x64_context_overlay x16r_8way_context_overlay

 extern __thread x16r_8way_context_overlay x16r_ctx;

-void x16r_8way_prehash( void *, void * );
-int x16r_8way_hash_generic( void *, const void *, int );
+void x16r_8way_prehash( void *, void *, const char * );
+int x16r_8way_hash_generic( void *, const void *, int, const char*, const int );
 int x16r_8way_hash( void *, const void *, int );
 int scanhash_x16r_8way( struct work *, uint32_t ,
                        uint64_t *, struct thr_info * );
-extern __thread x16r_8way_context_overlay x16r_ctx;

+#define x16r_8x64_prehash         x16r_8way_prehash
+#define x16r_8x64_hash_generic    x16r_8way_hash_generic
+#define x16r_8x64_hash            x16r_8way_hash
+#define scanhash_x16r_8x64        scanhash_x16r_8x64

 #elif defined(X16R_4WAY)

@@ -167,7 +187,6 @@ union _x16r_4way_context_overlay
    keccak512_4way_context  keccak;
    luffa_2way_context      luffa;
    cube_2way_context       cube;
-    hashState_luffa         luffa1;
    simd_2way_context       simd;
    hamsi512_4way_context   hamsi;
    hashState_fugue         fugue;
@@ -175,46 +194,102 @@ union _x16r_4way_context_overlay
    sph_whirlpool_context   whirlpool;
    sha512_4way_context     sha512;
 } __attribute__ ((aligned (64)));
+#define  _x16r_4x64_context_overlay _x16r_4way_context_overlay

 typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;
+#define  x16r_4x64_context_overlay x16r_4way_context_overlay

 extern __thread x16r_4way_context_overlay x16r_ctx;

-void x16r_4way_prehash( void *, void * );
-int x16r_4way_hash_generic( void *, const void *, int );
+void x16r_4way_prehash( void *, void *, const char * );
+int x16r_4way_hash_generic( void *, const void *, int, const char*, const int );
 int x16r_4way_hash( void *, const void *, int );
 int scanhash_x16r_4way( struct work *, uint32_t,
                        uint64_t *, struct thr_info * );
-extern __thread x16r_4way_context_overlay x16r_ctx;
+
+#define x16r_4x64_prehash         x16r_4way_prehash
+#define x16r_4x64_hash_generic    x16r_4way_hash_generic
+#define x16r_4x64_hash            x16r_4way_hash
+#define scanhash_x16r_4x64        scanhash_x16r_4x64
+
+#elif defined(X16R_2WAY)
+
+union _x16r_2x64_context_overlay
+{
+    blake512_2x64_context   blake;
+    bmw512_2x64_context     bmw;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+    hashState_groestl       groestl;
+#else
+    sph_groestl512_context  groestl;
+#endif
+    skein512_2x64_context   skein;
+    jh512_2x64_context      jh;
+    keccak512_2x64_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    shavite512_context      shavite;
+    simd512_context         simd;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+    hashState_echo          echo;
+#else
+    sph_echo512_context     echo;
+#endif
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
+    hamsi_2x64_context      hamsi;
+#else
+    sph_hamsi512_context    hamsi;
+#endif
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+    hashState_fugue         fugue;
+#else
+    sph_fugue512_context    fugue;
+#endif
+    sph_shabal512_context   shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_2x64_context     sha512;
+} __attribute__ ((aligned (64)));
+
+typedef union _x16r_2x64_context_overlay x16r_2x64_context_overlay;
+
+void x16r_2x64_prehash( void *, void *, const char * );
+int x16r_2x64_hash_generic( void *, const void *, int, const char*, const int );
+int x16r_2x64_hash( void *, const void *, int );
+int scanhash_x16r_2x64( struct work *, uint32_t,
+                        uint64_t *, struct thr_info * );
+extern __thread x16r_2x64_context_overlay x16r_ctx;

 #endif

+// need a reference, add hooks for SSE2.
 // needed for hex
 union _x16r_context_overlay
 {
-#if defined(__AES__)
-        hashState_echo          echo;
-        hashState_groestl       groestl;
-        hashState_fugue         fugue;
-#else
-        sph_groestl512_context   groestl;
-        sph_echo512_context      echo;
-        sph_fugue512_context    fugue;
-#endif
        blake512_context        blake;
        sph_bmw512_context      bmw;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+        hashState_groestl       groestl;
+#else
+        sph_groestl512_context  groestl;
+#endif
        sph_skein512_context    skein;
        sph_jh512_context       jh;
        sph_keccak512_context   keccak;
        hashState_luffa         luffa;
        cubehashParam           cube;
        shavite512_context      shavite;
-#if defined(__aarch64__)
-        sph_simd512_context     simd;
+        simd512_context         simd;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+        hashState_echo          echo;
 #else
-        hashState_sd            simd;
+        sph_echo512_context     echo;
 #endif
        sph_hamsi512_context    hamsi;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+        hashState_fugue         fugue;
+#else
+        sph_fugue512_context    fugue;
+#endif
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
        sph_sha512_context      sha512;
@@ -222,10 +297,10 @@ union _x16r_context_overlay

 typedef union _x16r_context_overlay x16r_context_overlay;

-extern __thread x16r_context_overlay x16_ctx;
+extern __thread x16r_context_overlay x16r_ref_ctx;

-void x16r_prehash( void *, void * );
-int x16r_hash_generic( void *, const void *, int );
+void x16r_prehash( void *, void *, const char * );
+int x16r_hash_generic( void *, const void *, int, const char*, const int );
 int x16r_hash( void *, const void *, int );
 int scanhash_x16r( struct work *, uint32_t, uint64_t *, struct thr_info * );

@@ -242,6 +317,12 @@ int x16rv2_4way_hash( void *state, const void *input, int thrid );
 int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

+#elif defined(X16RV2_2WAY)
+
+int x16rv2_2x64_hash( void *state, const void *input, int thrid );
+int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
 #else

 int x16rv2_hash( void *state, const void *input, int thr_id );
@@ -251,18 +332,24 @@ int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
 #endif

 // x16rt, veil
-#if defined(X16R_8WAY)
+#if defined(X16RT_8WAY)

 //void x16rt_8way_hash( void *state, const void *input );
 int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

-#elif defined(X16R_4WAY)
+#elif defined(X16RT_4WAY)

 //void x16rt_4way_hash( void *state, const void *input );
 int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

+#elif defined(X16RT_2WAY)
+
+//void x16rt_4way_hash( void *state, const void *input );
+int scanhash_x16rt_2x64( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
 #else

 //void x16rt_hash( void *state, const void *input );
@@ -272,20 +359,27 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
 #endif

 // x21s
-#if defined(X16R_8WAY)
+#if defined(X21S_8WAY)

 int x21s_8way_hash( void *state, const void *input, int thrid );
 int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 bool x21s_8way_thread_init();

-#elif defined(X16R_4WAY)
+#elif defined(X21S_4WAY)

 int x21s_4way_hash( void *state, const void *input, int thrid );
 int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 bool x21s_4way_thread_init();

+#elif defined(X21S_2WAY)
+
+int x21s_2x64_hash( void *state, const void *input, int thrid );
+int scanhash_x21s_2x64( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+bool x21s_2x64_thread_init();
+
 #else

 int x21s_hash( void *state, const void *input, int thr_id );
--- a/algo/x16/x16r.c
+++ b/algo/x16/x16r.c
@@ -10,55 +10,60 @@
 #include <stdlib.h>
 #include <string.h>

-void x16r_prehash( void *edata, void *pdata )
+void x16r_prehash( void *edata, void *pdata, const char *hash_order )
 {
-   const char elem = x16r_hash_order[0];
+   const char elem = hash_order[0];
   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

   switch ( algo )
   {
      case JH:
-         sph_jh512_init( &x16_ctx.jh );
-         sph_jh512( &x16_ctx.jh, edata, 64 );
+         sph_jh512_init( &x16r_ref_ctx.jh );
+         sph_jh512( &x16r_ref_ctx.jh, edata, 64 );
      break;
      case SKEIN:
-         sph_skein512_init( &x16_ctx.skein );
-         sph_skein512( &x16_ctx.skein, edata, 64 );
+         sph_skein512_init( &x16r_ref_ctx.skein );
+         sph_skein512( &x16r_ref_ctx.skein, edata, 64 );
+      break;
+      case KECCAK:
+         sph_keccak512_init( &x16r_ref_ctx.keccak );
+         sph_keccak512( &x16r_ref_ctx.keccak, edata, 72 );
      break;
      case LUFFA:
-         init_luffa( &x16_ctx.luffa, 512 );
-         update_luffa( &x16_ctx.luffa, edata, 64 );
+         init_luffa( &x16r_ref_ctx.luffa, 512 );
+         update_luffa( &x16r_ref_ctx.luffa, edata, 64 );
      break;
      case CUBEHASH:
-         cubehashInit( &x16_ctx.cube, 512, 16, 32 );
-         cubehashUpdate( &x16_ctx.cube, edata, 64 );
+         cubehashInit( &x16r_ref_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x16r_ref_ctx.cube, edata, 64 );
      break;
      case HAMSI:
-         sph_hamsi512_init( &x16_ctx.hamsi );
-         sph_hamsi512( &x16_ctx.hamsi, edata, 64 );
-      break;
+         sph_hamsi512_init( &x16r_ref_ctx.hamsi );
+         sph_hamsi512( &x16r_ref_ctx.hamsi, edata, 72 );
+         break;
      case SHABAL:
-         sph_shabal512_init( &x16_ctx.shabal );
-         sph_shabal512( &x16_ctx.shabal, edata, 64 );
+         sph_shabal512_init( &x16r_ref_ctx.shabal );
+         sph_shabal512( &x16r_ref_ctx.shabal, edata, 64 );
      break;
      case WHIRLPOOL:
-         sph_whirlpool_init( &x16_ctx.whirlpool );
-         sph_whirlpool( &x16_ctx.whirlpool, edata, 64 );
+         sph_whirlpool_init( &x16r_ref_ctx.whirlpool );
+         sph_whirlpool( &x16r_ref_ctx.whirlpool, edata, 64 );
      break;
   }
 }

-int x16r_hash_generic( void* output, const void* input, int thrid )
+int x16r_hash_generic( void* output, const void* input, int thrid, 
+                       const char *hash_order, const int func_count )
 {
-   uint32_t _ALIGN(128) hash[16];
+   uint32_t _ALIGN(32) hash[16];
   x16r_context_overlay ctx;
-   memcpy( &ctx, &x16_ctx, sizeof(ctx) );
+   memcpy( &ctx, &x16r_ref_ctx, sizeof(ctx) );
   void *in = (void*) input;
   int size = 80;

-   for ( int i = 0; i < 16; i++ )
+   for ( int i = 0; i < func_count; i++ )
   {
-      const char elem = x16r_hash_order[i];
+      const char elem = hash_order[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

      switch ( algo )
@@ -70,36 +75,41 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
         break;
         case BMW:
            sph_bmw512_init( &ctx.bmw );
-            sph_bmw512(&ctx.bmw, in, size);
-            sph_bmw512_close(&ctx.bmw, hash);
+            sph_bmw512( &ctx.bmw, in, size );
+            sph_bmw512_close( &ctx.bmw, hash );
         break;
         case GROESTL:
-#if defined(__AES__)
-            groestl512_full( &ctx.groestl, (char*)hash, (char*)in, size<<3 );
+#if defined(__AES__)  // || defined(__ARM_FEATURE_AES)
+            groestl512_full( &ctx.groestl, hash, in, size<<3 );
 #else
            sph_groestl512_init( &ctx.groestl );
            sph_groestl512( &ctx.groestl, in, size );
-            sph_groestl512_close(&ctx.groestl, hash);
+            sph_groestl512_close( &ctx.groestl, hash );
 #endif
         break;
         case JH:
            if ( i == 0 )
-               sph_jh512(&ctx.jh, in+64, 16 );
+               sph_jh512( &ctx.jh, in+64, 16 );
            else
            {
               sph_jh512_init( &ctx.jh );
-               sph_jh512(&ctx.jh, in, size );
+               sph_jh512( &ctx.jh, in, size );
            }
-            sph_jh512_close(&ctx.jh, hash );
+            sph_jh512_close( &ctx.jh, hash );
         break;
         case KECCAK:
-            sph_keccak512_init( &ctx.keccak );
-            sph_keccak512( &ctx.keccak, in, size );
+            if ( i == 0 )
+               sph_keccak512( &ctx.keccak, in+72, 8 );
+            else
+            {
+               sph_keccak512_init( &ctx.keccak );
+               sph_keccak512( &ctx.keccak, in, size );
+            }
            sph_keccak512_close( &ctx.keccak, hash );
         break;
         case SKEIN:
            if ( i == 0 )
-               sph_skein512(&ctx.skein, in+64, 16 );
+               sph_skein512( &ctx.skein, in+64, 16 );
            else
            {
               sph_skein512_init( &ctx.skein );
@@ -109,13 +119,13 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
         break;
         case LUFFA:
            if ( i == 0 )
-               update_and_final_luffa( &ctx.luffa, hash, (const void*)in+64, 16 );
+               update_and_final_luffa( &ctx.luffa, hash, in+64, 16 );
            else
               luffa_full( &ctx.luffa, hash, 512, in, size );
            break;
         case CUBEHASH:
            if ( i == 0 )
-               cubehashUpdateDigest( &ctx.cube, hash, (const void*)in+64, 16 );
+               cubehashUpdateDigest( &ctx.cube, hash, in+64, 16 );
            else
               cubehash_full( &ctx.cube, hash, 512, in, size );
         break;
@@ -123,19 +133,13 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
            shavite512_full( &ctx.shavite, hash, in, size );
         break;
         case SIMD:
-#if defined(__aarch64__)
            sph_simd512_init( &ctx.simd );
-            sph_simd512(&ctx.simd, (const void*) hash, 64);
-            sph_simd512_close(&ctx.simd, hash);
-#else
-            simd_full( &ctx.simd, (BitSequence *)hash,
-                             (const BitSequence*)in, size<<3 );
-#endif
+            sph_simd512( &ctx.simd, hash, size );
+            sph_simd512_close( &ctx.simd, hash );
         break;
         case ECHO:
 #if defined(__AES__)
-            echo_full( &ctx.echo, (BitSequence*)hash, 512,
-                            (const BitSequence*)in, size );
+            echo_full( &ctx.echo, hash, 512, in, size );
 #else
            sph_echo512_init( &ctx.echo );
            sph_echo512( &ctx.echo, in, size );
@@ -144,7 +148,7 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
         break;
         case HAMSI:
            if ( i == 0 )
-               sph_hamsi512( &ctx.hamsi, in+64, 16 );
+               sph_hamsi512( &ctx.hamsi, in+72, 8 );
            else
            {
               sph_hamsi512_init( &ctx.hamsi );
@@ -153,12 +157,8 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
            sph_hamsi512_close( &ctx.hamsi, hash );
         break;
         case FUGUE:
-#if defined(__AES__)
-         fugue512_full( &ctx.fugue, hash, in, size );
-#else
-	 sph_fugue512_full( &ctx.fugue, hash, in, size );
-#endif
-	 break;
+	         sph_fugue512_full( &ctx.fugue, hash, in, size );
+	      break;
         case SHABAL:
            if ( i == 0 )
               sph_shabal512( &ctx.shabal, in+64, 16 );
@@ -197,7 +197,8 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
 int x16r_hash( void* output, const void* input, int thrid )
 {  
   uint8_t hash[64] __attribute__ ((aligned (64)));
-   if ( !x16r_hash_generic( hash, input, thrid ) )
+   if ( !x16r_hash_generic( hash, input, thrid, x16r_hash_order, 
+                            X16R_HASH_FUNC_COUNT ) )
      return 0;
   
    memcpy( output, hash, 32 );
@@ -207,8 +208,8 @@ int x16r_hash( void* output, const void* input, int thrid )
 int scanhash_x16r( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t _ALIGN(128) hash32[8];
-   uint32_t _ALIGN(128) edata[20];
+   uint32_t _ALIGN(32) hash32[8];
+   uint32_t _ALIGN(32) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -230,7 +231,7 @@ int scanhash_x16r( struct work *work, uint32_t max_nonce,
           applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

-   x16r_prehash( edata, pdata );
+   x16r_prehash( edata, pdata, x16r_hash_order );

   do
   {
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -3,7 +3,7 @@
 #include <stdlib.h>
 #include <string.h>

-#if defined (X16R_8WAY)
+#if defined (X16RT_8WAY)

 int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr)
@@ -30,12 +30,12 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
      x16rt_getTimeHash( masked_ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
      s_ntime = masked_ntime;
-      if ( !thr_id )
-          applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
-                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
+      if ( !opt_quiet && !thr_id )
+          applog( LOG_INFO, "Hash order %s, Ntime %08x",
+                            x16r_hash_order, bswap_32( pdata[17] ) );
   }

-   x16r_8way_prehash( vdata, pdata );
+   x16r_8way_prehash( vdata, pdata, x16r_hash_order );
   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
@@ -57,7 +57,7 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
   return 0;
 }

-#elif defined (X16R_4WAY)
+#elif defined (X16RT_4WAY)

 int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr)
@@ -84,12 +84,12 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
      x16rt_getTimeHash( masked_ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
      s_ntime = masked_ntime;
-      if ( !thr_id )
-          applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
-                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
+      if ( !opt_quiet && !thr_id )
+          applog( LOG_INFO, "Hash order %s, Ntime %08x",
+                            x16r_hash_order, bswap_32( pdata[17] ) );
   }

-   x16r_4way_prehash( vdata, pdata );
+   x16r_4way_prehash( vdata, pdata, x16r_hash_order );
   *noncev = mm256_intrlv_blend_32(
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
@@ -110,4 +110,55 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }

+#elif defined (X16RT_2WAY)
+
+int scanhash_x16rt_2x64( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[2*16] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) timeHash[4*8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   v128_t *noncev = (v128_t*)vdata + 9;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+
+   if ( bench )  ptarget[7] = 0x0cff;
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
+   if ( s_ntime != masked_ntime )
+   {
+      x16rt_getTimeHash( masked_ntime, &timeHash );
+      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
+      s_ntime = masked_ntime;
+      if ( !opt_quiet && !thr_id )
+          applog( LOG_INFO, "Hash order %s, Ntime %08x",
+                            x16r_hash_order, bswap_32( pdata[17] ) );
+   }
+
+   x16r_2x64_prehash( vdata, pdata, x16r_hash_order );
+   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
+   do
+   {
+      if ( x16r_2x64_hash( hash, vdata, thr_id ) )
+      for ( int i = 0; i < 2; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
+      n += 2;
+   } while ( (  n < last_nonce ) && !(*restart) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
 #endif
--- a/algo/x16/x16rt.c
+++ b/algo/x16/x16rt.c
@@ -1,6 +1,6 @@
 #include "x16r-gate.h"

-#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
+#if !defined(X16RT_8WAY) && !defined(X16RT_4WAY)

 int scanhash_x16rt( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr )
@@ -31,7 +31,7 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
                        x16r_hash_order, swab32( pdata[17] ), timeHash );
   }
   
-   x16r_prehash( edata, pdata );
+   x16r_prehash( edata, pdata, x16r_hash_order );
   
   do
   {
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -395,7 +395,7 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
         break;
         case HAMSI:
            if ( i == 0 )
-               hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 );
+               hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -409,14 +409,43 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
                          hash7, vhash );
         break;
         case FUGUE:
-            fugue512_full( &ctx.fugue, hash0, in0, size );
-            fugue512_full( &ctx.fugue, hash1, in1, size );
-            fugue512_full( &ctx.fugue, hash2, in2, size );
-            fugue512_full( &ctx.fugue, hash3, in3, size );
-            fugue512_full( &ctx.fugue, hash4, in4, size );
-            fugue512_full( &ctx.fugue, hash5, in5, size );
-            fugue512_full( &ctx.fugue, hash6, in6, size );
-            fugue512_full( &ctx.fugue, hash7, in7, size );
+            if ( i == 0 )
+            {
+               fugue512_update( &ctx.fugue, in0 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in1 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash1 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in2 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash2 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in3 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash3 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) ); 
+               fugue512_update( &ctx.fugue, in4 + 76, 4 ); 
+               fugue512_final( &ctx.fugue, hash4 ); 
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) ); 
+               fugue512_update( &ctx.fugue, in5 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash5 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in6 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash6 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in7 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash7 );
+            }
+            else
+            {
+               fugue512_full( &ctx.fugue, hash0, hash0, size );
+               fugue512_full( &ctx.fugue, hash1, hash1, size );
+               fugue512_full( &ctx.fugue, hash2, hash2, size );
+               fugue512_full( &ctx.fugue, hash3, hash3, size );
+               fugue512_full( &ctx.fugue, hash4, hash4, size );
+               fugue512_full( &ctx.fugue, hash5, hash5, size );
+               fugue512_full( &ctx.fugue, hash6, hash6, size );
+               fugue512_full( &ctx.fugue, hash7, hash7, size );
+            }
         break;
         case SHABAL:
            intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -564,7 +593,6 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
   uint32_t edata[20] __attribute__ ((aligned (64)));
-   uint32_t bedata1[2] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -577,19 +605,15 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,

   if ( bench ) ptarget[7] = 0x0cff;

-   mm512_bswap32_intrlv80_8x64( vdata, pdata );
-
-   bedata1[0] = bswap_32( pdata[1] );
-   bedata1[1] = bswap_32( pdata[2] );
-
-   static __thread uint32_t s_ntime = UINT32_MAX;
-   const uint32_t ntime = bswap_32( pdata[17] );
-   if ( s_ntime != ntime )
+   static __thread uint32_t saved_height = UINT32_MAX;
+   if ( work->height != saved_height )
   {
-      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
-      s_ntime = ntime;
-      if ( opt_debug && !thr_id )
-         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+      vdata[1] = bswap_32( pdata[1] );
+      vdata[2] = bswap_32( pdata[2] );
+      saved_height = work->height;
+      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
+      if ( !opt_quiet && !thr_id )
+           applog( LOG_INFO, "hash order %s", x16r_hash_order );
   }

   // Do midstate prehash on hash functions with block size <= 64 bytes.
@@ -626,7 +650,14 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
      case HAMSI:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
         hamsi512_8way_init( &x16rv2_ctx.hamsi );
-         hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 );
+         hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 72 );
+      break;
+      case FUGUE:
+         v128_bswap32_80( edata, pdata );
+         fugue512_init( &x16rv2_ctx.fugue );
+         fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );
      break;
      case SHABAL:
         mm256_bswap32_intrlv80_8x32( vdata2, pdata );
@@ -824,8 +855,8 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               skein512_4way_init( &ctx.skein );
               skein512_4way_update( &ctx.skein, vhash, size );
+               skein512_4way_close( &ctx.skein, vhash );
            }
-            skein512_4way_close( &ctx.skein, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case LUFFA:
@@ -945,7 +976,7 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
         break;
         case HAMSI:
            if ( i == 0 )
-               hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 );
+               hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
@@ -956,10 +987,27 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case FUGUE:
-            fugue512_full( &ctx.fugue, hash0, in0, size );
-            fugue512_full( &ctx.fugue, hash1, in1, size );
-            fugue512_full( &ctx.fugue, hash2, in2, size );
-            fugue512_full( &ctx.fugue, hash3, in3, size );
+            if ( i == 0 )
+            {
+               fugue512_update( &ctx.fugue, in0 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in1 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash1 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in2 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash2 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in3 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash3 );
+            }
+            else
+            {
+               fugue512_full( &ctx.fugue, hash0, hash0, size );
+               fugue512_full( &ctx.fugue, hash1, hash1, size );
+               fugue512_full( &ctx.fugue, hash2, hash2, size );
+               fugue512_full( &ctx.fugue, hash3, hash3, size );
+            }
         break;
         case SHABAL:
             intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
@@ -1055,7 +1103,6 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
   uint32_t edata[20];
-   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -1068,17 +1115,15 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,

   if ( bench )  ptarget[7] = 0x0fff;
   
-   bedata1[0] = bswap_32( pdata[1] );
-   bedata1[1] = bswap_32( pdata[2] );
-
-   static __thread uint32_t s_ntime = UINT32_MAX;
-   const uint32_t ntime = bswap_32(pdata[17]);
-   if ( s_ntime != ntime )
+   static __thread uint32_t saved_height = UINT32_MAX;
+   if ( work->height != saved_height )
   {
-      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
-      s_ntime = ntime;
-      if ( opt_debug && !thr_id )
-         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+      vdata[1] = bswap_32( pdata[1] );
+      vdata[2] = bswap_32( pdata[2] );
+      saved_height = work->height;
+      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
+      if ( !opt_quiet && !thr_id )
+           applog( LOG_INFO, "hash order %s", x16r_hash_order );
   }

   // Do midstate prehash on hash functions with block size <= 64 bytes.
@@ -1101,7 +1146,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
      break;
      case SKEIN:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         skein512_4way_prehash64( &x16r_ctx.skein, vdata );
+         skein512_4way_prehash64( &x16rv2_ctx.skein, vdata );
      break;
      case CUBEHASH:
         v128_bswap32_80( edata, pdata );
@@ -1112,7 +1157,13 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
      case HAMSI:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
         hamsi512_4way_init( &x16rv2_ctx.hamsi );
-         hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 );
+         hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 72 );
+      break;
+      case FUGUE:
+         v128_bswap32_80( edata, pdata );
+         fugue512_init( &x16rv2_ctx.fugue );
+         fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
      break;
      case SHABAL:
         v128_bswap32_intrlv80_4x32( vdata32, pdata );
@@ -1151,4 +1202,450 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }

+#elif defined (X16RV2_2WAY)
+
+union _x16rv2_2x64_context_overlay
+{
+    blake512_2x64_context   blake;
+    bmw512_2x64_context     bmw;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+    hashState_groestl       groestl;
+#else
+    sph_groestl512_context  groestl;
+#endif
+    skein512_2x64_context   skein;
+    jh512_2x64_context      jh;
+    keccak512_2x64_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    shavite512_context      shavite;
+    simd512_context         simd;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+    hashState_echo          echo;
+#else
+    sph_echo512_context     echo;
+#endif
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
+    hamsi_2x64_context      hamsi;
+#else
+    sph_hamsi512_context    hamsi;
+#endif
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+    hashState_fugue         fugue;
+#else
+    sph_fugue512_context    fugue;
+#endif
+    sph_shabal512_context   shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_2x64_context     sha512;
+    sph_tiger_context       tiger;
+} __attribute__ ((aligned (64)));
+
+typedef union _x16rv2_2x64_context_overlay x16rv2_2x64_context_overlay;
+
+static __thread x16rv2_2x64_context_overlay x16rv2_ctx;
+
+// Pad the 24 bytes tiger hash to 64 bytes
+static inline void padtiger512( uint32_t* hash )
+{
+  for ( int i = 6; i < 16; i++ ) hash[i] = 0;
+}
+
+int x16rv2_2x64_hash( void* output, const void* input, int thrid )
+{
+   uint32_t vhash[20*2] __attribute__ ((aligned (64)));
+   uint32_t hash0[20] __attribute__ ((aligned (32)));
+   uint32_t hash1[20] __attribute__ ((aligned (32)));
+   x16rv2_2x64_context_overlay ctx;
+   memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   int size = 80;
+
+   dintrlv_2x64( hash0, hash1, input, 640 );
+
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = x16r_hash_order[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            if ( i == 0 )
+               blake512_2x64_full( &ctx.blake, vhash, input, size );
+            else
+            {
+               intrlv_2x64( vhash, in0, in1, size<<3 );
+               blake512_2x64_full( &ctx.blake, vhash, vhash, size );
+            }
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+         case BMW:
+            bmw512_2x64_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_2x64_update( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_2x64( vhash, in0, in1, size<<3 );
+               bmw512_2x64_update( &ctx.bmw, vhash, size );
+            }
+            bmw512_2x64_close( &ctx.bmw, vhash );
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+         case GROESTL:
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+            groestl512_full( &ctx.groestl, hash0, in0, size<<3 );
+            groestl512_full( &ctx.groestl, hash1, in1, size<<3 );
+#else
+            sph_groestl512_init( &ctx.groestl );
+            sph_groestl512( &ctx.groestl, in0, size );
+            sph_groestl512_close( &ctx.groestl, hash0 );
+            sph_groestl512_init( &ctx.groestl );
+            sph_groestl512( &ctx.groestl, in1, size );
+            sph_groestl512_close( &ctx.groestl, hash1 );
+#endif
+         break;
+         case JH:
+            if ( i == 0 )
+               jh512_2x64_update( &ctx.jh, input + (64<<1), 16 );
+            else
+            {
+               intrlv_2x64( vhash, in0, in1, size<<3 );
+               jh512_2x64_init( &ctx.jh );
+               jh512_2x64_update( &ctx.jh, vhash, size );
+            }
+            jh512_2x64_close( &ctx.jh, vhash );
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+         case KECCAK:
+            if ( i == 0 )
+            {
+               sph_tiger( &ctx.tiger, in0 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_tiger( &ctx.tiger, in1 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash1 );
+            }
+            else
+            {
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in0, size );
+               sph_tiger_close( &ctx.tiger, hash0 );
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in1, size );
+               sph_tiger_close( &ctx.tiger, hash1 );
+            }
+            for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = 0;
+
+            intrlv_2x64( vhash, hash0, hash1, 512 );
+            keccak512_2x64_init( &ctx.keccak );
+            keccak512_2x64_update( &ctx.keccak, vhash, 64 );
+            keccak512_2x64_close( &ctx.keccak, vhash );
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+         case SKEIN:
+            if ( i == 0 )
+               skein512_2x64_final16( &ctx.skein, vhash, input + (64*2) );
+            else
+            {
+               intrlv_2x64( vhash, in0, in1, size<<3 );
+               skein512_2x64_full( &ctx.skein, vhash, vhash,  size );
+            }
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+         case LUFFA:
+            if ( i == 0 )
+            {
+               sph_tiger( &ctx.tiger, in0 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_tiger( &ctx.tiger, in1 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash1 );
+            }
+            else
+            {
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in0, size );
+               sph_tiger_close( &ctx.tiger, hash0 );
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in1, size );
+               sph_tiger_close( &ctx.tiger, hash1 );
+            }
+            for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = 0;
+            luffa_full( &ctx.luffa, hash0, 512, hash0, 64 );
+            luffa_full( &ctx.luffa, hash1, 512, hash1, 64 );
+         break;
+         case CUBEHASH:
+            if ( i == 0 )
+            {
+               cubehashUpdateDigest( &ctx.cube, hash0, in0 + 64, 16 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, hash1, in1 + 64, 16 );
+            }
+            else
+            {
+               cubehash_full( &ctx.cube, hash0, 512, hash0, size );
+               cubehash_full( &ctx.cube, hash1, 512, hash1, size );
+            }
+         break;
+         case SHAVITE:
+            shavite512_full( &ctx.shavite, hash0, in0, size );
+            shavite512_full( &ctx.shavite, hash1, in1, size );
+         break;
+         case SIMD:
+            simd512_ctx( &ctx.simd, hash0, in0, size );
+            simd512_ctx( &ctx.simd, hash1, in1, size );
+         break;
+         case ECHO:
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+            echo_full( &ctx.echo, hash0, 512, in0, size );
+            echo_full( &ctx.echo, hash1, 512, in1, size );
+#else
+            sph_echo512_init( &ctx.echo );
+            sph_echo512( &ctx.echo, in0, size );
+            sph_echo512_close( &ctx.echo, hash0 );
+            sph_echo512_init( &ctx.echo );
+            sph_echo512( &ctx.echo, in1, size );
+            sph_echo512_close( &ctx.echo, hash1 );
+#endif
+         break;
+         case HAMSI:
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
+            if ( i == 0 )
+               hamsi512_2x64_update( &ctx.hamsi, input + (72*2), 8 );
+            else
+            {
+               intrlv_2x64( vhash, hash0, hash1, size<<3 );
+               hamsi512_2x64_init( &ctx.hamsi );
+               hamsi512_2x64_update( &ctx.hamsi, vhash, size );
+            }
+            hamsi512_2x64_close( &ctx.hamsi, vhash );
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+#else
+            if ( i == 0 )
+            {
+               sph_hamsi512( &ctx.hamsi, in0 + 72, 8 );
+               sph_hamsi512_close( &ctx.hamsi, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_hamsi512( &ctx.hamsi, in1 + 72, 8 );
+               sph_hamsi512_close( &ctx.hamsi, hash1 );
+            }
+            else
+            {
+               sph_hamsi512_init( &ctx.hamsi );
+               sph_hamsi512( &ctx.hamsi, hash0, size );
+               sph_hamsi512_close( &ctx.hamsi, hash0 );
+               sph_hamsi512_init( &ctx.hamsi );
+               sph_hamsi512( &ctx.hamsi, hash1, size );
+               sph_hamsi512_close( &ctx.hamsi, hash1 );
+             }
+#endif
+         break;
+         case FUGUE:
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+            if ( i == 0 )
+            {
+               fugue512_update( &ctx.fugue, in0 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in1 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash1 );
+            }
+            else
+            {
+               fugue512_full( &ctx.fugue, hash0, hash0, size );
+               fugue512_full( &ctx.fugue, hash1, hash1, size );
+            }
+#else
+            if ( i == 0 )
+            {
+               sph_fugue512( &ctx.fugue, in0 + 76, 4 );
+               sph_fugue512_close( &ctx.fugue, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(sph_fugue512_context) );
+               sph_fugue512( &ctx.fugue, in1 + 76, 4 );
+               sph_fugue512_close( &ctx.fugue, hash1 );
+            }
+             else
+             {
+                sph_fugue512_full( &ctx.fugue, hash0, hash0, size );
+                sph_fugue512_full( &ctx.fugue, hash1, hash1, size );
+             }
+#endif
+             break;
+         case SHABAL:
+            if ( i == 0 )
+            {
+               sph_shabal512( &ctx.shabal, in0 + 64, 16 );
+               sph_shabal512_close( &ctx.shabal, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_shabal512( &ctx.shabal, in1 + 64, 16 );
+               sph_shabal512_close( &ctx.shabal, hash1 );
+            }
+            else
+            {
+               sph_shabal512_init( &ctx.shabal );
+               sph_shabal512( &ctx.shabal, hash0, size );
+               sph_shabal512_close( &ctx.shabal, hash0 );
+               sph_shabal512_init( &ctx.shabal );
+               sph_shabal512( &ctx.shabal, hash1, size );
+               sph_shabal512_close( &ctx.shabal, hash1 );
+             }
+          break;
+          case WHIRLPOOL:
+            sph_whirlpool512_full( &ctx.whirlpool, hash0, in0, size );
+            sph_whirlpool512_full( &ctx.whirlpool, hash1, in1, size );
+         break;
+         case SHA_512:
+             if ( i == 0 )
+             {
+                sph_tiger( &ctx.tiger, in0 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash0 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in1 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash1 );
+             }
+             else
+             {
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in0, size );
+                sph_tiger_close( &ctx.tiger, hash0 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in1, size );
+                sph_tiger_close( &ctx.tiger, hash1 );
+             }
+             for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = 0;
+
+             intrlv_2x64( vhash, hash0, hash1, 512 );
+             sha512_2x64_init( &ctx.sha512 );
+             sha512_2x64_update( &ctx.sha512, vhash, 64 );
+             sha512_2x64_close( &ctx.sha512, vhash );
+             dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+      }
+
+      if ( work_restart[thrid].restart ) return 0;
+
+      size = 64;
+   }
+   memcpy( output,    hash0, 32 );
+   memcpy( output+32, hash1, 32 );
+   return 1;
+}
+
+int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[2*16] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
+   uint32_t edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   v128_t *noncev = (v128_t*)vdata + 9;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+
+   if ( bench )  ptarget[7] = 0x0fff;
+
+   static __thread uint32_t saved_height = UINT32_MAX;
+   if ( work->height != saved_height )
+   {
+      vdata[1] = bswap_32( pdata[1] );
+      vdata[2] = bswap_32( pdata[2] );
+      saved_height = work->height;
+      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
+      if ( !opt_quiet && !thr_id )
+           applog( LOG_INFO, "hash order %s", x16r_hash_order );
+   }
+
+   // Do midstate prehash on hash functions with block size <= 64 bytes.
+   const char elem = x16r_hash_order[0];
+   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+   switch ( algo )
+   {
+      case JH:
+         v128_bswap32_intrlv80_2x64( vdata, pdata );
+         jh512_2x64_init( &x16rv2_ctx.jh );
+         jh512_2x64_update( &x16rv2_ctx.jh, vdata, 64 );
+      break;
+      case KECCAK:
+      case LUFFA:
+      case SHA_512:
+         v128_bswap32_80( edata, pdata );
+         sph_tiger_init( &x16rv2_ctx.tiger );
+         sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
+         intrlv_2x64( vdata, edata, edata, 640 );
+      break;
+      case SKEIN:
+         v128_bswap32_intrlv80_2x64( vdata, pdata );
+         skein512_2x64_prehash64( &x16rv2_ctx.skein, vdata );
+      break;
+      case CUBEHASH:
+         v128_bswap32_80( edata, pdata );
+         cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x16rv2_ctx.cube, edata, 64 );
+         intrlv_2x64( vdata, edata, edata, 640 );
+      break;
+      case HAMSI:
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
+         v128_bswap32_intrlv80_2x64( vdata, pdata );
+         hamsi512_2x64_init( &x16rv2_ctx.hamsi );
+         hamsi512_2x64_update( &x16rv2_ctx.hamsi, vdata, 72 );
+#else
+         v128_bswap32_80( edata, pdata );
+         sph_hamsi512_init( &x16rv2_ctx.hamsi );
+         sph_hamsi512( &x16rv2_ctx.hamsi, edata, 72 );
+         intrlv_2x64( vdata, edata, edata, 640 );
+#endif
+      break;
+      case FUGUE:
+         v128_bswap32_80( edata, pdata );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+         fugue512_init( &x16rv2_ctx.fugue );
+         fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
+#else
+         sph_fugue512_init( &x16rv2_ctx.fugue );
+         sph_fugue512( &x16rv2_ctx.fugue, edata, 76 );
+#endif
+         intrlv_2x64( vdata, edata, edata, 640 );
+      break;
+      case SHABAL:
+         v128_bswap32_80( edata, pdata );
+         sph_shabal512_init( &x16rv2_ctx.shabal );
+         sph_shabal512( &x16rv2_ctx.shabal, edata, 64);
+         intrlv_2x64( vdata, edata, edata, 640 );
+      break;
+      default:
+         v128_bswap32_intrlv80_2x64( vdata, pdata );
+   }
+
+   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
+
+   do
+   {
+      if ( x16rv2_2x64_hash( hash, vdata, thr_id ) )
+      for ( int i = 0; i < 2; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
+      n += 2;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+
+
 #endif
--- a/algo/x16/x16rv2.c
+++ b/algo/x16/x16rv2.c
@@ -6,21 +6,15 @@
 */
 #include "x16r-gate.h"

-#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
+#if !defined(X16RV2_8WAY) && !defined(X16RV2_4WAY) && !defined(X16RV2_2WAY)

 #include "algo/tiger/sph_tiger.h"

 union _x16rv2_context_overlay
 {
-#if defined(__AES__)
-        hashState_echo          echo;
-        hashState_groestl       groestl;
-        hashState_fugue         fugue;
-#else
        sph_groestl512_context   groestl;
        sph_echo512_context      echo;
        sph_fugue512_context    fugue;
-#endif
        blake512_context        blake;
        sph_bmw512_context      bmw;
        sph_skein512_context    skein;
@@ -29,11 +23,7 @@ union _x16rv2_context_overlay
        hashState_luffa         luffa;
        cubehashParam           cube;
        shavite512_context      shavite;
-#if defined(__aarch64__)
        sph_simd512_context     simd;
-#else
-        hashState_sd            simd;
-#endif        
        sph_hamsi512_context    hamsi;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
@@ -72,15 +62,9 @@ int x16rv2_hash( void* output, const void* input, int thrid )
            sph_bmw512_close(&ctx.bmw, hash);
         break;
         case GROESTL:
-#if defined(__AES__)
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash,
-                                      (const char*)in, size<<3 );
-#else
            sph_groestl512_init( &ctx.groestl );
            sph_groestl512( &ctx.groestl, in, size );
            sph_groestl512_close(&ctx.groestl, hash);
-#endif
         break;
         case SKEIN:
            sph_skein512_init( &ctx.skein );
@@ -117,25 +101,14 @@ int x16rv2_hash( void* output, const void* input, int thrid )
            shavite512_full( &ctx.shavite, hash, in, size );
         break;
         case SIMD:
-#if defined(__aarch64__)
            sph_simd512_init( &ctx.simd );
-            sph_simd512(&ctx.simd, (const void*) hash, 64);
+            sph_simd512(&ctx.simd, hash, 64);
            sph_simd512_close(&ctx.simd, hash);
-#else
-            simd_full( &ctx.simd, (BitSequence *)hash,
-                             (const BitSequence*)in, size<<3 );
-#endif
         break;
         case ECHO:
-#if defined(__AES__)
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash,
-                                (const BitSequence*)in, size<<3 );
-#else
             sph_echo512_init( &ctx.echo );
             sph_echo512( &ctx.echo, in, size );
             sph_echo512_close( &ctx.echo, hash );
-#endif
         break;
         case HAMSI:
             sph_hamsi512_init( &ctx.hamsi );
@@ -143,11 +116,7 @@ int x16rv2_hash( void* output, const void* input, int thrid )
             sph_hamsi512_close( &ctx.hamsi, hash );
         break;
         case FUGUE:
-#if defined(__AES__)
-             fugue512_full( &ctx.fugue, hash, in, size );
-#else
             sph_fugue512_full( &ctx.fugue, hash, in, size );
-#endif
 	     break;
         case SHABAL:
             sph_shabal512_init( &ctx.shabal );
--- a/algo/x16/x20r.c
+++ b/algo/x16/x20r.c
@@ -0,0 +1,362 @@
+#include "miner.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "algo/blake/sph_blake.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/sph_groestl.h"
+#include "algo/jh/sph_jh.h"
+#include "algo/keccak/sph_keccak.h"
+#include "algo/skein/sph_skein.h"
+#include "algo/luffa/sph_luffa.h"
+#include "algo/cubehash/sph_cubehash.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sph_simd.h"
+#include "algo/echo/sph_echo.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/sph_shabal.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include "algo/sha/sph_sha2.h"
+#include "x16r-gate.h"
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X20R_8WAY   1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X20R_4WAY   1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define X20R_2WAY   1
+#endif
+
+// X20R is not what it seems. It does not permute 20 functions over 20 rounds,
+// it only permutes 16 of them. The last 4 functions are victims of trying to
+// fit 20 elements in the space for only 16. Arithmetic overflow recycles the
+// first 4 functions.  Otherwise it's identical to X16R. 
+// Welcome to the real X20R.
+
+#define X20R_HASH_FUNC_COUNT 20
+/*
+enum x20r_algo
+{
+	BLAKE = 0,
+	BMW,
+	GROESTL,
+	JH,
+	KECCAK,
+	SKEIN,
+	LUFFA,
+	CUBEHASH,
+	SHAVITE,
+	SIMD,
+	ECHO,
+	HAMSI,
+	FUGUE,
+	SHABAL,
+	WHIRLPOOL,
+	SHA512,
+	HAVAL,       // Last 4 names are meaningless and not used
+	GOST,
+	RADIOGATUN,
+	PANAMA,   
+	X20R_HASH_FUNC_COUNT
+};
+*/
+static __thread char x20r_hash_order[ X20R_HASH_FUNC_COUNT + 1 ] = {0};
+
+static void x20r_getAlgoString(const uint8_t* prevblock, char *output)
+{
+	char *sptr = output;
+
+	for (int j = 0; j < X20R_HASH_FUNC_COUNT; j++) {
+		uint8_t b = (19 - j) >> 1; // 16 ascii hex chars, reversed
+		uint8_t algoDigit = (j & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4;
+		if (algoDigit >= 10)
+			sprintf(sptr, "%c", 'A' + (algoDigit - 10));
+		else
+			sprintf(sptr, "%u", (uint32_t) algoDigit);
+		sptr++;
+	}
+	*sptr = '\0';
+}
+
+#if defined(X20R_8WAY)
+
+int x20r_8x64_hash( void* output, const void* input, int thrid )
+{
+   uint8_t hash[64*8] __attribute__ ((aligned (128)));
+   if ( !x16r_8x64_hash_generic( hash, input, thrid, x20r_hash_order,
+                                 X20R_HASH_FUNC_COUNT ) )
+      return 0;
+
+   memcpy( output,     hash,     32 );
+   memcpy( output+32,  hash+64,  32 );
+   memcpy( output+64,  hash+128, 32 );
+   memcpy( output+96,  hash+192, 32 );
+   memcpy( output+128, hash+256, 32 );
+   memcpy( output+160, hash+320, 32 );
+   memcpy( output+192, hash+384, 32 );
+   memcpy( output+224, hash+448, 32 );
+
+   return 1;
+}
+
+int scanhash_x20r_8x64( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[16*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   const int thr_id = mythr->id;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+
+   if ( bench )   ptarget[7] = 0x0cff;
+
+   static __thread uint32_t saved_height = UINT32_MAX;
+   if ( work->height != saved_height )
+   {
+      vdata[1] = bswap_32( pdata[1] );
+      vdata[2] = bswap_32( pdata[2] );
+      vdata[3] = bswap_32( pdata[3] );
+      saved_height = work->height;
+      x20r_getAlgoString( (const uint8_t*)(&vdata[1]), x20r_hash_order );
+      if ( !opt_quiet && !thr_id )
+           applog( LOG_INFO, "hash order %s", x20r_hash_order );
+   }
+
+   x16r_8x64_prehash( vdata, pdata, x20r_hash_order );
+   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
+                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
+   do
+   {
+      if( x20r_8x64_hash( hash, vdata, thr_id ) );
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  _mm512_set1_epi64( 0x0000000800000000 ) );
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+#elif defined(X20R_4WAY)
+
+int x20r_4x64_hash( void* output, const void* input, int thrid )
+{
+   uint8_t hash[64*4] __attribute__ ((aligned (64)));
+   if ( !x16r_4x64_hash_generic( hash, input, thrid, x20r_hash_order,
+                                 X20R_HASH_FUNC_COUNT ) )
+      return 0;
+
+   memcpy( output,     hash,     32 );
+   memcpy( output+32,  hash+64,  32 );
+   memcpy( output+64,  hash+128, 32 );
+   memcpy( output+96,  hash+192, 32 );
+
+   return 1;
+}
+
+int scanhash_x20r_4x64( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[16*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   uint32_t n = first_nonce;
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( bench )  ptarget[7] = 0x0cff;
+
+   static __thread uint32_t saved_height = UINT32_MAX;
+   if ( work->height != saved_height )
+   {
+      vdata[1] = bswap_32( pdata[1] );
+      vdata[2] = bswap_32( pdata[2] );
+      vdata[3] = bswap_32( pdata[3] );
+      saved_height = work->height;
+      x20r_getAlgoString( (const uint8_t*)(&vdata[1]), x20r_hash_order );
+      if ( !opt_quiet && !thr_id )
+           applog( LOG_INFO, "hash order %s", x20r_hash_order );
+   }
+   
+   x16r_4x64_prehash( vdata, pdata, x20r_hash_order );
+   *noncev = mm256_intrlv_blend_32(
+                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
+   do
+   {
+      if ( x20r_4x64_hash( hash, vdata, thr_id ) );
+      for ( int i = 0; i < 4; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  _mm256_set1_epi64x( 0x0000000400000000 ) );
+      n += 4;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined(X20R_2WAY)
+
+int x20r_2x64_hash( void* output, const void* input, int thrid )
+{
+   uint8_t hash[64*2] __attribute__ ((aligned (64)));
+   if ( !x16r_2x64_hash_generic( hash, input, thrid, x20r_hash_order,
+                                 X20R_HASH_FUNC_COUNT ) )
+      return 0;
+
+   memcpy( output,     hash,     32 );
+   memcpy( output+32,  hash+64,  32 );
+
+   return 1;
+}
+
+int scanhash_x20r_2x64( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[16*2] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*2] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   v128_t *noncev = (v128_t*)vdata + 9;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( bench )  ptarget[7] = 0x0cff;
+
+   static __thread uint32_t saved_height = UINT32_MAX;
+   if ( work->height != saved_height )
+   {
+      vdata[1] = bswap_32( pdata[1] );
+      vdata[2] = bswap_32( pdata[2] );
+      vdata[3] = bswap_32( pdata[3] );
+      saved_height = work->height;
+      x20r_getAlgoString( (const uint8_t*)(&vdata[1]), x20r_hash_order );
+      if ( !opt_quiet && !thr_id )
+           applog( LOG_INFO, "hash order %s", x20r_hash_order );
+   }
+   
+   x16r_2x64_prehash( vdata, pdata, x20r_hash_order );
+   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
+   do
+   {
+      if ( x20r_2x64_hash( hash, vdata, thr_id ) );
+      for ( int i = 0; i < 2; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
+      n += 2;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#else
+
+int x20r_hash( void* output, const void* input, int thrid )
+{
+   uint8_t hash[64] __attribute__ ((aligned (64)));
+   if ( !x16r_hash_generic( hash, input, thrid, x20r_hash_order, 
+                            X20R_HASH_FUNC_COUNT ) )
+      return 0;
+
+    memcpy( output, hash, 32 );
+    return 1;
+}
+
+int scanhash_x20r( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(32) hash32[8];
+   uint32_t _ALIGN(32) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const int thr_id = mythr->id;
+   uint32_t nonce = first_nonce;
+   volatile uint8_t *restart = &( work_restart[thr_id].restart );
+   const bool bench = opt_benchmark;
+   if ( bench )  ptarget[7] = 0x0cff;
+
+   static __thread uint32_t saved_height = UINT32_MAX;
+   if ( work->height != saved_height )
+   {
+      edata[1] = bswap_32( pdata[1] );
+      edata[2] = bswap_32( pdata[2] );
+      edata[3] = bswap_32( pdata[3] );
+      saved_height = work->height;
+      x20r_getAlgoString( (const uint8_t*)(&edata[1]), x20r_hash_order );
+      if ( !opt_quiet && !thr_id )
+           applog( LOG_INFO, "hash order %s", x20r_hash_order );
+   }
+
+   x16r_prehash( edata, pdata, x20r_hash_order );
+
+   do
+   {
+      edata[19] = nonce;
+      if ( x20r_hash( hash32, edata, thr_id ) )
+      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( nonce );
+         submit_solution( work, hash32, mythr );
+      }
+      nonce++;
+   } while ( nonce < max_nonce && !(*restart) );
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce;
+   return 0;
+}
+
+#endif
+
+bool register_x20r_algo( algo_gate_t* gate )
+{
+#if defined (X20R_8WAY)
+  gate->scanhash          = (void*)&scanhash_x20r_8x64;
+#elif defined (X20R_4WAY)
+  gate->scanhash          = (void*)&scanhash_x20r_4x64;
+#elif defined (X20R_2WAY)
+  gate->scanhash          = (void*)&scanhash_x20r_2x64;
+#else
+  gate->scanhash          = (void*)&scanhash_x20r;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
+                      | NEON_OPT;
+  opt_target_factor = 256.0;
+  return true;
+};
+
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -9,6 +9,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "algo/haval/haval-hash-4way.h"
+#include "algo/haval/sph-haval.h"
 #include "algo/tiger/sph_tiger.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/lyra2/lyra2.h"
@@ -42,7 +43,8 @@ int x21s_8way_hash( void* output, const void* input, int thrid )
   uint32_t *hash7 = (uint32_t*)( shash+448 );
   x21s_8way_context_overlay ctx;

-   if ( !x16r_8way_hash_generic( shash, input, thrid ) )
+   if ( !x16r_8way_hash_generic( shash, input, thrid, x16r_hash_order, 
+                                 X16R_HASH_FUNC_COUNT ) )
      return 0;

   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
@@ -134,7 +136,6 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &hash[7<<3];
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t bedata1[2] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
@@ -148,20 +149,18 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,

   if ( bench )   ptarget[7] = 0x0cff;

-   bedata1[0] = bswap_32( pdata[1] );
-   bedata1[1] = bswap_32( pdata[2] );
-
-   static __thread uint32_t s_ntime = UINT32_MAX;
-   uint32_t ntime = bswap_32( pdata[17] );
-   if ( s_ntime != ntime )
+   static __thread uint32_t saved_height = UINT32_MAX;
+   if ( work->height != saved_height )
   {
-      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
-      s_ntime = ntime;
-      if ( opt_debug && !thr_id )
-              applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+      vdata[1] = bswap_32( pdata[1] );
+      vdata[2] = bswap_32( pdata[2] );
+      saved_height = work->height;
+      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
+      if ( !opt_quiet && !thr_id )
+           applog( LOG_INFO, "hash order %s", x16r_hash_order );
   }

-   x16r_8way_prehash( vdata, pdata );
+   x16r_8way_prehash( vdata, pdata, x16r_hash_order );
   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
@@ -223,7 +222,8 @@ int x21s_4way_hash( void* output, const void* input, int thrid )
   uint32_t *hash2 = (uint32_t*)( shash+128 );
   uint32_t *hash3 = (uint32_t*)( shash+192 );

-   if ( !x16r_4way_hash_generic( shash, input, thrid ) )
+   if ( !x16r_4way_hash_generic( shash, input, thrid, x16r_hash_order,
+                                 X16R_HASH_FUNC_COUNT ) )
      return 0;

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3,  512 );
@@ -294,7 +294,6 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t bedata1[2] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -307,20 +306,18 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,

   if ( bench )  ptarget[7] = 0x0cff;
 
-   bedata1[0] = bswap_32( pdata[1] );
-   bedata1[1] = bswap_32( pdata[2] );
-
-   static __thread uint32_t s_ntime = UINT32_MAX;
-   uint32_t ntime = bswap_32( pdata[17] );
-   if ( s_ntime != ntime )
+   static __thread uint32_t saved_height = UINT32_MAX;
+   if ( work->height != saved_height )
   {
-      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
-      s_ntime = ntime;
-      if ( opt_debug && !thr_id )
-              applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
+      vdata[1] = bswap_32( pdata[1] );
+      vdata[2] = bswap_32( pdata[2] );
+      saved_height = work->height;
+      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
+      if ( !opt_quiet && !thr_id )
+           applog( LOG_INFO, "hash order %s", x16r_hash_order );
   }

-   x16r_4way_prehash( vdata, pdata );
+   x16r_4way_prehash( vdata, pdata, x16r_hash_order );
   *noncev = mm256_intrlv_blend_32(
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
@@ -351,4 +348,117 @@ bool x21s_4way_thread_init()
   return x21s_4way_matrix;
 }

+#elif defined (X21S_2WAY)
+
+static __thread uint64_t* x21s_2x64_matrix;
+
+union _x21s_2x64_context_overlay
+{
+    sph_haval256_5_context  haval;
+    sph_tiger_context       tiger;
+    sph_gost512_context     gost;
+} __attribute__ ((aligned (64)));
+
+typedef union _x21s_2x64_context_overlay x21s_2x64_context_overlay;
+
+int x21s_2x64_hash( void* output, const void* input, int thrid )
+{
+   uint8_t  shash[64*2] __attribute__ ((aligned (64)));
+   x21s_2x64_context_overlay ctx;
+   uint32_t *hash0 = (uint32_t*)  shash;
+   uint32_t *hash1 = (uint32_t*)( shash+64  );
+
+   if ( !x16r_2x64_hash_generic( shash, input, thrid, x16r_hash_order, 
+                                 X16R_HASH_FUNC_COUNT ) )
+      return 0;
+
+   sph_haval256_5_init( &ctx.haval );
+   sph_haval256_5( &ctx.haval, hash0, 64 );
+   sph_haval256_5_close( &ctx.haval, hash0 );
+   sph_haval256_5_init( &ctx.haval );
+   sph_haval256_5( &ctx.haval, hash1, 64 );
+   sph_haval256_5_close( &ctx.haval, hash1 );
+
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash0, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash0 );
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash1, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash1 );
+
+   LYRA2REV2( x21s_2x64_matrix, (void*) hash0, 32, (const void*) hash0, 32,
+            (const void*) hash0, 32, 1, 4, 4 );
+   LYRA2REV2( x21s_2x64_matrix, (void*) hash1, 32, (const void*) hash1, 32,
+            (const void*) hash1, 32, 1, 4, 4 );
+
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash0, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash0 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash1, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash1 );
+
+   sha256_full( output,    hash0, 64 );
+   sha256_full( output+32, hash1, 64 );
+
+   return 1;
+}
+
+int scanhash_x21s_2x64( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[16*2] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*2] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   v128_t *noncev = (v128_t*)vdata + 9;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( bench )  ptarget[7] = 0x0cff;
+
+   static __thread uint32_t saved_height = UINT32_MAX;
+   if ( work->height != saved_height )
+   {
+      vdata[1] = bswap_32( pdata[1] );
+      vdata[2] = bswap_32( pdata[2] );
+      saved_height = work->height;
+      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
+      if ( !opt_quiet && !thr_id )
+           applog( LOG_INFO, "hash order %s", x16r_hash_order );
+   }
+
+   x16r_2x64_prehash( vdata, pdata, x16r_hash_order );
+   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
+   do
+   {
+      if ( x21s_2x64_hash( hash, vdata, thr_id ) )
+      for ( int i = 0; i < 2; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
+      n += 2;
+   } while ( likely( (  n < last_nonce ) && !(*restart) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+bool x21s_2x64_thread_init()
+{
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+   const int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+   x21s_2x64_matrix = mm_malloc( size, 64 );
+   return x21s_2x64_matrix;
+}
+
 #endif
--- a/algo/x16/x21s.c
+++ b/algo/x16/x21s.c
@@ -15,7 +15,7 @@
 #include "algo/gost/sph_gost.h"
 #include "algo/lyra2/lyra2.h"

-#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
+#if !defined(X21S_8WAY) && !defined(X21S_4WAY)

 static __thread uint64_t* x21s_matrix;

@@ -33,7 +33,8 @@ int x21s_hash( void* output, const void* input, int thrid )
   uint32_t _ALIGN(128) hash[16];
   x21s_context_overlay ctx;

-   if ( !x16r_hash_generic( hash, input, thrid ) )
+   if ( !x16r_hash_generic( hash, input, thrid, x16r_hash_order,
+                            X16R_HASH_FUNC_COUNT ) )
      return 0;

   sph_haval256_5_init( &ctx.haval );
@@ -84,7 +85,7 @@ int scanhash_x21s( struct work *work, uint32_t max_nonce,
          applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

-   x16r_prehash( edata, pdata );
+   x16r_prehash( edata, pdata, x16r_hash_order );

   do
   {
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -928,25 +928,24 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,

 #elif defined(X17_2X64)

-// Need sph in some cases
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#include "algo/simd/sph_simd.h"
-#include "algo/simd/nist.h"
-#include "algo/hamsi/sph_hamsi.h"
+#if !( defined(__SSE4_2__) || defined(__ARM_NEON) )
+  #include "algo/hamsi/sph_hamsi.h"
+#endif
 #include "algo/shabal/sph_shabal.h"
 #include "algo/haval/sph-haval.h"
-//#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
+#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
  #include "algo/groestl/sph_groestl.h"
  #include "algo/echo/sph_echo.h"
-//#endif
-#include "algo/fugue/sph_fugue.h"
+  #include "algo/fugue/sph_fugue.h"
+#endif

 union _x17_context_overlay
 {
        blake512_2x64_context   blake;
        bmw512_2x64_context     bmw;
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_groestl       groestl;
 #else
        sph_groestl512_context  groestl;
@@ -956,7 +955,7 @@ union _x17_context_overlay
 #else
        sph_echo512_context     echo;
 #endif
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_fugue         fugue;
 #else
        sph_fugue512_context    fugue;
@@ -967,12 +966,8 @@ union _x17_context_overlay
        hashState_luffa         luffa;
        cubehashParam           cube;
        sph_shavite512_context  shavite;
-#if defined(__x86_64__)
        simd512_context         simd;
-#else
-        sph_simd512_context     simd;
-#endif
-#if defined(__SSE4_2__) // || defined(__ARM_NEON)
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
        hamsi_2x64_context      hamsi;
 #else
        sph_hamsi512_context    hamsi;
@@ -1000,7 +995,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )

    dintrlv_2x64( hash0, hash1, vhash, 512 );

-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
    groestl512_full( &ctx.groestl, hash0, hash0, 512 );
    groestl512_full( &ctx.groestl, hash1, hash1, 512 );
 #else
@@ -1033,17 +1028,8 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
    sph_shavite512( &ctx.shavite, hash1, 64 );
    sph_shavite512_close( &ctx.shavite, hash1 );

-#if defined(__x86_64__)
    simd512_ctx( &ctx.simd, hash0, hash0, 64 );
    simd512_ctx( &ctx.simd, hash1, hash1, 64 );
-#else
-    sph_simd512_init( &ctx.simd );
-    sph_simd512( &ctx.simd, hash0, 64 );
-    sph_simd512_close( &ctx.simd, hash0 );
-    sph_simd512_init( &ctx.simd );
-    sph_simd512( &ctx.simd, hash1, 64 );
-    sph_simd512_close( &ctx.simd, hash1 );
-#endif

 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
    echo_full( &ctx.echo, hash0, 512, hash0, 64 );
@@ -1057,7 +1043,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
    sph_echo512_close( &ctx.echo, hash1 );
 #endif

-#if defined(__SSE4_2__) // || defined(__ARM_NEON)
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
    intrlv_2x64( vhash, hash0, hash1, 512 );
    hamsi512_2x64_ctx( &ctx.hamsi, vhash, vhash, 64 );
    dintrlv_2x64( hash0, hash1, vhash, 512 );
@@ -1070,7 +1056,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
    sph_hamsi512_close( &ctx.hamsi, hash1 );
 #endif

-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
    fugue512_full( &ctx.fugue, hash0, hash0, 64 );
    fugue512_full( &ctx.fugue, hash1, hash1, 64 );
 #else
@@ -1142,14 +1128,12 @@ int scanhash_x17_2x64( struct work *work, uint32_t max_nonce,
      {
         if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
         {
-applog(LOG_INFO,"Submitted Thread %d, lane %d",thr_id,0);
              pdata[19] = bswap_32( n );
 //            pdata[19] = n;
            submit_solution( work, hash, mythr );
         }
         if ( unlikely( valid_hash( hash+8, ptarget ) && !bench ) )
         {
-applog(LOG_INFO,"Submitted Thread %d, lane %d",thr_id,1);            
            pdata[19] = bswap_32( n+1 );
            submit_solution( work, hash+8, mythr );
         }
--- a/algo/x22/x22i.c
+++ b/algo/x22/x22i.c
@@ -4,25 +4,24 @@

 #include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
-#if defined(__AES__)
-  #include "algo/echo/aes_ni/hash_api.h"
-  #include "algo/groestl/aes_ni/hash-groestl.h"
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/fugue/fugue-aesni.h"
+#else
+  #include "algo/fugue/sph_fugue.h"
+#endif
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+  #include "algo/groestl/aes_ni/hash-groestl.h"
+  #include "algo/echo/aes_ni/hash_api.h"
 #else
  #include "algo/groestl/sph_groestl.h"
  #include "algo/echo/sph_echo.h"
-  #include "algo/fugue/sph_fugue.h"
 #endif
 #include "algo/skein/sph_skein.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
@@ -39,14 +38,17 @@ union _x22i_context_overlay
 {
        blake512_context       blake;
        sph_bmw512_context     bmw;
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+        hashState_fugue         fugue;
+#else
+        sph_fugue512_context    fugue;
+#endif
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_groestl       groestl;
        hashState_echo          echo;
-        hashState_fugue         fugue;
 #else
        sph_groestl512_context  groestl;
        sph_echo512_context     echo;
-        sph_fugue512_context    fugue;
 #endif
        sph_jh512_context       jh;
        sph_keccak512_context   keccak;
@@ -54,11 +56,7 @@ union _x22i_context_overlay
        hashState_luffa         luffa;
        cubehashParam           cube;
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-        sph_simd512_context     simd;
-#else
-        hashState_sd            simd;
-#endif
+        simd512_context         simd;
        sph_hamsi512_context    hamsi;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
@@ -83,10 +81,8 @@ int x22i_hash( void *output, const void *input, int thrid )
   sph_bmw512(&ctx.bmw, (const void*) hash, 64);
   sph_bmw512_close(&ctx.bmw, hash);

-#if defined(__AES__)
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash,
-                                  (const char*)hash, 512 );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+   groestl512_full( &ctx.groestl, hash, hash, 512 );
 #else
   sph_groestl512_init( &ctx.groestl );
   sph_groestl512( &ctx.groestl, hash, 64 );
@@ -109,26 +105,16 @@ int x22i_hash( void *output, const void *input, int thrid )
   
   luffa_full( &ctx.luffa, hash, 512, hash, 64 );

-   cubehashInit( &ctx.cube, 512, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
-
+   cubehash_full( &ctx.cube, hash, 512, hash, 64 );
+   
   sph_shavite512_init(&ctx.shavite);
   sph_shavite512(&ctx.shavite, (const void*) hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512_init(&ctx.simd );
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    simd_full( &ctx.simd, (BitSequence *)hash,
-                       (const BitSequence *)hash, 512 );
-#endif
+   simd512_ctx( &ctx.simd, hash, hash, 64 );

-#if defined(__AES__)
-   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)hash,
-                            (const BitSequence*)hash, 512 );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+   echo_full( &ctx.echo, hash, 512, hash, 64 );
 #else
   sph_echo512_init( &ctx.echo );
   sph_echo512( &ctx.echo, hash, 64 );
@@ -141,7 +127,7 @@ int x22i_hash( void *output, const void *input, int thrid )
   sph_hamsi512(&ctx.hamsi, (const void*) hash, 64);
   sph_hamsi512_close(&ctx.hamsi, hash);

-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   fugue512_full( &ctx.fugue, hash, hash, 64 );
 #else
   sph_fugue512_init(&ctx.fugue);
@@ -161,7 +147,7 @@ int x22i_hash( void *output, const void *input, int thrid )
   sph_sha512( &ctx.sha512, &hash[128], 64 );
   sph_sha512_close( &ctx.sha512, &hash[192] );
   
-   ComputeSingleSWIFFTX((unsigned char*)hash, (unsigned char*)hash2);
+   ComputeSingleSWIFFTX( (unsigned char*)hash, (unsigned char*)hash2 );

   if ( work_restart[thrid].restart ) return 0;
   
@@ -176,7 +162,7 @@ int x22i_hash( void *output, const void *input, int thrid )
   sph_tiger_close(&ctx.tiger, (void*) hash2);

   memset(hash, 0, 64);
-   LYRA2RE((void*) hash, 32, (const void*) hash2, 32, (const void*) hash2, 32, 1, 4, 4);
+   LYRA2RE( (void*)hash, 32, (const void*)hash2, 32, (const void*)hash2, 32, 1, 4, 4 );

   sph_gost512_init(&ctx.gost);
   sph_gost512 (&ctx.gost, (const void*) hash, 64);
@@ -192,8 +178,8 @@ int x22i_hash( void *output, const void *input, int thrid )
 int scanhash_x22i( struct work *work, uint32_t max_nonce,
             uint64_t *hashes_done, struct thr_info *mythr)
 {
-   uint32_t edata[20] __attribute__((aligned(64)));
-   uint32_t hash64[8] __attribute__((aligned(64)));
+   uint32_t edata[20] __attribute__((aligned(32)));
+   uint32_t hash64[8] __attribute__((aligned(32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
--- a/algo/x22/x25x.c
+++ b/algo/x22/x25x.c
@@ -4,25 +4,24 @@

 #include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
-#if defined(__AES__)
-  #include "algo/echo/aes_ni/hash_api.h"
-  #include "algo/groestl/aes_ni/hash-groestl.h"
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/fugue/fugue-aesni.h"
+#else
+  #include "algo/fugue/sph_fugue.h"
+#endif
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+  #include "algo/groestl/aes_ni/hash-groestl.h"
+  #include "algo/echo/aes_ni/hash_api.h"
 #else
  #include "algo/groestl/sph_groestl.h"
  #include "algo/echo/sph_echo.h"
-  #include "algo/fugue/sph_fugue.h"
 #endif
 #include "algo/skein/sph_skein.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
@@ -42,14 +41,17 @@ union _x25x_context_overlay
 {
        blake512_context        blake;
        sph_bmw512_context      bmw;
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+        hashState_fugue         fugue;
+#else
+        sph_fugue512_context    fugue;
+#endif
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_groestl       groestl;
        hashState_echo          echo;
-        hashState_fugue         fugue;
 #else
        sph_groestl512_context  groestl;
        sph_echo512_context     echo;
-        sph_fugue512_context    fugue;
 #endif
        sph_jh512_context       jh;
        sph_keccak512_context   keccak;
@@ -57,11 +59,7 @@ union _x25x_context_overlay
        hashState_luffa         luffa;
        cubehashParam           cube;
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-        sph_simd512_context     simd;
-#else
-        hashState_sd            simd;
-#endif
+        simd512_context         simd;
        sph_hamsi512_context    hamsi;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
@@ -88,10 +86,8 @@ int x25x_hash( void *output, const void *input, int thrid )
   sph_bmw512(&ctx.bmw, (const void*) &hash[0], 64);
   sph_bmw512_close(&ctx.bmw, &hash[1]);

-#if defined(__AES__)
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)&hash[2],
-                                  (const char*)&hash[1], 512 );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+   groestl512_full( &ctx.groestl, (void*)&hash[2], (const void*)&hash[1], 512 );
 #else
   sph_groestl512_init( &ctx.groestl );
   sph_groestl512( &ctx.groestl, &hash[1], 64 );
@@ -112,28 +108,18 @@ int x25x_hash( void *output, const void *input, int thrid )

   if ( work_restart[thrid].restart ) return 0;
   
-   init_luffa( &ctx.luffa, 512 );
-   luffa_full( &ctx.luffa, &hash[6], 512, &hash[5], 64 );
+   luffa_full( &ctx.luffa, (void*)&hash[6], 512, (const void*)&hash[5], 64 );

-   cubehashInit( &ctx.cube, 512, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, &hash[7], &hash[6], 64 );
+   cubehash_full( &ctx.cube, (void*)&hash[7], 512, (const void*)&hash[6], 64 );

   sph_shavite512_init(&ctx.shavite);
   sph_shavite512(&ctx.shavite, (const void*) &hash[7], 64);
   sph_shavite512_close(&ctx.shavite, &hash[8]);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) &hash[8], 64);
-    sph_simd512_close(&ctx.simd, &hash[9] );
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)&hash[9],
-                       (const BitSequence *)&hash[8], 512 );
-#endif
+   simd512_ctx( &ctx.simd, (void*)&hash[9], (const void*)&hash[8], 64 ); 

-#if defined(__AES__)
-   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)&hash[10],
-                            (const BitSequence*)&hash[9], 512 );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+   echo_full( &ctx.echo, (void*)&hash[10], 512, (const void*)&hash[9], 64 );
 #else
   sph_echo512_init( &ctx.echo );
   sph_echo512( &ctx.echo, &hash[9], 64 );
@@ -146,7 +132,7 @@ int x25x_hash( void *output, const void *input, int thrid )
   sph_hamsi512(&ctx.hamsi, (const void*) &hash[10], 64);
   sph_hamsi512_close(&ctx.hamsi, &hash[11]);

-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   fugue512_full( &ctx.fugue, &hash[12], &hash[11], 64 );
 #else
   sph_fugue512_init(&ctx.fugue);
@@ -227,8 +213,8 @@ int x25x_hash( void *output, const void *input, int thrid )
 int scanhash_x25x( struct work *work, uint32_t max_nonce,
             uint64_t *hashes_done, struct thr_info *mythr)
 {
-   uint32_t edata[20] __attribute__((aligned(64)));
-   uint32_t hash64[8] __attribute__((aligned(64)));
+   uint32_t edata[20] __attribute__((aligned(32)));
+   uint32_t hash64[8] __attribute__((aligned(32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
@@ -245,7 +231,7 @@ int scanhash_x25x( struct work *work, uint32_t max_nonce,
   do
   {
      edata[19] = n;
-      if ( x25x_hash( hash64, edata, thr_id ) )
+      if ( x25x_hash( hash64, edata, thr_id ) );
      if ( unlikely( valid_hash( hash64, ptarget ) && !bench ) )
      {
         pdata[19] = bswap_32( n );
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.8.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.15.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='23.8'
-PACKAGE_STRING='cpuminer-opt 23.8'
+PACKAGE_VERSION='23.15'
+PACKAGE_STRING='cpuminer-opt 23.15'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 23.8 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 23.15 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1432,7 +1432,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 23.8:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 23.15:";;
   esac
  cat <<\_ACEOF

@@ -1538,7 +1538,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 23.8
+cpuminer-opt configure 23.15
 generated by GNU Autoconf 2.71

 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 23.8, which was
+It was created by cpuminer-opt $as_me 23.15, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='23.8'
+ VERSION='23.15'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 23.8, which was
+This file was extended by cpuminer-opt $as_me 23.15, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 23.8
+cpuminer-opt config.status 23.15
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [23.8])
+AC_INIT([cpuminer-opt], [23.15])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/34
+++ b/34
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.8.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.14.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='23.8'
-PACKAGE_STRING='cpuminer-opt 23.8'
+PACKAGE_VERSION='23.14'
+PACKAGE_STRING='cpuminer-opt 23.14'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -657,8 +657,6 @@ JANSSON_LIBS
 LIBCURL_CPPFLAGS
 LIBCURL_CFLAGS
 LIBCURL
-HAVE_MACOS_FALSE
-HAVE_MACOS_TRUE
 MINGW_FALSE
 MINGW_TRUE
 ARCH_ARM_FALSE
@@ -1362,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 23.8 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 23.14 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1434,7 +1432,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 23.8:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 23.14:";;
   esac
  cat <<\_ACEOF

@@ -1540,7 +1538,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 23.8
+cpuminer-opt configure 23.14
 generated by GNU Autoconf 2.71

 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1987,7 +1985,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 23.8, which was
+It was created by cpuminer-opt $as_me 23.14, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3595,7 +3593,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='23.8'
+ VERSION='23.14'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -6881,14 +6879,6 @@ else
  MINGW_FALSE=
 fi

- if test "x$OS" = "xAPPLE"; then
-  HAVE_MACOS_TRUE=
-  HAVE_MACOS_FALSE='#'
-else
-  HAVE_MACOS_TRUE='#'
-  HAVE_MACOS_FALSE=
-fi
-

 if test x$request_jansson = xtrue ; then
 	JANSSON_LIBS="compat/jansson/libjansson.a"
@@ -7128,10 +7118,6 @@ if test -z "${MINGW_TRUE}" && test -z "${MINGW_FALSE}"; then
  as_fn_error $? "conditional \"MINGW\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${HAVE_MACOS_TRUE}" && test -z "${HAVE_MACOS_FALSE}"; then
-  as_fn_error $? "conditional \"HAVE_MACOS\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi

 : "${CONFIG_STATUS=./config.status}"
 ac_write_fail=0
@@ -7522,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 23.8, which was
+This file was extended by cpuminer-opt $as_me 23.14, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7590,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 23.8
+cpuminer-opt config.status 23.14
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"

--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -2837,15 +2837,6 @@ static void show_credits()
 #define check_cpu_capability() cpu_capability( false )
 #define display_cpu_capability() cpu_capability( true )

-#if defined(__aarch64__)
-
-#define XSTR(x) STR(x)
-#define STR(x) #x
-
-//#pragma message "Building for armv" XSTR(__ARM_ARCH)  
-
-#endif
-
 static bool cpu_capability( bool display_only )
 {
     char cpu_brand[0x40];
@@ -2968,8 +2959,12 @@ static bool cpu_capability( bool display_only )
        printf(" Linux\n");
     #elif defined(WIN32)
        printf(" Windows\n");
+     #elif defined(__APPLE__)
+        printf(" MacOS\n");
+#elif defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) 
+        printf(" Unix\n");
     #else
-       printf("\n");
+        printf("\n");
     #endif

     printf("CPU features: ");
@@ -3671,11 +3666,6 @@ static int thread_create(struct thr_info *thr, void* func)

 void get_defconfig_path(char *out, size_t bufsize, char *argv0);

-
-#include "simd-utils.h"
-#include "algo/echo/aes_ni/hash_api.h"
-#include "compat/aes_helper.c"
-
 int main(int argc, char *argv[])
 {
 	struct thr_info *thr;
--- a/miner.h
+++ b/miner.h
@@ -3,12 +3,16 @@

 #include <cpuminer-config.h>

+#if !( defined(__SSE2__) || ( defined(__aarch64__) && defined(__ARM_NEON) ) )
+#warning "Unknown or unsupported CPU, requires x86_64 with SSE2 or AArch64 with NEON." 
+#endif
+
 #if defined(__x86_64__)
   #define USER_AGENT_ARCH "x64"     // Intel, AMD x86_64
 #elif defined(__aarch64__)
   #define USER_AGENT_ARCH "arm"     // AArch64
 //#elif
-//  #define USER_AGENT_ARCH "R5"     // RISC-V             
+//  #define USER_AGENT_ARCH "r5"     // RISC-V             
 #else
   #define USER_AGENT_ARCH
 #endif
@@ -668,6 +672,7 @@ enum algos {
        ALGO_X16RT_VEIL,
        ALGO_X16S,
        ALGO_X17,
+        ALGO_X20R,
        ALGO_X21S,
        ALGO_X22I,
        ALGO_X25X,
@@ -763,6 +768,7 @@ static const char* const algo_names[] = {
        "x16rt-veil",
        "x16s",
        "x17",
+        "x20r",
        "x21s",
        "x22i",
        "x25x",
@@ -926,6 +932,7 @@ Options:\n\
                          x16rt-veil    Veil (VEIL)\n\
                          x16s\n\
                          x17\n\
+                          x20r\n\
                          x21s\n\
                          x22i\n\
                          x25x\n\
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -381,7 +381,7 @@ static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
   d0[15] = s[ 60];   d1[15] = s[ 61];    d2[15] = s[ 62];   d3[15] = s[ 63];
 }

-#endif   // SSE4_1 else SSE2 or NEON
+#endif   // SSE4_1 or NEON else SSE2

 static inline void extr_lane_4x32( void *d, const void *s,
                                   const int lane, const int bit_len )
@@ -411,11 +411,11 @@ static inline void v128_bswap32_80( void *d, void *s )
 {
  const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
                                             0x0405060700010203 );
-  casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), bswap_shuf );
-  casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), bswap_shuf );
-  casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), bswap_shuf );
-  casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), bswap_shuf );
-  casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), bswap_shuf );
+  casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), bswap_shuf );
+  casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), bswap_shuf );
+  casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), bswap_shuf );
+  casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), bswap_shuf );
+  casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), bswap_shuf );
 }

 #elif defined(__aarch64__) && defined(__ARM_NEON)
@@ -461,11 +461,11 @@ static inline void v128_bswap32_80( void *d, void *s )

 static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
 {
-  v128_t s0 = casti_m128i( src,0 );
-  v128_t s1 = casti_m128i( src,1 );
-  v128_t s2 = casti_m128i( src,2 );
-  v128_t s3 = casti_m128i( src,3 );
-  v128_t s4 = casti_m128i( src,4 );
+  v128_t s0 = casti_v128( src,0 );
+  v128_t s1 = casti_v128( src,1 );
+  v128_t s2 = casti_v128( src,2 );
+  v128_t s3 = casti_v128( src,3 );
+  v128_t s4 = casti_v128( src,4 );

 #if defined(__SSSE3__)

@@ -480,38 +480,38 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )

 #else

-  s0 = mm128_bswap_32( s0 );
-  s1 = mm128_bswap_32( s1 );
-  s2 = mm128_bswap_32( s2 );
-  s3 = mm128_bswap_32( s3 );
-  s4 = mm128_bswap_32( s4 );
+  s0 = v128_bswap32( s0 );
+  s1 = v128_bswap32( s1 );
+  s2 = v128_bswap32( s2 );
+  s3 = v128_bswap32( s3 );
+  s4 = v128_bswap32( s4 );

 #endif

-  casti_m128i( d, 0 ) = _mm_shuffle_epi32( s0, 0x00 );
-  casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0, 0x55 );
-  casti_m128i( d, 2 ) = _mm_shuffle_epi32( s0, 0xaa );
-  casti_m128i( d, 3 ) = _mm_shuffle_epi32( s0, 0xff );
+  casti_v128( d, 0 ) = _mm_shuffle_epi32( s0, 0x00 );
+  casti_v128( d, 1 ) = _mm_shuffle_epi32( s0, 0x55 );
+  casti_v128( d, 2 ) = _mm_shuffle_epi32( s0, 0xaa );
+  casti_v128( d, 3 ) = _mm_shuffle_epi32( s0, 0xff );

-  casti_m128i( d, 4 ) = _mm_shuffle_epi32( s1, 0x00 );
-  casti_m128i( d, 5 ) = _mm_shuffle_epi32( s1, 0x55 );
-  casti_m128i( d, 6 ) = _mm_shuffle_epi32( s1, 0xaa );
-  casti_m128i( d, 7 ) = _mm_shuffle_epi32( s1, 0xff );
+  casti_v128( d, 4 ) = _mm_shuffle_epi32( s1, 0x00 );
+  casti_v128( d, 5 ) = _mm_shuffle_epi32( s1, 0x55 );
+  casti_v128( d, 6 ) = _mm_shuffle_epi32( s1, 0xaa );
+  casti_v128( d, 7 ) = _mm_shuffle_epi32( s1, 0xff );

-  casti_m128i( d, 8 ) = _mm_shuffle_epi32( s2, 0x00 );
-  casti_m128i( d, 9 ) = _mm_shuffle_epi32( s2, 0x55 );
-  casti_m128i( d,10 ) = _mm_shuffle_epi32( s2, 0xaa );
-  casti_m128i( d,11 ) = _mm_shuffle_epi32( s2, 0xff );
+  casti_v128( d, 8 ) = _mm_shuffle_epi32( s2, 0x00 );
+  casti_v128( d, 9 ) = _mm_shuffle_epi32( s2, 0x55 );
+  casti_v128( d,10 ) = _mm_shuffle_epi32( s2, 0xaa );
+  casti_v128( d,11 ) = _mm_shuffle_epi32( s2, 0xff );

-  casti_m128i( d,12 ) = _mm_shuffle_epi32( s3, 0x00 );
-  casti_m128i( d,13 ) = _mm_shuffle_epi32( s3, 0x55 );
-  casti_m128i( d,14 ) = _mm_shuffle_epi32( s3, 0xaa );
-  casti_m128i( d,15 ) = _mm_shuffle_epi32( s3, 0xff );
+  casti_v128( d,12 ) = _mm_shuffle_epi32( s3, 0x00 );
+  casti_v128( d,13 ) = _mm_shuffle_epi32( s3, 0x55 );
+  casti_v128( d,14 ) = _mm_shuffle_epi32( s3, 0xaa );
+  casti_v128( d,15 ) = _mm_shuffle_epi32( s3, 0xff );

-  casti_m128i( d,16 ) = _mm_shuffle_epi32( s4, 0x00 );
-  casti_m128i( d,17 ) = _mm_shuffle_epi32( s4, 0x55 );
-  casti_m128i( d,18 ) = _mm_shuffle_epi32( s4, 0xaa );
-  casti_m128i( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
+  casti_v128( d,16 ) = _mm_shuffle_epi32( s4, 0x00 );
+  casti_v128( d,17 ) = _mm_shuffle_epi32( s4, 0x55 );
+  casti_v128( d,18 ) = _mm_shuffle_epi32( s4, 0xaa );
+  casti_v128( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
 }

 #elif defined(__aarch64__) && defined(__ARM_NEON)
@@ -797,11 +797,11 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
   const __m256i c1 = v256_32( 0x04050607 );
   const __m256i c2 = v256_32( 0x08090a0b );
   const __m256i c3 = v256_32( 0x0c0d0e0f );
-   const v128_t s0 = casti_m128i( src,0 );
-   const v128_t s1 = casti_m128i( src,1 );
-   const v128_t s2 = casti_m128i( src,2 );
-   const v128_t s3 = casti_m128i( src,3 );
-   const v128_t s4 = casti_m128i( src,4 );
+   const v128_t s0 = casti_v128( src,0 );
+   const v128_t s1 = casti_v128( src,1 );
+   const v128_t s2 = casti_v128( src,2 );
+   const v128_t s3 = casti_v128( src,3 );
+   const v128_t s4 = casti_v128( src,4 );

   casti_m256i( d, 0 ) = _mm256_permutexvar_epi8( c0,
                          _mm256_castsi128_si256( s0 ) );
@@ -855,11 +855,11 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
  const __m256i c2 = _mm256_add_epi32( c1, c1 );
  const __m256i c3 = _mm256_add_epi32( c2, c1 );

-  v128_t s0 = casti_m128i( src,0 );
-  v128_t s1 = casti_m128i( src,1 );
-  v128_t s2 = casti_m128i( src,2 );
-  v128_t s3 = casti_m128i( src,3 );
-  v128_t s4 = casti_m128i( src,4 );
+  v128_t s0 = casti_v128( src,0 );
+  v128_t s1 = casti_v128( src,1 );
+  v128_t s2 = casti_v128( src,2 );
+  v128_t s3 = casti_v128( src,3 );
+  v128_t s4 = casti_v128( src,4 );

  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
@@ -1303,11 +1303,11 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
   const __m512i c1 = v512_32( 0x04050607 );
   const __m512i c2 = v512_32( 0x08090a0b );
   const __m512i c3 = v512_32( 0x0c0d0e0f );
-   const v128_t s0 = casti_m128i( src,0 );
-   const v128_t s1 = casti_m128i( src,1 );
-   const v128_t s2 = casti_m128i( src,2 );
-   const v128_t s3 = casti_m128i( src,3 );
-   const v128_t s4 = casti_m128i( src,4 );
+   const v128_t s0 = casti_v128( src,0 );
+   const v128_t s1 = casti_v128( src,1 );
+   const v128_t s2 = casti_v128( src,2 );
+   const v128_t s3 = casti_v128( src,3 );
+   const v128_t s4 = casti_v128( src,4 );
 
   casti_m512i( d, 0 ) = _mm512_permutexvar_epi8( c0,
                          _mm512_castsi128_si512( s0 ) );
@@ -1360,11 +1360,11 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
  const __m512i c1 = v512_32( 1 );
  const __m512i c2 = _mm512_add_epi32( c1, c1 );
  const __m512i c3 = _mm512_add_epi32( c2, c1 );
-  v128_t s0 = casti_m128i( src,0 );
-  v128_t s1 = casti_m128i( src,1 );
-  v128_t s2 = casti_m128i( src,2 );
-  v128_t s3 = casti_m128i( src,3 );
-  v128_t s4 = casti_m128i( src,4 );
+  v128_t s0 = casti_v128( src,0 );
+  v128_t s1 = casti_v128( src,1 );
+  v128_t s2 = casti_v128( src,2 );
+  v128_t s3 = casti_v128( src,3 );
+  v128_t s4 = casti_v128( src,4 );

  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
@@ -1492,20 +1492,20 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )

 #if defined(__SSE2__)

-  casti_m128i( d,0 ) = _mm_shuffle_epi32( s0, 0x44 );
-  casti_m128i( d,1 ) = _mm_shuffle_epi32( s0, 0xee );
+  casti_v128( d,0 ) = _mm_shuffle_epi32( s0, 0x44 );
+  casti_v128( d,1 ) = _mm_shuffle_epi32( s0, 0xee );

-  casti_m128i( d,2 ) = _mm_shuffle_epi32( s1, 0x44 );
-  casti_m128i( d,3 ) = _mm_shuffle_epi32( s1, 0xee );
+  casti_v128( d,2 ) = _mm_shuffle_epi32( s1, 0x44 );
+  casti_v128( d,3 ) = _mm_shuffle_epi32( s1, 0xee );

-  casti_m128i( d,4 ) = _mm_shuffle_epi32( s2, 0x44 );
-  casti_m128i( d,5 ) = _mm_shuffle_epi32( s2, 0xee );
+  casti_v128( d,4 ) = _mm_shuffle_epi32( s2, 0x44 );
+  casti_v128( d,5 ) = _mm_shuffle_epi32( s2, 0xee );

-  casti_m128i( d,6 ) = _mm_shuffle_epi32( s3, 0x44 );
-  casti_m128i( d,7 ) = _mm_shuffle_epi32( s3, 0xee );
+  casti_v128( d,6 ) = _mm_shuffle_epi32( s3, 0x44 );
+  casti_v128( d,7 ) = _mm_shuffle_epi32( s3, 0xee );

-  casti_m128i( d,8 ) = _mm_shuffle_epi32( s4, 0x44 );
-  casti_m128i( d,9 ) = _mm_shuffle_epi32( s4, 0xee );
+  casti_v128( d,8 ) = _mm_shuffle_epi32( s4, 0x44 );
+  casti_v128( d,9 ) = _mm_shuffle_epi32( s4, 0xee );

 #elif defined(__ARM_NEON)

@@ -1719,7 +1719,7 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src )
 {
  __m256i s0 = casti_m256i( src,0 );
  __m256i s1 = casti_m256i( src,1 );
-  v128_t s4 = casti_m128i( src,4 );
+  v128_t s4 = casti_v128( src,4 );

  casti_m256i( d, 0 ) = _mm256_permute4x64_epi64( s0, 0x00 );
  casti_m256i( d, 1 ) = _mm256_permute4x64_epi64( s0, 0x55 );
@@ -1747,11 +1747,11 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
 {
   const __m256i c0 = v256_64( 0x0405060700010203 );
   const __m256i c1 = v256_64( 0x0c0d0e0f08090a0b );
-   const v128_t s0 = casti_m128i( src,0 );
-   const v128_t s1 = casti_m128i( src,1 );
-   const v128_t s2 = casti_m128i( src,2 );
-   const v128_t s3 = casti_m128i( src,3 );
-   const v128_t s4 = casti_m128i( src,4 );
+   const v128_t s0 = casti_v128( src,0 );
+   const v128_t s1 = casti_v128( src,1 );
+   const v128_t s2 = casti_v128( src,2 );
+   const v128_t s3 = casti_v128( src,3 );
+   const v128_t s4 = casti_v128( src,4 );

   casti_m256i( d,0 ) = _mm256_permutexvar_epi8( c0,
                         _mm256_castsi128_si256( s0 ) );
@@ -1783,7 +1783,7 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
                    _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
  __m256i s0 = casti_m256i( src,0 );
  __m256i s1 = casti_m256i( src,1 );
-  v128_t s4 = casti_m128i( src,4 );
+  v128_t s4 = casti_v128( src,4 );

  s0 = _mm256_shuffle_epi8( s0, bswap_shuf );
  s1 = _mm256_shuffle_epi8( s1, bswap_shuf );
@@ -2162,11 +2162,11 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
 {
   const __m512i c0 = v512_64( 0x0405060700010203 );
   const __m512i c1 = v512_64( 0x0c0d0e0f08090a0b );
-   const v128_t s0 = casti_m128i( src,0 );
-   const v128_t s1 = casti_m128i( src,1 );
-   const v128_t s2 = casti_m128i( src,2 );
-   const v128_t s3 = casti_m128i( src,3 );
-   const v128_t s4 = casti_m128i( src,4 );
+   const v128_t s0 = casti_v128( src,0 );
+   const v128_t s1 = casti_v128( src,1 );
+   const v128_t s2 = casti_v128( src,2 );
+   const v128_t s3 = casti_v128( src,3 );
+   const v128_t s4 = casti_v128( src,4 );

   casti_m512i( d,0 ) = _mm512_permutexvar_epi8( c0,
                         _mm512_castsi128_si512( s0 ) );
@@ -2197,11 +2197,11 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
  const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
                                             0x0405060700010203 );
  const __m512i c1 = v512_64( 1 );
-  v128_t s0 = casti_m128i( src,0 );
-  v128_t s1 = casti_m128i( src,1 );
-  v128_t s2 = casti_m128i( src,2 );
-  v128_t s3 = casti_m128i( src,3 );
-  v128_t s4 = casti_m128i( src,4 );
+  v128_t s0 = casti_v128( src,0 );
+  v128_t s1 = casti_v128( src,1 );
+  v128_t s2 = casti_v128( src,2 );
+  v128_t s3 = casti_v128( src,3 );
+  v128_t s4 = casti_v128( src,4 );

  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
@@ -2391,11 +2391,11 @@ static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
 {
  const __m512i bswap_shuf = mm512_bcast_m128(
                    _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
-  const v128_t s0 = casti_m128i( src,0 );
-  const v128_t s1 = casti_m128i( src,1 );
-  const v128_t s2 = casti_m128i( src,2 );
-  const v128_t s3 = casti_m128i( src,3 );
-  const v128_t s4 = casti_m128i( src,4 );
+  const v128_t s0 = casti_v128( src,0 );
+  const v128_t s1 = casti_v128( src,1 );
+  const v128_t s2 = casti_v128( src,2 );
+  const v128_t s3 = casti_v128( src,3 );
+  const v128_t s4 = casti_v128( src,4 );

  casti_m512i( d,0 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s0 ),
                                                 bswap_shuf );
@@ -2415,11 +2415,11 @@ static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
 {
  const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
                                             0x0405060700010203 );
-  v128_t s0 = casti_m128i( src,0 );
-  v128_t s1 = casti_m128i( src,1 );
-  v128_t s2 = casti_m128i( src,2 );
-  v128_t s3 = casti_m128i( src,3 );
-  v128_t s4 = casti_m128i( src,4 );
+  v128_t s0 = casti_v128( src,0 );
+  v128_t s1 = casti_v128( src,1 );
+  v128_t s2 = casti_v128( src,2 );
+  v128_t s3 = casti_v128( src,3 );
+  v128_t s4 = casti_v128( src,4 );

  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
@@ -2489,44 +2489,44 @@ static inline void rintrlv_4x64_4x32( void *dst, const void *src,
   const v128_t *s = (const v128_t*)src;
   v128_t *d = (v128_t*)dst;

-   d[ 0] = mm128_shuffle2_32( s[ 0], s[ 1], 0x88 );
-   d[ 1] = mm128_shuffle2_32( s[ 0], s[ 1], 0xdd );
-   d[ 2] = mm128_shuffle2_32( s[ 2], s[ 3], 0x88 );
-   d[ 3] = mm128_shuffle2_32( s[ 2], s[ 3], 0xdd );
-   d[ 4] = mm128_shuffle2_32( s[ 4], s[ 5], 0x88 );
-   d[ 5] = mm128_shuffle2_32( s[ 4], s[ 5], 0xdd );
-   d[ 6] = mm128_shuffle2_32( s[ 6], s[ 7], 0x88 );
-   d[ 7] = mm128_shuffle2_32( s[ 6], s[ 7], 0xdd );
+   d[ 0] = v128_shuffle2_32( s[ 0], s[ 1], 0x88 );
+   d[ 1] = v128_shuffle2_32( s[ 0], s[ 1], 0xdd );
+   d[ 2] = v128_shuffle2_32( s[ 2], s[ 3], 0x88 );
+   d[ 3] = v128_shuffle2_32( s[ 2], s[ 3], 0xdd );
+   d[ 4] = v128_shuffle2_32( s[ 4], s[ 5], 0x88 );
+   d[ 5] = v128_shuffle2_32( s[ 4], s[ 5], 0xdd );
+   d[ 6] = v128_shuffle2_32( s[ 6], s[ 7], 0x88 );
+   d[ 7] = v128_shuffle2_32( s[ 6], s[ 7], 0xdd );

   if ( bit_len <= 256 ) return;

-   d[ 8] = mm128_shuffle2_32( s[ 8], s[ 9], 0x88 );
-   d[ 9] = mm128_shuffle2_32( s[ 8], s[ 9], 0xdd );
-   d[10] = mm128_shuffle2_32( s[10], s[11], 0x88 );
-   d[11] = mm128_shuffle2_32( s[10], s[11], 0xdd );
-   d[12] = mm128_shuffle2_32( s[12], s[13], 0x88 );
-   d[13] = mm128_shuffle2_32( s[12], s[13], 0xdd );
-   d[14] = mm128_shuffle2_32( s[14], s[15], 0x88 );
-   d[15] = mm128_shuffle2_32( s[14], s[15], 0xdd );
+   d[ 8] = v128_shuffle2_32( s[ 8], s[ 9], 0x88 );
+   d[ 9] = v128_shuffle2_32( s[ 8], s[ 9], 0xdd );
+   d[10] = v128_shuffle2_32( s[10], s[11], 0x88 );
+   d[11] = v128_shuffle2_32( s[10], s[11], 0xdd );
+   d[12] = v128_shuffle2_32( s[12], s[13], 0x88 );
+   d[13] = v128_shuffle2_32( s[12], s[13], 0xdd );
+   d[14] = v128_shuffle2_32( s[14], s[15], 0x88 );
+   d[15] = v128_shuffle2_32( s[14], s[15], 0xdd );

   if ( bit_len <= 512 ) return;

-   d[16] = mm128_shuffle2_32( s[16], s[17], 0x88 );
-   d[17] = mm128_shuffle2_32( s[16], s[17], 0xdd );
-   d[18] = mm128_shuffle2_32( s[18], s[19], 0x88 );
-   d[19] = mm128_shuffle2_32( s[18], s[19], 0xdd );
-   d[20] = mm128_shuffle2_32( s[20], s[21], 0x88 );
-   d[21] = mm128_shuffle2_32( s[20], s[21], 0xdd );
-   d[22] = mm128_shuffle2_32( s[22], s[23], 0x88 );
-   d[23] = mm128_shuffle2_32( s[22], s[23], 0xdd );
-   d[24] = mm128_shuffle2_32( s[24], s[25], 0x88 );
-   d[25] = mm128_shuffle2_32( s[24], s[25], 0xdd );
-   d[26] = mm128_shuffle2_32( s[26], s[27], 0x88 );
-   d[27] = mm128_shuffle2_32( s[26], s[27], 0xdd );
-   d[28] = mm128_shuffle2_32( s[28], s[29], 0x88 );
-   d[29] = mm128_shuffle2_32( s[28], s[29], 0xdd );
-   d[30] = mm128_shuffle2_32( s[30], s[31], 0x88 );
-   d[31] = mm128_shuffle2_32( s[30], s[31], 0xdd );
+   d[16] = v128_shuffle2_32( s[16], s[17], 0x88 );
+   d[17] = v128_shuffle2_32( s[16], s[17], 0xdd );
+   d[18] = v128_shuffle2_32( s[18], s[19], 0x88 );
+   d[19] = v128_shuffle2_32( s[18], s[19], 0xdd );
+   d[20] = v128_shuffle2_32( s[20], s[21], 0x88 );
+   d[21] = v128_shuffle2_32( s[20], s[21], 0xdd );
+   d[22] = v128_shuffle2_32( s[22], s[23], 0x88 );
+   d[23] = v128_shuffle2_32( s[22], s[23], 0xdd );
+   d[24] = v128_shuffle2_32( s[24], s[25], 0x88 );
+   d[25] = v128_shuffle2_32( s[24], s[25], 0xdd );
+   d[26] = v128_shuffle2_32( s[26], s[27], 0x88 );
+   d[27] = v128_shuffle2_32( s[26], s[27], 0xdd );
+   d[28] = v128_shuffle2_32( s[28], s[29], 0x88 );
+   d[29] = v128_shuffle2_32( s[28], s[29], 0xdd );
+   d[30] = v128_shuffle2_32( s[30], s[31], 0x88 );
+   d[31] = v128_shuffle2_32( s[30], s[31], 0xdd );

 // if ( bit_len <= 1024 ) return;
 }
@@ -2537,77 +2537,77 @@ static inline void rintrlv_8x64_8x32( void *dst, const void *src,
   const v128_t *s = (const v128_t*)src;
   v128_t *d = (v128_t*)dst;

-   d[ 0] = mm128_shuffle2_32( s[ 0], s[ 1], 0x88 );
-   d[ 1] = mm128_shuffle2_32( s[ 2], s[ 3], 0x88 );
-   d[ 2] = mm128_shuffle2_32( s[ 0], s[ 1], 0xdd );
-   d[ 3] = mm128_shuffle2_32( s[ 2], s[ 3], 0xdd );
-   d[ 4] = mm128_shuffle2_32( s[ 4], s[ 5], 0x88 );
-   d[ 5] = mm128_shuffle2_32( s[ 6], s[ 7], 0x88 );
-   d[ 6] = mm128_shuffle2_32( s[ 4], s[ 5], 0xdd );
-   d[ 7] = mm128_shuffle2_32( s[ 6], s[ 7], 0xdd );
-   d[ 8] = mm128_shuffle2_32( s[ 8], s[ 9], 0x88 );
-   d[ 9] = mm128_shuffle2_32( s[10], s[11], 0x88 );
-   d[10] = mm128_shuffle2_32( s[ 8], s[ 9], 0xdd );
-   d[11] = mm128_shuffle2_32( s[10], s[11], 0xdd );
-   d[12] = mm128_shuffle2_32( s[12], s[13], 0x88 );
-   d[13] = mm128_shuffle2_32( s[14], s[15], 0x88 );
-   d[14] = mm128_shuffle2_32( s[12], s[13], 0xdd );
-   d[15] = mm128_shuffle2_32( s[14], s[15], 0xdd );
+   d[ 0] = v128_shuffle2_32( s[ 0], s[ 1], 0x88 );
+   d[ 1] = v128_shuffle2_32( s[ 2], s[ 3], 0x88 );
+   d[ 2] = v128_shuffle2_32( s[ 0], s[ 1], 0xdd );
+   d[ 3] = v128_shuffle2_32( s[ 2], s[ 3], 0xdd );
+   d[ 4] = v128_shuffle2_32( s[ 4], s[ 5], 0x88 );
+   d[ 5] = v128_shuffle2_32( s[ 6], s[ 7], 0x88 );
+   d[ 6] = v128_shuffle2_32( s[ 4], s[ 5], 0xdd );
+   d[ 7] = v128_shuffle2_32( s[ 6], s[ 7], 0xdd );
+   d[ 8] = v128_shuffle2_32( s[ 8], s[ 9], 0x88 );
+   d[ 9] = v128_shuffle2_32( s[10], s[11], 0x88 );
+   d[10] = v128_shuffle2_32( s[ 8], s[ 9], 0xdd );
+   d[11] = v128_shuffle2_32( s[10], s[11], 0xdd );
+   d[12] = v128_shuffle2_32( s[12], s[13], 0x88 );
+   d[13] = v128_shuffle2_32( s[14], s[15], 0x88 );
+   d[14] = v128_shuffle2_32( s[12], s[13], 0xdd );
+   d[15] = v128_shuffle2_32( s[14], s[15], 0xdd );

   if ( bit_len <= 256 ) return;

-   d[16] = mm128_shuffle2_32( s[16], s[17], 0x88 );
-   d[17] = mm128_shuffle2_32( s[18], s[19], 0x88 );
-   d[18] = mm128_shuffle2_32( s[16], s[17], 0xdd );
-   d[19] = mm128_shuffle2_32( s[18], s[19], 0xdd );
-   d[20] = mm128_shuffle2_32( s[20], s[21], 0x88 );
-   d[21] = mm128_shuffle2_32( s[22], s[23], 0x88 );
-   d[22] = mm128_shuffle2_32( s[20], s[21], 0xdd );
-   d[23] = mm128_shuffle2_32( s[22], s[23], 0xdd );
-   d[24] = mm128_shuffle2_32( s[24], s[25], 0x88 );
-   d[25] = mm128_shuffle2_32( s[26], s[27], 0x88 );
-   d[26] = mm128_shuffle2_32( s[24], s[25], 0xdd );
-   d[27] = mm128_shuffle2_32( s[26], s[27], 0xdd );
-   d[28] = mm128_shuffle2_32( s[28], s[29], 0x88 );
-   d[29] = mm128_shuffle2_32( s[30], s[31], 0x88 );
-   d[30] = mm128_shuffle2_32( s[28], s[29], 0xdd );
-   d[31] = mm128_shuffle2_32( s[30], s[31], 0xdd );
+   d[16] = v128_shuffle2_32( s[16], s[17], 0x88 );
+   d[17] = v128_shuffle2_32( s[18], s[19], 0x88 );
+   d[18] = v128_shuffle2_32( s[16], s[17], 0xdd );
+   d[19] = v128_shuffle2_32( s[18], s[19], 0xdd );
+   d[20] = v128_shuffle2_32( s[20], s[21], 0x88 );
+   d[21] = v128_shuffle2_32( s[22], s[23], 0x88 );
+   d[22] = v128_shuffle2_32( s[20], s[21], 0xdd );
+   d[23] = v128_shuffle2_32( s[22], s[23], 0xdd );
+   d[24] = v128_shuffle2_32( s[24], s[25], 0x88 );
+   d[25] = v128_shuffle2_32( s[26], s[27], 0x88 );
+   d[26] = v128_shuffle2_32( s[24], s[25], 0xdd );
+   d[27] = v128_shuffle2_32( s[26], s[27], 0xdd );
+   d[28] = v128_shuffle2_32( s[28], s[29], 0x88 );
+   d[29] = v128_shuffle2_32( s[30], s[31], 0x88 );
+   d[30] = v128_shuffle2_32( s[28], s[29], 0xdd );
+   d[31] = v128_shuffle2_32( s[30], s[31], 0xdd );

   if ( bit_len <= 512 ) return;

-   d[32] = mm128_shuffle2_32( s[32], s[33], 0x88 );
-   d[33] = mm128_shuffle2_32( s[34], s[35], 0x88 );
-   d[34] = mm128_shuffle2_32( s[32], s[33], 0xdd );
-   d[35] = mm128_shuffle2_32( s[34], s[35], 0xdd );
-   d[36] = mm128_shuffle2_32( s[36], s[37], 0x88 );
-   d[37] = mm128_shuffle2_32( s[38], s[39], 0x88 );
-   d[38] = mm128_shuffle2_32( s[36], s[37], 0xdd );
-   d[39] = mm128_shuffle2_32( s[38], s[39], 0xdd );
-   d[40] = mm128_shuffle2_32( s[40], s[41], 0x88 );
-   d[41] = mm128_shuffle2_32( s[42], s[43], 0x88 );
-   d[42] = mm128_shuffle2_32( s[40], s[41], 0xdd );
-   d[43] = mm128_shuffle2_32( s[42], s[43], 0xdd );
-   d[44] = mm128_shuffle2_32( s[44], s[45], 0x88 );
-   d[45] = mm128_shuffle2_32( s[46], s[47], 0x88 );
-   d[46] = mm128_shuffle2_32( s[44], s[45], 0xdd );
-   d[47] = mm128_shuffle2_32( s[46], s[47], 0xdd );
+   d[32] = v128_shuffle2_32( s[32], s[33], 0x88 );
+   d[33] = v128_shuffle2_32( s[34], s[35], 0x88 );
+   d[34] = v128_shuffle2_32( s[32], s[33], 0xdd );
+   d[35] = v128_shuffle2_32( s[34], s[35], 0xdd );
+   d[36] = v128_shuffle2_32( s[36], s[37], 0x88 );
+   d[37] = v128_shuffle2_32( s[38], s[39], 0x88 );
+   d[38] = v128_shuffle2_32( s[36], s[37], 0xdd );
+   d[39] = v128_shuffle2_32( s[38], s[39], 0xdd );
+   d[40] = v128_shuffle2_32( s[40], s[41], 0x88 );
+   d[41] = v128_shuffle2_32( s[42], s[43], 0x88 );
+   d[42] = v128_shuffle2_32( s[40], s[41], 0xdd );
+   d[43] = v128_shuffle2_32( s[42], s[43], 0xdd );
+   d[44] = v128_shuffle2_32( s[44], s[45], 0x88 );
+   d[45] = v128_shuffle2_32( s[46], s[47], 0x88 );
+   d[46] = v128_shuffle2_32( s[44], s[45], 0xdd );
+   d[47] = v128_shuffle2_32( s[46], s[47], 0xdd );

-   d[48] = mm128_shuffle2_32( s[48], s[49], 0x88 );
-   d[49] = mm128_shuffle2_32( s[50], s[51], 0x88 );
-   d[50] = mm128_shuffle2_32( s[48], s[49], 0xdd );
-   d[51] = mm128_shuffle2_32( s[50], s[51], 0xdd );
-   d[52] = mm128_shuffle2_32( s[52], s[53], 0x88 );
-   d[53] = mm128_shuffle2_32( s[54], s[55], 0x88 );
-   d[54] = mm128_shuffle2_32( s[52], s[53], 0xdd );
-   d[55] = mm128_shuffle2_32( s[54], s[55], 0xdd );
-   d[56] = mm128_shuffle2_32( s[56], s[57], 0x88 );
-   d[57] = mm128_shuffle2_32( s[58], s[59], 0x88 );
-   d[58] = mm128_shuffle2_32( s[56], s[57], 0xdd );
-   d[59] = mm128_shuffle2_32( s[58], s[59], 0xdd );
-   d[60] = mm128_shuffle2_32( s[60], s[61], 0x88 );
-   d[61] = mm128_shuffle2_32( s[62], s[63], 0x88 );
-   d[62] = mm128_shuffle2_32( s[60], s[61], 0xdd );
-   d[63] = mm128_shuffle2_32( s[62], s[63], 0xdd );
+   d[48] = v128_shuffle2_32( s[48], s[49], 0x88 );
+   d[49] = v128_shuffle2_32( s[50], s[51], 0x88 );
+   d[50] = v128_shuffle2_32( s[48], s[49], 0xdd );
+   d[51] = v128_shuffle2_32( s[50], s[51], 0xdd );
+   d[52] = v128_shuffle2_32( s[52], s[53], 0x88 );
+   d[53] = v128_shuffle2_32( s[54], s[55], 0x88 );
+   d[54] = v128_shuffle2_32( s[52], s[53], 0xdd );
+   d[55] = v128_shuffle2_32( s[54], s[55], 0xdd );
+   d[56] = v128_shuffle2_32( s[56], s[57], 0x88 );
+   d[57] = v128_shuffle2_32( s[58], s[59], 0x88 );
+   d[58] = v128_shuffle2_32( s[56], s[57], 0xdd );
+   d[59] = v128_shuffle2_32( s[58], s[59], 0xdd );
+   d[60] = v128_shuffle2_32( s[60], s[61], 0x88 );
+   d[61] = v128_shuffle2_32( s[62], s[63], 0x88 );
+   d[62] = v128_shuffle2_32( s[60], s[61], 0xdd );
+   d[63] = v128_shuffle2_32( s[62], s[63], 0xdd );

 // if ( bit_len <= 1024 ) return;
 }
@@ -3248,12 +3248,21 @@ static inline void rintrlv_2x256_8x64( void *dst, const void *src0,

 // blend 2 vectors while interleaving: { hi[n], lo[n-1], ... hi[1], lo[0] }
 #if defined(__SSE4_1__)
-// No SSE2 implementation.

-//#define mm128_intrlv_blend_64( hi, lo )   _mm_blend_epi16( hi, lo, 0x0f )
-//#define mm128_intrlv_blend_32( hi, lo )   _mm_blend_epi16( hi, lo, 0x33 )
+#define v128_intrlv_blend_64( hi, lo )   _mm_blend_epi16( hi, lo, 0x0f )
+#define v128_intrlv_blend_32( hi, lo )   _mm_blend_epi16( hi, lo, 0x33 )

-#endif   // SSE4_1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+
+#define v128_intrlv_blend_64( hi, lo )  \
+   v128_blendv( hi, lo, v128_set64( 0ull, 0xffffffffffffffffull ) )
+
+#define v128_intrlv_blend_32( hi, lo ) \
+   v128_blendv( hi, lo, v128_set64( 0xffffffffull, 0xffffffffull ) )
+
+#else
+// unknown, unsupported architecture
+#endif

 #if defined(__AVX2__)

--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -35,17 +35,17 @@
 ///////////////////////////////////////////////////////////////////////////////

 // New architecturally agnostic syntax: 
-//    All users of 128 bit SIMD should use new syntax or protect SSE2 only
-//    code segments.
-//    Other vector sizes continue with old syntax for now.
-//    Definitionns here will gradually be converted to new synytax.
-//    For consistency the larger vector utilities should do the same.
-
+//
+//           __m128i -> v128_t
+//           _mm_    -> v128_
+//           mm128_  -> v128_
+// 
+//    There is also new syntax to accomodate ARM's stricter type checking of
+//    vector element size. They have no effect on x86_64.

 // direct translation of native intrinsics

 #define v128_t                         __m128i 
-// Needed for ARM
 #define v128u64_t                      v128_t
 #define v128u32_t                      v128_t
 #define v128u16_t                      v128_t
@@ -56,17 +56,15 @@

 // Needed for ARM, Doesn't do anything special on x86_64
 #define v128_load1_64(p)               _mm_set1_epi64x(*(uint64_t*)(p) )
-#define v128_load1_32(p)               _mm_set_epi32(  *(uint32_t*)(p) )
-#define v128_load1_16(p)               _mm_set_epi16(  *(uint16_t*)(p) )
-#define v128_load1_8( p)               _mm_set_epi8(   *(uint8_t*) (p) )
+#define v128_load1_32(p)               _mm_set1_epi32( *(uint32_t*)(p) )
+#define v128_load1_16(p)               _mm_set1_epi16( *(uint16_t*)(p) )
+#define v128_load1_8( p)               _mm_set1_epi8(  *(uint8_t*) (p) )

 // arithmetic
 #define v128_add64                     _mm_add_epi64
 #define v128_add32                     _mm_add_epi32
 #define v128_add16                     _mm_add_epi16
 #define v128_add8                      _mm_add_epi8
-#define v128_add4_64                   mm128_add4_64
-#define v128_add4_32                   mm128_add4_32

 #define v128_sub64                     _mm_sub_epi64
 #define v128_sub32                     _mm_sub_epi32
@@ -82,7 +80,7 @@
 #define v128_mulw32                    _mm_mul_epu32
 #define v128_mulw16                    _mm_mul_epu16

-// compare
+// signed compare
 #define v128_cmpeq64                   _mm_cmpeq_epi64
 #define v128_cmpeq32                   _mm_cmpeq_epi32
 #define v128_cmpeq16                   _mm_cmpeq_epi16
@@ -120,27 +118,6 @@
 #define v128_xor                       _mm_xor_si128
 #define v128_xorq                      _mm_xor_si128
 #define v128_andnot                    _mm_andnot_si128
-#define v128_xnor( a, b )              mm128_not( _mm_xor_si128( a, b ) )
-#define v128_ornot( a, b )             mm128_or( a, mm128_not( b ) ) 
-
-// ternary
-#define v128_xorandnot( v2, v1, v0 ) \
-                               _mm_xor_si128( v2, _mm_andnot_si128( v1, v0 ) )
-#define v128_xor3( v2, v1, v0 ) \
-                               _mm_xor_si128( v2, _mm_xor_si128( v1, v0 ) )
-#define v128_and3( a, b, c )           _mm_and_si128( a, _mm_and_si128( b, c ) )
-#define v128_or3( a, b, c )            _mm_or_si128( a, _mm_or_si128( b, c ) )
-#define v128_xorand( a, b, c )         _mm_xor_si128( a, _mm_and_si128( b, c ) )
-#define v128_andxor( a, b, c )         _mm_and_si128( a, _mm_xor_si128( b, c ))
-#define v128_xoror( a, b, c )          _mm_xor_si128( a, _mm_or_si128( b, c ) )
-#define v128_orand( a, b, c )          _mm_or_si128( a, _mm_and_si128( b, c ) )
-
-// shift 2 concatenated vectors right
-#define v128_alignr64                  mm128_alignr_64
-#define v128_alignr32                  mm128_alignr_32
-#if defined(__SSSE3__)
-  #define v128_alignr8                 _mm_alignr_epi8
-#endif

 // unpack
 #define v128_unpacklo64                _mm_unpacklo_epi64
@@ -230,7 +207,7 @@ static inline __m128i mm128_mov32_128( const uint32_t n )

 #endif

-// broadcast lane l to all lanes
+// broadcast (replicate) lane l to all lanes
 #define v128_replane64( v, l ) \
   ( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x44 ) \
                : _mm_shuffle_epi32( v, 0xee )
@@ -243,24 +220,22 @@ static inline __m128i mm128_mov32_128( const uint32_t n )

 // Pseudo constants
 #define v128_zero                       _mm_setzero_si128()
-#define m128_zero                       _mm_setzero_si128()
-

 #if defined(__SSE4_1__)

 // Bitwise AND, return 1 if result is all bits clear.
-#define v128_and_eq0                    _mm_testz_si128
+#define v128_and_eq0(v1, v0)            _mm_testz_si128(v1, v0)

+// v128_is_zero?
 static inline int v128_cmpeq0( v128_t v )
 {  return v128_and_eq0( v, v ); }

 #endif

 // Bitwise compare return 1 if all bits set.
-#define v128_cmpeq1                      _mm_test_all ones
+#define v128_cmpeq1(v)                   _mm_test_all ones(v)

-#define v128_one                         mm128_mov64_128( 1 )
-#define m128_one_128                     v128_one
+#define v128_one                         mm128_mov64_128(1)

 // ASM avoids the need to initialize return variable to avoid compiler warning.
 // Macro abstracts function parentheses to look like an identifier.
@@ -274,17 +249,14 @@ static inline __m128i v128_neg1_fn()
 #endif
   return a;
 }
-#define m128_neg1_fn                     v128_neg1_fn
 #define v128_neg1                        v128_neg1_fn()
-#define m128_neg1                        v128_neg1

 //
 // Vector pointer cast

 // p = any aligned pointer
 // returns p as pointer to vector type
-#define castp_m128i(p)    ((__m128i*)(p))
-#define castp_v128        castp_m128i
+#define castp_v128(p)     ((__m128i*)(p))
 #define castp_v128u64     castp_v128
 #define castp_v128u32     castp_v128
 #define castp_v128u16     castp_v128
@@ -292,8 +264,7 @@ static inline __m128i v128_neg1_fn()

 // p = any aligned pointer
 // returns *p, watch your pointer arithmetic
-#define cast_m128i(p)     (*((__m128i*)(p)))
-#define cast_v128         cast_m128i
+#define cast_v128(p)      (*((__m128i*)(p)))
 #define cast_v128u64      cast_v128
 #define cast_v128u32      cast_v128
 #define cast_v128u16      cast_v128
@@ -301,8 +272,8 @@ static inline __m128i v128_neg1_fn()

 // p = any aligned pointer, i = scaled array index
 // returns value p[i]
-#define casti_m128i(p,i)   (((__m128i*)(p))[(i)])
-#define casti_v128         casti_m128i
+#define casti_v128(p,i)    (((__m128i*)(p))[(i)])
+#define casti_m128i        casti_v128     // deprecated
 #define casti_v128u64      casti_v128
 #define casti_v128u32      casti_v128
 #define casti_v128u16      casti_v128
@@ -310,7 +281,7 @@ static inline __m128i v128_neg1_fn()

 // p = any aligned pointer, o = scaled offset
 // returns pointer p+o
-#define casto_m128i(p,o) (((__m128i*)(p))+(o))
+#define casto_v128(p,o) (((__m128i*)(p))+(o))

 #if defined(__SSE4_1__)
 #define v128_get64( v, l )         _mm_extract_epi64( v, l )
@@ -325,7 +296,7 @@ static inline __m128i v128_neg1_fn()

 /////////////////////////////////////////////////////////////
 //
-//      _mm_insert_ps( _mm128i v1, __m128i v2, imm8 c )
+//      _mm_insert_ps( __m128i v1, __m128i v2, imm8 c )
 //
 // Fast and powerful but very limited in its application.
 // It requires SSE4.1 but only works with 128 bit vectors with 32 bit
@@ -348,7 +319,7 @@ static inline __m128i v128_neg1_fn()
 //    c[7:6] source element selector

 // Convert type and abbreviate name: eXtract Insert Mask = XIM
-#define mm128_xim_32( v1, v0, c ) \
+#define v128_xim32( v1, v0, c ) \
   _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
                                    _mm_castsi128_ps( v0 ), c ) )

@@ -356,20 +327,19 @@ static inline __m128i v128_neg1_fn()
 /*
 // Copy i32 to element c of dest and copy remaining elemnts from v.
 #define v128_put32( v, i32, c ) \
-      mm128_xim_32( v, mm128_mov32_128( i32 ), (c)<<4 )
+      v128_xim_32( v, mm128_mov32_128( i32 ), (c)<<4 )
 */


-#define mm128_mask_32( v, m )    mm128_xim_32( v, v, m )
+#define v128_mask32( v, m )    v128_xim32( v, v, m & 0xf )

 // Zero 32 bit elements when corresponding bit in 4 bit mask is set.
-//static inline __m128i mm128_mask_32( const __m128i v, const int m ) 
-//{   return mm128_xim_32( v, v, m ); }
-#define v128_mask32    mm128_mask_32
+//static inline __m128i v128_mask32( const __m128i v, const int m ) 
+//{   return v128_xim32( v, v, m ); }

-// Copy element i2 of v2 to element i1 of dest and copy remaining elements from v1.
+// Copy element l0 of v0 to element l1 of dest and copy remaining elements from v1.
 #define v128_movlane32( v1, l1, v0, l0 ) \
-  mm128_xim_32( v1, v0, ( (l1)<<4 ) | ( (l0)<<6 ) )
+  v128_xim32( v1, v0, ( (l1)<<4 ) | ( (l0)<<6 ) )

 #endif  // SSE4_1

@@ -380,115 +350,112 @@ static inline __m128i v128_neg1_fn()
 #if defined(__AVX512VL__)
 //TODO Enable for AVX10_256

-static inline __m128i mm128_not( const __m128i v )
+static inline __m128i v128_not( const __m128i v )
 {  return _mm_ternarylogic_epi64( v, v, v, 1 ); }

 #else

-#define mm128_not( v )          _mm_xor_si128( v, m128_neg1 ) 
+#define v128_not( v )          _mm_xor_si128( v, v128_neg1 ) 

 #endif
-#define v128_not                mm128_not

-static inline __m128i mm128_negate_64( __m128i v )
+static inline v128u64_t v128_negate_64( v128u64_t v )
 { return _mm_sub_epi64( _mm_xor_si128( v, v ), v ); }
-#define v128_negate64                 mm128_negate_64

-static inline __m128i mm128_negate_32( __m128i v )
+static inline v128u32_t v128_negate_32( v128u32_t v )
 { return _mm_sub_epi32( _mm_xor_si128( v, v ), v ); }
-#define v128_negate32                 mm128_negate_32

-static inline __m128i mm128_negate_16( __m128i v ) 
+static inline v128u16_t v128_negate_16( v128u16_t v ) 
 { return _mm_sub_epi16( _mm_xor_si128( v, v ), v ); }
-#define v128_negate16                 mm128_negate_16


 // Add 4 values, fewer dependencies than sequential addition.
-#define mm128_add4_64( a, b, c, d ) \
+#define v128_add4_64( a, b, c, d ) \
   _mm_add_epi64( _mm_add_epi64( a, b ), _mm_add_epi64( c, d ) )

-#define mm128_add4_32( a, b, c, d ) \
+#define v128_add4_32( a, b, c, d ) \
   _mm_add_epi32( _mm_add_epi32( a, b ), _mm_add_epi32( c, d ) )
-#define v128_add4_32                   mm128_add4_32

-#define mm128_add4_16( a, b, c, d ) \
+#define v128_add4_16( a, b, c, d ) \
   _mm_add_epi16( _mm_add_epi16( a, b ), _mm_add_epi16( c, d ) )

-#define mm128_add4_8( a, b, c, d ) \
+#define v128_add4_8( a, b, c, d ) \
   _mm_add_epi8( _mm_add_epi8( a, b ), _mm_add_epi8( c, d ) )

-#define mm128_xor4( a, b, c, d ) \
+#define v128_xor4( a, b, c, d ) \
   _mm_xor_si128( _mm_xor_si128( a, b ), _mm_xor_si128( c, d ) )

+
 // Memory functions
 // Mostly for convenience, avoids calculating bytes.
 // Assumes data is alinged and integral.
 // n = number of __m128i, bytes/16

-static inline void memset_zero_128( __m128i *dst,  const int n )
-{   for ( int i = 0; i < n; i++ ) dst[i] = m128_zero; }
-#define v128_memset_zero               memset_zero_128
+static inline void v128_memset_zero( v128_t *dst,  const int n )
+{   for ( int i = 0; i < n; i++ ) dst[i] = v128_zero; }
+#define memset_zero_128      v128_memset_zero

-static inline void memset_128( __m128i *dst, const __m128i a, const int n )
+static inline void v128_memset( v128_t *dst, const v128_t a, const int n )
 {   for ( int i = 0; i < n; i++ ) dst[i] = a; }
-#define v128_memset                    memset_128

-static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
+static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 {   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
-#define v128_memcpy                    memcpy_128
+#define  memcpy_128           v128_memcpy  

 #if defined(__AVX512VL__)
 //TODO Enable for AVX10_256

 // a ^ b ^ c
-#define mm128_xor3( a, b, c )      _mm_ternarylogic_epi64( a, b, c, 0x96 )
+#define v128_xor3( a, b, c )      _mm_ternarylogic_epi64( a, b, c, 0x96 )

 // a & b & c
-#define mm128_and3( a, b, c )      _mm_ternarylogic_epi64( a, b, c, 0x80 )
+#define v128_and3( a, b, c )      _mm_ternarylogic_epi64( a, b, c, 0x80 )

 // a | b | c
-#define mm128_or3( a, b, c )       _mm_ternarylogic_epi64( a, b, c, 0xfe )
+#define v128_or3( a, b, c )       _mm_ternarylogic_epi64( a, b, c, 0xfe )

 // a ^ ( b & c )
-#define mm128_xorand( a, b, c )    _mm_ternarylogic_epi64( a, b, c, 0x78 )
+#define v128_xorand( a, b, c )    _mm_ternarylogic_epi64( a, b, c, 0x78 )

 // a & ( b ^ c )
-#define mm128_andxor( a, b, c )    _mm_ternarylogic_epi64( a, b, c, 0x60 )
+#define v128_andxor( a, b, c )    _mm_ternarylogic_epi64( a, b, c, 0x60 )

 // a ^ ( b | c )
-#define mm128_xoror( a, b, c )     _mm_ternarylogic_epi64( a, b, c, 0x1e )
+#define v128_xoror( a, b, c )     _mm_ternarylogic_epi64( a, b, c, 0x1e )

 // a ^ ( ~b & c )
-#define mm128_xorandnot( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xd2 )
+#define v128_xorandnot( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xd2 )

 // a | ( b & c )
-#define mm128_orand( a, b, c )     _mm_ternarylogic_epi64( a, b, c, 0xf8 )
+#define v128_orand( a, b, c )     _mm_ternarylogic_epi64( a, b, c, 0xf8 )

 // ~( a ^ b ), same as (~a) ^ b
-#define mm128_xnor( a, b )         _mm_ternarylogic_epi64( a, b, b, 0x81 )
+#define v128_xnor( a, b )         _mm_ternarylogic_epi64( a, b, b, 0x81 )

 #else

-#define mm128_xor3( a, b, c )      _mm_xor_si128( a, _mm_xor_si128( b, c ) )
+#define v128_xor3( a, b, c )      _mm_xor_si128( a, _mm_xor_si128( b, c ) )

-#define mm128_and3( a, b, c )      _mm_and_si128( a, _mm_and_si128( b, c ) )
+#define v128_and3( a, b, c )      _mm_and_si128( a, _mm_and_si128( b, c ) )

-#define mm128_or3( a, b, c )       _mm_or_si128( a, _mm_or_si128( b, c ) )
+#define v128_or3( a, b, c )       _mm_or_si128( a, _mm_or_si128( b, c ) )

-#define mm128_xorand( a, b, c )    _mm_xor_si128( a, _mm_and_si128( b, c ) )
+#define v128_xorand( a, b, c )    _mm_xor_si128( a, _mm_and_si128( b, c ) )

-#define mm128_andxor( a, b, c )    _mm_and_si128( a, _mm_xor_si128( b, c ))
+#define v128_andxor( a, b, c )    _mm_and_si128( a, _mm_xor_si128( b, c ))

-#define mm128_xoror( a, b, c )     _mm_xor_si128( a, _mm_or_si128( b, c ) )
+#define v128_xoror( a, b, c )     _mm_xor_si128( a, _mm_or_si128( b, c ) )

-#define mm128_xorandnot( a, b, c ) _mm_xor_si128( a, _mm_andnot_si128( b, c ) )
+#define v128_xorandnot( a, b, c ) _mm_xor_si128( a, _mm_andnot_si128( b, c ) )

-#define mm128_orand( a, b, c )     _mm_or_si128( a, _mm_and_si128( b, c ) )
+#define v128_orand( a, b, c )     _mm_or_si128( a, _mm_and_si128( b, c ) )

-#define mm128_xnor( a, b )         mm128_not( _mm_xor_si128( a, b ) )
+#define v128_xnor( a, b )         v128_not( _mm_xor_si128( a, b ) )

 #endif

+#define v128_ornot( a, b )             _mm_or_si128( a, v128_not( b ) )
+
 // Mask making
 // Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
 // Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements.
@@ -514,7 +481,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define v128_qrev16(v)      v128_shuffle16( v, 0x1b )
 #define v128_lrev16(v)      v128_shuffle16( v, 0xb1 )

-// These sgould never be callled from application code, use rol/ror.
+// Internal use only, should never be callled from application code.
 #define v128_ror64_sse2( v, c ) \
   _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )

@@ -529,14 +496,14 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )

 #if defined(__AVX512VL__)

-// AVX512 fastest all rotations.
-#define mm128_ror_64                _mm_ror_epi64
-#define mm128_rol_64                _mm_rol_epi64
-#define mm128_ror_32                _mm_ror_epi32
-#define mm128_rol_32                _mm_rol_epi32
+// AVX512 fastest for all rotations.
+#define v128_ror64                _mm_ror_epi64
+#define v128_rol64                _mm_rol_epi64
+#define v128_ror32                _mm_ror_epi32
+#define v128_rol32                _mm_rol_epi32

-// ror/rol will alway find the fastest but these names may fit better with
-// application code performing shuffles rather than bit rotations.
+// ror/rol will always find the fastest but these names may fit better with
+// application code performing byte operations rather than bit rotations.
 #define v128_shuflr64_8( v)         _mm_ror_epi64( v,  8 )
 #define v128_shufll64_8( v)         _mm_rol_epi64( v,  8 )
 #define v128_shuflr64_16(v)         _mm_ror_epi64( v, 16 )
@@ -549,7 +516,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define v128_shufll32_16(v)         _mm_rol_epi32( v, 16 )

 #elif defined(__SSSE3__)
-// SSE2: fastest 32 bit, very fast 16, fast 8
+// SSSE3: fastest 32 bit, very fast 16, fast 8

 #define v128_shuflr64_8( v ) \
    _mm_shuffle_epi8( v, _mm_set_epi64x( \
@@ -575,7 +542,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
   _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                  0x0e0d0c0f0a09080b, 0x0605040702010003 ) )

-#define mm128_ror_64( v, c ) \
+#define v128_ror64( v, c ) \
   ( (c) ==  8 ) ? v128_shuflr64_8( v ) \
 : ( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \
 : ( (c) == 24 ) ? v128_shuflr64_24( v ) \
@@ -585,7 +552,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 : ( (c) == 56 ) ? v128_shufll64_8( v ) \
 : v128_ror64_sse2( v, c ) 

-#define mm128_rol_64( v, c ) \
+#define v128_rol64( v, c ) \
   ( (c) ==  8 ) ? v128_shufll64_8( v ) \
 : ( (c) == 16 ) ? v128_shuffle16( v, 0x93 ) \
 : ( (c) == 24 ) ? v128_shufll64_24( v ) \
@@ -595,57 +562,54 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 : ( (c) == 56 ) ? v128_shuflr64_8( v ) \
 : v128_rol64_sse2( v, c ) 

-#define mm128_ror_32( v, c ) \
+#define v128_ror32( v, c ) \
   ( (c) ==  8 ) ? v128_shuflr32_8( v ) \
 : ( (c) == 16 ) ? v128_lrev16( v ) \
 : ( (c) == 24 ) ? v128_shufll32_8( v ) \
 : v128_ror32_sse2( v, c ) 

-#define mm128_rol_32( v, c ) \
+#define v128_rol32( v, c ) \
   ( (c) ==  8 ) ? v128_shufll32_8( v ) \
 : ( (c) == 16 ) ? v128_lrev16( v ) \
 : ( (c) == 24 ) ? v128_shuflr32_8( v ) \
 : v128_rol32_sse2( v, c )

 #elif defined(__SSE2__)
-// SSE2: fastest 32 bit, very fast 16
+// SSE2: fastest 32 bit, very fast 16, all else slow

-#define mm128_ror_64( v, c ) \
+#define v128_ror64( v, c ) \
   ( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \
 : ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
 : ( (c) == 48 ) ? v128_shuffle16( v, 0x93 ) \
 : v128_ror64_sse2( v, c )

-#define mm128_rol_64( v, c ) \
+#define v128_rol64( v, c ) \
   ( (c) == 16 ) ? v128_shuffle16( v, 0x93 ) \
 : ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
 : ( (c) == 48 ) ? v128_shuffle16( v, 0x39 ) \
 : v128_rol64_sse2( v, c )

-#define mm128_ror_32( v, c ) \
+#define v128_ror32( v, c ) \
  ( (c) == 16 ) ? v128_lrev16( v ) \
 : v128_ror32_sse2( v, c )

-#define mm128_rol_32( v, c ) \
+#define v128_rol32( v, c ) \
  ( (c) == 16 ) ? v128_lrev16( v ) \
 : v128_rol32_sse2( v, c )

 #else 

-#define mm128_ror_64         v128_ror64_sse2
-#define mm128_rol_64         v128_rol64_sse2
-#define mm128_ror_32         v128_ror32_sse2
-#define mm128_rol_32         v128_rol32_sse2
+#define v128_ror64         v128_ror64_sse2
+#define v128_rol64         v128_rol64_sse2
+#define v128_ror32         v128_ror32_sse2
+#define v128_rol32         v128_rol32_sse2

 #endif

-// Generic names for portable code
-#define v128_ror64            mm128_ror_64
-#define v128_rol64            mm128_rol_64
-#define v128_ror32            mm128_ror_32
-#define v128_rol32            mm128_rol_32
-
+// deprecated
+#define mm128_rol_32        v128_rol32

+/* not used
 // x2 rotates elements in 2 individual vectors in a double buffered
 // optimization for SSE2, does nothing for AVX512 but is there for
 // transparency.
@@ -653,25 +617,25 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #if defined(__AVX512VL__)
 //TODO Enable for AVX10_256

-#define mm128_rorx2_64( v1, v0, c ) \
+#define v128_2ror64( v1, v0, c ) \
   _mm_ror_epi64( v0, c ); \
   _mm_ror_epi64( v1, c )

-#define mm128_rolx2_64( v1, v0, c ) \
+#define v128_2rol64( v1, v0, c ) \
   _mm_rol_epi64( v0, c ); \
   _mm_rol_epi64( v1, c )

-#define mm128_rorx2_32( v1, v0, c ) \
+#define v128_2ror32( v1, v0, c ) \
   _mm_ror_epi32( v0, c ); \
   _mm_ror_epi32( v1, c )

-#define mm128_rolx2_32( v1, v0, c ) \
+#define v128_2rol32( v1, v0, c ) \
   _mm_rol_epi32( v0, c ); \
   _mm_rol_epi32( v1, c )

 #else  // SSE2

-#define mm128_rorx2_64( v1, v0, c ) \
+#define v128_2ror64( v1, v0, c ) \
 { \
 __m128i t0 = _mm_srli_epi64( v0, c ); \
 __m128i t1 = _mm_srli_epi64( v1, c ); \
@@ -681,7 +645,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 v1 = _mm_or_si256( v1, t1 ); \
 }

-#define mm128_rolx2_64( v1, v0, c ) \
+#define v128_2rol64( v1, v0, c ) \
 { \
 __m128i t0 = _mm_slli_epi64( v0, c ); \
 __m128i t1 = _mm_slli_epi64( v1, c ); \
@@ -691,7 +655,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 v1 = _mm_or_si256( v1, t1 ); \
 }

-#define mm128_rorx2_32( v1, v0, c ) \
+#define v128_2ror32( v1, v0, c ) \
 { \
 __m128i t0 = _mm_srli_epi32( v0, c ); \
 __m128i t1 = _mm_srli_epi32( v1, c ); \
@@ -701,7 +665,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 v1 = _mm_or_si256( v1, t1 ); \
 }

-#define mm128_rolx2_32( v1, v0, c ) \
+#define v128_2rol32( v1, v0, c ) \
 { \
 __m128i t0 = _mm_slli_epi32( v0, c ); \
 __m128i t1 = _mm_slli_epi32( v1, c ); \
@@ -712,20 +676,17 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 }

 #endif   // AVX512 else SSE2
-
-#define v128_2ror64            mm128_rorx2_64
-#define v128_2rol64            mm128_rolx2_64
-#define v128_2ror32            mm128_rorx2_32
-#define v128_2rol32            mm128_rolx2_32
-
+*/

 // Cross lane shuffles

+// No NEON version
 #define v128_shuffle32     _mm_shuffle_epi32

-// shuffle using vector mask, for compatibility with NEON
+/* Not used, exists only for compatibility with NEON if ever needed.
 #define v128_shufflev32( v, vmask ) \
  v128_shuffle32( v, mm128_movmask_32( vmask ) )
+*/

 #define v128_shuffle8     _mm_shuffle_epi8

@@ -734,12 +695,10 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define v128_shuffle2_64( v1, v2, c ) \
   _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( v1 ), \
                                     _mm_castsi128_pd( v2 ), c ) ); 
-#define mm128_shuffle2_64   v128_shuffle2_64

 #define v128_shuffle2_32( v1, v2, c ) \
   _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
                                     _mm_castsi128_ps( v2 ), c ) ); 
-#define mm128_shuffle2_32   v128_shuffle2_32

 // Rotate vector elements accross all lanes

@@ -756,95 +715,77 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define v128_shuflr16(v)    v128_shuffle16( v, 0x39 )
 #define v128_shufll16(v)    v128_shuffle16( v, 0x93 )

-
-//TODO fix this
-// alias bswap
-//#define v128_qrev8(v)       _mm_shuffle_epi8( v, v128_8( 0,1,2,3,4,5,6,7 ) )
-//#define v128_lrev8(v)       _mm_shuffle_epi8( v, v128_8( 4,5,6,7, 0,1,2,3 ) )
-//#define v128_wrev8(v)       _mm_shuffle_epi8( v, v128_8( 6,7, 4,5, 2,3, 1,0 ) )
-   
-// reverse bits, can it be done?
-//#define v128_bitrev8( v )              vrbitq_u8
-
-/* Not used
-#if defined(__SSSE3__)
-
-// Rotate right by c bytes, no SSE2 equivalent.
-static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
-{ return _mm_alignr_epi8( v, v, c ); }
-
-#endif
-*/
-
 // Endian byte swap.

 #if defined(__SSSE3__)

-#define mm128_bswap_128( v ) \
+#define v128_bswap128( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0001020304050607, \
                                        0x08090a0b0c0d0e0f ) )

-#define mm128_bswap_64( v ) \
+#define v128_bswap64( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
                                        0x0001020304050607 ) )

-#define mm128_bswap_32( v ) \
+#define v128_bswap32( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
                                        0x0405060700010203 ) )
+// deprecated
+#define mm128_bswap_32      v128_bswap32

-#define mm128_bswap_16( v ) \
+#define v128_bswap16( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0e0f0c0d0a0b0809, \
                                        0x0607040502030001 )

 // 8 byte qword * 8 qwords * 2 lanes = 128 bytes
 #define mm128_block_bswap_64( d, s ) \
 { \
-   __m128i ctl = _mm_set_epi64x(  0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
-  casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \
-  casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \
-  casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \
-  casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \
-  casti_m128i( d,4 ) = _mm_shuffle_epi8( casti_m128i( s,4 ), ctl ); \
-  casti_m128i( d,5 ) = _mm_shuffle_epi8( casti_m128i( s,5 ), ctl ); \
-  casti_m128i( d,6 ) = _mm_shuffle_epi8( casti_m128i( s,6 ), ctl ); \
-  casti_m128i( d,7 ) = _mm_shuffle_epi8( casti_m128i( s,7 ), ctl ); \
+  v128_t ctl = _mm_set_epi64x(  0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
+  casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
+  casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
+  casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
+  casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
+  casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \
+  casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \
+  casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
+  casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
 }
 #define mm128_block_bswap64_512    mm128_block_bswap_64
 #define v128_block_bswap64_512     mm128_block_bswap_64

 #define v128_block_bswap64_1024( d, s ) \
 { \
-   __m128i ctl = _mm_set_epi64x(  0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
-  casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), ctl ); \
-  casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), ctl ); \
-  casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), ctl ); \
-  casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), ctl ); \
-  casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), ctl ); \
-  casti_m128i( d, 5 ) = _mm_shuffle_epi8( casti_m128i( s, 5 ), ctl ); \
-  casti_m128i( d, 6 ) = _mm_shuffle_epi8( casti_m128i( s, 6 ), ctl ); \
-  casti_m128i( d, 7 ) = _mm_shuffle_epi8( casti_m128i( s, 7 ), ctl ); \
-  casti_m128i( d, 8 ) = _mm_shuffle_epi8( casti_m128i( s, 8 ), ctl ); \
-  casti_m128i( d, 9 ) = _mm_shuffle_epi8( casti_m128i( s, 9 ), ctl ); \
-  casti_m128i( d,10 ) = _mm_shuffle_epi8( casti_m128i( s,10 ), ctl ); \
-  casti_m128i( d,11 ) = _mm_shuffle_epi8( casti_m128i( s,11 ), ctl ); \
-  casti_m128i( d,12 ) = _mm_shuffle_epi8( casti_m128i( s,12 ), ctl ); \
-  casti_m128i( d,13 ) = _mm_shuffle_epi8( casti_m128i( s,13 ), ctl ); \
-  casti_m128i( d,14 ) = _mm_shuffle_epi8( casti_m128i( s,14 ), ctl ); \
-  casti_m128i( d,15 ) = _mm_shuffle_epi8( casti_m128i( s,15 ), ctl ); \
+  v128_t ctl = _mm_set_epi64x(  0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
+  casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \
+  casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \
+  casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \
+  casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \
+  casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \
+  casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \
+  casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \
+  casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \
+  casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \
+  casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \
+  casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \
+  casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \
+  casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \
+  casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \
+  casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \
+  casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \
 }

 // 4 byte dword * 8 dwords * 4 lanes = 128 bytes
 #define mm128_block_bswap_32( d, s ) \
 { \
-   __m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
-  casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \
-  casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \
-  casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \
-  casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \
-  casti_m128i( d,4 ) = _mm_shuffle_epi8( casti_m128i( s,4 ), ctl ); \
-  casti_m128i( d,5 ) = _mm_shuffle_epi8( casti_m128i( s,5 ), ctl ); \
-  casti_m128i( d,6 ) = _mm_shuffle_epi8( casti_m128i( s,6 ), ctl ); \
-  casti_m128i( d,7 ) = _mm_shuffle_epi8( casti_m128i( s,7 ), ctl ); \
+  v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+  casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
+  casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
+  casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
+  casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
+  casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \
+  casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \
+  casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
+  casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
 }
 #define mm128_block_bswap32_256      mm128_block_bswap_32
 #define v128_block_bswap32_256       mm128_block_bswap_32
@@ -852,129 +793,127 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )

 #define mm128_block_bswap32_128( d, s ) \
 { \
-   __m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
-  casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \
-  casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \
-  casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \
-  casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \
+  v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+  casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
+  casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
+  casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
+  casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
 }   

 #define v128_block_bswap32_512( d, s ) \
 { \
-   __m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
-  casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), ctl ); \
-  casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), ctl ); \
-  casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), ctl ); \
-  casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), ctl ); \
-  casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), ctl ); \
-  casti_m128i( d, 5 ) = _mm_shuffle_epi8( casti_m128i( s, 5 ), ctl ); \
-  casti_m128i( d, 6 ) = _mm_shuffle_epi8( casti_m128i( s, 6 ), ctl ); \
-  casti_m128i( d, 7 ) = _mm_shuffle_epi8( casti_m128i( s, 7 ), ctl ); \
-  casti_m128i( d, 8 ) = _mm_shuffle_epi8( casti_m128i( s, 8 ), ctl ); \
-  casti_m128i( d, 9 ) = _mm_shuffle_epi8( casti_m128i( s, 9 ), ctl ); \
-  casti_m128i( d,10 ) = _mm_shuffle_epi8( casti_m128i( s,10 ), ctl ); \
-  casti_m128i( d,11 ) = _mm_shuffle_epi8( casti_m128i( s,11 ), ctl ); \
-  casti_m128i( d,12 ) = _mm_shuffle_epi8( casti_m128i( s,12 ), ctl ); \
-  casti_m128i( d,13 ) = _mm_shuffle_epi8( casti_m128i( s,13 ), ctl ); \
-  casti_m128i( d,14 ) = _mm_shuffle_epi8( casti_m128i( s,14 ), ctl ); \
-  casti_m128i( d,15 ) = _mm_shuffle_epi8( casti_m128i( s,15 ), ctl ); \
+  v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+  casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \
+  casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \
+  casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \
+  casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \
+  casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \
+  casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \
+  casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \
+  casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \
+  casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \
+  casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \
+  casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \
+  casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \
+  casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \
+  casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \
+  casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \
+  casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \
 }

 #else  // SSE2

-static inline __m128i mm128_bswap_64( __m128i v )
+static inline v128_t v128_bswap64( __m128i v )
 {
      v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
      v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) );
  return  _mm_shufflehi_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) );
 }

-static inline __m128i mm128_bswap_32( __m128i v )
+static inline v128_t v128_bswap32( __m128i v )
 {
      v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
      v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
  return  _mm_shufflehi_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
 }
+#define mm128_bswap_32      v128_bswap32 

-static inline __m128i mm128_bswap_16( __m128i v )
+static inline v128_t v128_bswap16( __m128i v )
 {
  return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
 }

-#define mm128_bswap_128( v )   v128_qrev32( v128_bswap64( v ) )
+#define v128_bswap128( v )   v128_qrev32( v128_bswap64( v ) )

 static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
 {
-   d[0] = mm128_bswap_64( s[0] );
-   d[1] = mm128_bswap_64( s[1] );
-   d[2] = mm128_bswap_64( s[2] );
-   d[3] = mm128_bswap_64( s[3] );
-   d[4] = mm128_bswap_64( s[4] );
-   d[5] = mm128_bswap_64( s[5] );
-   d[6] = mm128_bswap_64( s[6] );
-   d[7] = mm128_bswap_64( s[7] );
+   d[0] = v128_bswap64( s[0] );
+   d[1] = v128_bswap64( s[1] );
+   d[2] = v128_bswap64( s[2] );
+   d[3] = v128_bswap64( s[3] );
+   d[4] = v128_bswap64( s[4] );
+   d[5] = v128_bswap64( s[5] );
+   d[6] = v128_bswap64( s[6] );
+   d[7] = v128_bswap64( s[7] );
 }
 #define v128_block_bswap64_512 mm128_block_bswap_64

 static inline void mm128_block_bswap64_1024( __m128i *d, const __m128i *s )
 {
-   d[ 0] = mm128_bswap_64( s[ 0] );
-   d[ 1] = mm128_bswap_64( s[ 1] );
-   d[ 2] = mm128_bswap_64( s[ 2] );
-   d[ 3] = mm128_bswap_64( s[ 3] );
-   d[ 4] = mm128_bswap_64( s[ 4] );
-   d[ 5] = mm128_bswap_64( s[ 5] );
-   d[ 6] = mm128_bswap_64( s[ 6] );
-   d[ 7] = mm128_bswap_64( s[ 7] );
-   d[ 8] = mm128_bswap_64( s[ 8] );
-   d[ 9] = mm128_bswap_64( s[ 9] );
-   d[10] = mm128_bswap_64( s[10] );
-   d[11] = mm128_bswap_64( s[11] );
-   d[14] = mm128_bswap_64( s[12] );
-   d[13] = mm128_bswap_64( s[13] );
-   d[14] = mm128_bswap_64( s[14] );
-   d[15] = mm128_bswap_64( s[15] );
+   d[ 0] = v128_bswap64( s[ 0] );
+   d[ 1] = v128_bswap64( s[ 1] );
+   d[ 2] = v128_bswap64( s[ 2] );
+   d[ 3] = v128_bswap64( s[ 3] );
+   d[ 4] = v128_bswap64( s[ 4] );
+   d[ 5] = v128_bswap64( s[ 5] );
+   d[ 6] = v128_bswap64( s[ 6] );
+   d[ 7] = v128_bswap64( s[ 7] );
+   d[ 8] = v128_bswap64( s[ 8] );
+   d[ 9] = v128_bswap64( s[ 9] );
+   d[10] = v128_bswap64( s[10] );
+   d[11] = v128_bswap64( s[11] );
+   d[14] = v128_bswap64( s[12] );
+   d[13] = v128_bswap64( s[13] );
+   d[14] = v128_bswap64( s[14] );
+   d[15] = v128_bswap64( s[15] );
 }

 static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 {
-   d[0] = mm128_bswap_32( s[0] );
-   d[1] = mm128_bswap_32( s[1] );
-   d[2] = mm128_bswap_32( s[2] );
-   d[3] = mm128_bswap_32( s[3] );
-   d[4] = mm128_bswap_32( s[4] );
-   d[5] = mm128_bswap_32( s[5] );
-   d[6] = mm128_bswap_32( s[6] );
-   d[7] = mm128_bswap_32( s[7] );
+   d[0] = v128_bswap32( s[0] );
+   d[1] = v128_bswap32( s[1] );
+   d[2] = v128_bswap32( s[2] );
+   d[3] = v128_bswap32( s[3] );
+   d[4] = v128_bswap32( s[4] );
+   d[5] = v128_bswap32( s[5] );
+   d[6] = v128_bswap32( s[6] );
+   d[7] = v128_bswap32( s[7] );
 }
 #define mm128_block_bswap32_256 mm128_block_bswap_32
 #define v128_block_bswap32_256  mm128_block_bswap_32

 static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
 {
-   d[ 0] = mm128_bswap_32( s[ 0] );
-   d[ 1] = mm128_bswap_32( s[ 1] );
-   d[ 2] = mm128_bswap_32( s[ 2] );
-   d[ 3] = mm128_bswap_32( s[ 3] );
-   d[ 4] = mm128_bswap_32( s[ 4] );
-   d[ 5] = mm128_bswap_32( s[ 5] );
-   d[ 6] = mm128_bswap_32( s[ 6] );
-   d[ 7] = mm128_bswap_32( s[ 7] );
-   d[ 8] = mm128_bswap_32( s[ 8] );
-   d[ 9] = mm128_bswap_32( s[ 9] );
-   d[10] = mm128_bswap_32( s[10] );
-   d[11] = mm128_bswap_32( s[11] );
-   d[12] = mm128_bswap_32( s[12] );
-   d[13] = mm128_bswap_32( s[13] );
-   d[14] = mm128_bswap_32( s[14] );
-   d[15] = mm128_bswap_32( s[15] );
+   d[ 0] = v128_bswap32( s[ 0] );
+   d[ 1] = v128_bswap32( s[ 1] );
+   d[ 2] = v128_bswap32( s[ 2] );
+   d[ 3] = v128_bswap32( s[ 3] );
+   d[ 4] = v128_bswap32( s[ 4] );
+   d[ 5] = v128_bswap32( s[ 5] );
+   d[ 6] = v128_bswap32( s[ 6] );
+   d[ 7] = v128_bswap32( s[ 7] );
+   d[ 8] = v128_bswap32( s[ 8] );
+   d[ 9] = v128_bswap32( s[ 9] );
+   d[10] = v128_bswap32( s[10] );
+   d[11] = v128_bswap32( s[11] );
+   d[12] = v128_bswap32( s[12] );
+   d[13] = v128_bswap32( s[13] );
+   d[14] = v128_bswap32( s[14] );
+   d[15] = v128_bswap32( s[15] );
 }

 #endif // SSSE3 else SSE2

-#define v128_bswap32                   mm128_bswap_32
-#define v128_bswap64                   mm128_bswap_64
-#define v128_bswap128                  mm128_bswap_128
 #define v128_block_bswap32             mm128_block_bswap_32
 #define v128_block_bswap64             mm128_block_bswap_64

@@ -984,24 +923,20 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )

 #if defined(__SSSE3__)

-#define mm128_alignr_64( hi, lo, c )    _mm_alignr_epi8( hi, lo, (c)*8 )
-#define mm128_alignr_32( hi, lo, c )    _mm_alignr_epi8( hi, lo, (c)*4 )
+#define v128_alignr8                   _mm_alignr_epi8
+#define v128_alignr64( hi, lo, c )     _mm_alignr_epi8( hi, lo, (c)*8 )
+#define v128_alignr32( hi, lo, c )     _mm_alignr_epi8( hi, lo, (c)*4 )

 #else

-#define mm128_alignr_64( hi, lo, c ) \
+#define v128_alignr64( hi, lo, c ) \
   _mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )

-#define mm128_alignr_32( hi, lo, c ) \
+#define v128_alignr32( hi, lo, c ) \
   _mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )

 #endif

-// NEON only uses vector mask. x86 blend selects second arg when control bit
-// is set. Blendv selects second arg when sign bit is set. And masking is the
-// opposite, elements are selected from the first arg if the mask bits are set.
-// Arm blend is a bit by bit blend while x76 is an elenet blend.
-// Reverse the logic so the use mask is consistent with both formats.
 #if defined(__SSE4_1__)

 #define v128_blendv                    _mm_blendv_epi8
@@ -1009,7 +944,7 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
 #else

 #define v128_blendv( v1, v0, mask ) \
-   v128_or( v128_andnot( mask, v0 ), v128_and( mask, v1 ) )
+   v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )

 #endif

--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -90,7 +90,7 @@ typedef union
 // code and therefore can't be used as compile time initializers.

 #define m256_zero            _mm256_setzero_si256()
-#define m256_one_128         mm256_bcast_m128( m128_one_128 )
+#define m256_one_128         mm256_bcast_m128( v128_one )

 static inline __m256i mm256_neg1_fn()
 {
--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -21,36 +21,36 @@
 //
 //  vornq( v1, v0 )        or( v1, not( v0 ) )

-#define v128_t                         uint32x4_t   // default, 
-#define v128u64_t                      uint64x2_t
-#define v128u32_t                      uint32x4_t
-#define v128u16_t                      uint16x8_t
-#define v128u8_t                       uint8x16_t
+#define v128_t                        uint32x4_t   // default, 
+#define v128u64_t                     uint64x2_t
+#define v128u32_t                     uint32x4_t
+#define v128u16_t                     uint16x8_t
+#define v128u8_t                      uint8x16_t

 // load & store
-#define v128_load( p )                 vld1q_u32( (uint32_t*)(p) )
-#define v128_store( p, v )             vst1q_u32( (uint32_t*)(p), v )
+#define v128_load( p )                vld1q_u32( (uint32_t*)(p) )
+#define v128_store( p, v )            vst1q_u32( (uint32_t*)(p), v )

-#define v128u64_load( p )              vld1q_u64( (uint64_t*)(p) )
-#define v128u64_store( p, v )          vst1q_u64( (uint64_t*)(p), v )
-#define v128u32_load( p )              vld1q_u32( (uint32_t*)(p) )
-#define v128u32_store( p, v )          vst1q_u32( (uint32_t*)(p), v )
-#define v128u16_load( p )              vld1q_u16( (uint16_t*)(p) )
-#define v128u16_store( p, v )          vst1q_u16( (uint16_t*)(p), v )
-#define v128u8_load( p )               vld1q_u16( (uint8_t*)(p) )
-#define v128u8_store( p, v )           vst1q_u16( (uint8_t*)(p), v )
+#define v128u64_load( p )             vld1q_u64( (uint64_t*)(p) )
+#define v128u64_store( p, v )         vst1q_u64( (uint64_t*)(p), v )
+#define v128u32_load( p )             vld1q_u32( (uint32_t*)(p) )
+#define v128u32_store( p, v )         vst1q_u32( (uint32_t*)(p), v )
+#define v128u16_load( p )             vld1q_u16( (uint16_t*)(p) )
+#define v128u16_store( p, v )         vst1q_u16( (uint16_t*)(p), v )
+#define v128u8_load( p )              vld1q_u16( (uint8_t*)(p) )
+#define v128u8_store( p, v )          vst1q_u16( (uint8_t*)(p), v )

-// load & set1 combined
-#define v128_load1_64(p)               vld1q_dup_u64( (uint64_t*)(p) )
-#define v128_load1_32(p)               vld1q_dup_u32( (uint32_t*)(p) )
-#define v128_load1_16(p)               vld1q_dup_u16( (uint16_t*)(p) )
-#define v128_load1_8( p)               vld1q_dup_u8(  (uint8_t*) (p) )
+// load & set1 combined, doesn't work
+#define v128_load1_64(p)              vld1q_dup_u64( (uint64_t*)(p) )
+#define v128_load1_32(p)              vld1q_dup_u32( (uint32_t*)(p) )
+#define v128_load1_16(p)              vld1q_dup_u16( (uint16_t*)(p) )
+#define v128_load1_8( p)              vld1q_dup_u8(  (uint8_t*) (p) )

 // arithmetic
-#define v128_add64                     vaddq_u64
-#define v128_add32                     vaddq_u32
-#define v128_add16                     vaddq_u16
-#define v128_add8                      vaddq_u8
+#define v128_add64                    vaddq_u64
+#define v128_add32                    vaddq_u32
+#define v128_add16                    vaddq_u16
+#define v128_add8                     vaddq_u8

 #define v128_add4_64( v3, v2, v1, v0 ) \
   vaddq_u64( vaddq_u64( v3, v2 ), vaddq_u64( v1, v0 ) )
@@ -58,17 +58,17 @@
 #define v128_add4_32( v3, v2, v1, v0 ) \
   vaddq_u32( vaddq_u32( v3, v2 ), vaddq_u32( v1, v0 ) )

-#define v128_sub64                     vsubq_u64
-#define v128_sub32                     vsubq_u32
-#define v128_sub16                     vsubq_u16
-#define v128_sub8                      vsubq_u8
+#define v128_sub64                    vsubq_u64
+#define v128_sub32                    vsubq_u32
+#define v128_sub16                    vsubq_u16
+#define v128_sub8                     vsubq_u8

 // returns low half, u64 undocumented, may not exist.
-#define v128_mul64                     vmulq_u64
-#define v128_mul32                     vmulq_u32
-#define v128_mul16                     vmulq_u16
+#define v128_mul64                    vmulq_u64
+#define v128_mul32                    vmulq_u32
+#define v128_mul16                    vmulq_u16

-// slow, tested with argon2d
+// Widening multiply, align source elements with Intel
 static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 {
   return vmull_u32( vget_low_u32( vcopyq_laneq_u32( v1, 1, v1, 2 ) ),
@@ -76,101 +76,102 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 }

 // compare
-#define v128_cmpeq64                   vceqq_u64
-#define v128_cmpeq32                   vceqq_u32
-#define v128_cmpeq16                   vceqq_u16
-#define v128_cmpeq8                    vceqq_u8
+#define v128_cmpeq64                  vceqq_u64
+#define v128_cmpeq32                  vceqq_u32
+#define v128_cmpeq16                  vceqq_u16
+#define v128_cmpeq8                   vceqq_u8

-#define v128_iszero                    vceqzq_u64
+// v128_cmp0, v128_cmpz, v128 testz
+#define v128_iszero                   vceqzq_u64

 // Not yet needed
 //#define v128_cmpeq1
+// Signed
+#define v128_cmpgt64( v1, v0 )        vcgtq_s64( (int64x2_t)v1, (int64x2_t)v0 )
+#define v128_cmpgt32( v1, v0 )        vcgtq_s32( (int32x4_t)v1, (int32x4_t)v0 )
+#define v128_cmpgt16( v1, v0 )        vcgtq_s16( (int16x8_t)v1, (int16x8_t)v0 )
+#define v128_cmpgt8( v1, v0 )         vcgtq_s8( (int8x16_t)v1, (int8x16_t)v0 )

-#define v128_cmpgt64                   vcgtq_u64
-#define v128_cmpgt32                   vcgtq_u32
-#define v128_cmpgt16                   vcgtq_u16
-#define v128_cmpgt8                    vcgtq_u8
+#define v128_cmplt64( v1, v0 )        vcltq_s64( (int64x2_t)v1, (int64x2_t)v0 )
+#define v128_cmplt32( v1, v0 )        vcltq_s32( (int32x4_t)v1, (int32x4_t)v0 )
+#define v128_cmplt16( v1, v0 )        vcltq_s16( (int16x8_t)v1, (int16x8_t)v0 )
+#define v128_cmplt8( v1, v0 )         vcltq_s8( (int8x16_t)v1, (int8x16_t)v0 )

-#define v128_cmplt64                   vcltq_u64
-#define v128_cmplt32                   vcltq_u32
-#define v128_cmplt16                   vcltq_u16
-#define v128_cmplt8                    vcltq_u8
+// Logical bit shift
+#define v128_sl64                     vshlq_n_u64
+#define v128_sl32                     vshlq_n_u32
+#define v128_sl16                     vshlq_n_u16
+#define v128_sl8                      vshlq_n_u8

-// bit shift
-#define v128_sl64                      vshlq_n_u64
-#define v128_sl32                      vshlq_n_u32
-#define v128_sl16                      vshlq_n_u16
-#define v128_sl8                       vshlq_n_u8
+#define v128_sr64                     vshrq_n_u64
+#define v128_sr32                     vshrq_n_u32
+#define v128_sr16                     vshrq_n_u16
+#define v128_sr8                      vshrq_n_u8

-#define v128_sr64                      vshrq_n_u64
-#define v128_sr32                      vshrq_n_u32
-#define v128_sr16                      vshrq_n_u16
-#define v128_sr8                       vshrq_n_u8
-
-// Unit tested, working.
-#define v128_sra64                     vshrq_n_s64
-#define v128_sra32                     vshrq_n_s32
-#define v128_sra16                     vshrq_n_s16
+// Arithmetic shift.
+#define v128_sra64( v, c )            vshrq_n_s64( (int64x2_t)v, c )
+#define v128_sra32( v, c )            vshrq_n_s32( (int32x4_t)v, c )
+#define v128_sra16( v, c )            vshrq_n_s16( (int16x8_t)v, c )

 // unary logic
-#define v128_not                       vmvnq_u32
+#define v128_not                      vmvnq_u32

 // binary logic
-#define v128_or                        vorrq_u32
-#define v128_and                       vandq_u32
-#define v128_xor                       veorq_u32
+#define v128_or                       vorrq_u32
+#define v128_and                      vandq_u32
+#define v128_xor                      veorq_u32

 // ~v1 & v0
-#define v128_andnot( v1, v0 )          vandq_u32( vmvnq_u32( v1 ), v0 )
+#define v128_andnot( v1, v0 )         vandq_u32( vmvnq_u32( v1 ), v0 )

 // ~( a ^ b ), same as (~a) ^ b
-#define v128_xnor( v1, v0 )            v128_not( v128_xor( v1, v0 ) )
+#define v128_xnor( v1, v0 )           v128_not( v128_xor( v1, v0 ) )

 // ~v1 | v0, x86_64 convention, first arg is  not'ed
-#define v128_ornot( v1, v0 )           vornq_u32( v0, v1 ) 
+#define v128_ornot( v1, v0 )          vornq_u32( v0, v1 ) 

 // ternary logic

 // v2 ^ v1 ^ v0
 // veorq_u32 not defined
 //#define v128_xor3                      veor3q_u32
-#define v128_xor3( v2, v1, v0 )        veorq_u32( v2, veorq_u32( v1, v0 ) )
+#define v128_xor3( v2, v1, v0 )       veorq_u32( v2, veorq_u32( v1, v0 ) )

 // v2 & v1 & v0
-#define v128_and3( v2, v1, v0 )        v128_and( v2, v128_and( v1, v0 ) )
+#define v128_and3( v2, v1, v0 )       v128_and( v2, v128_and( v1, v0 ) )

 // v2 | v1 | v0
-#define v128_or3( v2, v1, v0 )         v128_or( v2, v128_or( v1, v0 ) )
+#define v128_or3( v2, v1, v0 )        v128_or( v2, v128_or( v1, v0 ) )

 // a ^ ( ~b & c )
-#define v128_xorandnot( v2, v1, v0 )   v128_xor( v2, v128_andnot( v1, v0 ) )
+#define v128_xorandnot( v2, v1, v0 )  v128_xor( v2, v128_andnot( v1, v0 ) )

 // a ^ ( b & c )
-#define v128_xorand( v2, v1, v0 )      v128_xor( v2, v128_and( v1, v0 ) )
+#define v128_xorand( v2, v1, v0 )     v128_xor( v2, v128_and( v1, v0 ) )

 // a & ( b ^ c )
-#define v128_andxor( v2, v1, v0 )      v128_and( v2, v128_xor( v1, v0 ) )
+#define v128_andxor( v2, v1, v0 )     v128_and( v2, v128_xor( v1, v0 ) )

 // a ^ ( b | c )
-#define v128_xoror( v2, v1, v0 )       v128_xor( v2, v128_or( v1, v0 ) )
+#define v128_xoror( v2, v1, v0 )      v128_xor( v2, v128_or( v1, v0 ) )

 // v2 | ( v1 & v0 )
-#define v128_orand( v2, v1, v0 )       v128_or( v2, v128_and( v1, v0 ) )
+#define v128_orand( v2, v1, v0 )      v128_or( v2, v128_and( v1, v0 ) )

 // shift 2 concatenated vectors right.
-#define v128_alignr64( v1, v0, c )     vextq_u64( v0, v1, c )
-#define v128_alignr32( v1, v0, c )     vextq_u32( v0, v1, c )
-#define v128_alignr8(  v1, v0, c )     vextq_u8(  v0, v1, c ) 
+#define v128_alignr64( v1, v0, c )    vextq_u64( v0, v1, c )
+#define v128_alignr32( v1, v0, c )    vextq_u32( v0, v1, c )
+#define v128_alignr8(  v1, v0, c )    vextq_u8(  v0, v1, c ) 

 // Intetleave high or low half of 2 vectors.
-#define v128_unpacklo64( v1, v0 )      vzip1q_u64( v1, v0 )
-#define v128_unpackhi64( v1, v0 )      vzip2q_u64( v1, v0 )
-#define v128_unpacklo32( v1, v0 )      vzip1q_u32( v1, v0 )
-#define v128_unpackhi32( v1, v0 )      vzip2q_u32( v1, v0 )
-#define v128_unpacklo16( v1, v0 )      vzip1q_u16( v1, v0 )
-#define v128_unpackhi16( v1, v0 )      vzip2q_u16( v1, v0 )
-#define v128_unpacklo8(  v1, v0 )      vzip1q_u8(  v1, v0 )
-#define v128_unpackhi8(  v1, v0 )      vzip2q_u8(  v1, v0 )
+#define v128_unpacklo64( v1, v0 )     vzip1q_u64( v1, v0 )
+#define v128_unpackhi64( v1, v0 )     vzip2q_u64( v1, v0 )
+#define v128_unpacklo32( v1, v0 )     vzip1q_u32( v1, v0 )
+#define v128_unpackhi32( v1, v0 )     vzip2q_u32( v1, v0 )
+#define v128_unpacklo16( v1, v0 )     vzip1q_u16( v1, v0 )
+#define v128_unpackhi16( v1, v0 )     vzip2q_u16( v1, v0 )
+#define v128_unpacklo8(  v1, v0 )     vzip1q_u8(  v1, v0 )
+#define v128_unpackhi8(  v1, v0 )     vzip2q_u8(  v1, v0 )


 // AES
@@ -184,19 +185,19 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 #define v128_aesenclast( v, k ) \
   v128_xor( k, vaeseq_u8( v, v128_zero ) )

-#define v128_aesenclast_nokey( v, k ) \
+#define v128_aesenclast_nokey( v ) \
   vaeseq_u8( v, v128_zero )

 #define v128_aesdec( v, k ) \
    v128_xor( k, vaesimcq_u8( vaesdq_u8( v, v128_zero ) ) )

-#define v128_aesdec_nokey( v, k ) \
+#define v128_aesdec_nokey( v ) \
    vaesimcq_u8( vaesdq_u8( v, v128_zero ) )

 #define v128_aesdeclast( v, k ) \
    v128_xor( k, vaesdq_u8( v, v128_zero ) )

-#define v128_aesdeclast_nokey( v, k ) \
+#define v128_aesdeclast_nokey( v ) \
    vaesdq_u8( v, v128_zero )


@@ -254,24 +255,24 @@ typedef union
 #define v128_8                         vmovq_n_u8

 #define v64_set32( u32_1, u32_0 ) \
-   vcreate_u32( ( (uint64_t)(u32_1) << 32 ) | (uint64_t)(u32_0) )
+  vcreate_u32( ( (uint64_t)(u32_1) << 32 ) | (uint64_t)(u32_0) )

 #define v64_set16( u16_3, u16_2, u16_1, u16_0 ) \
-    vcreate_u16( ( (uint64_t)( ( (uint32_t)(u16_3) << 16 ) \
-                               | (uint32_t)(u16_2)       ) << 32 ) \
-               | ( (uint64_t)( ( (uint32_t)(u16_1) << 16 ) \
-                               | (uint32_t)(u16_0)       )       ) )
+  vcreate_u16( ( (uint64_t)( ( (uint32_t)(u16_3) << 16) \
+                             | (uint32_t)(u16_2)       ) << 32 ) \
+             | ( (uint64_t)( ( (uint32_t)(u16_1) << 16) \
+                             | (uint32_t)(u16_0)       )       ) )

 #define v64_set8( u8_7, u8_6, u8_5, u8_4, u8_3, u8_2, u8_1, u8_0 ) \
-    vcreate_u8( \
-     ( (uint64_t)( ( (uint32_t)(((uint16_t)(u8_7) << 8 ) \
-                               | (uint16_t)(u8_6)      ) << 16 ) \
-                 | ( (uint32_t)(((uint16_t)(u8_5) << 8 ) \
-                               | (uint16_t)(u8_4)      )       )) << 32 )  \
-   | ( (uint64_t)( ( (uint32_t)(((uint16_t)(u8_3) << 8 ) \
-                               | (uint16_t)(u8_2)      ) << 16 ) \
-                 | ( (uint32_t)(((uint16_t)(u8_1) << 8 ) \
-                               | (uint16_t)(u8_0)      )       ))       ))
+  vcreate_u8( \
+     ( (uint64_t)( ( (uint32_t)( ((uint16_t)(u8_7) << 8) \
+                                | (uint16_t)(u8_6)      ) << 16 ) \
+                 | ( (uint32_t)( ((uint16_t)(u8_5) << 8) \
+                                | (uint16_t)(u8_4)      )       ) ) << 32 )  \
+   | ( (uint64_t)( ( (uint32_t)( ((uint16_t)(u8_3) << 8) \
+                                | (uint16_t)(u8_2)      ) << 16 ) \
+                 | ( (uint32_t)( ((uint16_t)(u8_1) << 8) \
+                                | (uint16_t)(u8_0)      )       ) )       ) )

 #define v128_set64( u64_1, u64_0 ) \
   vcombine_u64( vcreate_u64( u64_0 ), vcreate_u64( u64_1 ) ) 
@@ -336,27 +337,27 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )

 // Bit rotation
 #define v128_ror64( v, c ) \
-  ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint64x2_t)v) ) \
+  ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)v) ) \
   : vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c )

 #define v128_rol64( v, c ) \
-  ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint64x2_t)v) ) \
+  ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)v) ) \
   : vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c )

 #define v128_ror32( v, c ) \
-  ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint32x4_t)v) ) \
+  ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)v) ) \
   : vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c )

 #define v128_rol32( v, c ) \
-  ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint32x4_t)v) ) \
+  ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)v) ) \
   : vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c )

 #define v128_ror16( v, c ) \
-  ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint16x8_t)v) ) \
+  ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
   : vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c )

 #define v128_rol16( v, c ) \
-  ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint16x8_t)v) ) \
+  ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
   : vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c )

 #define v128_ror8( v, c ) \
@@ -405,33 +406,17 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 v1 = vorrq_u32( v1, t1 ); \
 }

-// Cross lane shuffles, no programmable shuffle in NEON
-
-// vector mask, use as last resort. prefer rev, alignr, etc
+/* not used anywhere and hopefully never will
+// vector mask, use as last resort. prefer tbl, rev, alignr, etc
 #define v128_shufflev32( v, vmask ) \
  v128_set32( ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[3] ], \
              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[2] ], \
              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[1] ], \
              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[0] ] ) \
+*/

-// compatible with x86_64, but very slow, avoid
 #define v128_shuffle8( v, vmask ) \
-   v128_set8( ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[15] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[14] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[13] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[12] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[11] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[10] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 9] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 8] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 7] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 6] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 5] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 4] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 3] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 2] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 1] ], \
-              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 0] ] )
+     vqtbl1q_u8( (uint8x16_t)v, (uint8x16_t)vmask )

 // sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
 // Bit rotation already promotes faster widths. Usage is context sensitive.
@@ -549,20 +534,6 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
   casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \
 }

-// Prograsmmable shuffles
-// no compatible shuffles with x86_64, will require targeted user code.
-              
-#define v128_extractmask8( df, de, dd, dc, db, da, d9, d8, \
-                           d7, d6, d5, d4, d3, d2, d1, d0, vmask )   \
-  d0 = ((uint8_t*)(&vmask))[0];   d1 = ((uint8_t*)(&vmask))[1]; \
-  d2 = ((uint8_t*)(&vmask))[2];   d3 = ((uint8_t*)(&vmask))[3]; \
-  d4 = ((uint8_t*)(&vmask))[0];   d5 = ((uint8_t*)(&vmask))[1]; \
-  d6 = ((uint8_t*)(&vmask))[2];   d7 = ((uint8_t*)(&vmask))[3]; \
-  d8 = ((uint8_t*)(&vmask))[0];   d9 = ((uint8_t*)(&vmask))[1]; \
-  da = ((uint8_t*)(&vmask))[2];   db = ((uint8_t*)(&vmask))[3]; \
-  dc = ((uint8_t*)(&vmask))[0];   dd = ((uint8_t*)(&vmask))[1]; \
-  de = ((uint8_t*)(&vmask))[2];   df = ((uint8_t*)(&vmask))[3]; 
-
 // Blendv
 #define v128_blendv( v1, v0, mask ) \
   v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -930,7 +930,9 @@ static inline void cpu_brand_string( char* s )

 #elif defined(__arm__) || defined(__aarch64__)

-    sprintf( s, "ARM 64 bit CPU" );
+    unsigned int cpu_info[4] = { 0 };
+    cpuid( 0, 0, cpu_info );
+    sprintf( s, "ARM 64 bit CPU, HWCAP %08x", cpu_info[0] );

 #else
Author	SHA1	Message	Date
Jay D Dee	9d3a46c355	v23.15	2023-11-30 14:36:47 -05:00
Jay D Dee	4e3f1b926f	v23.14	2023-11-28 00:58:43 -05:00
Jay D Dee	045b42babf	v23.13	2023-11-21 14:18:15 -05:00
Jay D Dee	fc696dbbe5	v23.12	2023-11-20 11:51:57 -05:00
Jay D Dee	f3fde95f27	v23.10	2023-11-15 11:05:41 -05:00
Jay D Dee	0a78013cbe	v23.9	2023-11-12 18:48:50 -05:00