v23.15

v23.14
v23.13
2025-09-17 23:44:27 +00:00 · 2023-11-30 14:36:47 -05:00 · 2023-11-28 00:58:43 -05:00 · 2023-11-21 14:18:15 -05:00 · 2023-11-20 11:51:57 -05:00
79 changed files with 2913 additions and 1796 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -250,6 +250,7 @@ cpuminer_SOURCES = \
  algo/x16/x16rt.c \
  algo/x16/x16rt-4way.c \
  algo/x16/hex.c \
  algo/x16/x20r.c \
  algo/x16/x21s-4way.c \
  algo/x16/x21s.c \
  algo/x16/minotaur.c \
--- a/README.md
+++ b/README.md
@@ -87,7 +87,6 @@ Supported Algorithms
                          groestl       Groestl coin
                          hex           x16r-hex
                          hmq1725       
                          hodl          Hodlcoin
                          jha           Jackpotcoin
                          keccak        Maxcoin
                          keccakc       Creative coin
@@ -115,9 +114,11 @@ Supported Algorithms
                          scrypt:N      scrypt(N, 1, 1)
                          scryptn2      scrypt(1048576, 1, 1)
                          sha256d       Double SHA-256
                          sha256dt
                          sha256q       Quad SHA-256
                          sha256t       Triple SHA-256
                          sha3d         Double keccak256 (BSHA3)
                          sha512256d
                          skein         Skein+Sha (Skeincoin)
                          skein2        Double Skein (Woodcoin)
                          skunk         Signatum (SIGT)
@@ -145,6 +146,7 @@ Supported Algorithms
                          x16rt-veil    veil
                          x16s          
                          x17
                          x20r
                          x21s
                          x22i
                          x25x
--- a/50
+++ b/50
@@ -27,17 +27,19 @@ See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions
 Requirements
 ------------
-Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
+- A x86_64 architecture CPU with a minimum of SSE2 support. This includes Intel Core2 and newer and AMD equivalents.
-supported.
+- Arm CPU supporting AArch64 and NEON.
-64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
+32 bit CPUs are not supported.
 are not supported. FreeBSD YMMV.
-ARM requirements (Beta):
+Older CPUs are supported by open source cpuminer-multi by TPruvot but at reduced performance.
-CPU: Armv8 and NEON, SHA2 & AES are optional
+Mining on mobile devices that meet the requirements is not recommended due to the risk of
-OS: Linux distribution built for AArch64.
+overheating and damaging the battery. Mining has unlimited demand, it will push any device
-Packages: source code only.
+to or beyond its limits. There is also a fire risk with overheated lithium batteries.
 Beware of apps claiming "mobile only mining". There is no such thing, they aren't miners.
 If a mobile CPU can mine it any CPU can.
 See wiki for details.
@@ -73,12 +75,40 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 v23.15
 Fixed x11gost (sib) algo for all architectures, broken in v3.23.4.
 ARM: Fugue AES optimizations enabled.
 ARM: quark, qubit, x11gost algos optimized with NEON & AES.
 v23.14
 ARM: Groestl AES optimizations enabled.
 All: Small optimization to Shabal 4way.
 x86_64: Extend Shabal 4way support to SSE2 from SSE4.1.
 All: deleted some unused files.
 v23.13
 Added x20r algo.
 Eliminated redundant hash order calculations for x16r family.
 v23.12
 Several bugs fixes and speed improvements for x16r family for all CPU architectures.
 v23.11
 This is a release candidate for full AArch64 support, marking the end of the Beta phase.
 Fixed hmq1725 & x25x algos, SSE2 & NEON, broken in v3.23.4.
 Most CPU-mineable SHA3 algos (X*) upgraded to 2-way SSE2 & NEON.
 v23.10
 x86_64: Fixed scrypt, scryptn2 algos SSE2. 
-Fixed sha512d256d algo AVX2, SSE2, NEON.
+Fixed sha512256d algo AVX2, SSE2, NEON.
 Fixed a bug in Skein N-way that reduced performance.
-ARM: Skein algo optimized for NEON & SHA2.
+ARM: Skein optimized for NEON, SHA2 & SSE2.
 Skein2 algo 2-way optimized for NEON & SSE2.
 v23.9
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -368,6 +368,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_X16RT_VEIL:   rc = register_x16rt_veil_algo    ( gate ); break;
    case ALGO_X16S:         rc = register_x16s_algo          ( gate ); break;
    case ALGO_X17:          rc = register_x17_algo           ( gate ); break;
    case ALGO_X20R:         rc = register_x20r_algo          ( gate ); break;
    case ALGO_X21S:         rc = register_x21s_algo          ( gate ); break;
    case ALGO_X22I:         rc = register_x22i_algo          ( gate ); break;
    case ALGO_X25X:         rc = register_x25x_algo          ( gate ); break;
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -99,7 +99,7 @@ typedef  uint32_t set_t;
 #define AES_OPT          1 <<  7   // Intel Westmere, AArch64
 #define VAES_OPT         1 <<  8   // Icelake, Zen3
 #define SHA_OPT          1 <<  9   // Zen1, Icelake, AArch64 
-#define SHA512_OPT       1 << 10   // AArch64 
+#define SHA512_OPT       1 << 10   // Intel Arrow Lake, AArch64 
 #define NEON_OPT         1 << 11   // AArch64 
 // AVX10 does not have explicit algo features:
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -39,7 +39,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
   blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
   do {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      blakehash_4way( hash, vdata );
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -182,7 +182,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
   blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
   do {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      pdata[19] = n;
      blakecoin_4way_hash( hash, vdata );
--- a/algo/bmw/bmw512-4way.c
+++ b/algo/bmw/bmw512-4way.c
@@ -2,7 +2,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
 //#include "sph_keccak.h"
 #include "bmw-hash-4way.h"
 #if defined(BMW512_8WAY)
@@ -27,9 +26,9 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
-   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   __m512i  *noncev = (__m512i*)vdata + 9;
   const uint32_t Htarg = ptarget[7];
-   int thr_id = mythr->id;
+   const int thr_id = mythr->id;
   mm512_bswap32_intrlv80_8x64( vdata, pdata );
   do {
@@ -43,7 +42,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) )
+          if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
          {
              pdata[19] = n + lane;
              submit_solution( work, lane_hash, mythr );
@@ -59,8 +58,6 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
 #elif defined(BMW512_4WAY)
 //#ifdef BMW512_4WAY
 void bmw512hash_4way( void *state, const void *input )
 {
    bmw512_4way_context ctx;
@@ -81,9 +78,9 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 4;
-   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   __m256i  *noncev = (__m256i*)vdata + 9; 
   const uint32_t Htarg = ptarget[7];
-    int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;  
   mm256_bswap32_intrlv80_4x64( vdata, pdata );
   do {
@@ -96,7 +93,7 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) )
+          if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
          {
              pdata[19] = n + lane;
              submit_solution( work, lane_hash, mythr );
@@ -110,4 +107,55 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }
 #elif defined(BMW512_2WAY)
 void bmw512hash_2x64( void *state, const void *input )
 {
    bmw512_2x64_context ctx;
    bmw512_2x64_init( &ctx );
    bmw512_2x64_update( &ctx, input, 80 );
    bmw512_2x64_close( &ctx, state );
 }
 int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
   uint32_t hash[16*2] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[13]);   // 3*4+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 2;
   v128_t *noncev = (v128_t*)vdata + 9;  
   const uint32_t Htarg = ptarget[7];
   const int thr_id = mythr->id; 
   v128_bswap32_intrlv80_2x64( vdata, pdata );
   do {
      *noncev = v128_intrlv_blend_32( v128_bswap32(
                                      v128_set32( n+1, 0, n, 0 ) ), *noncev );
      bmw512hash_2x64( hash, vdata );
      for ( int lane = 0; lane < 2; lane++ )
      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
      {
          extr_lane_2x64( lane_hash, hash, lane, 256 );
          if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
          {
              pdata[19] = n + lane;
              submit_solution( work, lane_hash, mythr );
          }
      }
      n += 2;
   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
   *hashes_done = n - first_nonce;
   return 0;
 }
 #endif
--- a/algo/bmw/bmw512-gate.c
+++ b/algo/bmw/bmw512-gate.c
@@ -2,7 +2,7 @@
 bool register_bmw512_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
  opt_target_factor = 256.0;
 #if defined (BMW512_8WAY)
  gate->scanhash  = (void*)&scanhash_bmw512_8way;
@@ -10,6 +10,9 @@ bool register_bmw512_algo( algo_gate_t* gate )
 #elif defined (BMW512_4WAY)
  gate->scanhash  = (void*)&scanhash_bmw512_4way;
  gate->hash      = (void*)&bmw512hash_4way;
 #elif defined (BMW512_2WAY)
  gate->scanhash  = (void*)&scanhash_bmw512_2x64;
  gate->hash      = (void*)&bmw512hash_2x64;
 #else
  gate->scanhash        = (void*)&scanhash_bmw512;
  gate->hash            = (void*)&bmw512hash;
--- a/algo/bmw/bmw512-gate.h
+++ b/algo/bmw/bmw512-gate.h
@@ -8,6 +8,8 @@
  #define BMW512_8WAY 1
 #elif defined(__AVX2__)
  #define BMW512_4WAY 1
 #elif defined(__SSE2__) || defined(__ARM_NEON)
  #define BMW512_2WAY 1
 #endif
 #if defined(BMW512_8WAY)
@@ -22,6 +24,12 @@ void bmw512hash_4way( void *state, const void *input );
 int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 #elif defined(BMW512_2WAY)
 void bmw512hash_2x64( void *state, const void *input );
 int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 #else
 void bmw512hash( void *state, const void *input );
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -236,8 +236,6 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc
 }
 HashReturn init_echo( hashState_echo *ctx, int nHashSize )
 {
 	int i, j;
@@ -280,7 +278,8 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
 	return SUCCESS;
 }
-HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
+HashReturn update_echo( hashState_echo *state, const void *data,
                        uint32_t databitlen )
 {
 	unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -330,7 +329,7 @@ HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLengt
 	return SUCCESS;
 }
-HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
+HashReturn final_echo( hashState_echo *state, void *hashval)
 {
 	v128_t remainingbits;
@@ -407,8 +406,8 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
 	return SUCCESS;
 }
-HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
+HashReturn update_final_echo( hashState_echo *state, void *hashval,
-                              const BitSequence *data, DataLength databitlen )
+                              const void *data, uint32_t databitlen )
 {
   unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -530,8 +529,8 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
   return SUCCESS;
 }
-HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
+HashReturn echo_full( hashState_echo *state, void *hashval,
-            int nHashSize, const BitSequence *data, DataLength datalen )
+            int nHashSize, const void *data, uint32_t datalen )
 {
   int i, j;
@@ -578,7 +577,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
        {
           // Fill the buffer
           memcpy( state->buffer + state->uBufferBytes,
-                   (void*)data, state->uBlockLength - state->uBufferBytes );
+                   data, state->uBlockLength - state->uBufferBytes );
           // Process buffer
           Compress( state, state->buffer, 1 );
@@ -601,7 +600,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
        }
        if( uRemainingBytes > 0 )
-        memcpy(state->buffer, (void*)data, uRemainingBytes);
+        memcpy(state->buffer, data, uRemainingBytes);
        state->uBufferBytes = uRemainingBytes;
   }
@@ -689,7 +688,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
 }
-
+#if 0
 HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
 {
 	HashReturn hRet;
@@ -746,5 +745,6 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit
 	return SUCCESS;
 }
 #endif
 #endif
--- a/algo/echo/aes_ni/hash_api.h
+++ b/algo/echo/aes_ni/hash_api.h
@@ -47,16 +47,16 @@ HashReturn init_echo(hashState_echo *state, int hashbitlen);
 HashReturn reinit_echo(hashState_echo *state);
-HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen);
+HashReturn update_echo(hashState_echo *state, const void *data, uint32_t databitlen);
-HashReturn final_echo(hashState_echo *state, BitSequence *hashval);
+HashReturn final_echo(hashState_echo *state, void *hashval);
-HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
+HashReturn hash_echo(int hashbitlen, const void *data, uint32_t databitlen, void *hashval);
-HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
+HashReturn update_final_echo( hashState_echo *state, void *hashval,
-                              const BitSequence *data, DataLength databitlen );
+                              const void *data, uint32_t databitlen );
-HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
+HashReturn echo_full( hashState_echo *state, void *hashval,
-            int nHashSize, const BitSequence *data, DataLength databitlen );
+            int nHashSize, const void *data, uint32_t databitlen );
 #endif // HASH_API_H
--- a/algo/echo/sph_echo.c
+++ b/algo/echo/sph_echo.c
@@ -36,7 +36,6 @@
 #include "sph_echo.h"
 #if !defined(__AES__)
 #ifdef __cplusplus
 extern "C"{
@@ -1031,4 +1030,3 @@ sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #ifdef __cplusplus
 }
 #endif 
 #endif  // !AES
--- a/algo/echo/sph_echo.h
+++ b/algo/echo/sph_echo.h
@@ -36,8 +36,6 @@
 #ifndef SPH_ECHO_H__
 #define SPH_ECHO_H__
 #if !defined(__AES__)
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -318,5 +316,4 @@ void sph_echo512_addbits_and_close(
 #ifdef __cplusplus
 }
 #endif
 #endif // !AES
 #endif
--- a/algo/fugue/fugue-aesni.c
+++ b/algo/fugue/fugue-aesni.c
@@ -15,237 +15,176 @@
 *
 */
-#if defined(__AES__)
+#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
 #include <x86intrin.h>
 #include <memory.h>
 #include "fugue-aesni.h"
 static const v128u64_t _supermix1a	__attribute__ ((aligned (16))) =
   { 0x0202010807020100, 0x0a05000f06010c0b };
-MYALIGN const unsigned long long _supermix1a[]	= {0x0202010807020100, 0x0a05000f06010c0b};
+static const v128u64_t _supermix1b	__attribute__ ((aligned (16))) =
-MYALIGN const unsigned long long _supermix1b[]	= {0x0b0d080703060504, 0x0e0a090c050e0f0a};
+   { 0x0b0d080703060504, 0x0e0a090c050e0f0a };
 MYALIGN const unsigned long long _supermix1c[]	= {0x0402060c070d0003, 0x090a060580808080};
 MYALIGN const unsigned long long _supermix1d[]	= {0x808080800f0e0d0c, 0x0f0e0d0c80808080};
 MYALIGN const unsigned long long _supermix2a[]	= {0x07020d0880808080, 0x0b06010c050e0f0a};
 MYALIGN const unsigned long long _supermix4a[]	= {0x000f0a050c0b0601, 0x0302020404030e09};
 MYALIGN const unsigned long long _supermix4b[]	= {0x07020d08080e0d0d, 0x07070908050e0f0a};
 MYALIGN const unsigned long long _supermix4c[]	= {0x0706050403020000, 0x0302000007060504};
 MYALIGN const unsigned long long _supermix7a[]	= {0x010c0b060d080702, 0x0904030e03000104};
 MYALIGN const unsigned long long _supermix7b[]	= {0x8080808080808080, 0x0504070605040f06};
 //MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
 //MYALIGN const unsigned char _shift_one_mask[]   = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
 //MYALIGN const unsigned char _shift_four_mask[]  = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
 //MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
 //MYALIGN const unsigned char _aes_shift_rows[]   = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
 MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
 MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
 MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
 MYALIGN const unsigned int _lsbmask2[] = {0x03030303, 0x03030303, 0x03030303, 0x03030303};
 static const v128u64_t _supermix1c	__attribute__ ((aligned (16))) =
   { 0x0402060c070d0003, 0x090a060580808080 };
-MYALIGN const unsigned int _IV512[] = {		
+static const v128u64_t _supermix1d	__attribute__ ((aligned (16))) =
-	0x00000000, 0x00000000,	0x7ea50788, 0x00000000,
+   { 0x808080800f0e0d0c, 0x0f0e0d0c80808080 };
 static const v128u64_t _supermix2a	__attribute__ ((aligned (16))) =
   { 0x07020d0880808080, 0x0b06010c050e0f0a };
 static const v128u64_t _supermix4a	__attribute__ ((aligned (16))) =
   { 0x000f0a050c0b0601, 0x0302020404030e09 };
 static const v128u64_t _supermix4b	__attribute__ ((aligned (16))) =
   { 0x07020d08080e0d0d, 0x07070908050e0f0a };
 static const v128u64_t _supermix4c	__attribute__ ((aligned (16))) =
   { 0x0706050403020000, 0x0302000007060504 };
 static const v128u64_t _supermix7a	__attribute__ ((aligned (16))) =
   { 0x010c0b060d080702, 0x0904030e03000104 };
 static const v128u64_t _supermix7b	__attribute__ ((aligned (16))) =
   { 0x8080808080808080, 0x0504070605040f06 };
 static const v128u64_t _inv_shift_rows __attribute__ ((aligned (16))) =
   { 0x0b0e0104070a0d00, 0x0306090c0f020508 };
 static const v128u64_t _mul2mask __attribute__ ((aligned (16))) =
   { 0x000000001b1b0000, 0x0000000000000000 };
 static const v128u64_t _mul4mask __attribute__ ((aligned (16))) =
   { 0x000000002d361b00, 0x0000000000000000 };
 static const v128u64_t _lsbmask2 __attribute__ ((aligned (16))) =
   { 0x0303030303030303, 0x0303030303030303 };
 static const uint32_t _IV512[] __attribute__ ((aligned (32))) =
 {	0x00000000, 0x00000000,	0x7ea50788, 0x00000000,
 	0x75af16e6, 0xdbe4d3c5, 0x27b09aac, 0x00000000,
 	0x17f115d9, 0x54cceeb6, 0x0b02e806, 0x00000000,
 	0xd1ef924a, 0xc9e2c6aa, 0x9813b2dd, 0x00000000,
 	0x3858e6ca, 0x3f207f43, 0xe778ea25, 0x00000000,
-	0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000};
+	0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000
 };
-#if defined(__SSE4_1__)
+#if defined(__ARM_NEON)
-#define PACK_S0(s0, s1, t1)\
+#define mask_1000(v)         v128_put32( v, 0, 3 )
   s0 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s0), _mm_castsi128_ps(s1), 0x30))
-#define UNPACK_S0(s0, s1, t1)\
+static const v128u32_t MASK_3321 __attribute__ ((aligned (16))) =
-   s1 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s1), _mm_castsi128_ps(s0), 0xc0));\
+   { 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x0f0e0d0c };
-   s0 = mm128_mask_32( s0, 8 )
+
 static const v128u32_t MASK_3033 __attribute__ ((aligned (16))) =
   { 0x0f0e0d0c, 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c };
 static const v128u32_t MASK_3303 __attribute__ ((aligned (16))) =
   { 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c, 0x0f0e0d0c };
 static const v128u32_t MASK_0321 __attribute__ ((aligned (16))) =
   { 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x03020100 };
 #define shuffle_3303(v)      vqtbl1q_u8( v, MASK_3303 )
 #define shuffle_0321(v)      vqtbl1q_u8( v, MASK_0321 )
 #define CMIX( s1, s2, r1, r2, t1, t2 ) \
   t1 = vqtbl1q_u8( s1, MASK_3321 ); \
   t2 = vqtbl1q_u8( s2, MASK_3033 ); \
   t1 = v128_xor( t1, t2 ); \
   r1 = v128_xor( r1, t1 ); \
   r2 = v128_xor( r2, t1 );
 #elif defined(__SSE4_1__)
 #define mask_1000(v)         v128_mask32( v, 8 )
 #define shuffle_3303(v)      _mm_shuffle_epi32( v, 0xf3 )
 #define shuffle_0321(v)      _mm_shuffle_epi32( v, 0x39 )
 #define CMIX( s1, s2, r1, r2, t1, t2 ) \
   t1 = s1; \
-   t1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t1), _mm_castsi128_ps(s2), _MM_SHUFFLE(3, 0, 2, 1)));\
+   t1 = v128_shuffle2_32( t1, s2, _MM_SHUFFLE( 3, 0, 2, 1 ) ); \
-   r1 = _mm_xor_si128(r1, t1);\
+   r1 = v128_xor( r1, t1 ); \
-   r2 = _mm_xor_si128(r2, t1);
+   r2 = v128_xor( r2, t1 );
 #else   // SSE2
 #define PACK_S0(s0, s1, t1)\
   t1 = _mm_shuffle_epi32(s1, _MM_SHUFFLE(0, 3, 3, 3));\
   s0 = _mm_xor_si128(s0, t1);
 #define UNPACK_S0(s0, s1, t1)\
   t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 3, 3));\
   s1 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s1), _mm_castsi128_ps(t1)));\
   s0 = mm128_mask_32( s0, 8 )
 #define CMIX(s1, s2, r1, r2, t1, t2)\
   t1 = _mm_shuffle_epi32(s1, 0xf9);\
   t2 = _mm_shuffle_epi32(s2, 0xcf);\
   t1 = _mm_xor_si128(t1, t2);\
   r1 = _mm_xor_si128(r1, t1);\
   r2 = _mm_xor_si128(r2, t1)
 #endif
-#define TIX256(msg, s10, s8, s24, s0, t1, t2, t3)\
+#define PACK_S0( s0, s1, t1 ) \
-	t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
+ s0 = v128_movlane32( s0, 3, s1, 0 )
 	s10 = _mm_xor_si128(s10, t1);\
 	t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
 	s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
 	t1 = _mm_slli_si128(t1, 8);\
 	s8 = _mm_xor_si128(s8, t1);\
 	t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
 	s0 = _mm_xor_si128(s0, t1)
-
+#define UNPACK_S0( s0, s1, t1 ) \
-#define TIX384(msg, s16, s8, s27, s30, s0, s4, t1, t2, t3)\
+   s1 = v128_movlane32( s1, 0, s0, 3 ); \
-	t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
+   s0 = mask_1000( s0 )
 	s16 = _mm_xor_si128(s16, t1);\
 	t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
 	s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
 	t1 = _mm_slli_si128(t1, 8);\
 	s8 = _mm_xor_si128(s8, t1);\
 	t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
 	s0 = _mm_xor_si128(s0, t1);\
 	t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
 	s4 = _mm_xor_si128(s4, t1)
 #define TIX512(msg, s22, s8, s24, s27, s30, s0, s4, s7, t1, t2, t3)\
-	t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
+	t1 = shuffle_3303( s0 ); \
-	s22 = _mm_xor_si128(s22, t1);\
+	s22 = v128_xor(s22, t1);\
-	t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
+	t1 = v128_put32( v128_zero, *(uint32_t*)msg, 0 ); \
-	s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
+	s0 = v128_movlane32( s0, 0, t1, 0 ); \
-	t1 = _mm_slli_si128(t1, 8);\
+	t1 = v128_alignr64( t1, v128_zero, 1 ); \
-	s8 = _mm_xor_si128(s8, t1);\
+	s8 = v128_xor(s8, t1);\
-	t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
+	t1 = shuffle_3303( s24 ); \
-	s0 = _mm_xor_si128(s0, t1);\
+	s0 = v128_xor(s0, t1);\
-	t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
+	t1 = shuffle_3303( s27 ); \
-	s4 = _mm_xor_si128(s4, t1);\
+	s4 = v128_xor(s4, t1);\
-	t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
+	t1 = shuffle_3303( s30 ); \
-	s7 = _mm_xor_si128(s7, t1)
+	s7 = v128_xor(s7, t1)
 #define PRESUPERMIX(t0, t1, t2, t3, t4)\
   t2 = t0;\
   t3 = _mm_add_epi8(t0, t0);\
   t4 = _mm_add_epi8(t3, t3);\
   t1 = _mm_srli_epi16(t0, 6);\
   t1 = _mm_and_si128(t1, M128(_lsbmask2));\
   t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
   t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
 /*
 #define PRESUPERMIX(x, t1, s1, s2, t2)\
 	s1 = x;\
 	s2 = _mm_add_epi8(x, x);\
 	t2 = _mm_add_epi8(s2, s2);\
 	t1 = _mm_srli_epi16(x, 6);\
 	t1 = _mm_and_si128(t1, M128(_lsbmask2));\
 	s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
 	x  = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
 */
 #define SUBSTITUTE( r0, _t2 ) \
-	_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
+	_t2 = v128_shuffle8( r0, _inv_shift_rows ); \
-	_t2 = _mm_aesenclast_si128( _t2, v128_zero )
+	_t2 = v128_aesenclast_nokey( _t2 )
 #define SUPERMIX(t0, t1, t2, t3, t4)\
   t2 = t0;\
-   t3 = _mm_add_epi8(t0, t0);\
+   t3 = v128_add8( t0, t0 ); \
-   t4 = _mm_add_epi8(t3, t3);\
+   t4 = v128_add8( t3, t3 ); \
-   t1 = _mm_srli_epi16(t0, 6);\
+   t1 = v128_sr16( t0, 6 ); \
-   t1 = _mm_and_si128(t1, M128(_lsbmask2));\
+   t1 = v128_and( t1, _lsbmask2 ); \
-   t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
+   t0 = v128_xor( t4, v128_shuffle8( _mul4mask, t1 ) ); \
-   t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
+   t4 = v128_shuffle8( t2, _supermix1b ); \
-   t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
+   t3 = v128_xor( t3, v128_shuffle8( _mul2mask, t1 ) ); \
-   t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
+   t1 = v128_shuffle8( t4, _supermix1c ); \
-   t4 = _mm_xor_si128(t4, t1);\
+   t4 = v128_xor( t4, t1 ); \
-   t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
+   t1 = v128_shuffle8( t4, _supermix1d ); \
-   t4 = _mm_xor_si128(t4, t1);\
+   t4 = v128_xor( t4, t1 ); \
-   t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
+   t1 = v128_shuffle8( t2, _supermix1a ); \
   t2 = v128_xor3( t2, t3, t0 ); \
-   t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
+   t2 = v128_shuffle8( t2, _supermix7a ); \
   t4 = v128_xor3( t4, t1, t2 ); \
-   t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
+   t2 = v128_shuffle8( t2, _supermix7b ); \
-   t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
+   t3 = v128_shuffle8( t3, _supermix2a ); \
-   t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
+   t1 = v128_shuffle8( t0, _supermix4a ); \
-   t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
+   t0 = v128_shuffle8( t0, _supermix4b ); \
   t4 = v128_xor3( t4, t2, t1 ); \
-   t0 = _mm_xor_si128(t0, t3);\
+   t0 = v128_xor( t0, t3 ); \
-   t4 = v128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
+   t4 = v128_xor3( t4, t0, v128_shuffle8( t0, _supermix4c ) );
 /*
 #define SUPERMIX(t0, t1, t2, t3, t4)\
 	PRESUPERMIX(t0, t1, t2, t3, t4);\
 	POSTSUPERMIX(t0, t1, t2, t3, t4)
 */
 #define POSTSUPERMIX(t0, t1, t2, t3, t4)\
 	t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
 	t4 = t1;\
 	t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
 	t4 = _mm_xor_si128(t4, t1);\
 	t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
 	t4 = _mm_xor_si128(t4, t1);\
 	t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
 	t4 = _mm_xor_si128(t4, t1);\
 	t2 = v128_xor3(t2, t3, t0 );\
 	t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
 	t4 = _mm_xor_si128(t4, t2);\
 	t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
 	t4 = _mm_xor_si128(t4, t2);\
 	t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
 	t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
 	t4 = _mm_xor_si128(t4, t1);\
 	t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
 	t0 = _mm_xor_si128(t0, t3);\
 	t4 = _mm_xor_si128(t4, t0);\
 	t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
 	t4 = _mm_xor_si128(t4, t0)
 #define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
 	CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
 	PACK_S0(r1c, r1a, _t0);\
 	SUBSTITUTE(r1c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
 	_t0 = _mm_shuffle_epi32(r1c, 0x39);\
 	r2c = _mm_xor_si128(r2c, _t0);\
   _t0 = mm128_mask_32( _t0, 8 ); \
 	r2d = _mm_xor_si128(r2d, _t0);\
 	UNPACK_S0(r1c, r1a, _t3);\
 	SUBSTITUTE(r2c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
 	_t0 = _mm_shuffle_epi32(r2c, 0x39);\
 	r3c = _mm_xor_si128(r3c, _t0);\
   _t0 = mm128_mask_32( _t0, 8 ); \
 	r3d = _mm_xor_si128(r3d, _t0);\
 	UNPACK_S0(r2c, r2a, _t3);\
 	SUBSTITUTE(r3c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
 	UNPACK_S0(r3c, r3a, _t3)
 #define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
 	CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
 	PACK_S0(r1c, r1a, _t0);\
 	SUBSTITUTE( r1c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
-	_t0 = _mm_shuffle_epi32(r1c, 0x39);\
+	_t0 = shuffle_0321( r1c ); \
-	r2c = _mm_xor_si128(r2c, _t0);\
+	r2c = v128_xor(r2c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
+   _t0 = mask_1000( _t0 ); \
-	r2d = _mm_xor_si128(r2d, _t0);\
+	r2d = v128_xor(r2d, _t0);\
 	UNPACK_S0(r1c, r1a, _t3);\
 	SUBSTITUTE(r2c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
-	_t0 = _mm_shuffle_epi32(r2c, 0x39);\
+	_t0 = shuffle_0321( r2c ); \
-	r3c = _mm_xor_si128(r3c, _t0);\
+	r3c = v128_xor(r3c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
+   _t0 = mask_1000( _t0 ); \
-	r3d = _mm_xor_si128(r3d, _t0);\
+	r3d = v128_xor(r3d, _t0);\
 	UNPACK_S0(r2c, r2a, _t3);\
 	SUBSTITUTE( r3c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
-	_t0 = _mm_shuffle_epi32(r3c, 0x39);\
+	_t0 = shuffle_0321( r3c ); \
-	r4c = _mm_xor_si128(r4c, _t0);\
+	r4c = v128_xor(r4c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
+   _t0 = mask_1000( _t0 ); \
-	r4d = _mm_xor_si128(r4d, _t0);\
+	r4d = v128_xor(r4d, _t0);\
 	UNPACK_S0(r3c, r3a, _t3);\
 	SUBSTITUTE( r4c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
@@ -256,18 +195,19 @@ MYALIGN const unsigned int _IV512[] = {
 	block[1] = col[(base + a + 1) % s];\
 	block[2] = col[(base + a + 2) % s];\
 	block[3] = col[(base + a + 3) % s];\
-	x = _mm_load_si128((__m128i*)block)
+	x = v128_load( (v128_t*)block )
 #define STORECOLUMN(x, s)\
-	_mm_store_si128((__m128i*)block, x);\
+	v128_store((v128_t*)block, x );\
 	col[(base + 0) % s] = block[0];\
 	col[(base + 1) % s] = block[1];\
 	col[(base + 2) % s] = block[2];\
 	col[(base + 3) % s] = block[3]
-void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
+void Compress512( hashState_fugue *ctx, const unsigned char *pmsg,
                  unsigned int uBlockCount )
 {
-   __m128i _t0, _t1, _t2, _t3;
+   v128_t _t0, _t1, _t2, _t3;
   switch(ctx->base)
   {
@@ -346,19 +286,18 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
      pmsg += 4;
      uBlockCount--;
   }
 }
-void Final512(hashState_fugue *ctx, BitSequence *hashval)
+void Final512( hashState_fugue *ctx, uint8_t *hashval )
 {
   unsigned int block[4] __attribute__ ((aligned (32)));
   unsigned int col[36] __attribute__ ((aligned (16)));
 	unsigned int i, base;
-	__m128i r0, _t0, _t1, _t2, _t3;
+	v128_t r0, _t0, _t1, _t2, _t3;
 	for( i = 0; i < 12; i++ )
 	{
-		_mm_store_si128((__m128i*)block, ctx->state[i]);
+		v128_store( (v128_t*)block, ctx->state[i] );
 		col[3 * i + 0] = block[0];
 		col[3 * i + 1] = block[1];
@@ -458,22 +397,22 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
 	// Transform to the standard basis and store output; S1 || S2 || S3 || S4
 	LOADCOLUMN( r0, 36, 1 );
-	_mm_store_si128((__m128i*)hashval, r0);
+	v128_store( (v128_t*)hashval, r0 );
 	// Transform to the standard basis and store output; S9 || S10 || S11 || S12
 	LOADCOLUMN( r0, 36, 9 );
-	_mm_store_si128((__m128i*)hashval + 1, r0);
+	v128_store( (v128_t*)hashval + 1, r0 );
 	// Transform to the standard basis and store output; S18 || S19 || S20 || S21
 	LOADCOLUMN( r0, 36, 18 );
-	_mm_store_si128((__m128i*)hashval + 2, r0);
+	v128_store( (v128_t*)hashval + 2, r0 );
 	// Transform to the standard basis and store output; S27 || S28 || S29 || S30
 	LOADCOLUMN( r0, 36, 27 );
-	_mm_store_si128((__m128i*)hashval + 3, r0);
+	v128_store( (v128_t*)hashval + 3, r0 );
 }
-HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
+int fugue512_Init( hashState_fugue *ctx, int nHashSize )
 {
 	int i;
 	ctx->processed_bits = 0;
@@ -487,18 +426,18 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
 	for(i = 0; i < 6; i++)
 		ctx->state[i] = v128_zero;
-	ctx->state[6]  = _mm_load_si128((__m128i*)_IV512 + 0);
+	ctx->state[6]  = casti_v128( _IV512, 0 );
-	ctx->state[7]  = _mm_load_si128((__m128i*)_IV512 + 1);
+	ctx->state[7]  = casti_v128( _IV512, 1 );
-	ctx->state[8]  = _mm_load_si128((__m128i*)_IV512 + 2);
+	ctx->state[8]  = casti_v128( _IV512, 2 );
-	ctx->state[9]  = _mm_load_si128((__m128i*)_IV512 + 3);
+	ctx->state[9]  = casti_v128( _IV512, 3 );
-	ctx->state[10] = _mm_load_si128((__m128i*)_IV512 + 4);
+	ctx->state[10] = casti_v128( _IV512, 4 );
-	ctx->state[11] = _mm_load_si128((__m128i*)_IV512 + 5);
+	ctx->state[11] = casti_v128( _IV512, 5 );
-	return SUCCESS;
+	return 0;
 }
-
+int fugue512_Update( hashState_fugue *state, const void *data,
-HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen)
+                            uint64_t databitlen )
 {
 	unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -509,7 +448,8 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
 		if(state->uBufferBytes != 0)
 		{
 			// Fill the buffer
-			memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
+			memcpy( state->buffer + state->uBufferBytes, (void*)data,
                 state->uBlockLength - state->uBufferBytes );
 			// Process the buffer
 			Compress512(state, state->buffer, 1);
@@ -545,13 +485,13 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
 		state->uBufferBytes += uByteLength;
 	}
-	return SUCCESS;
+	return 0;
 }
-HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
+int fugue512_Final( hashState_fugue *state, void *hashval )
 {
 	unsigned int i;
-	BitSequence lengthbuf[8] __attribute__((aligned(64)));
+	uint8_t lengthbuf[8] __attribute__((aligned(64)));
 	// Update message bit count
 	state->processed_bits += state->uBufferBytes * 8;
@@ -575,16 +515,17 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
 	// Finalization
 	Final512(state, hashval);
-	return SUCCESS;
+	return 0;
 }
-HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen)
+int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
                   uint64_t databitlen )
 {
 	fugue512_Init( hs, 512 );
 	fugue512_Update( hs, data, databitlen*8 );
 	fugue512_Final( hs, hashval );
-	return SUCCESS;
+	return 0;
 }
 #endif  // AES
--- a/algo/fugue/fugue-aesni.h
+++ b/algo/fugue/fugue-aesni.h
@@ -14,37 +14,31 @@
 #ifndef FUGUE_HASH_API_H
 #define FUGUE_HASH_API_H
-#if defined(__AES__) 
+#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
 #if !defined(__SSE4_1__)
 #error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
 #endif
 #include "compat/sha3_common.h"
 #include "simd-utils.h"
 typedef struct
 {
-	__m128i			state[12];
+	v128_t			state[12];
 	unsigned int	base;
 	unsigned int	uHashSize;
 	unsigned int	uBlockLength;
 	unsigned int	uBufferBytes;
-	DataLength		processed_bits;
+	uint64_t 		processed_bits;
-	BitSequence		buffer[4];
+	uint8_t  		buffer[4];
 } hashState_fugue __attribute__ ((aligned (64)));
 // These functions are deprecated, use the lower case macro aliases that use
 // the standard interface. This will be cleaned up at a later date.
-HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen);
+int fugue512_Init( hashState_fugue *state, int hashbitlen );
-HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen);
+int fugue512_Update( hashState_fugue *state, const void *data,
                     uint64_t databitlen );
-HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
+int fugue512_Final( hashState_fugue *state, void *hashval );
 #define fugue512_init( state ) \
   fugue512_Init( state, 512 )
@@ -54,7 +48,8 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
   fugue512_Final
-HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen);
+int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
                   uint64_t databitlen);
 #endif // AES
 #endif // HASH_API_H
--- a/algo/gost/sph_gost.c
+++ b/algo/gost/sph_gost.c
@@ -704,15 +704,15 @@ static void AddXor512(const void *a,const void *b,void *c)
                                           casti_m256i( b, 0 ) );
   casti_m256i( c, 1 ) = _mm256_xor_si256( casti_m256i( a, 1 ),
                                           casti_m256i( b, 1 ) );
-#elif defined(__SSE2__)
+#elif defined(__SSE2__) || defined(__ARM_NEON)
-   casti_m128i( c, 0 ) = _mm_xor_si128( casti_m128i( a, 0 ),
+   casti_v128( c, 0 ) = v128_xor( casti_v128( a, 0 ),
-                                        casti_m128i( b, 0 ) );
+                                  casti_v128( b, 0 ) );
-   casti_m128i( c, 1 ) = _mm_xor_si128( casti_m128i( a, 1 ),
+   casti_v128( c, 1 ) = v128_xor( casti_v128( a, 1 ),
-                                        casti_m128i( b, 1 ) );
+                                  casti_v128( b, 1 ) );
-   casti_m128i( c, 2 ) = _mm_xor_si128( casti_m128i( a, 2 ),
+   casti_v128( c, 2 ) = v128_xor( casti_v128( a, 2 ),
-                                        casti_m128i( b, 2 ) );
+                                  casti_v128( b, 2 ) );
-   casti_m128i( c, 3 ) = _mm_xor_si128( casti_m128i( a, 3 ),
+   casti_v128( c, 3 ) = v128_xor( casti_v128( a, 3 ),
-                                        casti_m128i( b, 3 ) );
+                                  casti_v128( b, 3 ) );
 #else
   const unsigned long long *A=a, *B=b;
 	unsigned long long *C=c;
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -60,46 +60,10 @@ static const v128u64_t SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
 #if defined(__ARM_NEON)
-// No fast shuffle on NEON
+static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
-//static const uint32x4_t vmask_d8 = {  3, 1, 2, 0 };  
+   { 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
 static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
-#define gr_shuffle32( v )      v128_blendv( v128_qrev32( v ), v, BLEND_MASK )
+#define gr_shuffle32(v)       vqtbl1q_u8( v, gr_mask ) 
 /*
 #define TRANSP_MASK \
     0xd,0x5,0x9,0x1,0xc,0x4,0x8,0x0,0xf,0x7,0xb,0x3,0xe,0x6,0xa,0x2
 #define SUBSH_MASK0 \
     0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8
 #define SUBSH_MASK1 \
     0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9
 #define SUBSH_MASK2 \
     0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa
 #define SUBSH_MASK3 \
     0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb
 #define SUBSH_MASK4  \
     0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc
 #define SUBSH_MASK5 \
     0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd
 #define SUBSH_MASK6 \
     0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe
 #define SUBSH_MASK7 \
     0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3
 //#define gr_shuffle8( v, c )    v128_shullfev8( v, c )
 #define gr_shuffle8( v, c15, c14, c13, c12, c11, c10, c09, c08, \
                        c07, c06, c05, c04, c03, c02, c01, c00 ) \
  v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
  v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
  v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
  v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
    v, 15, v, c15 ), 14, v, c14 ), 13, v, c13 ), 12, v, c12 ), \
       11, v, c11 ), 10, v, c10 ),  9, v, c09 ),  8, v, c08 ), \
        7, v, c07 ),  6, v, c06 ),  5, v, c05 ),  4, v, c04 ), \
        3, v, c03 ),  2, v, c02 ),  1, v, c01 ),  0, v, c00 )
 */
 #else
@@ -107,7 +71,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
 #endif
 #define tos(a)    #a
 #define tostr(a)  tos(a)
@@ -334,15 +297,14 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
 */
 #define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* SubBytes */\
-  b0 = v128_xor(b0, b0);\
+  a0 = v128_aesenclast_nokey( a0 ); \
-  a0 = v128_aesenclast(a0, b0);\
+  a1 = v128_aesenclast_nokey( a1 ); \
-  a1 = v128_aesenclast(a1, b0);\
+  a2 = v128_aesenclast_nokey( a2 ); \
-  a2 = v128_aesenclast(a2, b0);\
+  a3 = v128_aesenclast_nokey( a3 ); \
-  a3 = v128_aesenclast(a3, b0);\
+  a4 = v128_aesenclast_nokey( a4 ); \
-  a4 = v128_aesenclast(a4, b0);\
+  a5 = v128_aesenclast_nokey( a5 ); \
-  a5 = v128_aesenclast(a5, b0);\
+  a6 = v128_aesenclast_nokey( a6 ); \
-  a6 = v128_aesenclast(a6, b0);\
+  a7 = v128_aesenclast_nokey( a7 ); \
  a7 = v128_aesenclast(a7, b0);\
  /* MixBytes */\
  MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7 ); \
 }
@@ -365,7 +327,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
     /* SubBytes + MixBytes */\
    SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
            xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7 ); \
    \
    /* AddRoundConstant P1024 */\
    xmm0 = v128_xor( xmm0, \
             casti_v128( round_const_p, round_counter+1 ) ); \
@@ -467,7 +428,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
  t1 = v128_unpackhi16(t1, i3);\
  i2 = v128_unpacklo16(i2, i3);\
  i0 = v128_unpacklo16(i0, i1);\
 \
  /* shuffle with immediate */\
  t0 = gr_shuffle32( t0 ); \
  t1 = gr_shuffle32( t1 ); \
@@ -477,7 +437,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
  i2 = gr_shuffle32( i2 ); \
  i4 = gr_shuffle32( i4 ); \
  i6 = gr_shuffle32( i6 ); \
 \
  /* continue with unpack */\
  t4 = i0;\
  i0 = v128_unpacklo32(i0, i2);\
@@ -584,7 +543,8 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
  /* transpose done */\
 }/**/
-
+#if 0
 // not used
 void INIT( v128_t* chaining )
 {
  static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
@@ -613,6 +573,7 @@ void INIT( v128_t* chaining )
  chaining[6] = xmm14;
  chaining[7] = xmm15;
 }
 #endif
 void TF1024( v128_t* chaining, const v128_t* message )
 {
--- a/algo/groestl/aes_ni/groestl256-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl256-intr-aes.h
@@ -1,3 +1,6 @@
 #if !defined GROESTL256_INTR_AES_H__
 #define GROESTL256_INTR_AES_H__
 /* groestl-intr-aes.h     Aug 2011
 *
 * Groestl implementation with intrinsics using ssse3, sse4.1, and aes
@@ -50,10 +53,10 @@ static const v128u64_t SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
 #if defined(__ARM_NEON)
-// No fast shuffle on NEON
+static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
-static const uint32x4_t vmask_d8 = {  3, 1, 2, 0 };
+   { 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
-#define gr_shuffle32( v )       v128_shufflev32( v, vmask_d8 )
+#define gr_shuffle32(v)       vqtbl1q_u8( v, gr_mask ) 
 #else
@@ -61,7 +64,6 @@ static const uint32x4_t vmask_d8 = {  3, 1, 2, 0 };
 #endif
 #define tos(a)    #a
 #define tostr(a)  tos(a)
@@ -598,4 +600,4 @@ void OF512( v128_t* chaining )
  chaining[3] = xmm11;
 }
-
+#endif
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -181,6 +181,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
   // digest final padding block and do output transform
   TF1024( ctx->chaining, ctx->buffer );
   OF1024( ctx->chaining );
   // store hash result in output 
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -87,6 +87,7 @@ int final_groestl( hashState_groestl*, void* );
 int update_and_final_groestl( hashState_groestl*,  void*, const void*, int );
 int groestl512( hashState_groestl*,  void*, const void*, uint64_t );
 #define groestl512_full   groestl512
 #define groestl512_ctx    groestl512
 #endif /* __hash_h */
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -626,7 +626,7 @@ static const __m256i SUBSH_MASK7_2WAY =
 #define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
-  b1 = mm256_bcast_m128( mm128_mask_32( v128_neg1, 0x3 ) ); \
+  b1 = mm256_bcast_m128( v128_mask32( v128_neg1, 0x3 ) ); \
  a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
  a1 = _mm256_xor_si256( a1, b1 );\
  a2 = _mm256_xor_si256( a2, b1 );\
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -213,7 +213,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
   v128_bswap32_intrlv80_4x32( vdata, pdata );
   do {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
+      *noncev = v128_bswap32( _mm_set_epi32( n+3,n+2,n+1,n ) );
      myriad_4way_hash( hash, vdata );
      pdata[19] = n;
--- a/algo/groestl/sph_groestl.c
+++ b/algo/groestl/sph_groestl.c
@@ -35,8 +35,6 @@
 #include "sph_groestl.h"
 #if !defined(__AES__)
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -3119,5 +3117,4 @@ sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #ifdef __cplusplus
 }
 #endif  // !AES
 #endif
--- a/algo/groestl/sph_groestl.h
+++ b/algo/groestl/sph_groestl.h
@@ -42,7 +42,6 @@ extern "C"{
 #include <stddef.h>
 #include "compat/sph_types.h"
 #if !defined(__AES__)   
 /**
 * Output size (in bits) for Groestl-224.
 */
@@ -327,5 +326,4 @@ void sph_groestl512_addbits_and_close(
 }
 #endif
 #endif  // !AES
 #endif
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -108,4 +108,53 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }
 #elif defined(KECCAK_2WAY)
 void keccakhash_2x64(void *state, const void *input)
 {
    keccak256_2x64_context ctx;
    keccak256_2x64_init( &ctx );
    keccak256_2x64_update( &ctx, input, 80 );
    keccak256_2x64_close( &ctx, state );
 }
 int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
   uint32_t hash[16*2] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[13]);   // 3*4+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   v128_t *noncev = (v128_t*)vdata + 9;
   const uint32_t Htarg = ptarget[7];
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
   v128_bswap32_intrlv80_2x64( vdata, pdata );
   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
   do {
      keccakhash_2x64( hash, vdata );
      for ( int lane = 0; lane < 2; lane++ )
      if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
      {
          extr_lane_2x64( lane_hash, hash, lane, 256 );
          if ( valid_hash( lane_hash, ptarget ))
          {
              pdata[19] = bswap_32( n + lane );
              submit_solution( work, lane_hash, mythr );
          }
      }
      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
      n += 2;
   } while ( (n < max_nonce-2) && !work_restart[thr_id].restart);
   pdata[19] = n;
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
 #endif
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -17,6 +17,9 @@ bool register_keccak_algo( algo_gate_t* gate )
 #elif defined (KECCAK_4WAY)
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
 #elif defined (KECCAK_2WAY)
  gate->scanhash  = (void*)&scanhash_keccak_2x64;
  gate->hash      = (void*)&keccakhash_2x64;
 #else
  gate->scanhash  = (void*)&scanhash_keccak;
  gate->hash      = (void*)&keccakhash;
@@ -37,6 +40,9 @@ bool register_keccakc_algo( algo_gate_t* gate )
 #elif defined (KECCAK_4WAY)
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
 #elif defined (KECCAK_2WAY)
  gate->scanhash  = (void*)&scanhash_keccak_2x64;
  gate->hash      = (void*)&keccakhash_2x64;
 #else
  gate->scanhash  = (void*)&scanhash_keccak;
  gate->hash      = (void*)&keccakhash;
@@ -75,15 +81,17 @@ void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
 bool register_sha3d_algo( algo_gate_t* gate )
 {
  hard_coded_eb = 6;
-//  opt_extranonce = false;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
  gate->optimizations = AVX2_OPT | AVX512_OPT;
  gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root;
-#if defined (KECCAK_8WAY)
+#if defined (SHA3D_8WAY)
  gate->scanhash  = (void*)&scanhash_sha3d_8way;
  gate->hash      = (void*)&sha3d_hash_8way;
-#elif defined (KECCAK_4WAY)
+#elif defined (SHA3D_4WAY)
  gate->scanhash  = (void*)&scanhash_sha3d_4way;
  gate->hash      = (void*)&sha3d_hash_4way;
 #elif defined (SHA3D_2WAY)
  gate->scanhash  = (void*)&scanhash_sha3d_2x64;
  gate->hash      = (void*)&sha3d_hash_2x64;
 #else
  gate->scanhash  = (void*)&scanhash_sha3d;
  gate->hash      = (void*)&sha3d_hash;
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -8,6 +8,16 @@
  #define KECCAK_8WAY 1
 #elif defined(__AVX2__)
  #define KECCAK_4WAY 1
 #elif defined(__SSE2__) || defined(__ARM_NEON)
  #define KECCAK_2WAY 1
 #endif
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define SHA3D_8WAY 1
 #elif defined(__AVX2__)
  #define SHA3D_4WAY 1
 #elif defined(__SSE2__) || defined(__ARM_NEON)
  #define SHA3D_2WAY 1
 #endif
 extern int hard_coded_eb;
@@ -18,18 +28,16 @@ void keccakhash_8way( void *state, const void *input );
 int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 void sha3d_hash_8way( void *state, const void *input );
 int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
 #elif defined(KECCAK_4WAY)
 void keccakhash_4way( void *state, const void *input );
 int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
-void sha3d_hash_4way( void *state, const void *input );
+#elif defined(KECCAK_2WAY)
-int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
+
 void keccakhash_2x64( void *state, const void *input );
 int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 #else
@@ -38,6 +46,28 @@ void keccakhash( void *state, const void *input );
 int scanhash_keccak( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 #if defined(SHA3D_8WAY)
 void sha3d_hash_8way( void *state, const void *input );
 int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
 #elif defined(SHA3D_4WAY)
 void sha3d_hash_4way( void *state, const void *input );
 int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
 #elif defined(SHA3D_2WAY)
 void sha3d_hash_2x64( void *state, const void *input );
 int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
 #else
 void sha3d_hash( void *state, const void *input );
 int scanhash_sha3d( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );
--- a/algo/keccak/sha3d-4way.c
+++ b/algo/keccak/sha3d-4way.c
@@ -4,7 +4,7 @@
 #include <stdint.h>
 #include "keccak-hash-4way.h"
-#if defined(KECCAK_8WAY)
+#if defined(SHA3D_8WAY)
 void sha3d_hash_8way(void *state, const void *input)
 {
@@ -64,7 +64,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
   return 0;
 }
-#elif defined(KECCAK_4WAY)
+#elif defined(SHA3D_4WAY)
 void sha3d_hash_4way(void *state, const void *input)
 {
@@ -122,4 +122,60 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }
 #elif defined(SHA3D_2WAY)
 void sha3d_hash_2x64(void *state, const void *input)
 {
    uint32_t buffer[16*4] __attribute__ ((aligned (64)));
    keccak256_2x64_context ctx;
    keccak256_2x64_init( &ctx );
    keccak256_2x64_update( &ctx, input, 80 );
    keccak256_2x64_close( &ctx, buffer );
    keccak256_2x64_init( &ctx );
    keccak256_2x64_update( &ctx, buffer, 32 );
    keccak256_2x64_close( &ctx, state );
 }
 int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
   uint32_t hash[16*2] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[13]);   // 3*4+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 2;
   v128_t *noncev = (v128_t*)vdata + 9;
   const uint32_t Htarg = ptarget[7];
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
   v128_bswap32_intrlv80_2x64( vdata, pdata );
   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
   do {
      sha3d_hash_2x64( hash, vdata );
      for ( int lane = 0; lane < 2; lane++ )
      if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
      {
          extr_lane_2x64( lane_hash, hash, lane, 256 );
          if ( valid_hash( lane_hash, ptarget ) )
          {
              pdata[19] = bswap_32( n + lane );
              submit_solution( work, lane_hash, mythr );
          }
      }
      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
      n += 2;
   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
 #endif
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -465,12 +465,8 @@ typedef union
 {
   keccak256_2x64_context    keccak;
   cubehashParam             cube;
 //#if defined(__x86_64__)
   skein256_2x64_context     skein;
-//#else
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
 //   sph_skein512_context      skein;
 //#endif
 #if defined(__AES__) // || defined(__ARM_FEATURE_AES)
   hashState_groestl256      groestl;
 #else
   sph_groestl256_context     groestl;
@@ -516,7 +512,6 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
 //#if defined(__x86_64__)
   intrlv_2x64( vhashA, hash0, hash1, 256 );
   skein256_2x64_init( &ctx.skein );
   skein256_2x64_update( &ctx.skein, vhashA, 32 );
@@ -527,23 +522,8 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
   skein256_2x64_update( &ctx.skein, vhashA, 32 );
   skein256_2x64_close( &ctx.skein, vhashA );
   dintrlv_2x64( hash2, hash3, vhashA, 256 );
-/*
+
-#else
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
    sph_skein256_init( &ctx.skein );
    sph_skein256( &ctx.skein, hash0, 32 );
    sph_skein256_close( &ctx.skein, hash0 );
    sph_skein256_init( &ctx.skein );
    sph_skein256( &ctx.skein, hash1, 32 );
    sph_skein256_close( &ctx.skein, hash1 );
    sph_skein256_init( &ctx.skein );
    sph_skein256( &ctx.skein, hash2, 32 );
    sph_skein256_close( &ctx.skein, hash2 );
    sph_skein256_init( &ctx.skein );
    sph_skein256( &ctx.skein, hash3, 32 );
    sph_skein256_close( &ctx.skein, hash3 );
 #endif
 */
 #if defined(__AES__) // || defined(__ARM_FEATURE_AES)
   groestl256_full( &ctx.groestl, hash0, hash0, 256 );
   groestl256_full( &ctx.groestl, hash1, hash1, 256 );
   groestl256_full( &ctx.groestl, hash2, hash2, 256 );
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -67,7 +67,7 @@ int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
   lyra2h_4way_midstate( vdata );
   do {
-     *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+     *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      lyra2h_4way_hash( hash, vdata );
      for ( int i = 0; i < 4; i++ )
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -456,7 +456,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
   do
   {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      lyra2rev2_4way_hash( hash, vdata );
--- a/algo/quark/hmq1725-gate.c
+++ b/algo/quark/hmq1725-gate.c
@@ -9,11 +9,11 @@ bool register_hmq1725_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_hmq1725_4way;
  gate->hash      = (void*)&hmq1725_4way_hash;
 #else
  init_hmq1725_ctx();
  gate->scanhash  = (void*)&scanhash_hmq1725;
  gate->hash      = (void*)&hmq1725hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT 
                      | NEON_OPT;
  opt_target_factor = 65536.0;
  return true;
 };
--- a/algo/quark/hmq1725-gate.h
+++ b/algo/quark/hmq1725-gate.h
@@ -29,7 +29,6 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
 void hmq1725hash( void *state, const void *input );
 int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );
 void init_hmq1725_ctx();
 #endif
--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -4,336 +4,254 @@
 #include <string.h>
 #include <stdint.h>
-#include "algo/blake/sph_blake.h"
+#include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/fugue/fugue-aesni.h"
 #else
  #include "algo/fugue/sph_fugue.h"
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #else
  #include "algo/groestl/sph_groestl.h"
  #include "algo/echo/sph_echo.h"
 #endif
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/sha/sph_sha2.h"
 #if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/fugue/fugue-aesni.h"
 #else
  #include "algo/groestl/sph_groestl.h"
  #include "algo/echo/sph_echo.h"
  #include "algo/fugue/sph_fugue.h"
 #endif
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
+#include "algo/simd/simd-hash-2way.h"
  #include "algo/simd/sph_simd.h"
 #else
  #include "algo/simd/nist.h"
 #endif
-typedef struct {
+union _hmq1725_ctx_holder
-  sph_blake512_context    blake1, blake2;
+{
-  sph_bmw512_context      bmw1, bmw2, bmw3;
+   blake512_context        blake;
-  sph_skein512_context    skein1, skein2;
+   sph_bmw512_context      bmw;
-  sph_jh512_context       jh1, jh2;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
-  sph_keccak512_context   keccak1, keccak2;
+   hashState_fugue         fugue;
-  hashState_luffa         luffa1, luffa2;
+#else
   sph_fugue512_context    fugue;
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
   hashState_groestl       groestl;
   hashState_echo          echo;
 #else
   sph_groestl512_context  groestl;
   sph_echo512_context     echo;
 #endif
   sph_skein512_context    skein;
   sph_jh512_context       jh;
   sph_keccak512_context   keccak;
   hashState_luffa         luffa;
   cubehashParam           cube;
-  sph_shavite512_context  shavite1, shavite2;
+   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
+   simd512_context         simd;
-  sph_simd512_context     simd1, simd2;
+   sph_hamsi512_context    hamsi;
-#else
+   sph_shabal512_context   shabal;
-  hashState_sd            simd1, simd2;
+   sph_whirlpool_context   whirlpool;
-#endif
+   sph_sha512_context      sha;
-  sph_hamsi512_context    hamsi1;
+   sph_haval256_5_context  haval;
-  sph_shabal512_context   shabal1;
+};
-  sph_whirlpool_context   whirlpool1, whirlpool2, whirlpool3, whirlpool4;
+typedef union _hmq1725_ctx_holder hmq1725_ctx_holder;
  sph_sha512_context      sha1, sha2;
  sph_haval256_5_context  haval1, haval2;
 #if defined(__AES__)
  hashState_echo          echo1, echo2;
  hashState_groestl       groestl1, groestl2;
  hashState_fugue         fugue1, fugue2;
 #else
  sph_groestl512_context  groestl1, groestl2;
  sph_echo512_context     echo1, echo2;
  sph_fugue512_context    fugue1, fugue2;
 #endif
 } hmq1725_ctx_holder;
 static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64)));
 static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64)));
 void init_hmq1725_ctx()
 {
    sph_blake512_init(&hmq1725_ctx.blake1);
    sph_blake512_init(&hmq1725_ctx.blake2);
    sph_bmw512_init(&hmq1725_ctx.bmw1);
    sph_bmw512_init(&hmq1725_ctx.bmw2);
    sph_bmw512_init(&hmq1725_ctx.bmw3);
    sph_skein512_init(&hmq1725_ctx.skein1);
    sph_skein512_init(&hmq1725_ctx.skein2);
    sph_jh512_init(&hmq1725_ctx.jh1);
    sph_jh512_init(&hmq1725_ctx.jh2);
    sph_keccak512_init(&hmq1725_ctx.keccak1);
    sph_keccak512_init(&hmq1725_ctx.keccak2);
    init_luffa( &hmq1725_ctx.luffa1, 512 );
    init_luffa( &hmq1725_ctx.luffa2, 512 );
    cubehashInit( &hmq1725_ctx.cube, 512, 16, 32 );
    sph_shavite512_init(&hmq1725_ctx.shavite1);
    sph_shavite512_init(&hmq1725_ctx.shavite2);
 #if defined(__aarch64__)
    sph_simd512_init(&hmq1725_ctx.simd1);
    sph_simd512_init(&hmq1725_ctx.simd2);
 #else    
    init_sd( &hmq1725_ctx.simd1, 512 );
    init_sd( &hmq1725_ctx.simd2, 512 );
 #endif
    sph_hamsi512_init(&hmq1725_ctx.hamsi1);
 #if defined(__AES__)
    fugue512_Init( &hmq1725_ctx.fugue1, 512 );
    fugue512_Init( &hmq1725_ctx.fugue2, 512 );
 #else
    sph_fugue512_init(&hmq1725_ctx.fugue1);
    sph_fugue512_init(&hmq1725_ctx.fugue2);
 #endif
    sph_shabal512_init(&hmq1725_ctx.shabal1);
    sph_whirlpool_init(&hmq1725_ctx.whirlpool1);
    sph_whirlpool_init(&hmq1725_ctx.whirlpool2);
    sph_whirlpool_init(&hmq1725_ctx.whirlpool3);
    sph_whirlpool_init(&hmq1725_ctx.whirlpool4);
    sph_sha512_init( &hmq1725_ctx.sha1 );
    sph_sha512_init( &hmq1725_ctx.sha2 );
    sph_haval256_5_init(&hmq1725_ctx.haval1);
    sph_haval256_5_init(&hmq1725_ctx.haval2);
 #if defined(__AES__)
     init_echo( &hmq1725_ctx.echo1, 512 );
     init_echo( &hmq1725_ctx.echo2, 512 );
     init_groestl( &hmq1725_ctx.groestl1, 64 );
     init_groestl( &hmq1725_ctx.groestl2, 64 );
 #else
     sph_groestl512_init( &hmq1725_ctx.groestl1 );
     sph_groestl512_init( &hmq1725_ctx.groestl2 );
     sph_echo512_init( &hmq1725_ctx.echo1 );
     sph_echo512_init( &hmq1725_ctx.echo2 );
 #endif
 }
 void hmq_bmw512_midstate( const void* input )
 {
    memcpy( &hmq_bmw_mid, &hmq1725_ctx.bmw1, sizeof hmq_bmw_mid );
    sph_bmw512( &hmq_bmw_mid, input, 64 );
 }
 __thread hmq1725_ctx_holder h_ctx __attribute__ ((aligned (64)));
 extern void hmq1725hash(void *state, const void *input)
 {
    const uint32_t mask = 24;
-    uint32_t hashA[32] __attribute__((aligned(64)));
+    uint32_t hashA[32] __attribute__((aligned(32)));
-    uint32_t hashB[32] __attribute__((aligned(64)));
+    uint32_t hashB[32] __attribute__((aligned(32)));
-    const int midlen = 64;            // bytes
+    hmq1725_ctx_holder ctx __attribute__ ((aligned (64)));
    const int tail   = 80 - midlen;   // 16
-    memcpy(&h_ctx, &hmq1725_ctx, sizeof(hmq1725_ctx));
+    sph_bmw512_init( &ctx.bmw );
    sph_bmw512( &ctx.bmw, input, 80 );
    sph_bmw512_close( &ctx.bmw, hashA );   //1
-    memcpy( &h_ctx.bmw1, &hmq_bmw_mid, sizeof hmq_bmw_mid );
+    sph_whirlpool_init( &ctx.whirlpool );
-    sph_bmw512( &h_ctx.bmw1, input + midlen, tail );
+    sph_whirlpool( &ctx.whirlpool, hashA, 64 );    //0
-    sph_bmw512_close(&h_ctx.bmw1, hashA);   //1
+    sph_whirlpool_close( &ctx.whirlpool, hashB );   //1
    sph_whirlpool (&h_ctx.whirlpool1, hashA, 64);    //0
    sph_whirlpool_close(&h_ctx.whirlpool1, hashB);   //1
    if ( hashB[0] & mask )   //1
    {
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
-     update_and_final_groestl( &h_ctx.groestl1, (char*)hashA,
+       groestl512_full( &ctx.groestl, hashA, hashB, 512 );
                               (const char*)hashB, 512 );
 #else
-     sph_groestl512 (&h_ctx.groestl1, hashB, 64); //1
+       sph_groestl512_init( &ctx.groestl );
-     sph_groestl512_close(&h_ctx.groestl1, hashA); //2
+       sph_groestl512( &ctx.groestl, hashB, 64 ); //1
       sph_groestl512_close( &ctx.groestl, hashA ); //2
 #endif
    }
    else
    {
-      sph_skein512 (&h_ctx.skein1, hashB, 64); //1
+      sph_skein512_init( &ctx.skein );
-      sph_skein512_close(&h_ctx.skein1, hashA); //2
+      sph_skein512( &ctx.skein, hashB, 64 ); //1
      sph_skein512_close( &ctx.skein, hashA ); //2
    }
-    sph_jh512 (&h_ctx.jh1, hashA, 64); //3
+    sph_jh512_init( &ctx.jh );
-    sph_jh512_close(&h_ctx.jh1, hashB); //4
+    sph_jh512( &ctx.jh, hashA, 64 ); //3
    sph_jh512_close( &ctx.jh, hashB ); //4
-    sph_keccak512 (&h_ctx.keccak1, hashB, 64); //2
+    sph_keccak512_init( &ctx.keccak );
-    sph_keccak512_close(&h_ctx.keccak1, hashA); //3
+    sph_keccak512( &ctx.keccak, hashB, 64 ); //2
    sph_keccak512_close( &ctx.keccak, hashA ); //3
    if ( hashA[0] & mask ) //4
    {
-        sph_blake512 (&h_ctx.blake1, hashA, 64); //
+        blake512_init( &ctx.blake );
-        sph_blake512_close(&h_ctx.blake1, hashB); //5
+        blake512_update( &ctx.blake, hashA, 64 );
        blake512_close( &ctx.blake, hashB );
    }
    else
    {
-        sph_bmw512 (&h_ctx.bmw2, hashA, 64); //4
+        sph_bmw512_init( &ctx.bmw );
-        sph_bmw512_close(&h_ctx.bmw2, hashB);   //5
+        sph_bmw512( &ctx.bmw, hashA, 64 ); //4
        sph_bmw512_close( &ctx.bmw, hashB );   //5
    }
-     update_and_final_luffa( &h_ctx.luffa1, hashA, hashB, 64 );
+    luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );
-     cubehashUpdateDigest( &h_ctx.cube, hashB, hashA, 64 );
+    cubehash_full( &ctx.cube, hashB, 512, hashA, 64 );
    if ( hashB[0] & mask ) //7
    {
-        sph_keccak512 (&h_ctx.keccak2, hashB, 64); //
+        sph_keccak512_init( &ctx.keccak );
-        sph_keccak512_close(&h_ctx.keccak2, hashA); //8
+        sph_keccak512( &ctx.keccak, hashB, 64 ); //
        sph_keccak512_close( &ctx.keccak, hashA ); //8
    }
    else
    {
-        sph_jh512 (&h_ctx.jh2, hashB, 64); //7
+        sph_jh512_init( &ctx.jh );
-        sph_jh512_close(&h_ctx.jh2, hashA); //8
+        sph_jh512( &ctx.jh, hashB, 64 ); //7
        sph_jh512_close( &ctx.jh, hashA ); //8
    }
-    sph_shavite512 (&h_ctx.shavite1, hashA, 64); //3
+    sph_shavite512_init( &ctx.shavite );
-    sph_shavite512_close(&h_ctx.shavite1, hashB); //4
+    sph_shavite512( &ctx.shavite, hashA, 64 ); //3
    sph_shavite512_close( &ctx.shavite, hashB ); //4
-#if defined(__aarch64__)
+    simd512_ctx( &ctx.simd, hashA, hashB, 64 );
    sph_simd512 (&h_ctx.simd1, hashB, 64); //3
    sph_simd512_close(&h_ctx.simd1, hashA); //4
 #else    
    update_final_sd( &h_ctx.simd1, (BitSequence *)hashA,
                                   (const BitSequence *)hashB, 512 );
 #endif
    if ( hashA[0] & mask ) //4
    {
-        sph_whirlpool (&h_ctx.whirlpool2, hashA, 64); //
+        sph_whirlpool_init( &ctx.whirlpool );
-        sph_whirlpool_close(&h_ctx.whirlpool2, hashB); //5
+        sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //
        sph_whirlpool_close( &ctx.whirlpool, hashB ); //5
    }
    else
    {
-        sph_haval256_5 (&h_ctx.haval1, hashA, 64); //4
+        sph_haval256_5_init( &ctx.haval );
-        sph_haval256_5_close(&h_ctx.haval1, hashB);   //5
+        sph_haval256_5( &ctx.haval, hashA, 64 ); //4
        sph_haval256_5_close( &ctx.haval, hashB );   //5
        memset(&hashB[8], 0, 32);
    }
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
-    update_final_echo ( &h_ctx.echo1, (BitSequence *)hashA,
+    echo_full( &ctx.echo, hashA, 512, hashB, 64 );
                        (const BitSequence *)hashB, 512 );
 #else
-    sph_echo512 (&h_ctx.echo1, hashB, 64); //5
+    sph_echo512_init( &ctx.echo );
-    sph_echo512_close(&h_ctx.echo1, hashA); //6
+    sph_echo512( &ctx.echo, hashB, 64 ); //5
    sph_echo512_close( &ctx.echo, hashA ); //6
 #endif
-    sph_blake512 (&h_ctx.blake2, hashA, 64); //6
+    blake512_init( &ctx.blake );
-    sph_blake512_close(&h_ctx.blake2, hashB); //7
+    blake512_update( &ctx.blake, hashA, 64 );
    blake512_close( &ctx.blake, hashB );
    if ( hashB[0] & mask ) //7
    {
-        sph_shavite512 (&h_ctx.shavite2, hashB, 64); //
+       sph_shavite512_init( &ctx.shavite );
-        sph_shavite512_close(&h_ctx.shavite2, hashA); //8
+       sph_shavite512( &ctx.shavite, hashB, 64 ); //
       sph_shavite512_close( &ctx.shavite, hashA ); //8
    }
    else
-    {
+       luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );
     update_and_final_luffa( &h_ctx.luffa2, hashA, hashB, 64 );
    }
-    sph_hamsi512 (&h_ctx.hamsi1, hashA, 64); //3
+    sph_hamsi512_init( &ctx.hamsi );
-    sph_hamsi512_close(&h_ctx.hamsi1, hashB); //4
+    sph_hamsi512( &ctx.hamsi, hashA, 64 ); //3
    sph_hamsi512_close( &ctx.hamsi, hashB ); //4
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
-    fugue512_Update( &h_ctx.fugue1, hashB, 512 ); //2   ////
+    fugue512_full( &ctx.fugue, hashA, hashB, 64 );
    fugue512_Final( &h_ctx.fugue1, hashA ); //3 
 #else
-    sph_fugue512 (&h_ctx.fugue1, hashB, 64); //2   ////
+    sph_fugue512_init( &ctx.fugue );
-    sph_fugue512_close(&h_ctx.fugue1, hashA); //3 
+    sph_fugue512( &ctx.fugue, hashB, 64 ); //2   ////
    sph_fugue512_close( &ctx.fugue, hashA ); //3 
 #endif
    if ( hashA[0] & mask ) //4
    {
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
-     update_final_echo ( &h_ctx.echo2, (BitSequence *)hashB,
+       echo_full( &ctx.echo, hashB, 512, hashA, 64 );
                         (const BitSequence *)hashA, 512 );
 #else
-     sph_echo512 (&h_ctx.echo2, hashA, 64); //
+       sph_echo512_init( &ctx.echo );
-     sph_echo512_close(&h_ctx.echo2, hashB); //5
+       sph_echo512( &ctx.echo, hashA, 64 ); //
       sph_echo512_close( &ctx.echo, hashB ); //5
 #endif
    }
    else
-    {
+       simd512_ctx( &ctx.simd, hashB, hashA, 64 );
 #if defined(__aarch64__)
    sph_simd512(&h_ctx.simd2, hashA, 64); //6
    sph_simd512_close(&h_ctx.simd2, hashB); //7
 #else
    update_final_sd( &h_ctx.simd2, (BitSequence *)hashB,
                      (const BitSequence *)hashA, 512 );
 #endif
    }
-    sph_shabal512 (&h_ctx.shabal1, hashB, 64); //5
+    sph_shabal512_init( &ctx.shabal );
-    sph_shabal512_close(&h_ctx.shabal1, hashA); //6
+    sph_shabal512( &ctx.shabal, hashB, 64 ); //5
    sph_shabal512_close( &ctx.shabal, hashA ); //6
-    sph_whirlpool (&h_ctx.whirlpool3, hashA, 64); //6
+    sph_whirlpool_init( &ctx.whirlpool );
-    sph_whirlpool_close(&h_ctx.whirlpool3, hashB); //7
+    sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //6
    sph_whirlpool_close( &ctx.whirlpool, hashB ); //7
    if ( hashB[0] & mask ) //7
    {
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
-        fugue512_Update( &h_ctx.fugue2, hashB, 512 ); //
+       fugue512_full( &ctx.fugue, hashA, hashB, 64 );
        fugue512_Final( &h_ctx.fugue2, hashA ); //8
 #else
-        sph_fugue512 (&h_ctx.fugue2, hashB, 64); //
+       sph_fugue512_init( &ctx.fugue );
-        sph_fugue512_close(&h_ctx.fugue2, hashA); //8
+       sph_fugue512( &ctx.fugue, hashB, 64 ); //
       sph_fugue512_close( &ctx.fugue, hashA ); //8
 #endif
    }
    else
    {
-        sph_sha512( &h_ctx.sha1, hashB, 64 );
+       sph_sha512_init( &ctx.sha );
-        sph_sha512_close( &h_ctx.sha1, hashA );
+       sph_sha512( &ctx.sha, hashB, 64 );
       sph_sha512_close( &ctx.sha, hashA );
    }
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
-    update_and_final_groestl( &h_ctx.groestl2, (char*)hashB,
+    groestl512_full( &ctx.groestl, hashB, hashA, 512 );
                               (const char*)hashA, 512 );
 #else
-    sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3
+    sph_groestl512_init( &ctx.groestl );
-    sph_groestl512_close(&h_ctx.groestl2, hashB); //4
+    sph_groestl512( &ctx.groestl, hashA, 64 ); //3
    sph_groestl512_close( &ctx.groestl, hashB ); //4
 #endif
-    sph_sha512( &h_ctx.sha2, hashB, 64 );
+    sph_sha512_init( &ctx.sha );
-    sph_sha512_close( &h_ctx.sha2, hashA );
+    sph_sha512( &ctx.sha, hashB, 64 );
    sph_sha512_close( &ctx.sha, hashA );
    if ( hashA[0] & mask ) //4
    {
-        sph_haval256_5 (&h_ctx.haval2, hashA, 64); //
+        sph_haval256_5_init( &ctx.haval );
-        sph_haval256_5_close(&h_ctx.haval2, hashB); //5
+        sph_haval256_5( &ctx.haval, hashA, 64 ); //
        sph_haval256_5_close( &ctx.haval, hashB ); //5
        memset( &hashB[8], 0, 32 );
    }
    else
    {
-        sph_whirlpool (&h_ctx.whirlpool4, hashA, 64); //4
+        sph_whirlpool_init( &ctx.whirlpool );
-        sph_whirlpool_close(&h_ctx.whirlpool4, hashB);   //5
+        sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //4
        sph_whirlpool_close( &ctx.whirlpool, hashB );   //5
    }
-    sph_bmw512 (&h_ctx.bmw3, hashB, 64); //5
+    sph_bmw512_init( &ctx.bmw );
-    sph_bmw512_close(&h_ctx.bmw3, hashA); //6
+    sph_bmw512( &ctx.bmw, hashB, 64 ); //5
    sph_bmw512_close( &ctx.bmw, hashA ); //6
 	memcpy( state, hashA, 32 );
 }
@@ -341,30 +259,18 @@ extern void hmq1725hash(void *state, const void *input)
 int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-//        uint32_t endiandata[32] __attribute__((aligned(64)));
+   uint32_t endiandata[20] __attribute__((aligned(32)));
-        uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t hash64[8] __attribute__((aligned(32)));
        uint32_t hash64[8] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
   int thr_id = mythr->id;  // thr_id arg is deprecated
 	//const uint32_t Htarg = ptarget[7];
 	//we need bigendian data...
 //        for (int k = 0; k < 32; k++)
   for (int k = 0; k < 20; k++)
         be32enc(&endiandata[k], pdata[k]);
        hmq_bmw512_midstate( endiandata );
 //	if (opt_debug) 
 //	{
 //		applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce);
 //	}
 	/* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */
 	/* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */
 	if (ptarget[7]==0) {
 		do {
 			pdata[19] = ++n;
--- a/algo/quark/quark-gate.c
+++ b/algo/quark/quark-gate.c
@@ -14,7 +14,8 @@ bool register_quark_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_quark;
  gate->hash      = (void*)&quark_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
                      | NEON_OPT;
  return true;
 };
--- a/algo/quark/quark.c
+++ b/algo/quark/quark.c
@@ -7,12 +7,12 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "algo/blake/sph_blake.h"
+#include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #else
  #include "algo/groestl/sph_groestl.h"
@@ -21,9 +21,9 @@
 void quark_hash(void *state, const void *input)
 {
   uint32_t hash[16] __attribute__((aligned(64)));
-   sph_blake512_context    ctx_blake;
+   blake512_context        ctx_blake;
   sph_bmw512_context      ctx_bmw;
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   hashState_groestl       ctx_groestl;
 #else
   sph_groestl512_context  ctx_groestl;
@@ -33,9 +33,7 @@ void quark_hash(void *state, const void *input)
   sph_keccak512_context   ctx_keccak;
   uint32_t mask = 8;
-   sph_blake512_init( &ctx_blake );
+   blake512_full( &ctx_blake, hash, input, 80 );
   sph_blake512( &ctx_blake, input, 80 );
   sph_blake512_close( &ctx_blake, hash );
   sph_bmw512_init( &ctx_bmw );
   sph_bmw512( &ctx_bmw, hash, 64 );
@@ -43,7 +41,7 @@ void quark_hash(void *state, const void *input)
   if ( hash[0] & mask )
   {
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
      init_groestl( &ctx_groestl, 64 );
      update_and_final_groestl( &ctx_groestl, (char*)hash,
                                        (const char*)hash, 512 );
@@ -60,7 +58,7 @@ void quark_hash(void *state, const void *input)
      sph_skein512_close( &ctx_skein, hash );
   }
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   init_groestl( &ctx_groestl, 64 );
   update_and_final_groestl( &ctx_groestl, (char*)hash,
                                     (const char*)hash, 512 );
@@ -76,9 +74,7 @@ void quark_hash(void *state, const void *input)
   if ( hash[0] & mask )
   {
-      sph_blake512_init( &ctx_blake );
+      blake512_full( &ctx_blake, hash, hash, 64 );
      sph_blake512( &ctx_blake, hash, 64 );
      sph_blake512_close( &ctx_blake, hash );
   }
   else
   {
--- a/algo/qubit/deep-2way.c
+++ b/algo/qubit/deep-2way.c
@@ -83,7 +83,7 @@ int scanhash_deep_2way( struct work *work,uint32_t max_nonce,
     casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
     casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
-     casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+     casti_v128(  endiandata, 4 ) = v128_bswap32(   casti_v128(  pdata, 4 ) );
     uint64_t *edata = (uint64_t*)endiandata;
     intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -236,7 +236,7 @@ int scanhash_qubit_2way( struct work *work,uint32_t max_nonce,
     casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
     casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
-     casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+     casti_v128(  endiandata, 4 ) = v128_bswap32(   casti_v128(  pdata, 4 ) );
     uint64_t *edata = (uint64_t*)endiandata;
     intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );
--- a/algo/qubit/qubit-gate.c
+++ b/algo/qubit/qubit-gate.c
@@ -16,7 +16,8 @@ bool register_qubit_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_qubit;
  gate->hash      = (void*)&qubit_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
                      | NEON_OPT;
  return true;
 };
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -8,13 +8,9 @@
 #include <stdio.h>
 #include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/cubehash_sse2.h" 
-#if defined(__aarch64__)
+#include "algo/simd/simd-hash-2way.h"
  #include "algo/simd/sph_simd.h"
 #else
  #include "algo/simd/nist.h"
 #endif
 #include "algo/shavite/sph_shavite.h"
-#ifdef __AES__
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
 #include "algo/echo/aes_ni/hash_api.h"
 #else
 #include "algo/echo/sph_echo.h"
@@ -25,12 +21,8 @@ typedef struct
        hashState_luffa         luffa;
        cubehashParam           cubehash;
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
+        simd512_context         simd;
-   sph_simd512_context     simd;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
 #else
   hashState_sd            simd;
 #endif
 #ifdef __AES__
        hashState_echo          echo;
 #else
        sph_echo512_context echo;
@@ -45,12 +37,7 @@ void init_qubit_ctx()
        init_luffa(&qubit_ctx.luffa,512);
        cubehashInit(&qubit_ctx.cubehash,512,16,32);
        sph_shavite512_init(&qubit_ctx.shavite);
-#if defined(__aarch64__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   sph_simd512_init( &qubit_ctx.simd );
 #else
   init_sd( &qubit_ctx.simd, 512 );
 #endif
 #ifdef __AES__
        init_echo(&qubit_ctx.echo, 512);
 #else
        sph_echo512_init(&qubit_ctx.echo);
@@ -81,15 +68,9 @@ void qubit_hash(void *output, const void *input)
        sph_shavite512( &ctx.shavite, hash, 64);
        sph_shavite512_close( &ctx.shavite, hash);
-#if defined(__aarch64__)
+        simd512_ctx( &ctx.simd, hash, hash, 64 );
    sph_simd512(&ctx.simd, (const void*) hash, 64);
    sph_simd512_close(&ctx.simd, hash);
 #else
    update_sd( &ctx.simd, (const BitSequence *)hash, 512 );
    final_sd( &ctx.simd, (BitSequence *)hash );
 #endif
-#ifdef __AES__
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        update_final_echo( &ctx.echo, (BitSequence *) hash,
                     (const BitSequence *) hash, 512 );
 #else
--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -45,10 +45,10 @@ static const uint32_t IV[5] =
 #define RR(a, b, c, d, e, f, s, r, k) \
 do{ \
-   a = _mm_add_epi32( mm128_rol_32( _mm_add_epi32( _mm_add_epi32( \
+   a = _mm_add_epi32( v128_rol32( _mm_add_epi32( _mm_add_epi32( \
                _mm_add_epi32( a, f( b ,c, d ) ), r ), \
                                 _mm_set1_epi64x( k ) ), s ), e ); \
-   c = mm128_rol_32( c, 10 );\
+   c = v128_rol32( c, 10 );\
 } while (0)
 #define ROUND1(a, b, c, d, e, f, s, r, k)  \
--- a/algo/sha/sha256-hash.c
+++ b/algo/sha/sha256-hash.c
@@ -587,8 +587,8 @@ void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y,
    // Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3)
    TMSG0_X = casti_m128i( msg_X, 0 );
    TMSG0_Y = casti_m128i( msg_Y, 0 );
-    TMP_X = mm128_xim_32( TMSG0_X, TMSG0_X, 0xd5 );
+    TMP_X = v128_xim32( TMSG0_X, TMSG0_X, 0xd5 );
-    TMP_Y = mm128_xim_32( TMSG0_Y, TMSG0_Y, 0xd5 );
+    TMP_Y = v128_xim32( TMSG0_Y, TMSG0_Y, 0xd5 );
    STATE0_X = _mm_add_epi32( STATE0_X, TMP_X );
    STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y );
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -34,8 +34,6 @@
 #include <string.h>
 #include "shabal-hash-4way.h"
 //#if defined(__SSE4_1__) || defined(__ARM_NEON)
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 #define DECL_STATE16   \
@@ -47,8 +45,6 @@
           C8, C9, CA, CB, CC, CD, CE, CF; \
   __m512i M0, M1, M2, M3, M4, M5, M6, M7, \
           M8, M9, MA, MB, MC, MD, ME, MF; \
   const __m512i FIVE  = v512_32( 5 ); \
   const __m512i THREE = v512_32( 3 ); \
   uint32_t Wlow, Whigh;
 #define READ_STATE16(state) do \
@@ -292,11 +288,21 @@ do { \
    mm512_swap1024_512( BF, CF ); \
 } while (0)
 static inline __m512i v512_mult_x3( const __m512i x )
 {
   return _mm512_add_epi32( x, _mm512_slli_epi32( x, 1 ) );
 }
 static inline __m512i v512_mult_x5( const __m512i x )
 {
   return _mm512_add_epi32( x, _mm512_slli_epi32( x, 2 ) );
 }
 #define PERM_ELT16( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
   xa0 = mm512_xor3( xm, xb1, mm512_xorandnot( \
-           _mm512_mullo_epi32( mm512_xor3( xa0, xc, \
+           v512_mult_x3( mm512_xor3( xa0, xc, \
-              _mm512_mullo_epi32( mm512_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
+              v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ) ), \
           xb3, xb2 ) ); \
   xb0 = mm512_xnor( xa0, mm512_rol_32( xb0, 1 ) ); \
 } while (0)
@@ -644,8 +650,6 @@ shabal512_16way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
           C8, C9, CA, CB, CC, CD, CE, CF; \
   __m256i M0, M1, M2, M3, M4, M5, M6, M7, \
           M8, M9, MA, MB, MC, MD, ME, MF; \
   const __m256i FIVE  = v256_32( 5 ); \
   const __m256i THREE = v256_32( 3 ); \
   uint32_t Wlow, Whigh;
 #define READ_STATE8(state) do \
@@ -889,11 +893,21 @@ do { \
    mm256_swap512_256( BF, CF ); \
 } while (0)
 static inline __m256i v256_mult_x3( const __m256i x )
 {
   return _mm256_add_epi32( x, _mm256_slli_epi32( x, 1 ) );
 }
 static inline __m256i v256_mult_x5( const __m256i x )
 {
   return _mm256_add_epi32( x, _mm256_slli_epi32( x, 2 ) );
 }
 #define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
   xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
-           _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
+           v256_mult_x3( mm256_xor3( xa0, xc, \
-              _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
+              v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ) ), \
           xb3, xb2 ) ); \
   xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
 } while (0)
@@ -1226,15 +1240,13 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #endif  // AVX2
-#if defined(__SSE4_1__) || defined(__ARM_NEON)
+#if defined(__SSE2__) || defined(__ARM_NEON)
 #define DECL_STATE   \
 	v128u32_t A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, AA, AB; \
 	v128u32_t B0, B1, B2, B3, B4, B5, B6, B7, B8, B9, BA, BB, BC, BD, BE, BF; \
 	v128u32_t C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, CA, CB, CC, CD, CE, CF; \
 	v128u32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; \
   const v128u32_t FIVE  = v128_32( 5 ); \
   const v128u32_t THREE = v128_32( 3 ); \
   uint32_t Wlow, Whigh;
 #define READ_STATE( state ) \
@@ -1479,11 +1491,21 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
    v128_swap256_128( BF, CF ); \
 }
 static inline v128_t v128_mult_x3( const v128_t x )
 {
   return v128_add32( x, v128_sl32( x, 1 ) );
 }
 static inline v128_t v128_mult_x5( const v128_t x )
 {
   return v128_add32( x, v128_sl32( x, 2 ) );
 }
 #define PERM_ELT( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 { \
   xa0 = v128_xor3( xm, xb1, v128_xorandnot( \
-           v128_mul32( v128_xor3( xa0, xc, \
+                               v128_mult_x3( v128_xor3( xa0, xc, \
-              v128_mul32( v128_rol32( xa1, 15 ), FIVE ) ), THREE ), \
+                                   v128_mult_x5( v128_rol32( xa1, 15 ) ) ) ), \
                               xb3, xb2 ) ); \
   xb0 = v128_not( v128_xor( xa0, v128_rol32( xb0, 1 ) ) ); \
 }
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -62,7 +62,7 @@ void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
 #endif
-#if defined(__SSE4_1__) || defined(__ARM_NEON)
+#if defined(__SSE2__) || defined(__ARM_NEON)
 typedef struct {
 	v128_t buf[16] __attribute__ ((aligned (64)));
--- a/algo/swifftx/Swifftx_sha3.cpp
+++ b/algo/swifftx/Swifftx_sha3.cpp
@@ -1,369 +0,0 @@
 #include "Swifftx_sha3.h"
 extern "C" {
 #include "SWIFFTX.h"
 }
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
 // The default salt value.
 // This is the expansion of e (Euler's number) - the 19 digits after 2.71:
 // 8281828459045235360.
 // The above in base 256, from MSB to LSB:
 BitSequence SWIF_saltValueChar[SWIF_HAIFA_SALT_SIZE] = {114, 238, 247, 26, 192, 28, 170, 160};
 // All the IVs here below were produced from the decimal digits of e's expansion.
 // The code can be found in 'ProduceRandomIV.c'.
 // The initial value for 224 digest size.
 const BitSequence SWIF_HAIFA_IV_224[SWIFFTX_OUTPUT_BLOCK_SIZE] = 
 {37, 242, 132,   2, 167,  81, 158, 237, 113,  77, 162,  60,  65, 236, 108, 246,
 101,  72, 190, 109,  58, 205,  99,   6, 114, 169, 104, 114,  38, 146, 121, 142,
 59,  98, 233,  84,  72, 227,  22, 199,  17, 102, 198, 145,  24, 178,  37,   1,
 215, 245,  66, 120, 230, 193, 113, 253, 165, 218,  66, 134,  49, 231, 124, 204,
  0};
 // The initial value for 256 digest size.
 const BitSequence SWIF_HAIFA_IV_256[SWIFFTX_OUTPUT_BLOCK_SIZE] = 
 {250,  50,  42,  40,  14, 233,  53,  48, 227,  42, 237, 187, 211, 120, 209, 234,
  27, 144,   4,  61, 243, 244,  29, 247,  37, 162,  70,  11, 231, 196,  53,   6,
 193, 240,  94, 126, 204, 132, 104,  46, 114,  29,   3, 104, 118, 184, 201,   3,
  57,  77,  91, 101,  31, 155,  84, 199, 228,  39, 198,  42, 248, 198, 201, 178,
   8};
 // The initial value for 384 digest size.
 const BitSequence SWIF_HAIFA_IV_384[SWIFFTX_OUTPUT_BLOCK_SIZE] = 
 {40, 145, 193, 100, 205, 171,  47,  76, 254,  10, 196,  41, 165, 207, 200,  79,
 109,  13,  75, 201,  17, 172,  64, 162, 217,  22,  88,  39,  51,  30, 220, 151,
 133,  73, 216, 233, 184, 203,  77,   0, 248,  13,  28, 199,  30, 147, 232, 242,
 227, 124, 169, 174,  14,  45,  27,  87, 254,  73,  68, 136, 135, 159,  83, 152,
  0};
 // The initial value for 512 digest size.
 const BitSequence SWIF_HAIFA_IV_512[SWIFFTX_OUTPUT_BLOCK_SIZE] = 
 {195, 126, 197, 167, 157, 114,  99, 126, 208, 105, 200,  90,  71, 195, 144, 138,
 142, 122, 123, 116,  24, 214, 168, 173, 203, 183, 194, 210, 102, 117, 138,  42,
 114, 118, 132,  33,  35, 149, 143, 163, 163, 183, 243, 175,  72,  22, 201, 255,
 102, 243,  22, 187, 211, 167, 239,  76, 164,  70,  80, 182, 181, 212,   9, 185,
   0};
 ///////////////////////////////////////////////////////////////////////////////////////////////
 // NIST API implementation portion.
 ///////////////////////////////////////////////////////////////////////////////////////////////
 int Swifftx::Init(int hashbitlen)
 {
 	switch(hashbitlen)
 	{
 	case 224:
 		swifftxState.hashbitlen = hashbitlen;
 		// Initializes h_0 in HAIFA:
 		memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_224, SWIFFTX_OUTPUT_BLOCK_SIZE);
 		break;
 	case 256:
 		swifftxState.hashbitlen = hashbitlen;
 		memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_256, SWIFFTX_OUTPUT_BLOCK_SIZE);
 		break;
 	case 384:
 		swifftxState.hashbitlen = hashbitlen;
 		memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_384, SWIFFTX_OUTPUT_BLOCK_SIZE);
 		break;
 	case 512:
 		swifftxState.hashbitlen = hashbitlen;
 		memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_512, SWIFFTX_OUTPUT_BLOCK_SIZE);
 		break;
 	default:
 		return BAD_HASHBITLEN;
 	}
 	swifftxState.wasUpdated = false;
 	swifftxState.remainingSize = 0;
 	memset(swifftxState.remaining, 0, SWIF_HAIFA_INPUT_BLOCK_SIZE);
 	memset(swifftxState.numOfBitsChar, 0, SWIF_HAIFA_NUM_OF_BITS_SIZE);
 	// Initialize the salt with the default value.
 	memcpy(swifftxState.salt, SWIF_saltValueChar, SWIF_HAIFA_SALT_SIZE);
 	InitializeSWIFFTX();
 	return SUCCESS;
 }
 int Swifftx::Update(const BitSequence *data, DataLength databitlen)
 {
 	// The size of input in bytes after putting the remaining data from previous invocation.
 	int sizeOfInputAfterRemaining = 0;
 	// The input block to compression function of SWIFFTX:
 	BitSequence currInputBlock[SWIFFTX_INPUT_BLOCK_SIZE] = {0};
 	// Whether we handled a single block.
 	bool wasSingleBlockHandled = false;
 	swifftxState.wasUpdated = true;
 	// Handle an empty message as required by NIST. Since 'Final()' is oblivious to the input
 	// (but of course uses the output of the compression function from the previous round, 
 	// which is called h_{i-1} in HAIFA article), we have to do nothing here.
 	if (databitlen == 0)
 		return SUCCESS;
    // If we had before an input with unaligned length, return an error
    if (swifftxState.remainingSize % 8)
 	{
    	return INPUT_DATA_NOT_ALIGNED;
    }
    // Convert remaining size to bytes.
    swifftxState.remainingSize /= 8;
 	// As long as we have enough data combined from (remaining + data) to fill input block
 	//NASTAVENIE RUND
 	while (((databitlen / 8) + swifftxState.remainingSize) >= SWIF_HAIFA_INPUT_BLOCK_SIZE)
 	{
 		// Fill the input block with data:
 		// 1. The output of the previous block:
 		memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
 		// 2. The input part of the block:
 		// 2a. The remaining data from the previous 'Update()' call:
 		if (swifftxState.remainingSize)
 			memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE, swifftxState.remaining, 
 				   swifftxState.remainingSize);
 		// 2b. The input data that we have place for after the 'remaining':
 		sizeOfInputAfterRemaining = SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE 
 								  - ((int) swifftxState.remainingSize) - SWIF_HAIFA_NUM_OF_BITS_SIZE 
 								  - SWIF_HAIFA_SALT_SIZE;
 		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize, 
 			   data, sizeOfInputAfterRemaining);
 		// 3. The #bits part of the block:
 		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize 
 			 + sizeOfInputAfterRemaining,
 			   swifftxState.numOfBitsChar, SWIF_HAIFA_NUM_OF_BITS_SIZE);
 		// 4. The salt part of the block:
 		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize 
 			 + sizeOfInputAfterRemaining + SWIF_HAIFA_NUM_OF_BITS_SIZE,
 			   swifftxState.salt, SWIF_HAIFA_SALT_SIZE);
 		ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, false);
 		// Update the #bits field with SWIF_HAIFA_INPUT_BLOCK_SIZE.
 		AddToCurrInBase256(swifftxState.numOfBitsChar, SWIF_HAIFA_INPUT_BLOCK_SIZE * 8);
 		wasSingleBlockHandled = true;
 		data += sizeOfInputAfterRemaining;
 		databitlen -= (sizeOfInputAfterRemaining * 8);
   		swifftxState.remainingSize = 0;
 	}
 	// Update the swifftxState.remaining and swifftxState.remainingSize.
    // remainingSize will be in bits after exiting 'Update()'.
 	if (wasSingleBlockHandled)
 	{		
 		swifftxState.remainingSize = (unsigned int) databitlen; // now remaining size is in bits.
        if (swifftxState.remainingSize)
 			memcpy(swifftxState.remaining, data, (swifftxState.remainingSize + 7) / 8);
 	}
 	else
 	{
 		memcpy(swifftxState.remaining + swifftxState.remainingSize, data, 
 			   (size_t) (databitlen + 7) / 8);
 		swifftxState.remainingSize = (swifftxState.remainingSize * 8) + (unsigned short) databitlen;
 	}
 	return SUCCESS;
 }
 int Swifftx::Final(BitSequence *hashval)
 {
    int i;
    // Whether to add one last block. True if the padding appended to the last block overflows
 	// the block size.
    bool toAddFinalBlock = false;
    bool toPutOneInFinalBlock = false;
    unsigned short oneShift = 0;
   	// The size of the last input block before the zeroes padding. We add 1 here because we
    // include the final '1' bit in the calculation and 7 as we round the length to bytes.
 	unsigned short sizeOfLastInputBlock = (swifftxState.remainingSize + 1 + 7) / 8;
    // The number of bytes of zero in the padding part.
 	// The padding contains:
 	// 1. A single 1 bit.
 	// 2. As many zeroes as needed.
 	// 3. The message length in bits. Occupies SWIF_HAIFA_NUM_OF_BITS_SIZE bytes.
 	// 4. The digest size. Maximum is 512, so we need 2 bytes.
 	// If the total number achieved is negative, add an additional block, as HAIFA specifies.
 	short numOfZeroBytesInPadding = (short) SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE 
 								  - sizeOfLastInputBlock - (2 * SWIF_HAIFA_NUM_OF_BITS_SIZE) - 2 
 								  - SWIF_HAIFA_SALT_SIZE;
   	// The input block to compression function of SWIFFTX:
 	BitSequence currInputBlock[SWIFFTX_INPUT_BLOCK_SIZE] = {0};
 	// The message length in base 256.
 	BitSequence messageLengthChar[SWIF_HAIFA_NUM_OF_BITS_SIZE] = {0};
   	// The digest size used for padding:
 	unsigned char digestSizeLSB = swifftxState.hashbitlen % 256;
 	unsigned char digestSizeMSB = (swifftxState.hashbitlen - digestSizeLSB) / 256;
 	if (numOfZeroBytesInPadding < 1)
 		toAddFinalBlock = true;
 	// Fill the input block with data:
 	// 1. The output of the previous block:
 	memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
 	// 2a. The input part of the block, which is the remaining data from the previous 'Update()'
    //     call, if exists and an extra '1' bit (maybe all we have is this extra 1):
    // Add the last 1 in big-endian convention ...
    if (swifftxState.remainingSize % 8 == 0)
 	{
       swifftxState.remaining[sizeOfLastInputBlock - 1] = 0x80;
    }
    else 
 	{
       swifftxState.remaining[sizeOfLastInputBlock - 1] |= (1 << (7 - (swifftxState.remainingSize % 8)));
    }
 	if (sizeOfLastInputBlock)
 		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE, swifftxState.remaining, 
 			   sizeOfLastInputBlock);
   	// Compute the message length in base 256:
 	for (i = 0; i < SWIF_HAIFA_NUM_OF_BITS_SIZE; ++i)
        messageLengthChar[i] = swifftxState.numOfBitsChar[i];
    if (sizeOfLastInputBlock)
 		AddToCurrInBase256(messageLengthChar, sizeOfLastInputBlock * 8);
 	if (!toAddFinalBlock)
 	{
 		// 2b. Put the zeroes:
 		memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock,
 			   0, numOfZeroBytesInPadding);
 		// 2c. Pad the message length:
 		for (i = 0; i < SWIF_HAIFA_NUM_OF_BITS_SIZE; ++i)
 			currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock 
 						 + numOfZeroBytesInPadding + i] = messageLengthChar[i];
 		// 2d. Pad the digest size:
 		currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock 
 					 + numOfZeroBytesInPadding + SWIF_HAIFA_NUM_OF_BITS_SIZE] = digestSizeMSB;
 		currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock 
 					 + numOfZeroBytesInPadding + SWIF_HAIFA_NUM_OF_BITS_SIZE + 1] = digestSizeLSB;
 	}
 	else
 	{
 		// 2b. Put the zeroes, if at all:
 		if ((SWIF_HAIFA_INPUT_BLOCK_SIZE - sizeOfLastInputBlock) > 0)
 		{
 			 memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock,
 					0, SWIF_HAIFA_INPUT_BLOCK_SIZE - sizeOfLastInputBlock);
 		}
 	}
   	// 3. The #bits part of the block: 
 	memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE, 
           swifftxState.numOfBitsChar, SWIF_HAIFA_NUM_OF_BITS_SIZE);
 	// 4. The salt part of the block:
 	memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE 
 		 + SWIF_HAIFA_NUM_OF_BITS_SIZE, 
           swifftxState.salt, 
 		   SWIF_HAIFA_SALT_SIZE);
    ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, !toAddFinalBlock); 
 	// If we have to add one more block, it is now:
 	if (toAddFinalBlock)
 	{
 		// 1. The previous output block, as usual.
 		memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
 		// 2a. Instead of the input, zeroes:
 		memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE , 0, 
 			   SWIF_HAIFA_INPUT_BLOCK_SIZE - SWIF_HAIFA_NUM_OF_BITS_SIZE - 2);
 		// 2b. Instead of the input, the message length:
 		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE 
 			 - SWIF_HAIFA_NUM_OF_BITS_SIZE - 2,
 			   messageLengthChar,
 			   SWIF_HAIFA_NUM_OF_BITS_SIZE);
 		// 2c. Instead of the input, the digest size:
 		currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE - 2] = digestSizeMSB;
 		currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE - 1] = digestSizeLSB;
 		// 3. The #bits part of the block, which is zero in case of additional block:
 		memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE,
 			   0,
 			   SWIF_HAIFA_NUM_OF_BITS_SIZE);
 		// 4. The salt part of the block:
 		memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE 
 			 + SWIF_HAIFA_NUM_OF_BITS_SIZE, 
               swifftxState.salt, 
 			   SWIF_HAIFA_SALT_SIZE);
        ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, true); 
 	}
 	// Finally, copy the result into 'hashval'. In case the digest size is not 512bit, copy the
 	// first hashbitlen of them:
    for (i = 0; i < (swifftxState.hashbitlen / 8); ++i)
 		hashval[i] = swifftxState.currOutputBlock[i];
 	return SUCCESS;
 }
 int Swifftx::Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, 
 				BitSequence *hashval)
 {
 	int result;
 	//hashState state;
   	// The pointer to the current place in the input we take into the compression function.
 	DataLength currInputIndex = 0;
    result = Swifftx::Init(hashbitlen);
 	if (result != SUCCESS)
 		return result;
 	for ( ; (databitlen / 8) >  SWIF_HAIFA_INPUT_BLOCK_SIZE; 
         currInputIndex += SWIF_HAIFA_INPUT_BLOCK_SIZE, databitlen -= (SWIF_HAIFA_INPUT_BLOCK_SIZE * 8))
 	{
 		result = Swifftx::Update(data + currInputIndex, SWIF_HAIFA_INPUT_BLOCK_SIZE * 8); 
 		if (result != SUCCESS)
 			return result;
 	}
 	// The length of the last block may be shorter than (SWIF_HAIFA_INPUT_BLOCK_SIZE * 8)
 	result = Swifftx::Update(data + currInputIndex, databitlen); 
 	if (result != SUCCESS)
 	{
 		return result;
 	}
    return Swifftx::Final(hashval);
 }
 ///////////////////////////////////////////////////////////////////////////////////////////////
 // Helper fuction implementation portion.
 ///////////////////////////////////////////////////////////////////////////////////////////////
 void Swifftx::AddToCurrInBase256(BitSequence value[SWIF_HAIFA_NUM_OF_BITS_SIZE], 
 							   unsigned short toAdd)
 {
 	unsigned char remainder = 0;
 	short i;
 	BitSequence currValueInBase256[8] = {0};
 	unsigned short currIndex = 7;
 	unsigned short temp = 0;
 	do
 	{
 		remainder = toAdd % 256;
 		currValueInBase256[currIndex--] = remainder;
 		toAdd -= remainder;
 		toAdd /= 256;
 	}
 	while(toAdd != 0);
 	for (i = 7; i >= 0; --i)
 	{
 		temp = value[i] + currValueInBase256[i];
 		if (temp > 255)
 		{
 			value[i] = temp % 256;
 			currValueInBase256[i - 1]++;
 		}
 		else
 			value[i] = (unsigned char) temp;
 	}
 }
--- a/algo/swifftx/Swifftx_sha3.h
+++ b/algo/swifftx/Swifftx_sha3.h
@@ -1,79 +0,0 @@
 #ifndef SWIFFTX_SHA3_H
 #define SWIFFTX_SHA3_H
 #include "sha3_interface.h"
 #include "stdbool.h"
 #include "stdint.h"
 class Swifftx : public SHA3 {
 #define SWIFFTX_INPUT_BLOCK_SIZE 256
 #define SWIFFTX_OUTPUT_BLOCK_SIZE 65
 #define SWIF_HAIFA_SALT_SIZE 8
 #define SWIF_HAIFA_NUM_OF_BITS_SIZE 8
 #define SWIF_HAIFA_INPUT_BLOCK_SIZE (SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE \
 							  - SWIF_HAIFA_NUM_OF_BITS_SIZE - SWIF_HAIFA_SALT_SIZE)
 	typedef unsigned char BitSequence;
 //const DataLength SWIF_SALT_VALUE;
 #define SWIF_HAIFA_IV 0
 /*const BitSequence SWIF_HAIFA_IV_224[SWIFFTX_OUTPUT_BLOCK_SIZE];
 const BitSequence SWIF_HAIFA_IV_256[SWIFFTX_OUTPUT_BLOCK_SIZE];
 const BitSequence SWIF_HAIFA_IV_384[SWIFFTX_OUTPUT_BLOCK_SIZE];
 const BitSequence SWIF_HAIFA_IV_512[SWIFFTX_OUTPUT_BLOCK_SIZE];*/
 typedef enum 
 { 
 	SUCCESS = 0,
 	FAIL = 1,
 	BAD_HASHBITLEN = 2,
 	BAD_SALT_SIZE = 3,
 	SET_SALT_VALUE_FAILED = 4,
 	INPUT_DATA_NOT_ALIGNED = 5
 } HashReturn;
 typedef struct hashState {
 	unsigned short hashbitlen;
 	// The data remained after the recent call to 'Update()'. 
 	BitSequence remaining[SWIF_HAIFA_INPUT_BLOCK_SIZE + 1];
 	// The size of the remaining data in bits.
 	// Is 0 in case there is no remaning data at all.
 	unsigned int remainingSize;
 	// The current output of the compression function. At the end will contain the final digest
 	// (which may be needed to be truncated, depending on hashbitlen).
 	BitSequence currOutputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE];
 	// The value of '#bits hashed so far' field in HAIFA, in base 256.
 	BitSequence numOfBitsChar[SWIF_HAIFA_NUM_OF_BITS_SIZE];
 	// The salt value currently in use:
 	BitSequence salt[SWIF_HAIFA_SALT_SIZE];
 	// Indicates whether a single 'Update()' occured. 
 	// Ater a call to 'Update()' the key and the salt values cannot be changed.
 	bool wasUpdated;
 } hashState;
 private:
 int swifftxNumRounds;
 hashState swifftxState;
 public:
 int Init(int hashbitlen);
 int Update(const BitSequence *data, DataLength databitlen);
 int Final(BitSequence *hashval);
 int Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, 
 				BitSequence *hashval);
 private:
 static void AddToCurrInBase256(BitSequence value[SWIF_HAIFA_NUM_OF_BITS_SIZE], unsigned short toAdd);
 };
 #endif
--- a/algo/swifftx/hash_interface.h
+++ b/algo/swifftx/hash_interface.h
@@ -1,21 +0,0 @@
 #pragma once
 #include <cstdint>
 namespace hash {
 using BitSequence = unsigned char;
 using DataLength = unsigned long long;
 struct hash_interface {
    virtual ~hash_interface() = default;
    virtual int Init(int hash_bitsize) = 0;
    virtual int Update(const BitSequence *data, DataLength data_bitsize) = 0;
    virtual int Final(BitSequence *hash) = 0;
    virtual int
    Hash(int hash_bitsize, const BitSequence *data, DataLength data_bitsize, BitSequence *hash) = 0;
 };
 } // namespace hash
--- a/algo/swifftx/sha3_interface.h
+++ b/algo/swifftx/sha3_interface.h
@@ -1,14 +0,0 @@
 #pragma once
 #include <cstdint>
 //#include <streams/hash/hash_interface.h>
 #include "hash_interface.h"
 namespace sha3 {
 using BitSequence = hash::BitSequence;
 using DataLength = hash::DataLength;
 struct sha3_interface : hash::hash_interface {};
 } // namespace sha3
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -506,4 +506,156 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
     return 0;
 }
 #elif defined (X11GOST_2WAY)
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
  #include "algo/groestl/sph_groestl.h"
  #include "algo/echo/sph_echo.h"
 #endif
 union _x11gost_context_overlay
 {
        blake512_2x64_context   blake;
        bmw512_2x64_context     bmw;
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_groestl       groestl;
 #else
        sph_groestl512_context  groestl;
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_echo          echo;
 #else
        sph_echo512_context     echo;
 #endif
        jh512_2x64_context      jh;
        keccak512_2x64_context  keccak;
        skein512_2x64_context   skein;
        hashState_luffa         luffa;
        cubehashParam           cube;
        sph_shavite512_context  shavite;
        simd512_context         simd;
        sph_gost512_context     gost;
 };
 typedef union _x11gost_context_overlay x11gost_context_overlay;
 int x11gost_2x64_hash( void *state, const void *input, int thr_id )
 {
    uint8_t vhash[80*2] __attribute__((aligned(64)));
    uint8_t hash0[64]   __attribute__((aligned(64)));
    uint8_t hash1[64]   __attribute__((aligned(64)));
    x11gost_context_overlay ctx;
    intrlv_2x64( vhash, input, input+80, 640 );
    blake512_2x64_full( &ctx.blake, vhash, vhash, 80 );
    bmw512_2x64_init( &ctx.bmw );
    bmw512_2x64_update( &ctx.bmw, vhash, 64 );
    bmw512_2x64_close( &ctx.bmw, vhash );
    dintrlv_2x64( hash0, hash1, vhash, 512 );
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
    groestl512_full( &ctx.groestl, hash0, hash0, 512 );
    groestl512_full( &ctx.groestl, hash1, hash1, 512 );
 #else
    sph_groestl512_init( &ctx.groestl );
    sph_groestl512( &ctx.groestl, hash0, 64 );
    sph_groestl512_close( &ctx.groestl, hash0 );
    sph_groestl512_init( &ctx.groestl );
    sph_groestl512( &ctx.groestl, hash1, 64 );
    sph_groestl512_close( &ctx.groestl, hash1 );
 #endif
    intrlv_2x64( vhash, hash0, hash1, 512 );
    skein512_2x64_full( &ctx.skein, vhash, vhash, 64 );
    jh512_2x64_ctx( &ctx.jh, vhash, vhash, 64 );
    keccak512_2x64_ctx( &ctx.keccak, vhash, vhash, 64 );
    dintrlv_2x64( hash0, hash1, vhash, 512 );
    sph_gost512_init( &ctx.gost );
    sph_gost512( &ctx.gost, hash0, 64 );
    sph_gost512_close( &ctx.gost, hash0 );
    sph_gost512_init( &ctx.gost );
    sph_gost512( &ctx.gost, hash1, 64 );
    sph_gost512_close( &ctx.gost, hash1 );
    luffa_full( &ctx.luffa, hash0, 512, hash0, 64 );
    luffa_full( &ctx.luffa, hash1, 512, hash1, 64 );
    cubehash_full( &ctx.cube, hash0, 512, hash0, 64 );
    cubehash_full( &ctx.cube, hash1, 512, hash1, 64 );
    sph_shavite512_init( &ctx.shavite );
    sph_shavite512( &ctx.shavite, hash0, 64 );
    sph_shavite512_close( &ctx.shavite, hash0 );
    sph_shavite512_init( &ctx.shavite );
    sph_shavite512( &ctx.shavite, hash1, 64 );
    sph_shavite512_close( &ctx.shavite, hash1 );
    simd512_ctx( &ctx.simd, hash0, hash0, 64 );
    simd512_ctx( &ctx.simd, hash1, hash1, 64 );
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
    echo_full( &ctx.echo, hash0, 512, hash0, 64 );
    echo_full( &ctx.echo, hash1, 512, hash1, 64 );
 #else
    sph_echo512_init( &ctx.echo );
    sph_echo512( &ctx.echo, hash0, 64 );
    sph_echo512_close( &ctx.echo, hash0 );
    sph_echo512_init( &ctx.echo );
    sph_echo512( &ctx.echo, hash1, 64 );
    sph_echo512_close( &ctx.echo, hash1 );
 #endif
    memcpy( state,    hash0, 32 );
    memcpy( state+32, hash1, 32 );
    return 1;
 }
 int scanhash_x11gost_2x64( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*2]   __attribute__((aligned(64)));
   uint32_t edata[20*2]   __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 2;
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
   v128_bswap32_80( edata, pdata );
   memcpy( edata+20, edata, 80 );
   do
   {
      edata[19] = n;
      edata[39] = n+1;
      if ( likely( x11gost_2x64_hash( hash, edata, thr_id ) ) )
      {
         if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
         {
            pdata[19] = bswap_32( n );
            submit_solution( work, hash, mythr );
         }
         if ( unlikely( valid_hash( hash+8, ptarget ) && !bench ) )
         {
            pdata[19] = bswap_32( n+1 );
            submit_solution( work, hash+8, mythr );
         }
      }
      n += 2;
   } while ( n < last_nonce && !work_restart[thr_id].restart );
   *hashes_done = n - first_nonce;
   pdata[19] = n;
   return 0;
 }
 #endif
--- a/algo/x11/x11gost-gate.c
+++ b/algo/x11/x11gost-gate.c
@@ -10,12 +10,16 @@ bool register_x11gost_algo( algo_gate_t* gate )
  init_x11gost_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x11gost_4way;
  gate->hash      = (void*)&x11gost_4way_hash;
 #elif defined(X11GOST_2WAY)
  gate->scanhash  = (void*)&scanhash_x11gost_2x64;
  gate->hash      = (void*)&x11gost_2x64_hash;
 #else
  init_x11gost_ctx();
  gate->scanhash  = (void*)&scanhash_x11gost;
  gate->hash      = (void*)&x11gost_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT 
                      | NEON_OPT;
  return true;
 };
--- a/algo/x11/x11gost-gate.h
+++ b/algo/x11/x11gost-gate.h
@@ -8,6 +8,8 @@
  #define X11GOST_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X11GOST_4WAY 1
 #elif defined(__SSE2__) || defined(__ARM_NEON)
  #define X11GOST_2WAY 1
 #endif
 bool register_x11gost_algo( algo_gate_t* gate );
@@ -26,6 +28,12 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
 void init_x11gost_4way_ctx();
 #elif defined(X11GOST_2WAY)
 int x11gost_2x64_hash( void *state, const void *input, int thr_id );
 int scanhash_x11gost_2x64( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
 #else
 void x11gost_hash( void *state, const void *input );
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -1,6 +1,8 @@
 #include "x11gost-gate.h"
-#if !defined(X11GOST_8WAY) && !defined(X11GOST_4WAY)
+// no longer used, not working when last used.
 #if !defined(X11GOST_8WAY) && !defined(X11GOST_4WAY) && !defined(X11GOST_2WAY)
 #include <stdlib.h>
 #include <stdint.h>
--- a/algo/x13/skunk-4way.c
+++ b/algo/x13/skunk-4way.c
@@ -155,13 +155,13 @@ void skunk_4way_hash( void *output, const void *input )
     skein512_4way_final16( &ctx.skein, vhash, input + (64*4) );
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-     cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*)hash0, 64 );
+     cubehashUpdateDigest( &ctx.cube, hash0, hash0, 64 );
     memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     cubehashUpdateDigest( &ctx.cube, hash1, hash1, 64 );
     memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     cubehashUpdateDigest( &ctx.cube, hash2, hash2, 64 );
     memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+     cubehashUpdateDigest( &ctx.cube, hash3, hash3, 64 );
     fugue512_full( &ctx.fugue, hash0, hash0, 64 );
     fugue512_full( &ctx.fugue, hash1, hash1, 64 );
--- a/algo/x16/hex.c
+++ b/algo/x16/hex.c
@@ -23,13 +23,12 @@ static void hex_getAlgoString(const uint32_t* prevblock, char *output)
   *sptr = '\0';
 }
 static __thread x16r_context_overlay hex_ctx;
 int hex_hash( void* output, const void* input, int thrid )
 {
   uint32_t _ALIGN(128) hash[16];
   x16r_context_overlay ctx;
-   memcpy( &ctx, &hex_ctx, sizeof(ctx) );
+   memcpy( &ctx, &x16r_ref_ctx, sizeof(ctx) );
   void *in = (void*) input;
   int size = 80;
@@ -52,7 +51,7 @@ int hex_hash( void* output, const void* input, int thrid )
         break;
         case GROESTL:
 #if defined(__AES__)
-            groestl512_full( &ctx.groestl, (char*)hash, (char*)in, size<<3 );
+            groestl512_full( &ctx.groestl, hash, in, size<<3 );
 #else
            sph_groestl512_init( &ctx.groestl );
            sph_groestl512( &ctx.groestl, in, size );
@@ -87,7 +86,7 @@ int hex_hash( void* output, const void* input, int thrid )
         case LUFFA:
            if ( i == 0 )
            {
-              update_and_final_luffa( &ctx.luffa, hash, (const void*)in+64, 16 );
+              update_and_final_luffa( &ctx.luffa, hash, in+64, 16 );
            }
            else
            {
@@ -97,7 +96,7 @@ int hex_hash( void* output, const void* input, int thrid )
            break;
         case CUBEHASH:
            if ( i == 0 )
-               cubehashUpdateDigest( &ctx.cube, hash, (const void*)in+64, 16 );
+               cubehashUpdateDigest( &ctx.cube, hash, in+64, 16 );
            else
            {
               cubehashInit( &ctx.cube, 512, 16, 32 );
@@ -108,22 +107,11 @@ int hex_hash( void* output, const void* input, int thrid )
            shavite512_full( &ctx.shavite, hash, in, size );
         break;
         case SIMD:
-#if defined(__aarch64__)
+            simd512_ctx( &ctx.simd, hash, in, size<<3 );
            sph_simd512_init( &ctx.simd );
            sph_simd512(&ctx.simd, (const void*) hash, 64);
            sph_simd512_close(&ctx.simd, hash);
 #else
            simd_full( &ctx.simd, (BitSequence *)hash,
                             (const BitSequence*)in, size<<3 );
             init_sd( &ctx.simd, 512 );
             update_final_sd( &ctx.simd, (BitSequence *)hash,
                              (const BitSequence*)in, size<<3 );
 #endif
         break;
         case ECHO:
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES) 
-            echo_full( &ctx.echo, (BitSequence *)hash, 512,
+            echo_full( &ctx.echo, hash, 512, in, size );
                              (const BitSequence *)in, size );
 #else
            sph_echo512_init( &ctx.echo );
            sph_echo512( &ctx.echo, in, size );
@@ -216,32 +204,32 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
   switch ( algo )
   {
      case JH:
-         sph_jh512_init( &hex_ctx.jh );
+         sph_jh512_init( &x16r_ref_ctx.jh );
-         sph_jh512( &hex_ctx.jh, edata, 64 );
+         sph_jh512( &x16r_ref_ctx.jh, edata, 64 );
      break;
      case SKEIN:
-         sph_skein512_init( &hex_ctx.skein );
+         sph_skein512_init( &x16r_ref_ctx.skein );
-         sph_skein512( &hex_ctx.skein, edata, 64 );
+         sph_skein512( &x16r_ref_ctx.skein, edata, 64 );
      break;
      case LUFFA:
-         init_luffa( &hex_ctx.luffa, 512 );
+         init_luffa( &x16r_ref_ctx.luffa, 512 );
-         update_luffa( &hex_ctx.luffa, edata, 64 );
+         update_luffa( &x16r_ref_ctx.luffa, edata, 64 );
      break;
      case CUBEHASH:
-         cubehashInit( &hex_ctx.cube, 512, 16, 32 );
+         cubehashInit( &x16r_ref_ctx.cube, 512, 16, 32 );
-         cubehashUpdate( &hex_ctx.cube, edata, 64 );
+         cubehashUpdate( &x16r_ref_ctx.cube, edata, 64 );
      break;
      case HAMSI:
-         sph_hamsi512_init( &hex_ctx.hamsi );
+         sph_hamsi512_init( &x16r_ref_ctx.hamsi );
-         sph_hamsi512( &hex_ctx.hamsi, edata, 64 );
+         sph_hamsi512( &x16r_ref_ctx.hamsi, edata, 64 );
      break;
      case SHABAL:
-         sph_shabal512_init( &hex_ctx.shabal );
+         sph_shabal512_init( &x16r_ref_ctx.shabal );
-         sph_shabal512( &hex_ctx.shabal, edata, 64 );
+         sph_shabal512( &x16r_ref_ctx.shabal, edata, 64 );
      break;
      case WHIRLPOOL:
-         sph_whirlpool_init( &hex_ctx.whirlpool );
+         sph_whirlpool_init( &x16r_ref_ctx.whirlpool );
-         sph_whirlpool( &hex_ctx.whirlpool, edata, 64 );
+         sph_whirlpool( &x16r_ref_ctx.whirlpool, edata, 64 );
      break;
   }
--- a/algo/x16/minotaur.c
+++ b/algo/x16/minotaur.c
@@ -14,9 +14,6 @@
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #if defined(__aarch64__)
  #include "algo/simd/sph_simd.h"
 #endif
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
@@ -24,11 +21,15 @@
 #include "algo/yespower/yespower.h"
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/echo/aes_ni/hash_api.h"
-  #include "algo/groestl/aes_ni/hash-groestl.h"
+#else
 #endif
  #include "algo/echo/sph_echo.h"
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #else
  #include "algo/groestl/sph_groestl.h"
-#if defined(__AES__)
+#endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/fugue/fugue-aesni.h"
 #else
  #include "algo/fugue/sph_fugue.h"
@@ -46,7 +47,7 @@ typedef struct TortureGarden TortureGarden;
 // Graph of hash algos plus SPH contexts
 struct TortureGarden
 {
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   hashState_groestl       groestl;
 #else
   sph_groestl512_context  groestl;
@@ -56,7 +57,7 @@ struct TortureGarden
 #else
   sph_echo512_context     echo;
 #endif
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   hashState_fugue         fugue;
 #else
   sph_fugue512_context    fugue;
@@ -112,14 +113,14 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
 #endif
 	         break;
        case 4:
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
            fugue512_full( &garden->fugue, hash, input, 64 );
 #else
            sph_fugue512_full( &garden->fugue, hash, input, 64 );
 #endif
 	         break;
        case 5:
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
            groestl512_full( &garden->groestl, hash, input, 512 );
 #else
            sph_groestl512_init( &garden->groestl) ;
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -19,12 +19,12 @@
 // Perform midstate prehash of hash functions with block size <= 72 bytes,
 // 76 bytes for hash functions that operate on 32 bit data.
-void x16r_8way_prehash( void *vdata, void *pdata )
+void x16r_8way_prehash( void *vdata, void *pdata, const char *hash_order )
 {
   uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
   uint32_t edata[20] __attribute__ ((aligned (64)));
-   const char elem = x16r_hash_order[0];
+   const char elem = hash_order[0];
   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
   switch ( algo )
@@ -110,7 +110,8 @@ void x16r_8way_prehash( void *vdata, void *pdata )
 // Called by wrapper hash function to optionally continue hashing and
 // convert to final hash.
-int x16r_8way_hash_generic( void* output, const void* input, int thrid )
+int x16r_8way_hash_generic( void* output, const void* input, int thrid,
     const char *hash_order, const int func_count )
 {
   uint32_t vhash[20*8] __attribute__ ((aligned (128)));
   uint32_t hash0[20] __attribute__ ((aligned (16)));
@@ -136,9 +137,9 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 input, 640 );
-   for ( int i = 0; i < 16; i++ )
+   for ( int i = 0; i < func_count; i++ )
   {
-      const char elem = x16r_hash_order[i];
+      const char elem = hash_order[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
      switch ( algo )
@@ -474,7 +475,8 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
 int x16r_8way_hash( void* output, const void* input, int thrid )
 {
   uint8_t hash[64*8] __attribute__ ((aligned (128)));
-   if ( !x16r_8way_hash_generic( hash, input, thrid ) )
+   if ( !x16r_8way_hash_generic( hash, input, thrid, x16r_hash_order, 
                                 X16R_HASH_FUNC_COUNT ) )
      return 0;
   memcpy( output,     hash,     32 );
@@ -495,7 +497,6 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*8] __attribute__ ((aligned (128)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -508,21 +509,18 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
   if ( bench )   ptarget[7] = 0x0cff;
-   bedata1[0] = bswap_32( pdata[1] );
+   static __thread uint32_t saved_height = UINT32_MAX;
-   bedata1[1] = bswap_32( pdata[2] );
+   if ( work->height != saved_height )
   static __thread uint32_t s_ntime = UINT32_MAX;
   const uint32_t ntime = bswap_32( pdata[17] );
   if ( s_ntime != ntime )
   {
-      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      vdata[1] = bswap_32( pdata[1] );
-      s_ntime = ntime;
+      vdata[2] = bswap_32( pdata[2] );
-
+      saved_height = work->height;
-      if ( opt_debug && !thr_id )
+      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
-          applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
+      if ( !opt_quiet && !thr_id )
           applog( LOG_INFO, "hash order %s", x16r_hash_order );
   }
-   x16r_8way_prehash( vdata, pdata );
+   x16r_8way_prehash( vdata, pdata, x16r_hash_order );
   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
@@ -546,12 +544,12 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
 #elif defined (X16R_4WAY)
-void x16r_4way_prehash( void *vdata, void *pdata )
+void x16r_4way_prehash( void *vdata, void *pdata, const char *hash_order )
 {
   uint32_t vdata2[20*4] __attribute__ ((aligned (64)));
   uint32_t edata[20] __attribute__ ((aligned (64)));
-   const char elem = x16r_hash_order[0];
+   const char elem = hash_order[0];
   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
   switch ( algo )
@@ -627,7 +625,8 @@ void x16r_4way_prehash( void *vdata, void *pdata )
   }
 }
-int x16r_4way_hash_generic( void* output, const void* input, int thrid )
+int x16r_4way_hash_generic( void* output, const void* input, int thrid,
                            const char *hash_order, const int func_count )
 {
   uint32_t vhash[20*4] __attribute__ ((aligned (128)));
   uint32_t hash0[20] __attribute__ ((aligned (32)));
@@ -644,9 +643,9 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
   dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
-   for ( int i = 0; i < 16; i++ )
+   for ( int i = 0; i < func_count; i++ )
   {
-      const char elem = x16r_hash_order[i];
+      const char elem = hash_order[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
      switch ( algo )
@@ -908,7 +907,8 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
 int x16r_4way_hash( void* output, const void* input, int thrid )
 {
   uint8_t hash[64*4] __attribute__ ((aligned (64)));
-   if ( !x16r_4way_hash_generic( hash, input, thrid ) )
+   if ( !x16r_4way_hash_generic( hash, input, thrid, x16r_hash_order,
                                 X16R_HASH_FUNC_COUNT ) )
      return 0;
   memcpy( output,     hash,     32 );
@@ -924,7 +924,6 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -937,20 +936,18 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
   if ( bench )  ptarget[7] = 0x0cff;
-   bedata1[0] = bswap_32( pdata[1] );
+   static __thread uint32_t saved_height = UINT32_MAX;
-   bedata1[1] = bswap_32( pdata[2] );
+   if ( work->height != saved_height )
   static __thread uint32_t s_ntime = UINT32_MAX;
   const uint32_t ntime = bswap_32( pdata[17] );
   if ( s_ntime != ntime )
   {
-      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      vdata[1] = bswap_32( pdata[1] );
-      s_ntime = ntime;
+      vdata[2] = bswap_32( pdata[2] );
-      if ( opt_debug && !thr_id )
+      saved_height = work->height;
-         applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
+      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
      if ( !opt_quiet && !thr_id )
           applog( LOG_INFO, "hash order %s", x16r_hash_order );
   }
-   x16r_4way_prehash( vdata, pdata );
+   x16r_4way_prehash( vdata, pdata, x16r_hash_order );
   *noncev = mm256_intrlv_blend_32(
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
@@ -971,4 +968,404 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }
 #elif defined (X16R_2WAY)
 void x16r_2x64_prehash( void *vdata, void *pdata, const char *hash_order )
 {
   uint32_t edata[20] __attribute__ ((aligned (64)));
   const char elem = hash_order[0];
   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
   switch ( algo )
   {
      case JH:
         v128_bswap32_intrlv80_2x64( vdata, pdata );
         jh512_2x64_init( &x16r_ctx.jh );
         jh512_2x64_update( &x16r_ctx.jh, vdata, 64 );
      break;
      case KECCAK:
         v128_bswap32_intrlv80_2x64( vdata, pdata );
         keccak512_2x64_init( &x16r_ctx.keccak );
         keccak512_2x64_update( &x16r_ctx.keccak, vdata, 72 );
      break;
      case SKEIN:
         v128_bswap32_intrlv80_2x64( vdata, pdata );
         skein512_2x64_prehash64( &x16r_ctx.skein, vdata );
      break;
      case LUFFA:
      {
         v128_bswap32_80( edata, pdata );
         init_luffa( &x16r_ctx.luffa, 512 );
         update_luffa( &x16r_ctx.luffa, edata, 64 );
         intrlv_2x64( vdata, edata, edata, 640 );
      }
      break;
      case CUBEHASH:
      {
         v128_bswap32_80( edata, pdata );
         cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
         cubehashUpdate( &x16r_ctx.cube, edata, 64 );
         intrlv_2x64( vdata, edata, edata, 640 );
      }
      break;
      case HAMSI:
 #if defined(__SSE4_2__) || defined(__ARM_NEON)
         v128_bswap32_intrlv80_2x64( vdata, pdata );
         hamsi512_2x64_init( &x16r_ctx.hamsi );
         hamsi512_2x64_update( &x16r_ctx.hamsi, vdata, 72 );
 #else
         v128_bswap32_80( edata, pdata );
         sph_hamsi512_init( &x16r_ctx.hamsi );
         sph_hamsi512( &x16r_ctx.hamsi, edata, 72 );
         intrlv_2x64( vdata, edata, edata, 640 );
 #endif
      break;
      case FUGUE:
         v128_bswap32_80( edata, pdata );
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
         fugue512_init( &x16r_ctx.fugue );
         fugue512_update( &x16r_ctx.fugue, edata, 76 );
 #else         
         sph_fugue512_init( &x16r_ctx.fugue );
         sph_fugue512( &x16r_ctx.fugue, edata, 76 );
 #endif
         intrlv_2x64( vdata, edata, edata, 640 );
      break;
      case SHABAL:
         v128_bswap32_80( edata, pdata );
         sph_shabal512_init( &x16r_ctx.shabal );
         sph_shabal512( &x16r_ctx.shabal, edata, 64);
         intrlv_2x64( vdata, edata, edata, 640 );
      break;
      case WHIRLPOOL:
         v128_bswap32_80( edata, pdata );
         sph_whirlpool_init( &x16r_ctx.whirlpool );
         sph_whirlpool( &x16r_ctx.whirlpool, edata, 64 );
         intrlv_2x64( vdata, edata, edata, 640 );
      break;
      default:
         v128_bswap32_intrlv80_2x64( vdata, pdata );
   }
 }
 int x16r_2x64_hash_generic( void* output, const void* input, int thrid,
                            const char *hash_order, const int func_count )
 {
   uint32_t vhash[20*2] __attribute__ ((aligned (64)));
   uint32_t hash0[20] __attribute__ ((aligned (32)));
   uint32_t hash1[20] __attribute__ ((aligned (32)));
   x16r_2x64_context_overlay ctx;
   memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
   void *in0 = (void*) hash0;
   void *in1 = (void*) hash1;
   int size = 80;
   dintrlv_2x64( hash0, hash1, input, 640 );
   for ( int i = 0; i < func_count; i++ )
   {
      const char elem = hash_order[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
      switch ( algo )
      {
         case BLAKE:
            if ( i == 0 )
               blake512_2x64_full( &ctx.blake, vhash, input, size );
            else
            {
               intrlv_2x64( vhash, in0, in1, size<<3 );
               blake512_2x64_full( &ctx.blake, vhash, vhash, size );
            }
            dintrlv_2x64( hash0, hash1, vhash, 512 );
         break;
         case BMW:
            bmw512_2x64_init( &ctx.bmw );
            if ( i == 0 )
               bmw512_2x64_update( &ctx.bmw, input, size );
            else
            {
               intrlv_2x64( vhash, in0, in1, size<<3 );
               bmw512_2x64_update( &ctx.bmw, vhash, size );
            }
            bmw512_2x64_close( &ctx.bmw, vhash );
            dintrlv_2x64( hash0, hash1, vhash, 512 );
         break;
         case GROESTL:
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
            groestl512_full( &ctx.groestl, hash0, in0, size<<3 );
            groestl512_full( &ctx.groestl, hash1, in1, size<<3 );
 #else
            sph_groestl512_init( &ctx.groestl );
            sph_groestl512( &ctx.groestl, in0, size );
            sph_groestl512_close( &ctx.groestl, hash0 );
            sph_groestl512_init( &ctx.groestl );
            sph_groestl512( &ctx.groestl, in1, size );
            sph_groestl512_close( &ctx.groestl, hash1 );
 #endif
          break;
         case JH:
            if ( i == 0 )
               jh512_2x64_update( &ctx.jh, input + (64*2), 16 );
            else
            {
               intrlv_2x64( vhash, in0, in1, size<<3 );
               jh512_2x64_init( &ctx.jh );
               jh512_2x64_update( &ctx.jh, vhash, size );
            }
            jh512_2x64_close( &ctx.jh, vhash );
            dintrlv_2x64( hash0, hash1, vhash, 512 );
         break;
         case KECCAK:
           if ( i == 0 )
               keccak512_2x64_update( &ctx.keccak, input + (72*2), 8 );
            else
            {
               intrlv_2x64( vhash, in0, in1, size<<3 );
               keccak512_2x64_init( &ctx.keccak );
               keccak512_2x64_update( &ctx.keccak, vhash, size );
            }
            keccak512_2x64_close( &ctx.keccak, vhash );
            dintrlv_2x64( hash0, hash1, vhash, 512 );
         break;
         case SKEIN:
            if ( i == 0 )
               skein512_2x64_final16( &ctx.skein, vhash, input + (64*2) );
            else
            {
               intrlv_2x64( vhash, in0, in1, size<<3 );
               skein512_2x64_full( &ctx.skein, vhash, vhash, size );
            }
            dintrlv_2x64( hash0, hash1, vhash, 512 );
         break;
         case LUFFA:
            if ( i == 0 )
            {
               update_and_final_luffa( &ctx.luffa, hash0, in0 + 64, 16 );
               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
               update_and_final_luffa( &ctx.luffa, hash1, in1 + 64, 16 );
            }
            else
            {
               luffa_full( &ctx.luffa, hash0, 512, hash0, size );
               luffa_full( &ctx.luffa, hash1, 512, hash1, size );
            }
         break;
         case CUBEHASH:
            if ( i == 0 )
            {
               cubehashUpdateDigest( &ctx.cube, hash0, in0 + 64, 16 );
               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
               cubehashUpdateDigest( &ctx.cube, hash1, in1 + 64, 16 );
            }
            else
            {
               cubehash_full( &ctx.cube, hash0, 512, hash0, size );
               cubehash_full( &ctx.cube, hash1, 512, hash1, size );
            }
         break;
         case SHAVITE:
            shavite512_full( &ctx.shavite, hash0, in0, size );
            shavite512_full( &ctx.shavite, hash1, in1, size );
          break;
         case SIMD:
            simd512_ctx( &ctx.simd, hash0, in0, size );
            simd512_ctx( &ctx.simd, hash1, in1, size );
         break;
         case ECHO:
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
            echo_full( &ctx.echo, hash0, 512, in0, size );
            echo_full( &ctx.echo, hash1, 512, in1, size );
 #else
            sph_echo512_init( &ctx.echo );
            sph_echo512( &ctx.echo, in0, size );
            sph_echo512_close( &ctx.echo, hash0 );
            sph_echo512_init( &ctx.echo );
            sph_echo512( &ctx.echo, in1, size );
            sph_echo512_close( &ctx.echo, hash1 );
 #endif
          break;
         case HAMSI:
 #if defined(__SSE4_2__) || defined(__ARM_NEON)
            if ( i == 0 )
               hamsi512_2x64_update( &ctx.hamsi, input + (72*2), 8 );
            else
            {
               intrlv_2x64( vhash, hash0, hash1, size<<3 );
               hamsi512_2x64_init( &ctx.hamsi );
               hamsi512_2x64_update( &ctx.hamsi, vhash, size );
            }
            hamsi512_2x64_close( &ctx.hamsi, vhash );
            dintrlv_2x64( hash0, hash1, vhash, 512 );
 #else
            if ( i == 0 )
            {
               sph_hamsi512( &ctx.hamsi, in0 + 72, 8 );
               sph_hamsi512_close( &ctx.hamsi, hash0 );
               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
               sph_hamsi512( &ctx.hamsi, in1 + 72, 8 );
               sph_hamsi512_close( &ctx.hamsi, hash1 );
            }
            else
            {
               sph_hamsi512_init( &ctx.hamsi );
               sph_hamsi512( &ctx.hamsi, hash0, size );
               sph_hamsi512_close( &ctx.hamsi, hash0 );
               sph_hamsi512_init( &ctx.hamsi );
               sph_hamsi512( &ctx.hamsi, hash1, size );
               sph_hamsi512_close( &ctx.hamsi, hash1 );
             }
 #endif
            break;
         case FUGUE:
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
            if ( i == 0 )
            {
               fugue512_update( &ctx.fugue, in0 + 76, 4 );
               fugue512_final( &ctx.fugue, hash0 );
               memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
               fugue512_update( &ctx.fugue, in1 + 76, 4 );
               fugue512_final( &ctx.fugue, hash1 );
            }
            else
            {
               fugue512_full( &ctx.fugue, hash0, hash0, size );
               fugue512_full( &ctx.fugue, hash1, hash1, size );
            }
 #else
            if ( i == 0 )
            {
               sph_fugue512( &ctx.fugue, in0 + 76, 4 );
               sph_fugue512_close( &ctx.fugue, hash0 );
               memcpy( &ctx, &x16r_ctx, sizeof(sph_fugue512_context) );
               sph_fugue512( &ctx.fugue, in1 + 76, 4 );
               sph_fugue512_close( &ctx.fugue, hash1 );
            }
             else
             {
                sph_fugue512_full( &ctx.fugue, hash0, hash0, size );
                sph_fugue512_full( &ctx.fugue, hash1, hash1, size );
             }
 #endif
             break;
         case SHABAL:
            if ( i == 0 )
            {
               sph_shabal512( &ctx.shabal, in0 + 64, 16 );
               sph_shabal512_close( &ctx.shabal, hash0 );
               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
               sph_shabal512( &ctx.shabal, in1 + 64, 16 );
               sph_shabal512_close( &ctx.shabal, hash1 );
            }
            else
            {
               sph_shabal512_init( &ctx.shabal );
               sph_shabal512( &ctx.shabal, hash0, size );
               sph_shabal512_close( &ctx.shabal, hash0 );
               sph_shabal512_init( &ctx.shabal );
               sph_shabal512( &ctx.shabal, hash1, size );
               sph_shabal512_close( &ctx.shabal, hash1 );
             }
         break;
         case WHIRLPOOL:
            if ( i == 0 )
            {
               sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
               sph_whirlpool_close( &ctx.whirlpool, hash0 );
               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
               sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
               sph_whirlpool_close( &ctx.whirlpool, hash1 );
               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
            }
            else
            {
               sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, size );
               sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, size );
            }
         break;
         case SHA_512:
            sha512_2x64_init( &ctx.sha512 );
            if ( i == 0 )
               sha512_2x64_update( &ctx.sha512, input, size );
            else
            {
               intrlv_2x64( vhash, in0, in1, size<<3 );
               sha512_2x64_init( &ctx.sha512 );
               sha512_2x64_update( &ctx.sha512, vhash, size );
            }
            sha512_2x64_close( &ctx.sha512, vhash );
            dintrlv_2x64( hash0, hash1, vhash, 512 );
         break;
      }
      if ( work_restart[thrid].restart ) return 0;
      size = 64;
   }
   memcpy( output,     hash0, 64 );
   memcpy( output+64,  hash1, 64 );
   return 1;
 }
 int x16r_2x64_hash( void* output, const void* input, int thrid )
 {
   uint8_t hash[64*2] __attribute__ ((aligned (64)));
   if ( !x16r_2x64_hash_generic( hash, input, thrid, x16r_hash_order,
                                 X16R_HASH_FUNC_COUNT ) )
      return 0;
   memcpy( output,     hash,     32 );
   memcpy( output+32,  hash+64,  32 );
   return 1;
 }
 int scanhash_x16r_2x64( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr)
 {
   uint32_t hash[16*2] __attribute__ ((aligned (64)));
   uint32_t vdata[20*2] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 2;
   uint32_t n = first_nonce;
   v128_t *noncev = (v128_t*)vdata + 9;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
   if ( bench )  ptarget[7] = 0x0cff;
   static __thread uint32_t saved_height = UINT32_MAX;
   if ( work->height != saved_height )
   {
      vdata[1] = bswap_32( pdata[1] );
      vdata[2] = bswap_32( pdata[2] );
      saved_height = work->height;
      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
      if ( !opt_quiet && !thr_id )
           applog( LOG_INFO, "hash order %s", x16r_hash_order );
   }
   x16r_2x64_prehash( vdata, pdata, x16r_hash_order );
   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
   do
   {
      if ( x16r_2x64_hash( hash, vdata, thr_id ) );
      for ( int i = 0; i < 2; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
         pdata[19] = bswap_32( n+i );
         submit_solution( work, hash+(i<<3), mythr );
      }
      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
      n += 2;
   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
 #endif
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -13,10 +13,13 @@ __thread x16r_8way_context_overlay x16r_ctx;
 __thread x16r_4way_context_overlay x16r_ctx;
 #elif defined(X16R_2WAY)
 __thread x16r_2x64_context_overlay x16r_ctx;
 #endif
-__thread x16r_context_overlay x16_ctx;
+__thread x16r_context_overlay x16r_ref_ctx;
 void x16r_getAlgoString( const uint8_t* prevblock, char *output )
 {
@@ -58,11 +61,15 @@ bool register_x16r_algo( algo_gate_t* gate )
 #elif defined(X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #elif defined(X16R_2WAY)
  gate->scanhash  = (void*)&scanhash_x16r_2x64;
  gate->hash      = (void*)&x16r_2x64_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
                      | NEON_OPT;
  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -76,11 +83,15 @@ bool register_x16rv2_algo( algo_gate_t* gate )
 #elif defined(X16RV2_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rv2_4way;
  gate->hash      = (void*)&x16rv2_4way_hash;
 #elif defined(X16RV2_2WAY)
  gate->scanhash  = (void*)&scanhash_x16rv2_2x64;
  gate->hash      = (void*)&x16rv2_2x64_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rv2;
  gate->hash      = (void*)&x16rv2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
                      | NEON_OPT;
  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -94,11 +105,15 @@ bool register_x16s_algo( algo_gate_t* gate )
 #elif defined(X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #elif defined(X16R_2WAY)
  gate->scanhash  = (void*)&scanhash_x16r_2x64;
  gate->hash      = (void*)&x16r_2x64_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
                      | NEON_OPT;
  x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -108,7 +123,6 @@ bool register_x16s_algo( algo_gate_t* gate )
 //
 //   X16RT
 void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash )
 {
    int32_t maskedTime = timeStamp & 0xffffff80;
@@ -221,34 +235,42 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 bool register_x16rt_algo( algo_gate_t* gate )
 {
-#if defined (X16R_8WAY)
+#if defined(X16RT_8WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_8way;
  gate->hash      = (void*)&x16r_8way_hash;
-#elif defined (X16R_4WAY)
+#elif defined(X16RT_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #elif defined(X16RT_2WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_2x64;
  gate->hash      = (void*)&x16r_2x64_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
                      | NEON_OPT;
  opt_target_factor = 256.0;
  return true;
 };
 bool register_x16rt_veil_algo( algo_gate_t* gate )
 {
-#if defined (X16R_8WAY)
+#if defined(X16RT_8WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_8way;
  gate->hash      = (void*)&x16r_8way_hash;
-#elif defined (X16R_4WAY)
+#elif defined(X16RT_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #elif defined(X16RT_2WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_2x64;
  gate->hash      = (void*)&x16r_2x64_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
                      | NEON_OPT;
  gate->build_extraheader = (void*)&veil_build_extraheader;
  opt_target_factor = 256.0;
  return true;
@@ -262,7 +284,7 @@ bool register_hex_algo( algo_gate_t* gate )
 {
  gate->scanhash        = (void*)&scanhash_hex;
  gate->hash            = (void*)&x16r_hash;
-  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
  gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
  opt_target_factor = 128.0;
  return true;
@@ -274,20 +296,25 @@ bool register_hex_algo( algo_gate_t* gate )
 bool register_x21s_algo( algo_gate_t* gate )
 {
-#if defined (X16R_8WAY)
+#if defined(X21S_8WAY)
  gate->scanhash          = (void*)&scanhash_x21s_8way;
  gate->hash              = (void*)&x21s_8way_hash;
  gate->miner_thread_init = (void*)&x21s_8way_thread_init;
-#elif defined (X16R_4WAY)
+#elif defined(X21S_4WAY)
  gate->scanhash          = (void*)&scanhash_x21s_4way;
  gate->hash              = (void*)&x21s_4way_hash;
  gate->miner_thread_init = (void*)&x21s_4way_thread_init;
 #elif defined(X21S_2WAY)
  gate->scanhash          = (void*)&scanhash_x21s_2x64;
  gate->hash              = (void*)&x21s_2x64_hash;
  gate->miner_thread_init = (void*)&x21s_2x64_thread_init;
 #else
  gate->scanhash          = (void*)&scanhash_x21s;
  gate->hash              = (void*)&x21s_hash;
  gate->miner_thread_init = (void*)&x21s_thread_init;
 #endif
-  gate->optimizations  = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
                      | NEON_OPT;
  x16_r_s_getAlgoString   = (void*)&x16s_getAlgoString;
  opt_target_factor = 256.0;
  return true;
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -7,13 +7,15 @@
 #include <unistd.h>
 #include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/groestl/sph_groestl.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/groestl/sph_groestl.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/sph_simd.h"
 #include "algo/simd/nist.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
@@ -21,13 +23,13 @@
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sha512-hash.h"
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/fugue/fugue-aesni.h"
 #endif
-#if defined (__AVX2__)
+//#if defined (__AVX2__)
  #include "algo/bmw/bmw-hash-4way.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/skein/skein-hash-4way.h"
@@ -39,7 +41,7 @@
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/hamsi/hamsi-hash-4way.h"
  #include "algo/shabal/shabal-hash-4way.h"
-#endif
+//#endif
 #if defined(__VAES__)
  #include "algo/groestl/groestl512-hash-4way.h"
@@ -48,28 +50,41 @@
  #include "algo/echo/echo-hash-4way.h"
 #endif
-#if defined(__aarch64__)
+// X16R, X16S
-  #include "algo/simd/sph_simd.h"
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-#else
+  #define X16R_8WAY   1
-  #include "algo/simd/nist.h"
+#elif defined(__AVX2__) && defined(__AES__)
  #define X16R_4WAY   1
 #elif defined(__SSE2__) || defined(__ARM_NEON)
  #define X16R_2WAY   1
 #endif
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define X16R_8WAY   1
  #define X16RV2_8WAY 1
  #define X16RT_8WAY  1
  #define X21S_8WAY   1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X16RV2_4WAY 1
-  #define X16RT_4WAY  1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
-  #define X21S_4WAY   1
+  #define X16RV2_2WAY 1
  #define X16R_4WAY   1
 #endif
 // X16RT, VEIL
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define X16RT_8WAY  1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X16RT_4WAY  1
 #elif defined(__SSE2__) || defined(__ARM_NEON)
  #define X16RT_2WAY  1
 #endif
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define X21S_8WAY   1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X21S_4WAY   1
 #elif defined(__SSE2__) || defined(__ARM_NEON)
  #define X21S_2WAY   1
 #endif
 enum x16r_Algo {
        BLAKE = 0,
        BMW,
@@ -134,18 +149,23 @@ union _x16r_8way_context_overlay
    hashState_echo          echo;
 #endif
 } __attribute__ ((aligned (64)));
 #define  _x16r_8x64_context_overlay _x16r_8way_context_overlay
 typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;
 #define  x16r_8x64_context_overlay x16r_8way_context_overlay
 extern __thread x16r_8way_context_overlay x16r_ctx;
-void x16r_8way_prehash( void *, void * );
+void x16r_8way_prehash( void *, void *, const char * );
-int x16r_8way_hash_generic( void *, const void *, int );
+int x16r_8way_hash_generic( void *, const void *, int, const char*, const int );
 int x16r_8way_hash( void *, const void *, int );
 int scanhash_x16r_8way( struct work *, uint32_t ,
                        uint64_t *, struct thr_info * );
 extern __thread x16r_8way_context_overlay x16r_ctx;
 #define x16r_8x64_prehash         x16r_8way_prehash
 #define x16r_8x64_hash_generic    x16r_8way_hash_generic
 #define x16r_8x64_hash            x16r_8way_hash
 #define scanhash_x16r_8x64        scanhash_x16r_8x64
 #elif defined(X16R_4WAY)
@@ -167,7 +187,6 @@ union _x16r_4way_context_overlay
    keccak512_4way_context  keccak;
    luffa_2way_context      luffa;
    cube_2way_context       cube;
    hashState_luffa         luffa1;
    simd_2way_context       simd;
    hamsi512_4way_context   hamsi;
    hashState_fugue         fugue;
@@ -175,46 +194,102 @@ union _x16r_4way_context_overlay
    sph_whirlpool_context   whirlpool;
    sha512_4way_context     sha512;
 } __attribute__ ((aligned (64)));
 #define  _x16r_4x64_context_overlay _x16r_4way_context_overlay
 typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;
 #define  x16r_4x64_context_overlay x16r_4way_context_overlay
 extern __thread x16r_4way_context_overlay x16r_ctx;
-void x16r_4way_prehash( void *, void * );
+void x16r_4way_prehash( void *, void *, const char * );
-int x16r_4way_hash_generic( void *, const void *, int );
+int x16r_4way_hash_generic( void *, const void *, int, const char*, const int );
 int x16r_4way_hash( void *, const void *, int );
 int scanhash_x16r_4way( struct work *, uint32_t,
                        uint64_t *, struct thr_info * );
-extern __thread x16r_4way_context_overlay x16r_ctx;
+
 #define x16r_4x64_prehash         x16r_4way_prehash
 #define x16r_4x64_hash_generic    x16r_4way_hash_generic
 #define x16r_4x64_hash            x16r_4way_hash
 #define scanhash_x16r_4x64        scanhash_x16r_4x64
 #elif defined(X16R_2WAY)
 union _x16r_2x64_context_overlay
 {
    blake512_2x64_context   blake;
    bmw512_2x64_context     bmw;
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
    hashState_groestl       groestl;
 #else
    sph_groestl512_context  groestl;
 #endif
    skein512_2x64_context   skein;
    jh512_2x64_context      jh;
    keccak512_2x64_context  keccak;
    hashState_luffa         luffa;
    cubehashParam           cube;
    shavite512_context      shavite;
    simd512_context         simd;
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
    hashState_echo          echo;
 #else
    sph_echo512_context     echo;
 #endif
 #if defined(__SSE4_2__) || defined(__ARM_NEON)
    hamsi_2x64_context      hamsi;
 #else
    sph_hamsi512_context    hamsi;
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
    hashState_fugue         fugue;
 #else
    sph_fugue512_context    fugue;
 #endif
    sph_shabal512_context   shabal;
    sph_whirlpool_context   whirlpool;
    sha512_2x64_context     sha512;
 } __attribute__ ((aligned (64)));
 typedef union _x16r_2x64_context_overlay x16r_2x64_context_overlay;
 void x16r_2x64_prehash( void *, void *, const char * );
 int x16r_2x64_hash_generic( void *, const void *, int, const char*, const int );
 int x16r_2x64_hash( void *, const void *, int );
 int scanhash_x16r_2x64( struct work *, uint32_t,
                        uint64_t *, struct thr_info * );
 extern __thread x16r_2x64_context_overlay x16r_ctx;
 #endif
 // need a reference, add hooks for SSE2.
 // needed for hex
 union _x16r_context_overlay
 {
 #if defined(__AES__)
        hashState_echo          echo;
        hashState_groestl       groestl;
        hashState_fugue         fugue;
 #else
        sph_groestl512_context   groestl;
        sph_echo512_context      echo;
        sph_fugue512_context    fugue;
 #endif
        blake512_context        blake;
        sph_bmw512_context      bmw;
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_groestl       groestl;
 #else
        sph_groestl512_context  groestl;
 #endif
        sph_skein512_context    skein;
        sph_jh512_context       jh;
        sph_keccak512_context   keccak;
        hashState_luffa         luffa;
        cubehashParam           cube;
        shavite512_context      shavite;
-#if defined(__aarch64__)
+        simd512_context         simd;
-        sph_simd512_context     simd;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_echo          echo;
 #else
-        hashState_sd            simd;
+        sph_echo512_context     echo;
 #endif
        sph_hamsi512_context    hamsi;
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_fugue         fugue;
 #else
        sph_fugue512_context    fugue;
 #endif
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
        sph_sha512_context      sha512;
@@ -222,10 +297,10 @@ union _x16r_context_overlay
 typedef union _x16r_context_overlay x16r_context_overlay;
-extern __thread x16r_context_overlay x16_ctx;
+extern __thread x16r_context_overlay x16r_ref_ctx;
-void x16r_prehash( void *, void * );
+void x16r_prehash( void *, void *, const char * );
-int x16r_hash_generic( void *, const void *, int );
+int x16r_hash_generic( void *, const void *, int, const char*, const int );
 int x16r_hash( void *, const void *, int );
 int scanhash_x16r( struct work *, uint32_t, uint64_t *, struct thr_info * );
@@ -242,6 +317,12 @@ int x16rv2_4way_hash( void *state, const void *input, int thrid );
 int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 #elif defined(X16RV2_2WAY)
 int x16rv2_2x64_hash( void *state, const void *input, int thrid );
 int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 #else
 int x16rv2_hash( void *state, const void *input, int thr_id );
@@ -251,18 +332,24 @@ int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
 #endif
 // x16rt, veil
-#if defined(X16R_8WAY)
+#if defined(X16RT_8WAY)
 //void x16rt_8way_hash( void *state, const void *input );
 int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
-#elif defined(X16R_4WAY)
+#elif defined(X16RT_4WAY)
 //void x16rt_4way_hash( void *state, const void *input );
 int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 #elif defined(X16RT_2WAY)
 //void x16rt_4way_hash( void *state, const void *input );
 int scanhash_x16rt_2x64( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 #else
 //void x16rt_hash( void *state, const void *input );
@@ -272,20 +359,27 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
 #endif
 // x21s
-#if defined(X16R_8WAY)
+#if defined(X21S_8WAY)
 int x21s_8way_hash( void *state, const void *input, int thrid );
 int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 bool x21s_8way_thread_init();
-#elif defined(X16R_4WAY)
+#elif defined(X21S_4WAY)
 int x21s_4way_hash( void *state, const void *input, int thrid );
 int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 bool x21s_4way_thread_init();
 #elif defined(X21S_2WAY)
 int x21s_2x64_hash( void *state, const void *input, int thrid );
 int scanhash_x21s_2x64( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 bool x21s_2x64_thread_init();
 #else
 int x21s_hash( void *state, const void *input, int thr_id );
--- a/algo/x16/x16r.c
+++ b/algo/x16/x16r.c
@@ -10,55 +10,60 @@
 #include <stdlib.h>
 #include <string.h>
-void x16r_prehash( void *edata, void *pdata )
+void x16r_prehash( void *edata, void *pdata, const char *hash_order )
 {
-   const char elem = x16r_hash_order[0];
+   const char elem = hash_order[0];
   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
   switch ( algo )
   {
      case JH:
-         sph_jh512_init( &x16_ctx.jh );
+         sph_jh512_init( &x16r_ref_ctx.jh );
-         sph_jh512( &x16_ctx.jh, edata, 64 );
+         sph_jh512( &x16r_ref_ctx.jh, edata, 64 );
      break;
      case SKEIN:
-         sph_skein512_init( &x16_ctx.skein );
+         sph_skein512_init( &x16r_ref_ctx.skein );
-         sph_skein512( &x16_ctx.skein, edata, 64 );
+         sph_skein512( &x16r_ref_ctx.skein, edata, 64 );
      break;
      case KECCAK:
         sph_keccak512_init( &x16r_ref_ctx.keccak );
         sph_keccak512( &x16r_ref_ctx.keccak, edata, 72 );
      break;
      case LUFFA:
-         init_luffa( &x16_ctx.luffa, 512 );
+         init_luffa( &x16r_ref_ctx.luffa, 512 );
-         update_luffa( &x16_ctx.luffa, edata, 64 );
+         update_luffa( &x16r_ref_ctx.luffa, edata, 64 );
      break;
      case CUBEHASH:
-         cubehashInit( &x16_ctx.cube, 512, 16, 32 );
+         cubehashInit( &x16r_ref_ctx.cube, 512, 16, 32 );
-         cubehashUpdate( &x16_ctx.cube, edata, 64 );
+         cubehashUpdate( &x16r_ref_ctx.cube, edata, 64 );
      break;
      case HAMSI:
-         sph_hamsi512_init( &x16_ctx.hamsi );
+         sph_hamsi512_init( &x16r_ref_ctx.hamsi );
-         sph_hamsi512( &x16_ctx.hamsi, edata, 64 );
+         sph_hamsi512( &x16r_ref_ctx.hamsi, edata, 72 );
         break;
      case SHABAL:
-         sph_shabal512_init( &x16_ctx.shabal );
+         sph_shabal512_init( &x16r_ref_ctx.shabal );
-         sph_shabal512( &x16_ctx.shabal, edata, 64 );
+         sph_shabal512( &x16r_ref_ctx.shabal, edata, 64 );
      break;
      case WHIRLPOOL:
-         sph_whirlpool_init( &x16_ctx.whirlpool );
+         sph_whirlpool_init( &x16r_ref_ctx.whirlpool );
-         sph_whirlpool( &x16_ctx.whirlpool, edata, 64 );
+         sph_whirlpool( &x16r_ref_ctx.whirlpool, edata, 64 );
      break;
   }
 }
-int x16r_hash_generic( void* output, const void* input, int thrid )
+int x16r_hash_generic( void* output, const void* input, int thrid, 
                       const char *hash_order, const int func_count )
 {
-   uint32_t _ALIGN(128) hash[16];
+   uint32_t _ALIGN(32) hash[16];
   x16r_context_overlay ctx;
-   memcpy( &ctx, &x16_ctx, sizeof(ctx) );
+   memcpy( &ctx, &x16r_ref_ctx, sizeof(ctx) );
   void *in = (void*) input;
   int size = 80;
-   for ( int i = 0; i < 16; i++ )
+   for ( int i = 0; i < func_count; i++ )
   {
-      const char elem = x16r_hash_order[i];
+      const char elem = hash_order[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
      switch ( algo )
@@ -74,8 +79,8 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
            sph_bmw512_close( &ctx.bmw, hash );
         break;
         case GROESTL:
-#if defined(__AES__)
+#if defined(__AES__)  // || defined(__ARM_FEATURE_AES)
-            groestl512_full( &ctx.groestl, (char*)hash, (char*)in, size<<3 );
+            groestl512_full( &ctx.groestl, hash, in, size<<3 );
 #else
            sph_groestl512_init( &ctx.groestl );
            sph_groestl512( &ctx.groestl, in, size );
@@ -93,8 +98,13 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
            sph_jh512_close( &ctx.jh, hash );
         break;
         case KECCAK:
            if ( i == 0 )
               sph_keccak512( &ctx.keccak, in+72, 8 );
            else
            {
               sph_keccak512_init( &ctx.keccak );
               sph_keccak512( &ctx.keccak, in, size );
            }
            sph_keccak512_close( &ctx.keccak, hash );
         break;
         case SKEIN:
@@ -109,13 +119,13 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
         break;
         case LUFFA:
            if ( i == 0 )
-               update_and_final_luffa( &ctx.luffa, hash, (const void*)in+64, 16 );
+               update_and_final_luffa( &ctx.luffa, hash, in+64, 16 );
            else
               luffa_full( &ctx.luffa, hash, 512, in, size );
            break;
         case CUBEHASH:
            if ( i == 0 )
-               cubehashUpdateDigest( &ctx.cube, hash, (const void*)in+64, 16 );
+               cubehashUpdateDigest( &ctx.cube, hash, in+64, 16 );
            else
               cubehash_full( &ctx.cube, hash, 512, in, size );
         break;
@@ -123,19 +133,13 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
            shavite512_full( &ctx.shavite, hash, in, size );
         break;
         case SIMD:
 #if defined(__aarch64__)
            sph_simd512_init( &ctx.simd );
-            sph_simd512(&ctx.simd, (const void*) hash, 64);
+            sph_simd512( &ctx.simd, hash, size );
            sph_simd512_close( &ctx.simd, hash );
 #else
            simd_full( &ctx.simd, (BitSequence *)hash,
                             (const BitSequence*)in, size<<3 );
 #endif
         break;
         case ECHO:
 #if defined(__AES__)
-            echo_full( &ctx.echo, (BitSequence*)hash, 512,
+            echo_full( &ctx.echo, hash, 512, in, size );
                            (const BitSequence*)in, size );
 #else
            sph_echo512_init( &ctx.echo );
            sph_echo512( &ctx.echo, in, size );
@@ -144,7 +148,7 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
         break;
         case HAMSI:
            if ( i == 0 )
-               sph_hamsi512( &ctx.hamsi, in+64, 16 );
+               sph_hamsi512( &ctx.hamsi, in+72, 8 );
            else
            {
               sph_hamsi512_init( &ctx.hamsi );
@@ -153,11 +157,7 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
            sph_hamsi512_close( &ctx.hamsi, hash );
         break;
         case FUGUE:
 #if defined(__AES__)
         fugue512_full( &ctx.fugue, hash, in, size );
 #else
 	         sph_fugue512_full( &ctx.fugue, hash, in, size );
 #endif
 	      break;
         case SHABAL:
            if ( i == 0 )
@@ -197,7 +197,8 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
 int x16r_hash( void* output, const void* input, int thrid )
 {  
   uint8_t hash[64] __attribute__ ((aligned (64)));
-   if ( !x16r_hash_generic( hash, input, thrid ) )
+   if ( !x16r_hash_generic( hash, input, thrid, x16r_hash_order, 
                            X16R_HASH_FUNC_COUNT ) )
      return 0;
    memcpy( output, hash, 32 );
@@ -207,8 +208,8 @@ int x16r_hash( void* output, const void* input, int thrid )
 int scanhash_x16r( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t _ALIGN(128) hash32[8];
+   uint32_t _ALIGN(32) hash32[8];
-   uint32_t _ALIGN(128) edata[20];
+   uint32_t _ALIGN(32) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -230,7 +231,7 @@ int scanhash_x16r( struct work *work, uint32_t max_nonce,
           applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
   }
-   x16r_prehash( edata, pdata );
+   x16r_prehash( edata, pdata, x16r_hash_order );
   do
   {
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -3,7 +3,7 @@
 #include <stdlib.h>
 #include <string.h>
-#if defined (X16R_8WAY)
+#if defined (X16RT_8WAY)
 int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr)
@@ -30,12 +30,12 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
      x16rt_getTimeHash( masked_ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
      s_ntime = masked_ntime;
-      if ( !thr_id )
+      if ( !opt_quiet && !thr_id )
-          applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
+          applog( LOG_INFO, "Hash order %s, Ntime %08x",
-                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
+                            x16r_hash_order, bswap_32( pdata[17] ) );
   }
-   x16r_8way_prehash( vdata, pdata );
+   x16r_8way_prehash( vdata, pdata, x16r_hash_order );
   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
@@ -57,7 +57,7 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
   return 0;
 }
-#elif defined (X16R_4WAY)
+#elif defined (X16RT_4WAY)
 int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr)
@@ -84,12 +84,12 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
      x16rt_getTimeHash( masked_ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
      s_ntime = masked_ntime;
-      if ( !thr_id )
+      if ( !opt_quiet && !thr_id )
-          applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
+          applog( LOG_INFO, "Hash order %s, Ntime %08x",
-                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
+                            x16r_hash_order, bswap_32( pdata[17] ) );
   }
-   x16r_4way_prehash( vdata, pdata );
+   x16r_4way_prehash( vdata, pdata, x16r_hash_order );
   *noncev = mm256_intrlv_blend_32(
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
@@ -110,4 +110,55 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }
 #elif defined (X16RT_2WAY)
 int scanhash_x16rt_2x64( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr)
 {
   uint32_t hash[2*16] __attribute__ ((aligned (64)));
   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
   uint32_t _ALIGN(64) timeHash[4*8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 2;
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   v128_t *noncev = (v128_t*)vdata + 9;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
   const bool bench = opt_benchmark;
   if ( bench )  ptarget[7] = 0x0cff;
   static __thread uint32_t s_ntime = UINT32_MAX;
   uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
   if ( s_ntime != masked_ntime )
   {
      x16rt_getTimeHash( masked_ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
      s_ntime = masked_ntime;
      if ( !opt_quiet && !thr_id )
          applog( LOG_INFO, "Hash order %s, Ntime %08x",
                            x16r_hash_order, bswap_32( pdata[17] ) );
   }
   x16r_2x64_prehash( vdata, pdata, x16r_hash_order );
   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
   do
   {
      if ( x16r_2x64_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 2; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
         pdata[19] = bswap_32( n+i );
         submit_solution( work, hash+(i<<3), mythr );
      }
      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
      n += 2;
   } while ( (  n < last_nonce ) && !(*restart) );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
 #endif
--- a/algo/x16/x16rt.c
+++ b/algo/x16/x16rt.c
@@ -1,6 +1,6 @@
 #include "x16r-gate.h"
-#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
+#if !defined(X16RT_8WAY) && !defined(X16RT_4WAY)
 int scanhash_x16rt( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr )
@@ -31,7 +31,7 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
                        x16r_hash_order, swab32( pdata[17] ), timeHash );
   }
-   x16r_prehash( edata, pdata );
+   x16r_prehash( edata, pdata, x16r_hash_order );
   do
   {
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -395,7 +395,7 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
         break;
         case HAMSI:
            if ( i == 0 )
-               hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 );
+               hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -409,14 +409,43 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
                          hash7, vhash );
         break;
         case FUGUE:
-            fugue512_full( &ctx.fugue, hash0, in0, size );
+            if ( i == 0 )
-            fugue512_full( &ctx.fugue, hash1, in1, size );
+            {
-            fugue512_full( &ctx.fugue, hash2, in2, size );
+               fugue512_update( &ctx.fugue, in0 + 76, 4 );
-            fugue512_full( &ctx.fugue, hash3, in3, size );
+               fugue512_final( &ctx.fugue, hash0 );
-            fugue512_full( &ctx.fugue, hash4, in4, size );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
-            fugue512_full( &ctx.fugue, hash5, in5, size );
+               fugue512_update( &ctx.fugue, in1 + 76, 4 );
-            fugue512_full( &ctx.fugue, hash6, in6, size );
+               fugue512_final( &ctx.fugue, hash1 );
-            fugue512_full( &ctx.fugue, hash7, in7, size );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
               fugue512_update( &ctx.fugue, in2 + 76, 4 );
               fugue512_final( &ctx.fugue, hash2 );
               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
               fugue512_update( &ctx.fugue, in3 + 76, 4 );
               fugue512_final( &ctx.fugue, hash3 );
               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) ); 
               fugue512_update( &ctx.fugue, in4 + 76, 4 ); 
               fugue512_final( &ctx.fugue, hash4 ); 
               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) ); 
               fugue512_update( &ctx.fugue, in5 + 76, 4 );
               fugue512_final( &ctx.fugue, hash5 );
               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
               fugue512_update( &ctx.fugue, in6 + 76, 4 );
               fugue512_final( &ctx.fugue, hash6 );
               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
               fugue512_update( &ctx.fugue, in7 + 76, 4 );
               fugue512_final( &ctx.fugue, hash7 );
            }
            else
            {
               fugue512_full( &ctx.fugue, hash0, hash0, size );
               fugue512_full( &ctx.fugue, hash1, hash1, size );
               fugue512_full( &ctx.fugue, hash2, hash2, size );
               fugue512_full( &ctx.fugue, hash3, hash3, size );
               fugue512_full( &ctx.fugue, hash4, hash4, size );
               fugue512_full( &ctx.fugue, hash5, hash5, size );
               fugue512_full( &ctx.fugue, hash6, hash6, size );
               fugue512_full( &ctx.fugue, hash7, hash7, size );
            }
         break;
         case SHABAL:
            intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -564,7 +593,6 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
   uint32_t edata[20] __attribute__ ((aligned (64)));
   uint32_t bedata1[2] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -577,19 +605,15 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
   if ( bench ) ptarget[7] = 0x0cff;
-   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   static __thread uint32_t saved_height = UINT32_MAX;
-
+   if ( work->height != saved_height )
   bedata1[0] = bswap_32( pdata[1] );
   bedata1[1] = bswap_32( pdata[2] );
   static __thread uint32_t s_ntime = UINT32_MAX;
   const uint32_t ntime = bswap_32( pdata[17] );
   if ( s_ntime != ntime )
   {
-      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      vdata[1] = bswap_32( pdata[1] );
-      s_ntime = ntime;
+      vdata[2] = bswap_32( pdata[2] );
-      if ( opt_debug && !thr_id )
+      saved_height = work->height;
-         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
      if ( !opt_quiet && !thr_id )
           applog( LOG_INFO, "hash order %s", x16r_hash_order );
   }
   // Do midstate prehash on hash functions with block size <= 64 bytes.
@@ -626,7 +650,14 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
      case HAMSI:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
         hamsi512_8way_init( &x16rv2_ctx.hamsi );
-         hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 );
+         hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 72 );
      break;
      case FUGUE:
         v128_bswap32_80( edata, pdata );
         fugue512_init( &x16rv2_ctx.fugue );
         fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
         intrlv_8x64( vdata, edata, edata, edata, edata,
                             edata, edata, edata, edata, 640 );
      break;
      case SHABAL:
         mm256_bswap32_intrlv80_8x32( vdata2, pdata );
@@ -824,8 +855,8 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               skein512_4way_init( &ctx.skein );
               skein512_4way_update( &ctx.skein, vhash, size );
            }
               skein512_4way_close( &ctx.skein, vhash );
            }
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case LUFFA:
@@ -945,7 +976,7 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
         break;
         case HAMSI:
            if ( i == 0 )
-               hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 );
+               hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
@@ -956,10 +987,27 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case FUGUE:
-            fugue512_full( &ctx.fugue, hash0, in0, size );
+            if ( i == 0 )
-            fugue512_full( &ctx.fugue, hash1, in1, size );
+            {
-            fugue512_full( &ctx.fugue, hash2, in2, size );
+               fugue512_update( &ctx.fugue, in0 + 76, 4 );
-            fugue512_full( &ctx.fugue, hash3, in3, size );
+               fugue512_final( &ctx.fugue, hash0 );
               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
               fugue512_update( &ctx.fugue, in1 + 76, 4 );
               fugue512_final( &ctx.fugue, hash1 );
               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
               fugue512_update( &ctx.fugue, in2 + 76, 4 );
               fugue512_final( &ctx.fugue, hash2 );
               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
               fugue512_update( &ctx.fugue, in3 + 76, 4 );
               fugue512_final( &ctx.fugue, hash3 );
            }
            else
            {
               fugue512_full( &ctx.fugue, hash0, hash0, size );
               fugue512_full( &ctx.fugue, hash1, hash1, size );
               fugue512_full( &ctx.fugue, hash2, hash2, size );
               fugue512_full( &ctx.fugue, hash3, hash3, size );
            }
         break;
         case SHABAL:
             intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
@@ -1055,7 +1103,6 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
   uint32_t edata[20];
   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -1068,17 +1115,15 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
   if ( bench )  ptarget[7] = 0x0fff;
-   bedata1[0] = bswap_32( pdata[1] );
+   static __thread uint32_t saved_height = UINT32_MAX;
-   bedata1[1] = bswap_32( pdata[2] );
+   if ( work->height != saved_height )
   static __thread uint32_t s_ntime = UINT32_MAX;
   const uint32_t ntime = bswap_32(pdata[17]);
   if ( s_ntime != ntime )
   {
-      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      vdata[1] = bswap_32( pdata[1] );
-      s_ntime = ntime;
+      vdata[2] = bswap_32( pdata[2] );
-      if ( opt_debug && !thr_id )
+      saved_height = work->height;
-         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
      if ( !opt_quiet && !thr_id )
           applog( LOG_INFO, "hash order %s", x16r_hash_order );
   }
   // Do midstate prehash on hash functions with block size <= 64 bytes.
@@ -1101,7 +1146,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
      break;
      case SKEIN:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         skein512_4way_prehash64( &x16r_ctx.skein, vdata );
+         skein512_4way_prehash64( &x16rv2_ctx.skein, vdata );
      break;
      case CUBEHASH:
         v128_bswap32_80( edata, pdata );
@@ -1112,7 +1157,13 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
      case HAMSI:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
         hamsi512_4way_init( &x16rv2_ctx.hamsi );
-         hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 );
+         hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 72 );
      break;
      case FUGUE:
         v128_bswap32_80( edata, pdata );
         fugue512_init( &x16rv2_ctx.fugue );
         fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
      break;
      case SHABAL:
         v128_bswap32_intrlv80_4x32( vdata32, pdata );
@@ -1151,4 +1202,450 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }
 #elif defined (X16RV2_2WAY)
 union _x16rv2_2x64_context_overlay
 {
    blake512_2x64_context   blake;
    bmw512_2x64_context     bmw;
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
    hashState_groestl       groestl;
 #else
    sph_groestl512_context  groestl;
 #endif
    skein512_2x64_context   skein;
    jh512_2x64_context      jh;
    keccak512_2x64_context  keccak;
    hashState_luffa         luffa;
    cubehashParam           cube;
    shavite512_context      shavite;
    simd512_context         simd;
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
    hashState_echo          echo;
 #else
    sph_echo512_context     echo;
 #endif
 #if defined(__SSE4_2__) || defined(__ARM_NEON)
    hamsi_2x64_context      hamsi;
 #else
    sph_hamsi512_context    hamsi;
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
    hashState_fugue         fugue;
 #else
    sph_fugue512_context    fugue;
 #endif
    sph_shabal512_context   shabal;
    sph_whirlpool_context   whirlpool;
    sha512_2x64_context     sha512;
    sph_tiger_context       tiger;
 } __attribute__ ((aligned (64)));
 typedef union _x16rv2_2x64_context_overlay x16rv2_2x64_context_overlay;
 static __thread x16rv2_2x64_context_overlay x16rv2_ctx;
 // Pad the 24 bytes tiger hash to 64 bytes
 static inline void padtiger512( uint32_t* hash )
 {
  for ( int i = 6; i < 16; i++ ) hash[i] = 0;
 }
 int x16rv2_2x64_hash( void* output, const void* input, int thrid )
 {
   uint32_t vhash[20*2] __attribute__ ((aligned (64)));
   uint32_t hash0[20] __attribute__ ((aligned (32)));
   uint32_t hash1[20] __attribute__ ((aligned (32)));
   x16rv2_2x64_context_overlay ctx;
   memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
   void *in0 = (void*) hash0;
   void *in1 = (void*) hash1;
   int size = 80;
   dintrlv_2x64( hash0, hash1, input, 640 );
   for ( int i = 0; i < 16; i++ )
   {
      const char elem = x16r_hash_order[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
      switch ( algo )
      {
         case BLAKE:
            if ( i == 0 )
               blake512_2x64_full( &ctx.blake, vhash, input, size );
            else
            {
               intrlv_2x64( vhash, in0, in1, size<<3 );
               blake512_2x64_full( &ctx.blake, vhash, vhash, size );
            }
            dintrlv_2x64( hash0, hash1, vhash, 512 );
         break;
         case BMW:
            bmw512_2x64_init( &ctx.bmw );
            if ( i == 0 )
               bmw512_2x64_update( &ctx.bmw, input, size );
            else
            {
               intrlv_2x64( vhash, in0, in1, size<<3 );
               bmw512_2x64_update( &ctx.bmw, vhash, size );
            }
            bmw512_2x64_close( &ctx.bmw, vhash );
            dintrlv_2x64( hash0, hash1, vhash, 512 );
         break;
         case GROESTL:
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
            groestl512_full( &ctx.groestl, hash0, in0, size<<3 );
            groestl512_full( &ctx.groestl, hash1, in1, size<<3 );
 #else
            sph_groestl512_init( &ctx.groestl );
            sph_groestl512( &ctx.groestl, in0, size );
            sph_groestl512_close( &ctx.groestl, hash0 );
            sph_groestl512_init( &ctx.groestl );
            sph_groestl512( &ctx.groestl, in1, size );
            sph_groestl512_close( &ctx.groestl, hash1 );
 #endif
         break;
         case JH:
            if ( i == 0 )
               jh512_2x64_update( &ctx.jh, input + (64<<1), 16 );
            else
            {
               intrlv_2x64( vhash, in0, in1, size<<3 );
               jh512_2x64_init( &ctx.jh );
               jh512_2x64_update( &ctx.jh, vhash, size );
            }
            jh512_2x64_close( &ctx.jh, vhash );
            dintrlv_2x64( hash0, hash1, vhash, 512 );
         break;
         case KECCAK:
            if ( i == 0 )
            {
               sph_tiger( &ctx.tiger, in0 + 64, 16 );
               sph_tiger_close( &ctx.tiger, hash0 );
               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
               sph_tiger( &ctx.tiger, in1 + 64, 16 );
               sph_tiger_close( &ctx.tiger, hash1 );
            }
            else
            {
               sph_tiger_init( &ctx.tiger );
               sph_tiger( &ctx.tiger, in0, size );
               sph_tiger_close( &ctx.tiger, hash0 );
               sph_tiger_init( &ctx.tiger );
               sph_tiger( &ctx.tiger, in1, size );
               sph_tiger_close( &ctx.tiger, hash1 );
            }
            for ( int i = (24/4); i < (64/4); i++ )
                hash0[i] = hash1[i] = 0;
            intrlv_2x64( vhash, hash0, hash1, 512 );
            keccak512_2x64_init( &ctx.keccak );
            keccak512_2x64_update( &ctx.keccak, vhash, 64 );
            keccak512_2x64_close( &ctx.keccak, vhash );
            dintrlv_2x64( hash0, hash1, vhash, 512 );
         break;
         case SKEIN:
            if ( i == 0 )
               skein512_2x64_final16( &ctx.skein, vhash, input + (64*2) );
            else
            {
               intrlv_2x64( vhash, in0, in1, size<<3 );
               skein512_2x64_full( &ctx.skein, vhash, vhash,  size );
            }
            dintrlv_2x64( hash0, hash1, vhash, 512 );
         break;
         case LUFFA:
            if ( i == 0 )
            {
               sph_tiger( &ctx.tiger, in0 + 64, 16 );
               sph_tiger_close( &ctx.tiger, hash0 );
               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
               sph_tiger( &ctx.tiger, in1 + 64, 16 );
               sph_tiger_close( &ctx.tiger, hash1 );
            }
            else
            {
               sph_tiger_init( &ctx.tiger );
               sph_tiger( &ctx.tiger, in0, size );
               sph_tiger_close( &ctx.tiger, hash0 );
               sph_tiger_init( &ctx.tiger );
               sph_tiger( &ctx.tiger, in1, size );
               sph_tiger_close( &ctx.tiger, hash1 );
            }
            for ( int i = (24/4); i < (64/4); i++ )
                hash0[i] = hash1[i] = 0;
            luffa_full( &ctx.luffa, hash0, 512, hash0, 64 );
            luffa_full( &ctx.luffa, hash1, 512, hash1, 64 );
         break;
         case CUBEHASH:
            if ( i == 0 )
            {
               cubehashUpdateDigest( &ctx.cube, hash0, in0 + 64, 16 );
               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
               cubehashUpdateDigest( &ctx.cube, hash1, in1 + 64, 16 );
            }
            else
            {
               cubehash_full( &ctx.cube, hash0, 512, hash0, size );
               cubehash_full( &ctx.cube, hash1, 512, hash1, size );
            }
         break;
         case SHAVITE:
            shavite512_full( &ctx.shavite, hash0, in0, size );
            shavite512_full( &ctx.shavite, hash1, in1, size );
         break;
         case SIMD:
            simd512_ctx( &ctx.simd, hash0, in0, size );
            simd512_ctx( &ctx.simd, hash1, in1, size );
         break;
         case ECHO:
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
            echo_full( &ctx.echo, hash0, 512, in0, size );
            echo_full( &ctx.echo, hash1, 512, in1, size );
 #else
            sph_echo512_init( &ctx.echo );
            sph_echo512( &ctx.echo, in0, size );
            sph_echo512_close( &ctx.echo, hash0 );
            sph_echo512_init( &ctx.echo );
            sph_echo512( &ctx.echo, in1, size );
            sph_echo512_close( &ctx.echo, hash1 );
 #endif
         break;
         case HAMSI:
 #if defined(__SSE4_2__) || defined(__ARM_NEON)
            if ( i == 0 )
               hamsi512_2x64_update( &ctx.hamsi, input + (72*2), 8 );
            else
            {
               intrlv_2x64( vhash, hash0, hash1, size<<3 );
               hamsi512_2x64_init( &ctx.hamsi );
               hamsi512_2x64_update( &ctx.hamsi, vhash, size );
            }
            hamsi512_2x64_close( &ctx.hamsi, vhash );
            dintrlv_2x64( hash0, hash1, vhash, 512 );
 #else
            if ( i == 0 )
            {
               sph_hamsi512( &ctx.hamsi, in0 + 72, 8 );
               sph_hamsi512_close( &ctx.hamsi, hash0 );
               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
               sph_hamsi512( &ctx.hamsi, in1 + 72, 8 );
               sph_hamsi512_close( &ctx.hamsi, hash1 );
            }
            else
            {
               sph_hamsi512_init( &ctx.hamsi );
               sph_hamsi512( &ctx.hamsi, hash0, size );
               sph_hamsi512_close( &ctx.hamsi, hash0 );
               sph_hamsi512_init( &ctx.hamsi );
               sph_hamsi512( &ctx.hamsi, hash1, size );
               sph_hamsi512_close( &ctx.hamsi, hash1 );
             }
 #endif
         break;
         case FUGUE:
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
            if ( i == 0 )
            {
               fugue512_update( &ctx.fugue, in0 + 76, 4 );
               fugue512_final( &ctx.fugue, hash0 );
               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
               fugue512_update( &ctx.fugue, in1 + 76, 4 );
               fugue512_final( &ctx.fugue, hash1 );
            }
            else
            {
               fugue512_full( &ctx.fugue, hash0, hash0, size );
               fugue512_full( &ctx.fugue, hash1, hash1, size );
            }
 #else
            if ( i == 0 )
            {
               sph_fugue512( &ctx.fugue, in0 + 76, 4 );
               sph_fugue512_close( &ctx.fugue, hash0 );
               memcpy( &ctx, &x16rv2_ctx, sizeof(sph_fugue512_context) );
               sph_fugue512( &ctx.fugue, in1 + 76, 4 );
               sph_fugue512_close( &ctx.fugue, hash1 );
            }
             else
             {
                sph_fugue512_full( &ctx.fugue, hash0, hash0, size );
                sph_fugue512_full( &ctx.fugue, hash1, hash1, size );
             }
 #endif
             break;
         case SHABAL:
            if ( i == 0 )
            {
               sph_shabal512( &ctx.shabal, in0 + 64, 16 );
               sph_shabal512_close( &ctx.shabal, hash0 );
               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
               sph_shabal512( &ctx.shabal, in1 + 64, 16 );
               sph_shabal512_close( &ctx.shabal, hash1 );
            }
            else
            {
               sph_shabal512_init( &ctx.shabal );
               sph_shabal512( &ctx.shabal, hash0, size );
               sph_shabal512_close( &ctx.shabal, hash0 );
               sph_shabal512_init( &ctx.shabal );
               sph_shabal512( &ctx.shabal, hash1, size );
               sph_shabal512_close( &ctx.shabal, hash1 );
             }
          break;
          case WHIRLPOOL:
            sph_whirlpool512_full( &ctx.whirlpool, hash0, in0, size );
            sph_whirlpool512_full( &ctx.whirlpool, hash1, in1, size );
         break;
         case SHA_512:
             if ( i == 0 )
             {
                sph_tiger( &ctx.tiger, in0 + 64, 16 );
                sph_tiger_close( &ctx.tiger, hash0 );
                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
                sph_tiger( &ctx.tiger, in1 + 64, 16 );
                sph_tiger_close( &ctx.tiger, hash1 );
             }
             else
             {
                sph_tiger_init( &ctx.tiger );
                sph_tiger( &ctx.tiger, in0, size );
                sph_tiger_close( &ctx.tiger, hash0 );
                sph_tiger_init( &ctx.tiger );
                sph_tiger( &ctx.tiger, in1, size );
                sph_tiger_close( &ctx.tiger, hash1 );
             }
             for ( int i = (24/4); i < (64/4); i++ )
                hash0[i] = hash1[i] = 0;
             intrlv_2x64( vhash, hash0, hash1, 512 );
             sha512_2x64_init( &ctx.sha512 );
             sha512_2x64_update( &ctx.sha512, vhash, 64 );
             sha512_2x64_close( &ctx.sha512, vhash );
             dintrlv_2x64( hash0, hash1, vhash, 512 );
         break;
      }
      if ( work_restart[thrid].restart ) return 0;
      size = 64;
   }
   memcpy( output,    hash0, 32 );
   memcpy( output+32, hash1, 32 );
   return 1;
 }
 int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr)
 {
   uint32_t hash[2*16] __attribute__ ((aligned (64)));
   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
   uint32_t edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 2;
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   v128_t *noncev = (v128_t*)vdata + 9;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
   const bool bench = opt_benchmark;
   if ( bench )  ptarget[7] = 0x0fff;
   static __thread uint32_t saved_height = UINT32_MAX;
   if ( work->height != saved_height )
   {
      vdata[1] = bswap_32( pdata[1] );
      vdata[2] = bswap_32( pdata[2] );
      saved_height = work->height;
      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
      if ( !opt_quiet && !thr_id )
           applog( LOG_INFO, "hash order %s", x16r_hash_order );
   }
   // Do midstate prehash on hash functions with block size <= 64 bytes.
   const char elem = x16r_hash_order[0];
   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
   switch ( algo )
   {
      case JH:
         v128_bswap32_intrlv80_2x64( vdata, pdata );
         jh512_2x64_init( &x16rv2_ctx.jh );
         jh512_2x64_update( &x16rv2_ctx.jh, vdata, 64 );
      break;
      case KECCAK:
      case LUFFA:
      case SHA_512:
         v128_bswap32_80( edata, pdata );
         sph_tiger_init( &x16rv2_ctx.tiger );
         sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
         intrlv_2x64( vdata, edata, edata, 640 );
      break;
      case SKEIN:
         v128_bswap32_intrlv80_2x64( vdata, pdata );
         skein512_2x64_prehash64( &x16rv2_ctx.skein, vdata );
      break;
      case CUBEHASH:
         v128_bswap32_80( edata, pdata );
         cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
         cubehashUpdate( &x16rv2_ctx.cube, edata, 64 );
         intrlv_2x64( vdata, edata, edata, 640 );
      break;
      case HAMSI:
 #if defined(__SSE4_2__) || defined(__ARM_NEON)
         v128_bswap32_intrlv80_2x64( vdata, pdata );
         hamsi512_2x64_init( &x16rv2_ctx.hamsi );
         hamsi512_2x64_update( &x16rv2_ctx.hamsi, vdata, 72 );
 #else
         v128_bswap32_80( edata, pdata );
         sph_hamsi512_init( &x16rv2_ctx.hamsi );
         sph_hamsi512( &x16rv2_ctx.hamsi, edata, 72 );
         intrlv_2x64( vdata, edata, edata, 640 );
 #endif
      break;
      case FUGUE:
         v128_bswap32_80( edata, pdata );
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
         fugue512_init( &x16rv2_ctx.fugue );
         fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
 #else
         sph_fugue512_init( &x16rv2_ctx.fugue );
         sph_fugue512( &x16rv2_ctx.fugue, edata, 76 );
 #endif
         intrlv_2x64( vdata, edata, edata, 640 );
      break;
      case SHABAL:
         v128_bswap32_80( edata, pdata );
         sph_shabal512_init( &x16rv2_ctx.shabal );
         sph_shabal512( &x16rv2_ctx.shabal, edata, 64);
         intrlv_2x64( vdata, edata, edata, 640 );
      break;
      default:
         v128_bswap32_intrlv80_2x64( vdata, pdata );
   }
   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
   do
   {
      if ( x16rv2_2x64_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 2; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
         pdata[19] = bswap_32( n+i );
         submit_solution( work, hash+(i<<3), mythr );
      }
      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
      n += 2;
   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
 #endif
--- a/algo/x16/x16rv2.c
+++ b/algo/x16/x16rv2.c
@@ -6,21 +6,15 @@
 */
 #include "x16r-gate.h"
-#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
+#if !defined(X16RV2_8WAY) && !defined(X16RV2_4WAY) && !defined(X16RV2_2WAY)
 #include "algo/tiger/sph_tiger.h"
 union _x16rv2_context_overlay
 {
 #if defined(__AES__)
        hashState_echo          echo;
        hashState_groestl       groestl;
        hashState_fugue         fugue;
 #else
        sph_groestl512_context   groestl;
        sph_echo512_context      echo;
        sph_fugue512_context    fugue;
 #endif
        blake512_context        blake;
        sph_bmw512_context      bmw;
        sph_skein512_context    skein;
@@ -29,11 +23,7 @@ union _x16rv2_context_overlay
        hashState_luffa         luffa;
        cubehashParam           cube;
        shavite512_context      shavite;
 #if defined(__aarch64__)
        sph_simd512_context     simd;
 #else
        hashState_sd            simd;
 #endif        
        sph_hamsi512_context    hamsi;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
@@ -72,15 +62,9 @@ int x16rv2_hash( void* output, const void* input, int thrid )
            sph_bmw512_close(&ctx.bmw, hash);
         break;
         case GROESTL:
 #if defined(__AES__)
            init_groestl( &ctx.groestl, 64 );
            update_and_final_groestl( &ctx.groestl, (char*)hash,
                                      (const char*)in, size<<3 );
 #else
            sph_groestl512_init( &ctx.groestl );
            sph_groestl512( &ctx.groestl, in, size );
            sph_groestl512_close(&ctx.groestl, hash);
 #endif
         break;
         case SKEIN:
            sph_skein512_init( &ctx.skein );
@@ -117,25 +101,14 @@ int x16rv2_hash( void* output, const void* input, int thrid )
            shavite512_full( &ctx.shavite, hash, in, size );
         break;
         case SIMD:
 #if defined(__aarch64__)
            sph_simd512_init( &ctx.simd );
-            sph_simd512(&ctx.simd, (const void*) hash, 64);
+            sph_simd512(&ctx.simd, hash, 64);
            sph_simd512_close(&ctx.simd, hash);
 #else
            simd_full( &ctx.simd, (BitSequence *)hash,
                             (const BitSequence*)in, size<<3 );
 #endif
         break;
         case ECHO:
 #if defined(__AES__)
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash,
                                (const BitSequence*)in, size<<3 );
 #else
             sph_echo512_init( &ctx.echo );
             sph_echo512( &ctx.echo, in, size );
             sph_echo512_close( &ctx.echo, hash );
 #endif
         break;
         case HAMSI:
             sph_hamsi512_init( &ctx.hamsi );
@@ -143,11 +116,7 @@ int x16rv2_hash( void* output, const void* input, int thrid )
             sph_hamsi512_close( &ctx.hamsi, hash );
         break;
         case FUGUE:
 #if defined(__AES__)
             fugue512_full( &ctx.fugue, hash, in, size );
 #else
             sph_fugue512_full( &ctx.fugue, hash, in, size );
 #endif
 	     break;
         case SHABAL:
             sph_shabal512_init( &ctx.shabal );
--- a/algo/x16/x20r.c
+++ b/algo/x16/x20r.c
@@ -0,0 +1,362 @@
 #include "miner.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "algo/blake/sph_blake.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/groestl/sph_groestl.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/luffa/sph_luffa.h"
 #include "algo/cubehash/sph_cubehash.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/sph_simd.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sph_sha2.h"
 #include "x16r-gate.h"
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define X20R_8WAY   1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X20R_4WAY   1
 #elif defined(__SSE2__) || defined(__ARM_NEON)
  #define X20R_2WAY   1
 #endif
 // X20R is not what it seems. It does not permute 20 functions over 20 rounds,
 // it only permutes 16 of them. The last 4 functions are victims of trying to
 // fit 20 elements in the space for only 16. Arithmetic overflow recycles the
 // first 4 functions.  Otherwise it's identical to X16R. 
 // Welcome to the real X20R.
 #define X20R_HASH_FUNC_COUNT 20
 /*
 enum x20r_algo
 {
 	BLAKE = 0,
 	BMW,
 	GROESTL,
 	JH,
 	KECCAK,
 	SKEIN,
 	LUFFA,
 	CUBEHASH,
 	SHAVITE,
 	SIMD,
 	ECHO,
 	HAMSI,
 	FUGUE,
 	SHABAL,
 	WHIRLPOOL,
 	SHA512,
 	HAVAL,       // Last 4 names are meaningless and not used
 	GOST,
 	RADIOGATUN,
 	PANAMA,   
 	X20R_HASH_FUNC_COUNT
 };
 */
 static __thread char x20r_hash_order[ X20R_HASH_FUNC_COUNT + 1 ] = {0};
 static void x20r_getAlgoString(const uint8_t* prevblock, char *output)
 {
 	char *sptr = output;
 	for (int j = 0; j < X20R_HASH_FUNC_COUNT; j++) {
 		uint8_t b = (19 - j) >> 1; // 16 ascii hex chars, reversed
 		uint8_t algoDigit = (j & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4;
 		if (algoDigit >= 10)
 			sprintf(sptr, "%c", 'A' + (algoDigit - 10));
 		else
 			sprintf(sptr, "%u", (uint32_t) algoDigit);
 		sptr++;
 	}
 	*sptr = '\0';
 }
 #if defined(X20R_8WAY)
 int x20r_8x64_hash( void* output, const void* input, int thrid )
 {
   uint8_t hash[64*8] __attribute__ ((aligned (128)));
   if ( !x16r_8x64_hash_generic( hash, input, thrid, x20r_hash_order,
                                 X20R_HASH_FUNC_COUNT ) )
      return 0;
   memcpy( output,     hash,     32 );
   memcpy( output+32,  hash+64,  32 );
   memcpy( output+64,  hash+128, 32 );
   memcpy( output+96,  hash+192, 32 );
   memcpy( output+128, hash+256, 32 );
   memcpy( output+160, hash+320, 32 );
   memcpy( output+192, hash+384, 32 );
   memcpy( output+224, hash+448, 32 );
   return 1;
 }
 int scanhash_x20r_8x64( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr)
 {
   uint32_t hash[16*8] __attribute__ ((aligned (128)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
   const int thr_id = mythr->id;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
   const bool bench = opt_benchmark;
   if ( bench )   ptarget[7] = 0x0cff;
   static __thread uint32_t saved_height = UINT32_MAX;
   if ( work->height != saved_height )
   {
      vdata[1] = bswap_32( pdata[1] );
      vdata[2] = bswap_32( pdata[2] );
      vdata[3] = bswap_32( pdata[3] );
      saved_height = work->height;
      x20r_getAlgoString( (const uint8_t*)(&vdata[1]), x20r_hash_order );
      if ( !opt_quiet && !thr_id )
           applog( LOG_INFO, "hash order %s", x20r_hash_order );
   }
   x16r_8x64_prehash( vdata, pdata, x20r_hash_order );
   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
   do
   {
      if( x20r_8x64_hash( hash, vdata, thr_id ) );
      for ( int i = 0; i < 8; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
         pdata[19] = bswap_32( n+i );
         submit_solution( work, hash+(i<<3), mythr );
      }
      *noncev = _mm512_add_epi32( *noncev,
                                  _mm512_set1_epi64( 0x0000000800000000 ) );
      n += 8;
   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
 #elif defined(X20R_4WAY)
 int x20r_4x64_hash( void* output, const void* input, int thrid )
 {
   uint8_t hash[64*4] __attribute__ ((aligned (64)));
   if ( !x16r_4x64_hash_generic( hash, input, thrid, x20r_hash_order,
                                 X20R_HASH_FUNC_COUNT ) )
      return 0;
   memcpy( output,     hash,     32 );
   memcpy( output+32,  hash+64,  32 );
   memcpy( output+64,  hash+128, 32 );
   memcpy( output+96,  hash+192, 32 );
   return 1;
 }
 int scanhash_x20r_4x64( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr)
 {
   uint32_t hash[16*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
   if ( bench )  ptarget[7] = 0x0cff;
   static __thread uint32_t saved_height = UINT32_MAX;
   if ( work->height != saved_height )
   {
      vdata[1] = bswap_32( pdata[1] );
      vdata[2] = bswap_32( pdata[2] );
      vdata[3] = bswap_32( pdata[3] );
      saved_height = work->height;
      x20r_getAlgoString( (const uint8_t*)(&vdata[1]), x20r_hash_order );
      if ( !opt_quiet && !thr_id )
           applog( LOG_INFO, "hash order %s", x20r_hash_order );
   }
   x16r_4x64_prehash( vdata, pdata, x20r_hash_order );
   *noncev = mm256_intrlv_blend_32(
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
   {
      if ( x20r_4x64_hash( hash, vdata, thr_id ) );
      for ( int i = 0; i < 4; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
         pdata[19] = bswap_32( n+i );
         submit_solution( work, hash+(i<<3), mythr );
      }
      *noncev = _mm256_add_epi32( *noncev,
                                  _mm256_set1_epi64x( 0x0000000400000000 ) );
      n += 4;
   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
 #elif defined(X20R_2WAY)
 int x20r_2x64_hash( void* output, const void* input, int thrid )
 {
   uint8_t hash[64*2] __attribute__ ((aligned (64)));
   if ( !x16r_2x64_hash_generic( hash, input, thrid, x20r_hash_order,
                                 X20R_HASH_FUNC_COUNT ) )
      return 0;
   memcpy( output,     hash,     32 );
   memcpy( output+32,  hash+64,  32 );
   return 1;
 }
 int scanhash_x20r_2x64( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr)
 {
   uint32_t hash[16*2] __attribute__ ((aligned (64)));
   uint32_t vdata[20*2] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 2;
   uint32_t n = first_nonce;
   v128_t *noncev = (v128_t*)vdata + 9;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
   if ( bench )  ptarget[7] = 0x0cff;
   static __thread uint32_t saved_height = UINT32_MAX;
   if ( work->height != saved_height )
   {
      vdata[1] = bswap_32( pdata[1] );
      vdata[2] = bswap_32( pdata[2] );
      vdata[3] = bswap_32( pdata[3] );
      saved_height = work->height;
      x20r_getAlgoString( (const uint8_t*)(&vdata[1]), x20r_hash_order );
      if ( !opt_quiet && !thr_id )
           applog( LOG_INFO, "hash order %s", x20r_hash_order );
   }
   x16r_2x64_prehash( vdata, pdata, x20r_hash_order );
   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
   do
   {
      if ( x20r_2x64_hash( hash, vdata, thr_id ) );
      for ( int i = 0; i < 2; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
         pdata[19] = bswap_32( n+i );
         submit_solution( work, hash+(i<<3), mythr );
      }
      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
      n += 2;
   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
 #else
 int x20r_hash( void* output, const void* input, int thrid )
 {
   uint8_t hash[64] __attribute__ ((aligned (64)));
   if ( !x16r_hash_generic( hash, input, thrid, x20r_hash_order, 
                            X20R_HASH_FUNC_COUNT ) )
      return 0;
    memcpy( output, hash, 32 );
    return 1;
 }
 int scanhash_x20r( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(32) hash32[8];
   uint32_t _ALIGN(32) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const int thr_id = mythr->id;
   uint32_t nonce = first_nonce;
   volatile uint8_t *restart = &( work_restart[thr_id].restart );
   const bool bench = opt_benchmark;
   if ( bench )  ptarget[7] = 0x0cff;
   static __thread uint32_t saved_height = UINT32_MAX;
   if ( work->height != saved_height )
   {
      edata[1] = bswap_32( pdata[1] );
      edata[2] = bswap_32( pdata[2] );
      edata[3] = bswap_32( pdata[3] );
      saved_height = work->height;
      x20r_getAlgoString( (const uint8_t*)(&edata[1]), x20r_hash_order );
      if ( !opt_quiet && !thr_id )
           applog( LOG_INFO, "hash order %s", x20r_hash_order );
   }
   x16r_prehash( edata, pdata, x20r_hash_order );
   do
   {
      edata[19] = nonce;
      if ( x20r_hash( hash32, edata, thr_id ) )
      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
      {
         pdata[19] = bswap_32( nonce );
         submit_solution( work, hash32, mythr );
      }
      nonce++;
   } while ( nonce < max_nonce && !(*restart) );
   pdata[19] = nonce;
   *hashes_done = pdata[19] - first_nonce;
   return 0;
 }
 #endif
 bool register_x20r_algo( algo_gate_t* gate )
 {
 #if defined (X20R_8WAY)
  gate->scanhash          = (void*)&scanhash_x20r_8x64;
 #elif defined (X20R_4WAY)
  gate->scanhash          = (void*)&scanhash_x20r_4x64;
 #elif defined (X20R_2WAY)
  gate->scanhash          = (void*)&scanhash_x20r_2x64;
 #else
  gate->scanhash          = (void*)&scanhash_x20r;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
                      | NEON_OPT;
  opt_target_factor = 256.0;
  return true;
 };
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -9,6 +9,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/tiger/sph_tiger.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/lyra2/lyra2.h"
@@ -42,7 +43,8 @@ int x21s_8way_hash( void* output, const void* input, int thrid )
   uint32_t *hash7 = (uint32_t*)( shash+448 );
   x21s_8way_context_overlay ctx;
-   if ( !x16r_8way_hash_generic( shash, input, thrid ) )
+   if ( !x16r_8way_hash_generic( shash, input, thrid, x16r_hash_order, 
                                 X16R_HASH_FUNC_COUNT ) )
      return 0;
   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
@@ -134,7 +136,6 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &hash[7<<3];
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t bedata1[2] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
@@ -148,20 +149,18 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
   if ( bench )   ptarget[7] = 0x0cff;
-   bedata1[0] = bswap_32( pdata[1] );
+   static __thread uint32_t saved_height = UINT32_MAX;
-   bedata1[1] = bswap_32( pdata[2] );
+   if ( work->height != saved_height )
   static __thread uint32_t s_ntime = UINT32_MAX;
   uint32_t ntime = bswap_32( pdata[17] );
   if ( s_ntime != ntime )
   {
-      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      vdata[1] = bswap_32( pdata[1] );
-      s_ntime = ntime;
+      vdata[2] = bswap_32( pdata[2] );
-      if ( opt_debug && !thr_id )
+      saved_height = work->height;
-              applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
      if ( !opt_quiet && !thr_id )
           applog( LOG_INFO, "hash order %s", x16r_hash_order );
   }
-   x16r_8way_prehash( vdata, pdata );
+   x16r_8way_prehash( vdata, pdata, x16r_hash_order );
   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
@@ -223,7 +222,8 @@ int x21s_4way_hash( void* output, const void* input, int thrid )
   uint32_t *hash2 = (uint32_t*)( shash+128 );
   uint32_t *hash3 = (uint32_t*)( shash+192 );
-   if ( !x16r_4way_hash_generic( shash, input, thrid ) )
+   if ( !x16r_4way_hash_generic( shash, input, thrid, x16r_hash_order,
                                 X16R_HASH_FUNC_COUNT ) )
      return 0;
   intrlv_4x32( vhash, hash0, hash1, hash2, hash3,  512 );
@@ -294,7 +294,6 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t bedata1[2] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -307,20 +306,18 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
   if ( bench )  ptarget[7] = 0x0cff;
-   bedata1[0] = bswap_32( pdata[1] );
+   static __thread uint32_t saved_height = UINT32_MAX;
-   bedata1[1] = bswap_32( pdata[2] );
+   if ( work->height != saved_height )
   static __thread uint32_t s_ntime = UINT32_MAX;
   uint32_t ntime = bswap_32( pdata[17] );
   if ( s_ntime != ntime )
   {
-      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      vdata[1] = bswap_32( pdata[1] );
-      s_ntime = ntime;
+      vdata[2] = bswap_32( pdata[2] );
-      if ( opt_debug && !thr_id )
+      saved_height = work->height;
-              applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
+      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
      if ( !opt_quiet && !thr_id )
           applog( LOG_INFO, "hash order %s", x16r_hash_order );
   }
-   x16r_4way_prehash( vdata, pdata );
+   x16r_4way_prehash( vdata, pdata, x16r_hash_order );
   *noncev = mm256_intrlv_blend_32(
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
@@ -351,4 +348,117 @@ bool x21s_4way_thread_init()
   return x21s_4way_matrix;
 }
 #elif defined (X21S_2WAY)
 static __thread uint64_t* x21s_2x64_matrix;
 union _x21s_2x64_context_overlay
 {
    sph_haval256_5_context  haval;
    sph_tiger_context       tiger;
    sph_gost512_context     gost;
 } __attribute__ ((aligned (64)));
 typedef union _x21s_2x64_context_overlay x21s_2x64_context_overlay;
 int x21s_2x64_hash( void* output, const void* input, int thrid )
 {
   uint8_t  shash[64*2] __attribute__ ((aligned (64)));
   x21s_2x64_context_overlay ctx;
   uint32_t *hash0 = (uint32_t*)  shash;
   uint32_t *hash1 = (uint32_t*)( shash+64  );
   if ( !x16r_2x64_hash_generic( shash, input, thrid, x16r_hash_order, 
                                 X16R_HASH_FUNC_COUNT ) )
      return 0;
   sph_haval256_5_init( &ctx.haval );
   sph_haval256_5( &ctx.haval, hash0, 64 );
   sph_haval256_5_close( &ctx.haval, hash0 );
   sph_haval256_5_init( &ctx.haval );
   sph_haval256_5( &ctx.haval, hash1, 64 );
   sph_haval256_5_close( &ctx.haval, hash1 );
   sph_tiger_init( &ctx.tiger );
   sph_tiger ( &ctx.tiger, (const void*) hash0, 64 );
   sph_tiger_close( &ctx.tiger, (void*) hash0 );
   sph_tiger_init( &ctx.tiger );
   sph_tiger ( &ctx.tiger, (const void*) hash1, 64 );
   sph_tiger_close( &ctx.tiger, (void*) hash1 );
   LYRA2REV2( x21s_2x64_matrix, (void*) hash0, 32, (const void*) hash0, 32,
            (const void*) hash0, 32, 1, 4, 4 );
   LYRA2REV2( x21s_2x64_matrix, (void*) hash1, 32, (const void*) hash1, 32,
            (const void*) hash1, 32, 1, 4, 4 );
   sph_gost512_init( &ctx.gost );
   sph_gost512 ( &ctx.gost, (const void*) hash0, 64 );
   sph_gost512_close( &ctx.gost, (void*) hash0 );
   sph_gost512_init( &ctx.gost );
   sph_gost512 ( &ctx.gost, (const void*) hash1, 64 );
   sph_gost512_close( &ctx.gost, (void*) hash1 );
   sha256_full( output,    hash0, 64 );
   sha256_full( output+32, hash1, 64 );
   return 1;
 }
 int scanhash_x21s_2x64( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr)
 {
   uint32_t hash[16*2] __attribute__ ((aligned (64)));
   uint32_t vdata[20*2] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 2;
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
   v128_t *noncev = (v128_t*)vdata + 9;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
   if ( bench )  ptarget[7] = 0x0cff;
   static __thread uint32_t saved_height = UINT32_MAX;
   if ( work->height != saved_height )
   {
      vdata[1] = bswap_32( pdata[1] );
      vdata[2] = bswap_32( pdata[2] );
      saved_height = work->height;
      x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
      if ( !opt_quiet && !thr_id )
           applog( LOG_INFO, "hash order %s", x16r_hash_order );
   }
   x16r_2x64_prehash( vdata, pdata, x16r_hash_order );
   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
   do
   {
      if ( x21s_2x64_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 2; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
         pdata[19] = bswap_32( n+i );
         submit_solution( work, hash+(i<<3), mythr );
      }
      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
      n += 2;
   } while ( likely( (  n < last_nonce ) && !(*restart) ) );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
 bool x21s_2x64_thread_init()
 {
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
   const int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
   x21s_2x64_matrix = mm_malloc( size, 64 );
   return x21s_2x64_matrix;
 }
 #endif
--- a/algo/x16/x21s.c
+++ b/algo/x16/x21s.c
@@ -15,7 +15,7 @@
 #include "algo/gost/sph_gost.h"
 #include "algo/lyra2/lyra2.h"
-#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
+#if !defined(X21S_8WAY) && !defined(X21S_4WAY)
 static __thread uint64_t* x21s_matrix;
@@ -33,7 +33,8 @@ int x21s_hash( void* output, const void* input, int thrid )
   uint32_t _ALIGN(128) hash[16];
   x21s_context_overlay ctx;
-   if ( !x16r_hash_generic( hash, input, thrid ) )
+   if ( !x16r_hash_generic( hash, input, thrid, x16r_hash_order,
                            X16R_HASH_FUNC_COUNT ) )
      return 0;
   sph_haval256_5_init( &ctx.haval );
@@ -84,7 +85,7 @@ int scanhash_x21s( struct work *work, uint32_t max_nonce,
          applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
   }
-   x16r_prehash( edata, pdata );
+   x16r_prehash( edata, pdata, x16r_hash_order );
   do
   {
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -928,29 +928,24 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,
 #elif defined(X17_2X64)
 // Need sph in some cases
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 //#include "algo/simd/sph_simd.h"
 //#include "algo/simd/nist.h"
 #if !( defined(__SSE4_2__) || defined(__ARM_NEON) )
  #include "algo/hamsi/sph_hamsi.h"
 #endif
 #include "algo/shabal/sph_shabal.h"
 #include "algo/haval/sph-haval.h"
 #if !( defined(__AES__) ) //|| defined(__ARM_FEATURE_AES) )
  #include "algo/groestl/sph_groestl.h"
 #endif
 #if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
  #include "algo/groestl/sph_groestl.h"
  #include "algo/echo/sph_echo.h"
 #endif
  #include "algo/fugue/sph_fugue.h"
 #endif
 union _x17_context_overlay
 {
        blake512_2x64_context   blake;
        bmw512_2x64_context     bmw;
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_groestl       groestl;
 #else
        sph_groestl512_context  groestl;
@@ -960,7 +955,7 @@ union _x17_context_overlay
 #else
        sph_echo512_context     echo;
 #endif
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_fugue         fugue;
 #else
        sph_fugue512_context    fugue;
@@ -1000,7 +995,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
    dintrlv_2x64( hash0, hash1, vhash, 512 );
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
    groestl512_full( &ctx.groestl, hash0, hash0, 512 );
    groestl512_full( &ctx.groestl, hash1, hash1, 512 );
 #else
@@ -1061,7 +1056,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
    sph_hamsi512_close( &ctx.hamsi, hash1 );
 #endif
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
    fugue512_full( &ctx.fugue, hash0, hash0, 64 );
    fugue512_full( &ctx.fugue, hash1, hash1, 64 );
 #else
@@ -1133,14 +1128,12 @@ int scanhash_x17_2x64( struct work *work, uint32_t max_nonce,
      {
         if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
         {
 applog(LOG_INFO,"Submitted Thread %d, lane %d",thr_id,0);
              pdata[19] = bswap_32( n );
 //            pdata[19] = n;
            submit_solution( work, hash, mythr );
         }
         if ( unlikely( valid_hash( hash+8, ptarget ) && !bench ) )
         {
 applog(LOG_INFO,"Submitted Thread %d, lane %d",thr_id,1);            
            pdata[19] = bswap_32( n+1 );
            submit_solution( work, hash+8, mythr );
         }
--- a/algo/x22/x22i.c
+++ b/algo/x22/x22i.c
@@ -4,25 +4,24 @@
 #include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/fugue/fugue-aesni.h"
 #else
  #include "algo/fugue/sph_fugue.h"
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #else
  #include "algo/groestl/sph_groestl.h"
  #include "algo/echo/sph_echo.h"
  #include "algo/fugue/sph_fugue.h"
 #endif
 #include "algo/skein/sph_skein.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#if defined(__aarch64__)
+#include "algo/simd/simd-hash-2way.h"
  #include "algo/simd/sph_simd.h"
 #else
  #include "algo/simd/nist.h"
 #endif
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
@@ -39,14 +38,17 @@ union _x22i_context_overlay
 {
        blake512_context       blake;
        sph_bmw512_context     bmw;
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_fugue         fugue;
 #else
        sph_fugue512_context    fugue;
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_groestl       groestl;
        hashState_echo          echo;
        hashState_fugue         fugue;
 #else
        sph_groestl512_context  groestl;
        sph_echo512_context     echo;
        sph_fugue512_context    fugue;
 #endif
        sph_jh512_context       jh;
        sph_keccak512_context   keccak;
@@ -54,11 +56,7 @@ union _x22i_context_overlay
        hashState_luffa         luffa;
        cubehashParam           cube;
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
+        simd512_context         simd;
        sph_simd512_context     simd;
 #else
        hashState_sd            simd;
 #endif
        sph_hamsi512_context    hamsi;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
@@ -83,10 +81,8 @@ int x22i_hash( void *output, const void *input, int thrid )
   sph_bmw512(&ctx.bmw, (const void*) hash, 64);
   sph_bmw512_close(&ctx.bmw, hash);
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
-   init_groestl( &ctx.groestl, 64 );
+   groestl512_full( &ctx.groestl, hash, hash, 512 );
   update_and_final_groestl( &ctx.groestl, (char*)hash,
                                  (const char*)hash, 512 );
 #else
   sph_groestl512_init( &ctx.groestl );
   sph_groestl512( &ctx.groestl, hash, 64 );
@@ -109,26 +105,16 @@ int x22i_hash( void *output, const void *input, int thrid )
   luffa_full( &ctx.luffa, hash, 512, hash, 64 );
-   cubehashInit( &ctx.cube, 512, 16, 32 );
+   cubehash_full( &ctx.cube, hash, 512, hash, 64 );
   cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
   sph_shavite512_init(&ctx.shavite);
   sph_shavite512(&ctx.shavite, (const void*) hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);
-#if defined(__aarch64__)
+   simd512_ctx( &ctx.simd, hash, hash, 64 );
    sph_simd512_init(&ctx.simd );
    sph_simd512(&ctx.simd, (const void*) hash, 64);
    sph_simd512_close(&ctx.simd, hash);
 #else
    simd_full( &ctx.simd, (BitSequence *)hash,
                       (const BitSequence *)hash, 512 );
 #endif
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
-   init_echo( &ctx.echo, 512 );
+   echo_full( &ctx.echo, hash, 512, hash, 64 );
   update_final_echo ( &ctx.echo, (BitSequence*)hash,
                            (const BitSequence*)hash, 512 );
 #else
   sph_echo512_init( &ctx.echo );
   sph_echo512( &ctx.echo, hash, 64 );
@@ -141,7 +127,7 @@ int x22i_hash( void *output, const void *input, int thrid )
   sph_hamsi512(&ctx.hamsi, (const void*) hash, 64);
   sph_hamsi512_close(&ctx.hamsi, hash);
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   fugue512_full( &ctx.fugue, hash, hash, 64 );
 #else
   sph_fugue512_init(&ctx.fugue);
@@ -192,8 +178,8 @@ int x22i_hash( void *output, const void *input, int thrid )
 int scanhash_x22i( struct work *work, uint32_t max_nonce,
             uint64_t *hashes_done, struct thr_info *mythr)
 {
-   uint32_t edata[20] __attribute__((aligned(64)));
+   uint32_t edata[20] __attribute__((aligned(32)));
-   uint32_t hash64[8] __attribute__((aligned(64)));
+   uint32_t hash64[8] __attribute__((aligned(32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
--- a/algo/x22/x25x.c
+++ b/algo/x22/x25x.c
@@ -4,25 +4,24 @@
 #include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/fugue/fugue-aesni.h"
 #else
  #include "algo/fugue/sph_fugue.h"
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #else
  #include "algo/groestl/sph_groestl.h"
  #include "algo/echo/sph_echo.h"
  #include "algo/fugue/sph_fugue.h"
 #endif
 #include "algo/skein/sph_skein.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#if defined(__aarch64__)
+#include "algo/simd/simd-hash-2way.h"
  #include "algo/simd/sph_simd.h"
 #else
  #include "algo/simd/nist.h"
 #endif
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
@@ -42,14 +41,17 @@ union _x25x_context_overlay
 {
        blake512_context        blake;
        sph_bmw512_context      bmw;
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_fugue         fugue;
 #else
        sph_fugue512_context    fugue;
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_groestl       groestl;
        hashState_echo          echo;
        hashState_fugue         fugue;
 #else
        sph_groestl512_context  groestl;
        sph_echo512_context     echo;
        sph_fugue512_context    fugue;
 #endif
        sph_jh512_context       jh;
        sph_keccak512_context   keccak;
@@ -57,11 +59,7 @@ union _x25x_context_overlay
        hashState_luffa         luffa;
        cubehashParam           cube;
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
+        simd512_context         simd;
        sph_simd512_context     simd;
 #else
        hashState_sd            simd;
 #endif
        sph_hamsi512_context    hamsi;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
@@ -88,10 +86,8 @@ int x25x_hash( void *output, const void *input, int thrid )
   sph_bmw512(&ctx.bmw, (const void*) &hash[0], 64);
   sph_bmw512_close(&ctx.bmw, &hash[1]);
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
-   init_groestl( &ctx.groestl, 64 );
+   groestl512_full( &ctx.groestl, (void*)&hash[2], (const void*)&hash[1], 512 );
   update_and_final_groestl( &ctx.groestl, (char*)&hash[2],
                                  (const char*)&hash[1], 512 );
 #else
   sph_groestl512_init( &ctx.groestl );
   sph_groestl512( &ctx.groestl, &hash[1], 64 );
@@ -112,28 +108,18 @@ int x25x_hash( void *output, const void *input, int thrid )
   if ( work_restart[thrid].restart ) return 0;
-   init_luffa( &ctx.luffa, 512 );
+   luffa_full( &ctx.luffa, (void*)&hash[6], 512, (const void*)&hash[5], 64 );
   luffa_full( &ctx.luffa, &hash[6], 512, &hash[5], 64 );
-   cubehashInit( &ctx.cube, 512, 16, 32 );
+   cubehash_full( &ctx.cube, (void*)&hash[7], 512, (const void*)&hash[6], 64 );
   cubehashUpdateDigest( &ctx.cube, &hash[7], &hash[6], 64 );
   sph_shavite512_init(&ctx.shavite);
   sph_shavite512(&ctx.shavite, (const void*) &hash[7], 64);
   sph_shavite512_close(&ctx.shavite, &hash[8]);
-#if defined(__aarch64__)
+   simd512_ctx( &ctx.simd, (void*)&hash[9], (const void*)&hash[8], 64 ); 
    sph_simd512(&ctx.simd, (const void*) &hash[8], 64);
    sph_simd512_close(&ctx.simd, &hash[9] );
 #else
    update_final_sd( &ctx.simd, (BitSequence *)&hash[9],
                       (const BitSequence *)&hash[8], 512 );
 #endif
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
-   init_echo( &ctx.echo, 512 );
+   echo_full( &ctx.echo, (void*)&hash[10], 512, (const void*)&hash[9], 64 );
   update_final_echo ( &ctx.echo, (BitSequence*)&hash[10],
                            (const BitSequence*)&hash[9], 512 );
 #else
   sph_echo512_init( &ctx.echo );
   sph_echo512( &ctx.echo, &hash[9], 64 );
@@ -146,7 +132,7 @@ int x25x_hash( void *output, const void *input, int thrid )
   sph_hamsi512(&ctx.hamsi, (const void*) &hash[10], 64);
   sph_hamsi512_close(&ctx.hamsi, &hash[11]);
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   fugue512_full( &ctx.fugue, &hash[12], &hash[11], 64 );
 #else
   sph_fugue512_init(&ctx.fugue);
@@ -227,8 +213,8 @@ int x25x_hash( void *output, const void *input, int thrid )
 int scanhash_x25x( struct work *work, uint32_t max_nonce,
             uint64_t *hashes_done, struct thr_info *mythr)
 {
-   uint32_t edata[20] __attribute__((aligned(64)));
+   uint32_t edata[20] __attribute__((aligned(32)));
-   uint32_t hash64[8] __attribute__((aligned(64)));
+   uint32_t hash64[8] __attribute__((aligned(32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
@@ -245,7 +231,7 @@ int scanhash_x25x( struct work *work, uint32_t max_nonce,
   do
   {
      edata[19] = n;
-      if ( x25x_hash( hash64, edata, thr_id ) )
+      if ( x25x_hash( hash64, edata, thr_id ) );
      if ( unlikely( valid_hash( hash64, ptarget ) && !bench ) )
      {
         pdata[19] = bswap_32( n );
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.10.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.15.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='23.10'
+PACKAGE_VERSION='23.15'
-PACKAGE_STRING='cpuminer-opt 23.10'
+PACKAGE_STRING='cpuminer-opt 23.15'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 23.10 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 23.15 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1432,7 +1432,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 23.10:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 23.15:";;
   esac
  cat <<\_ACEOF
@@ -1538,7 +1538,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 23.10
+cpuminer-opt configure 23.15
 generated by GNU Autoconf 2.71
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by cpuminer-opt $as_me 23.10, which was
+It was created by cpuminer-opt $as_me 23.15, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
  $ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi
 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='23.10'
+ VERSION='23.15'
 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 23.10, which was
+This file was extended by cpuminer-opt $as_me 23.15, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 23.10
+cpuminer-opt config.status 23.15
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [23.10])
+AC_INIT([cpuminer-opt], [23.15])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.9.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.14.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='23.9'
+PACKAGE_VERSION='23.14'
-PACKAGE_STRING='cpuminer-opt 23.9'
+PACKAGE_STRING='cpuminer-opt 23.14'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 23.9 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 23.14 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1432,7 +1432,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 23.9:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 23.14:";;
   esac
  cat <<\_ACEOF
@@ -1538,7 +1538,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 23.9
+cpuminer-opt configure 23.14
 generated by GNU Autoconf 2.71
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by cpuminer-opt $as_me 23.9, which was
+It was created by cpuminer-opt $as_me 23.14, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
  $ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi
 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='23.9'
+ VERSION='23.14'
 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 23.9, which was
+This file was extended by cpuminer-opt $as_me 23.14, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 23.9
+cpuminer-opt config.status 23.14
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -2837,15 +2837,6 @@ static void show_credits()
 #define check_cpu_capability() cpu_capability( false )
 #define display_cpu_capability() cpu_capability( true )
 #if defined(__aarch64__)
 #define XSTR(x) STR(x)
 #define STR(x) #x
 //#pragma message "Building for armv" XSTR(__ARM_ARCH)  
 #endif
 static bool cpu_capability( bool display_only )
 {
     char cpu_brand[0x40];
@@ -3675,11 +3666,6 @@ static int thread_create(struct thr_info *thr, void* func)
 void get_defconfig_path(char *out, size_t bufsize, char *argv0);
 #include "simd-utils.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "compat/aes_helper.c"
 int main(int argc, char *argv[])
 {
 	struct thr_info *thr;
--- a/miner.h
+++ b/miner.h
@@ -672,6 +672,7 @@ enum algos {
        ALGO_X16RT_VEIL,
        ALGO_X16S,
        ALGO_X17,
        ALGO_X20R,
        ALGO_X21S,
        ALGO_X22I,
        ALGO_X25X,
@@ -767,6 +768,7 @@ static const char* const algo_names[] = {
        "x16rt-veil",
        "x16s",
        "x17",
        "x20r",
        "x21s",
        "x22i",
        "x25x",
@@ -930,6 +932,7 @@ Options:\n\
                          x16rt-veil    Veil (VEIL)\n\
                          x16s\n\
                          x17\n\
                          x20r\n\
                          x21s\n\
                          x22i\n\
                          x25x\n\
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -381,7 +381,7 @@ static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
   d0[15] = s[ 60];   d1[15] = s[ 61];    d2[15] = s[ 62];   d3[15] = s[ 63];
 }
-#endif   // SSE4_1 else SSE2 or NEON
+#endif   // SSE4_1 or NEON else SSE2
 static inline void extr_lane_4x32( void *d, const void *s,
                                   const int lane, const int bit_len )
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -207,7 +207,7 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
 #endif
-// broadcast lane l to all lanes
+// broadcast (replicate) lane l to all lanes
 #define v128_replane64( v, l ) \
   ( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x44 ) \
                : _mm_shuffle_epi32( v, 0xee )
@@ -319,7 +319,7 @@ static inline __m128i v128_neg1_fn()
 //    c[7:6] source element selector
 // Convert type and abbreviate name: eXtract Insert Mask = XIM
-#define mm128_xim_32( v1, v0, c ) \
+#define v128_xim32( v1, v0, c ) \
   _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
                                    _mm_castsi128_ps( v0 ), c ) )
@@ -327,20 +327,19 @@ static inline __m128i v128_neg1_fn()
 /*
 // Copy i32 to element c of dest and copy remaining elemnts from v.
 #define v128_put32( v, i32, c ) \
-      mm128_xim_32( v, mm128_mov32_128( i32 ), (c)<<4 )
+      v128_xim_32( v, mm128_mov32_128( i32 ), (c)<<4 )
 */
-#define mm128_mask_32( v, m )    mm128_xim_32( v, v, m )
+#define v128_mask32( v, m )    v128_xim32( v, v, m & 0xf )
 // Zero 32 bit elements when corresponding bit in 4 bit mask is set.
-//static inline __m128i mm128_mask_32( const __m128i v, const int m ) 
+//static inline __m128i v128_mask32( const __m128i v, const int m ) 
-//{   return mm128_xim_32( v, v, m ); }
+//{   return v128_xim32( v, v, m ); }
 #define v128_mask32    mm128_mask_32
-// Copy element i2 of v2 to element i1 of dest and copy remaining elements from v1.
+// Copy element l0 of v0 to element l1 of dest and copy remaining elements from v1.
 #define v128_movlane32( v1, l1, v0, l0 ) \
-  mm128_xim_32( v1, v0, ( (l1)<<4 ) | ( (l0)<<6 ) )
+  v128_xim32( v1, v0, ( (l1)<<4 ) | ( (l0)<<6 ) )
 #endif  // SSE4_1
@@ -451,7 +450,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_orand( a, b, c )     _mm_or_si128( a, _mm_and_si128( b, c ) )
-#define v128_xnor( a, b )         mm128_not( _mm_xor_si128( a, b ) )
+#define v128_xnor( a, b )         v128_not( _mm_xor_si128( a, b ) )
 #endif
@@ -482,7 +481,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_qrev16(v)      v128_shuffle16( v, 0x1b )
 #define v128_lrev16(v)      v128_shuffle16( v, 0xb1 )
-// These should never be callled from application code, use rol/ror.
+// Internal use only, should never be callled from application code.
 #define v128_ror64_sse2( v, c ) \
   _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
@@ -497,14 +496,14 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #if defined(__AVX512VL__)
-// AVX512 fastest all rotations.
+// AVX512 fastest for all rotations.
 #define v128_ror64                _mm_ror_epi64
 #define v128_rol64                _mm_rol_epi64
 #define v128_ror32                _mm_ror_epi32
 #define v128_rol32                _mm_rol_epi32
 // ror/rol will always find the fastest but these names may fit better with
-// application code performing shuffles rather than bit rotations.
+// application code performing byte operations rather than bit rotations.
 #define v128_shuflr64_8( v)         _mm_ror_epi64( v,  8 )
 #define v128_shufll64_8( v)         _mm_rol_epi64( v,  8 )
 #define v128_shuflr64_16(v)         _mm_ror_epi64( v, 16 )
@@ -576,7 +575,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 : v128_rol32_sse2( v, c )
 #elif defined(__SSE2__)
-// SSE2: fastest 32 bit, very fast 16
+// SSE2: fastest 32 bit, very fast 16, all else slow
 #define v128_ror64( v, c ) \
   ( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \
@@ -607,9 +606,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #endif
-//#define v128_ror64            mm128_ror_64
+// deprecated
 //#define v128_rol64            mm128_rol_64
 //#define v128_ror32            mm128_ror_32
 #define mm128_rol_32        v128_rol32
 /* not used
@@ -632,7 +629,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
   _mm_ror_epi32( v0, c ); \
   _mm_ror_epi32( v1, c )
-#define mm128_2rol32( v1, v0, c ) \
+#define v128_2rol32( v1, v0, c ) \
   _mm_rol_epi32( v0, c ); \
   _mm_rol_epi32( v1, c )
@@ -683,11 +680,13 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 // Cross lane shuffles
 // No NEON version
 #define v128_shuffle32     _mm_shuffle_epi32
-// shuffle using vector mask, for compatibility with NEON
+/* Not used, exists only for compatibility with NEON if ever needed.
 #define v128_shufflev32( v, vmask ) \
  v128_shuffle32( v, mm128_movmask_32( vmask ) )
 */
 #define v128_shuffle8     _mm_shuffle_epi8
@@ -696,12 +695,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_shuffle2_64( v1, v2, c ) \
   _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( v1 ), \
                                     _mm_castsi128_pd( v2 ), c ) ); 
 #define mm128_shuffle2_64   v128_shuffle2_64
 #define v128_shuffle2_32( v1, v2, c ) \
   _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
                                     _mm_castsi128_ps( v2 ), c ) ); 
 #define mm128_shuffle2_32   v128_shuffle2_32
 // Rotate vector elements accross all lanes
@@ -733,6 +730,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_bswap32( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
                                        0x0405060700010203 ) )
 // deprecated
 #define mm128_bswap_32      v128_bswap32
 #define v128_bswap16( v ) \
--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -40,7 +40,7 @@
 #define v128u8_load( p )              vld1q_u16( (uint8_t*)(p) )
 #define v128u8_store( p, v )          vst1q_u16( (uint8_t*)(p), v )
-// load & set1 combined
+// load & set1 combined, doesn't work
 #define v128_load1_64(p)              vld1q_dup_u64( (uint64_t*)(p) )
 #define v128_load1_32(p)              vld1q_dup_u32( (uint32_t*)(p) )
 #define v128_load1_16(p)              vld1q_dup_u16( (uint16_t*)(p) )
@@ -68,7 +68,7 @@
 #define v128_mul32                    vmulq_u32
 #define v128_mul16                    vmulq_u16
-// slow, tested with argon2d
+// Widening multiply, align source elements with Intel
 static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 {
   return vmull_u32( vget_low_u32( vcopyq_laneq_u32( v1, 1, v1, 2 ) ),
@@ -86,7 +86,7 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 // Not yet needed
 //#define v128_cmpeq1
-
+// Signed
 #define v128_cmpgt64( v1, v0 )        vcgtq_s64( (int64x2_t)v1, (int64x2_t)v0 )
 #define v128_cmpgt32( v1, v0 )        vcgtq_s32( (int32x4_t)v1, (int32x4_t)v0 )
 #define v128_cmpgt16( v1, v0 )        vcgtq_s16( (int16x8_t)v1, (int16x8_t)v0 )
@@ -97,7 +97,7 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 #define v128_cmplt16( v1, v0 )        vcltq_s16( (int16x8_t)v1, (int16x8_t)v0 )
 #define v128_cmplt8( v1, v0 )         vcltq_s8( (int8x16_t)v1, (int8x16_t)v0 )
-// bit shift
+// Logical bit shift
 #define v128_sl64                     vshlq_n_u64
 #define v128_sl32                     vshlq_n_u32
 #define v128_sl16                     vshlq_n_u16
@@ -108,7 +108,7 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 #define v128_sr16                     vshrq_n_u16
 #define v128_sr8                      vshrq_n_u8
-// Unit tested, working.
+// Arithmetic shift.
 #define v128_sra64( v, c )            vshrq_n_s64( (int64x2_t)v, c )
 #define v128_sra32( v, c )            vshrq_n_s32( (int32x4_t)v, c )
 #define v128_sra16( v, c )            vshrq_n_s16( (int16x8_t)v, c )
@@ -406,34 +406,17 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 v1 = vorrq_u32( v1, t1 ); \
 }
-// Cross lane shuffles, no programmable shuffle in NEON
+/* not used anywhere and hopefully never will
-
+// vector mask, use as last resort. prefer tbl, rev, alignr, etc
 // vector mask, use as last resort. prefer rev, alignr, etc
 #define v128_shufflev32( v, vmask ) \
  v128_set32( ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[3] ], \
              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[2] ], \
              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[1] ], \
              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[0] ] ) \
 */
 // compatible with x86_64, but very slow, avoid
 #define v128_shuffle8( v, vmask ) \
-   v128_set8( ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[15] ], \
+     vqtbl1q_u8( (uint8x16_t)v, (uint8x16_t)vmask )
              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[14] ], \
              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[13] ], \
              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[12] ], \
              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[11] ], \
              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[10] ], \
              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 9] ], \
              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 8] ], \
              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 7] ], \
              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 6] ], \
              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 5] ], \
              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 4] ], \
              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 3] ], \
              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 2] ], \
              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 1] ], \
              ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 0] ] )
 // sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
 // Bit rotation already promotes faster widths. Usage is context sensitive.
@@ -551,20 +534,6 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
   casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \
 }
 // Prograsmmable shuffles
 // no compatible shuffles with x86_64, will require targeted user code.
 #define v128_extractmask8( df, de, dd, dc, db, da, d9, d8, \
                           d7, d6, d5, d4, d3, d2, d1, d0, vmask )   \
  d0 = ((uint8_t*)(&vmask))[0];   d1 = ((uint8_t*)(&vmask))[1]; \
  d2 = ((uint8_t*)(&vmask))[2];   d3 = ((uint8_t*)(&vmask))[3]; \
  d4 = ((uint8_t*)(&vmask))[0];   d5 = ((uint8_t*)(&vmask))[1]; \
  d6 = ((uint8_t*)(&vmask))[2];   d7 = ((uint8_t*)(&vmask))[3]; \
  d8 = ((uint8_t*)(&vmask))[0];   d9 = ((uint8_t*)(&vmask))[1]; \
  da = ((uint8_t*)(&vmask))[2];   db = ((uint8_t*)(&vmask))[3]; \
  dc = ((uint8_t*)(&vmask))[0];   dd = ((uint8_t*)(&vmask))[1]; \
  de = ((uint8_t*)(&vmask))[2];   df = ((uint8_t*)(&vmask))[3]; 
 // Blendv
 #define v128_blendv( v1, v0, mask ) \
   v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -930,7 +930,9 @@ static inline void cpu_brand_string( char* s )
 #elif defined(__arm__) || defined(__aarch64__)
-    sprintf( s, "ARM 64 bit CPU" );
+    unsigned int cpu_info[4] = { 0 };
    cpuid( 0, 0, cpu_info );
    sprintf( s, "ARM 64 bit CPU, HWCAP %08x", cpu_info[0] );
 #else
Author	SHA1	Message	Date
Jay D Dee	9d3a46c355	v23.15	2023-11-30 14:36:47 -05:00
Jay D Dee	4e3f1b926f	v23.14	2023-11-28 00:58:43 -05:00
Jay D Dee	045b42babf	v23.13	2023-11-21 14:18:15 -05:00
Jay D Dee	fc696dbbe5	v23.12	2023-11-20 11:51:57 -05:00