v23.12

2025-09-17 23:44:27 +00:00 · 2023-11-20 11:51:57 -05:00
parent f3fde95f27
commit fc696dbbe5
36 changed files with 3597 additions and 3152 deletions
--- a/32
+++ b/32
@@ -27,17 +27,19 @@ See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions
 Requirements
 ------------

-Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
-supported.
+- A x86_64 architecture CPU with a minimum of SSE2 support. This includes Intel Core2 and newer and AMD equivalents.
+- Arm CPU supporting AArch64 and NEON.

-64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
-are not supported. FreeBSD YMMV.
+32 bit CPUs are not supported.

-ARM requirements (Beta):
+Older CPUs are supported by open source cpuminer-multi by TPruvot but at reduced performance.

-CPU: Armv8 and NEON, SHA2 & AES are optional
-OS: Linux distribution built for AArch64.
-Packages: source code only.
+Mining on mobile devices that meet the requirements is not recommended due to the risk of
+overheating and damaging the battery. Mining has unlimited demand, it will push any device
+to or beyond its limits. There is also a fire risk with overheated lithium batteries.
+
+Beware of apps claiming "mobile only mining". There is no such thing, they aren't miners.
+If a mobile CPU can mine it any CPU can.

 See wiki for details.

@@ -73,12 +75,22 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v23.12
+
+Several bugs fixes and speed improvements for x16r family for all CPU architectures.
+
+v23.11
+
+This is a release candidate for full AArch64 support, marking the end of the Beta phase.
+Fixed hmq1725 & x25x algos, SSE2 & NEON, broken in v3.23.4.
+Most CPU-mineable SHA3 algos (X*) upgraded to 2-way SSE2 & NEON.
+
 v23.10

 x86_64: Fixed scrypt, scryptn2 algos SSE2. 
-Fixed sha512d256d algo AVX2, SSE2, NEON.
+Fixed sha512256d algo AVX2, SSE2, NEON.
 Fixed a bug in Skein N-way that reduced performance.
-ARM: Skein algo optimized for NEON & SHA2.
+ARM: Skein optimized for NEON, SHA2 & SSE2.
 Skein2 algo 2-way optimized for NEON & SSE2.

 v23.9
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -99,7 +99,7 @@ typedef  uint32_t set_t;
 #define AES_OPT          1 <<  7   // Intel Westmere, AArch64
 #define VAES_OPT         1 <<  8   // Icelake, Zen3
 #define SHA_OPT          1 <<  9   // Zen1, Icelake, AArch64 
-#define SHA512_OPT       1 << 10   // AArch64 
+#define SHA512_OPT       1 << 10   // Intel Arrow Lake, AArch64 
 #define NEON_OPT         1 << 11   // AArch64 

 // AVX10 does not have explicit algo features:
--- a/algo/bmw/bmw512-4way.c
+++ b/algo/bmw/bmw512-4way.c
@@ -2,12 +2,11 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
-//#include "sph_keccak.h"
 #include "bmw-hash-4way.h"

 #if defined(BMW512_8WAY)

-void bmw512hash_8way(void *state, const void *input)
+void bmw512hash_8way( void *state, const void *input )
 {
    bmw512_8way_context ctx;
    bmw512_8way_init( &ctx );
@@ -27,9 +26,9 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
-   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   __m512i  *noncev = (__m512i*)vdata + 9;
   const uint32_t Htarg = ptarget[7];
-   int thr_id = mythr->id;
+   const int thr_id = mythr->id;

   mm512_bswap32_intrlv80_8x64( vdata, pdata );
   do {
@@ -43,7 +42,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) )
+          if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
          {
              pdata[19] = n + lane;
              submit_solution( work, lane_hash, mythr );
@@ -59,9 +58,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
   
 #elif defined(BMW512_4WAY)

-//#ifdef BMW512_4WAY
-
-void bmw512hash_4way(void *state, const void *input)
+void bmw512hash_4way( void *state, const void *input )
 {
    bmw512_4way_context ctx;
    bmw512_4way_init( &ctx );
@@ -80,10 +77,10 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce -  4;
-   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   const uint32_t last_nonce = max_nonce - 4;
+   __m256i  *noncev = (__m256i*)vdata + 9; 
   const uint32_t Htarg = ptarget[7];
-    int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;  

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
   do {
@@ -96,7 +93,7 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) )
+          if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
          {
              pdata[19] = n + lane;
              submit_solution( work, lane_hash, mythr );
@@ -110,4 +107,55 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }

+#elif defined(BMW512_2WAY)
+
+void bmw512hash_2x64( void *state, const void *input )
+{
+    bmw512_2x64_context ctx;
+    bmw512_2x64_init( &ctx );
+    bmw512_2x64_update( &ctx, input, 80 );
+    bmw512_2x64_close( &ctx, state );
+}
+
+int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
+   uint32_t hash[16*2] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[13]);   // 3*4+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   v128_t *noncev = (v128_t*)vdata + 9;  
+   const uint32_t Htarg = ptarget[7];
+   const int thr_id = mythr->id; 
+
+   v128_bswap32_intrlv80_2x64( vdata, pdata );
+   do {
+      *noncev = v128_intrlv_blend_32( v128_bswap32(
+                                      v128_set32( n+1, 0, n, 0 ) ), *noncev );
+
+      bmw512hash_2x64( hash, vdata );
+
+      for ( int lane = 0; lane < 2; lane++ )
+      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
+      {
+          extr_lane_2x64( lane_hash, hash, lane, 256 );
+          if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
+          {
+              pdata[19] = n + lane;
+              submit_solution( work, lane_hash, mythr );
+          }
+      }
+      n += 2;
+
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
 #endif
--- a/algo/bmw/bmw512-gate.c
+++ b/algo/bmw/bmw512-gate.c
@@ -2,7 +2,7 @@

 bool register_bmw512_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
  opt_target_factor = 256.0;
 #if defined (BMW512_8WAY)
  gate->scanhash  = (void*)&scanhash_bmw512_8way;
@@ -10,6 +10,9 @@ bool register_bmw512_algo( algo_gate_t* gate )
 #elif defined (BMW512_4WAY)
  gate->scanhash  = (void*)&scanhash_bmw512_4way;
  gate->hash      = (void*)&bmw512hash_4way;
+#elif defined (BMW512_2WAY)
+  gate->scanhash  = (void*)&scanhash_bmw512_2x64;
+  gate->hash      = (void*)&bmw512hash_2x64;
 #else
  gate->scanhash        = (void*)&scanhash_bmw512;
  gate->hash            = (void*)&bmw512hash;
--- a/algo/bmw/bmw512-gate.h
+++ b/algo/bmw/bmw512-gate.h
@@ -8,19 +8,27 @@
  #define BMW512_8WAY 1
 #elif defined(__AVX2__)
  #define BMW512_4WAY 1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define BMW512_2WAY 1
 #endif

 #if defined(BMW512_8WAY)

 void bmw512hash_8way( void *state, const void *input );
 int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
+                          uint64_t *hashes_done, struct thr_info *mythr );

 #elif defined(BMW512_4WAY)

 void bmw512hash_4way( void *state, const void *input );
 int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
+                          uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(BMW512_2WAY)
+
+void bmw512hash_2x64( void *state, const void *input );
+int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );

 #else

--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -236,9 +236,7 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc

 }

-
-
-HashReturn init_echo(hashState_echo *ctx, int nHashSize)
+HashReturn init_echo( hashState_echo *ctx, int nHashSize )
 {
 	int i, j;

@@ -280,7 +278,8 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
 	return SUCCESS;
 }

-HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
+HashReturn update_echo( hashState_echo *state, const void *data,
+                        uint32_t databitlen )
 {
 	unsigned int uByteLength, uBlockCount, uRemainingBytes;

@@ -330,7 +329,7 @@ HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLengt
 	return SUCCESS;
 }

-HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
+HashReturn final_echo( hashState_echo *state, void *hashval)
 {
 	v128_t remainingbits;

@@ -407,8 +406,8 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
 	return SUCCESS;
 }

-HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
-                              const BitSequence *data, DataLength databitlen )
+HashReturn update_final_echo( hashState_echo *state, void *hashval,
+                              const void *data, uint32_t databitlen )
 {
   unsigned int uByteLength, uBlockCount, uRemainingBytes;

@@ -530,8 +529,8 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
   return SUCCESS;
 }

-HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
-            int nHashSize, const BitSequence *data, DataLength datalen )
+HashReturn echo_full( hashState_echo *state, void *hashval,
+            int nHashSize, const void *data, uint32_t datalen )
 {
   int i, j;

@@ -578,7 +577,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
        {
           // Fill the buffer
           memcpy( state->buffer + state->uBufferBytes,
-                   (void*)data, state->uBlockLength - state->uBufferBytes );
+                   data, state->uBlockLength - state->uBufferBytes );

           // Process buffer
           Compress( state, state->buffer, 1 );
@@ -601,7 +600,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
        }

        if( uRemainingBytes > 0 )
-        memcpy(state->buffer, (void*)data, uRemainingBytes);
+        memcpy(state->buffer, data, uRemainingBytes);

        state->uBufferBytes = uRemainingBytes;
   }
@@ -689,7 +688,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
 }


-
+#if 0
 HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
 {
 	HashReturn hRet;
@@ -746,5 +745,6 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit

 	return SUCCESS;
 }
+#endif

 #endif
--- a/algo/echo/aes_ni/hash_api.h
+++ b/algo/echo/aes_ni/hash_api.h
@@ -47,16 +47,16 @@ HashReturn init_echo(hashState_echo *state, int hashbitlen);

 HashReturn reinit_echo(hashState_echo *state);

-HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen);
+HashReturn update_echo(hashState_echo *state, const void *data, uint32_t databitlen);

-HashReturn final_echo(hashState_echo *state, BitSequence *hashval);
+HashReturn final_echo(hashState_echo *state, void *hashval);

-HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
+HashReturn hash_echo(int hashbitlen, const void *data, uint32_t databitlen, void *hashval);

-HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
-                              const BitSequence *data, DataLength databitlen );
-HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
-            int nHashSize, const BitSequence *data, DataLength databitlen );
+HashReturn update_final_echo( hashState_echo *state, void *hashval,
+                              const void *data, uint32_t databitlen );
+HashReturn echo_full( hashState_echo *state, void *hashval,
+            int nHashSize, const void *data, uint32_t databitlen );

 #endif // HASH_API_H

--- a/algo/echo/sph_echo.c
+++ b/algo/echo/sph_echo.c
@@ -36,7 +36,6 @@

 #include "sph_echo.h"

-#if !defined(__AES__)

 #ifdef __cplusplus
 extern "C"{
@@ -1031,4 +1030,3 @@ sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #ifdef __cplusplus
 }
 #endif 
-#endif  // !AES
--- a/algo/echo/sph_echo.h
+++ b/algo/echo/sph_echo.h
@@ -36,8 +36,6 @@
 #ifndef SPH_ECHO_H__
 #define SPH_ECHO_H__

-#if !defined(__AES__)
-
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -318,5 +316,4 @@ void sph_echo512_addbits_and_close(
 #ifdef __cplusplus
 }
 #endif
-#endif // !AES
 #endif
--- a/algo/groestl/sph_groestl.c
+++ b/algo/groestl/sph_groestl.c
@@ -35,8 +35,6 @@

 #include "sph_groestl.h"

-#if !defined(__AES__)
-
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -3119,5 +3117,4 @@ sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #ifdef __cplusplus
 }

-#endif  // !AES
 #endif
--- a/algo/groestl/sph_groestl.h
+++ b/algo/groestl/sph_groestl.h
@@ -42,7 +42,6 @@ extern "C"{
 #include <stddef.h>
 #include "compat/sph_types.h"

-#if !defined(__AES__)   
 /**
 * Output size (in bits) for Groestl-224.
 */
@@ -327,5 +326,4 @@ void sph_groestl512_addbits_and_close(
 }
 #endif

-#endif  // !AES
 #endif
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -78,7 +78,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
-   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   __m256i *noncev = (__m256i*)vdata + 9;   // aligned
   const uint32_t Htarg = ptarget[7];
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
@@ -108,4 +108,53 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }

+#elif defined(KECCAK_2WAY)
+
+void keccakhash_2x64(void *state, const void *input)
+{
+    keccak256_2x64_context ctx;
+    keccak256_2x64_init( &ctx );
+    keccak256_2x64_update( &ctx, input, 80 );
+    keccak256_2x64_close( &ctx, state );
+}
+
+int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
+   uint32_t hash[16*2] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[13]);   // 3*4+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   v128_t *noncev = (v128_t*)vdata + 9;
+   const uint32_t Htarg = ptarget[7];
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+
+   v128_bswap32_intrlv80_2x64( vdata, pdata );
+   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
+   do {
+      keccakhash_2x64( hash, vdata );
+
+      for ( int lane = 0; lane < 2; lane++ )
+      if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
+      {
+          extr_lane_2x64( lane_hash, hash, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ))
+          {
+              pdata[19] = bswap_32( n + lane );
+              submit_solution( work, lane_hash, mythr );
+          }
+      }
+      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
+      n += 2;
+   } while ( (n < max_nonce-2) && !work_restart[thr_id].restart);
+   pdata[19] = n;
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
 #endif
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -17,6 +17,9 @@ bool register_keccak_algo( algo_gate_t* gate )
 #elif defined (KECCAK_4WAY)
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
+#elif defined (KECCAK_2WAY)
+  gate->scanhash  = (void*)&scanhash_keccak_2x64;
+  gate->hash      = (void*)&keccakhash_2x64;
 #else
  gate->scanhash  = (void*)&scanhash_keccak;
  gate->hash      = (void*)&keccakhash;
@@ -37,6 +40,9 @@ bool register_keccakc_algo( algo_gate_t* gate )
 #elif defined (KECCAK_4WAY)
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
+#elif defined (KECCAK_2WAY)
+  gate->scanhash  = (void*)&scanhash_keccak_2x64;
+  gate->hash      = (void*)&keccakhash_2x64;
 #else
  gate->scanhash  = (void*)&scanhash_keccak;
  gate->hash      = (void*)&keccakhash;
@@ -75,15 +81,17 @@ void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
 bool register_sha3d_algo( algo_gate_t* gate )
 {
  hard_coded_eb = 6;
-//  opt_extranonce = false;
-  gate->optimizations = AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
  gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root;
-#if defined (KECCAK_8WAY)
+#if defined (SHA3D_8WAY)
  gate->scanhash  = (void*)&scanhash_sha3d_8way;
  gate->hash      = (void*)&sha3d_hash_8way;
-#elif defined (KECCAK_4WAY)
+#elif defined (SHA3D_4WAY)
  gate->scanhash  = (void*)&scanhash_sha3d_4way;
  gate->hash      = (void*)&sha3d_hash_4way;
+#elif defined (SHA3D_2WAY)
+  gate->scanhash  = (void*)&scanhash_sha3d_2x64;
+  gate->hash      = (void*)&sha3d_hash_2x64;
 #else
  gate->scanhash  = (void*)&scanhash_sha3d;
  gate->hash      = (void*)&sha3d_hash;
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -8,6 +8,16 @@
  #define KECCAK_8WAY 1
 #elif defined(__AVX2__)
  #define KECCAK_4WAY 1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define KECCAK_2WAY 1
+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define SHA3D_8WAY 1
+#elif defined(__AVX2__)
+  #define SHA3D_4WAY 1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define SHA3D_2WAY 1
 #endif

 extern int hard_coded_eb;
@@ -16,27 +26,47 @@ extern int hard_coded_eb;

 void keccakhash_8way( void *state, const void *input );
 int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
-
-void sha3d_hash_8way( void *state, const void *input );
-int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
+                          uint64_t *hashes_done, struct thr_info *mythr );

 #elif defined(KECCAK_4WAY)

 void keccakhash_4way( void *state, const void *input );
 int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
+                          uint64_t *hashes_done, struct thr_info *mythr );

-void sha3d_hash_4way( void *state, const void *input );
-int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
+#elif defined(KECCAK_2WAY)
+
+void keccakhash_2x64( void *state, const void *input );
+int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );

 #else

 void keccakhash( void *state, const void *input );
 int scanhash_keccak( struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done, struct thr_info *mythr );
+                     uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
+#if defined(SHA3D_8WAY)
+
+void sha3d_hash_8way( void *state, const void *input );
+int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(SHA3D_4WAY)
+
+void sha3d_hash_4way( void *state, const void *input );
+int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(SHA3D_2WAY)
+
+void sha3d_hash_2x64( void *state, const void *input );
+int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#else

 void sha3d_hash( void *state, const void *input );
 int scanhash_sha3d( struct work *work, uint32_t max_nonce,
--- a/algo/keccak/sha3d-4way.c
+++ b/algo/keccak/sha3d-4way.c
@@ -4,7 +4,7 @@
 #include <stdint.h>
 #include "keccak-hash-4way.h"

-#if defined(KECCAK_8WAY)
+#if defined(SHA3D_8WAY)

 void sha3d_hash_8way(void *state, const void *input)
 {
@@ -64,7 +64,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
   return 0;
 }

-#elif defined(KECCAK_4WAY)
+#elif defined(SHA3D_4WAY)

 void sha3d_hash_4way(void *state, const void *input)
 {
@@ -122,4 +122,60 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }

+#elif defined(SHA3D_2WAY)
+
+void sha3d_hash_2x64(void *state, const void *input)
+{
+    uint32_t buffer[16*4] __attribute__ ((aligned (64)));
+    keccak256_2x64_context ctx;
+
+    keccak256_2x64_init( &ctx );
+    keccak256_2x64_update( &ctx, input, 80 );
+    keccak256_2x64_close( &ctx, buffer );
+
+    keccak256_2x64_init( &ctx );
+    keccak256_2x64_update( &ctx, buffer, 32 );
+    keccak256_2x64_close( &ctx, state );
+}
+
+int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
+   uint32_t hash[16*2] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[13]);   // 3*4+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   v128_t *noncev = (v128_t*)vdata + 9;
+   const uint32_t Htarg = ptarget[7];
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+
+   v128_bswap32_intrlv80_2x64( vdata, pdata );
+   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
+   do {
+      sha3d_hash_2x64( hash, vdata );
+
+      for ( int lane = 0; lane < 2; lane++ )
+      if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
+      {
+          extr_lane_2x64( lane_hash, hash, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) )
+          {
+              pdata[19] = bswap_32( n + lane );
+              submit_solution( work, lane_hash, mythr );
+          }
+      }
+      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
+      n += 2;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
 #endif
--- a/algo/quark/hmq1725-gate.c
+++ b/algo/quark/hmq1725-gate.c
@@ -9,11 +9,11 @@ bool register_hmq1725_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_hmq1725_4way;
  gate->hash      = (void*)&hmq1725_4way_hash;
 #else
-  init_hmq1725_ctx();
  gate->scanhash  = (void*)&scanhash_hmq1725;
  gate->hash      = (void*)&hmq1725hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT 
+                      | NEON_OPT;
  opt_target_factor = 65536.0;
  return true;
 };
--- a/algo/quark/hmq1725-gate.h
+++ b/algo/quark/hmq1725-gate.h
@@ -29,7 +29,6 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
 void hmq1725hash( void *state, const void *input );
 int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );
-void init_hmq1725_ctx();

 #endif

--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -4,346 +4,267 @@

 #include <string.h>
 #include <stdint.h>
-#include "algo/blake/sph_blake.h"
+#include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
-#include "algo/groestl/sph_groestl.h"
+#if defined(__AES__)
+  #include "algo/groestl/aes_ni/hash-groestl.h"
+  #include "algo/fugue/fugue-aesni.h"
+#else
+  #include "algo/groestl/sph_groestl.h"
+  #include "algo/fugue/sph_fugue.h"
+#endif
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+  #include "algo/echo/aes_ni/hash_api.h"
+#else
+  #include "algo/echo/sph_echo.h"
+#endif
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
-#include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/sha/sph_sha2.h"
-#if defined(__AES__)
-  #include "algo/groestl/aes_ni/hash-groestl.h"
-  #include "algo/echo/aes_ni/hash_api.h"
-  #include "algo/fugue/fugue-aesni.h"
-#else
-  #include "algo/groestl/sph_groestl.h"
-  #include "algo/echo/sph_echo.h"
-  #include "algo/fugue/sph_fugue.h"
-#endif
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"

-typedef struct {
-  sph_blake512_context    blake1, blake2;
-  sph_bmw512_context      bmw1, bmw2, bmw3;
-  sph_skein512_context    skein1, skein2;
-  sph_jh512_context       jh1, jh2;
-  sph_keccak512_context   keccak1, keccak2;
-  hashState_luffa         luffa1, luffa2;
-  cubehashParam           cube;
-  sph_shavite512_context  shavite1, shavite2;
-#if defined(__aarch64__)
-  sph_simd512_context     simd1, simd2;
-#else
-  hashState_sd            simd1, simd2;
-#endif
-  sph_hamsi512_context    hamsi1;
-  sph_shabal512_context   shabal1;
-  sph_whirlpool_context   whirlpool1, whirlpool2, whirlpool3, whirlpool4;
-  sph_sha512_context      sha1, sha2;
-  sph_haval256_5_context  haval1, haval2;
-#if defined(__AES__)
-  hashState_echo          echo1, echo2;
-  hashState_groestl       groestl1, groestl2;
-  hashState_fugue         fugue1, fugue2;
-#else
-  sph_groestl512_context  groestl1, groestl2;
-  sph_echo512_context     echo1, echo2;
-  sph_fugue512_context    fugue1, fugue2;
-#endif
-} hmq1725_ctx_holder;
-
-static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64)));
-static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64)));
-
-void init_hmq1725_ctx()
+union _hmq1725_ctx_holder
 {
-    sph_blake512_init(&hmq1725_ctx.blake1);
-    sph_blake512_init(&hmq1725_ctx.blake2);
-
-    sph_bmw512_init(&hmq1725_ctx.bmw1);
-    sph_bmw512_init(&hmq1725_ctx.bmw2);
-    sph_bmw512_init(&hmq1725_ctx.bmw3);
-
-    sph_skein512_init(&hmq1725_ctx.skein1);
-    sph_skein512_init(&hmq1725_ctx.skein2);
-
-    sph_jh512_init(&hmq1725_ctx.jh1);
-    sph_jh512_init(&hmq1725_ctx.jh2);
-
-    sph_keccak512_init(&hmq1725_ctx.keccak1);
-    sph_keccak512_init(&hmq1725_ctx.keccak2);
-
-    init_luffa( &hmq1725_ctx.luffa1, 512 );
-    init_luffa( &hmq1725_ctx.luffa2, 512 );
-
-    cubehashInit( &hmq1725_ctx.cube, 512, 16, 32 );
-
-    sph_shavite512_init(&hmq1725_ctx.shavite1);
-    sph_shavite512_init(&hmq1725_ctx.shavite2);
-
-#if defined(__aarch64__)
-    sph_simd512_init(&hmq1725_ctx.simd1);
-    sph_simd512_init(&hmq1725_ctx.simd2);
-#else    
-    init_sd( &hmq1725_ctx.simd1, 512 );
-    init_sd( &hmq1725_ctx.simd2, 512 );
-#endif
-
-    sph_hamsi512_init(&hmq1725_ctx.hamsi1);
-
+   blake512_context    blake;
+   sph_bmw512_context      bmw;
 #if defined(__AES__)
-    fugue512_Init( &hmq1725_ctx.fugue1, 512 );
-    fugue512_Init( &hmq1725_ctx.fugue2, 512 );
+   hashState_groestl       groestl;
+   hashState_fugue         fugue;
 #else
-    sph_fugue512_init(&hmq1725_ctx.fugue1);
-    sph_fugue512_init(&hmq1725_ctx.fugue2);
+   sph_groestl512_context  groestl;
+   sph_fugue512_context    fugue;
 #endif
-
-    sph_shabal512_init(&hmq1725_ctx.shabal1);
-
-    sph_whirlpool_init(&hmq1725_ctx.whirlpool1);
-    sph_whirlpool_init(&hmq1725_ctx.whirlpool2);
-    sph_whirlpool_init(&hmq1725_ctx.whirlpool3);
-    sph_whirlpool_init(&hmq1725_ctx.whirlpool4);
-
-    sph_sha512_init( &hmq1725_ctx.sha1 );
-    sph_sha512_init( &hmq1725_ctx.sha2 );
-
-    sph_haval256_5_init(&hmq1725_ctx.haval1);
-    sph_haval256_5_init(&hmq1725_ctx.haval2);
-
-#if defined(__AES__)
-     init_echo( &hmq1725_ctx.echo1, 512 );
-     init_echo( &hmq1725_ctx.echo2, 512 );
-     init_groestl( &hmq1725_ctx.groestl1, 64 );
-     init_groestl( &hmq1725_ctx.groestl2, 64 );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+   hashState_echo          echo;
 #else
-     sph_groestl512_init( &hmq1725_ctx.groestl1 );
-     sph_groestl512_init( &hmq1725_ctx.groestl2 );
-     sph_echo512_init( &hmq1725_ctx.echo1 );
-     sph_echo512_init( &hmq1725_ctx.echo2 );
+   sph_echo512_context     echo;
 #endif
-}
+   sph_skein512_context    skein;
+   sph_jh512_context       jh;
+   sph_keccak512_context   keccak;
+   hashState_luffa         luffa;
+   cubehashParam           cube;
+   sph_shavite512_context  shavite;
+   simd512_context         simd;
+   sph_hamsi512_context    hamsi;
+   sph_shabal512_context   shabal;
+   sph_whirlpool_context   whirlpool;
+   sph_sha512_context      sha;
+   sph_haval256_5_context  haval;
+};
+typedef union _hmq1725_ctx_holder hmq1725_ctx_holder;

-void hmq_bmw512_midstate( const void* input )
-{
-    memcpy( &hmq_bmw_mid, &hmq1725_ctx.bmw1, sizeof hmq_bmw_mid );
-    sph_bmw512( &hmq_bmw_mid, input, 64 );
-}
-
-__thread hmq1725_ctx_holder h_ctx __attribute__ ((aligned (64)));
+//static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64)));
+//static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64)));

 extern void hmq1725hash(void *state, const void *input)
 {
    const uint32_t mask = 24;
-    uint32_t hashA[32] __attribute__((aligned(64)));
-    uint32_t hashB[32] __attribute__((aligned(64)));
-    const int midlen = 64;            // bytes
-    const int tail   = 80 - midlen;   // 16
+    uint32_t hashA[32] __attribute__((aligned(32)));
+    uint32_t hashB[32] __attribute__((aligned(32)));
+    hmq1725_ctx_holder ctx __attribute__ ((aligned (64)));

-    memcpy(&h_ctx, &hmq1725_ctx, sizeof(hmq1725_ctx));
+    sph_bmw512_init( &ctx.bmw );
+    sph_bmw512( &ctx.bmw, input, 80 );
+    sph_bmw512_close( &ctx.bmw, hashA );   //1

-    memcpy( &h_ctx.bmw1, &hmq_bmw_mid, sizeof hmq_bmw_mid );
-    sph_bmw512( &h_ctx.bmw1, input + midlen, tail );
-    sph_bmw512_close(&h_ctx.bmw1, hashA);   //1
-
-    sph_whirlpool (&h_ctx.whirlpool1, hashA, 64);    //0
-    sph_whirlpool_close(&h_ctx.whirlpool1, hashB);   //1
+    sph_whirlpool_init( &ctx.whirlpool );
+    sph_whirlpool( &ctx.whirlpool, hashA, 64 );    //0
+    sph_whirlpool_close( &ctx.whirlpool, hashB );   //1

    if ( hashB[0] & mask )   //1
    {
 #if defined(__AES__)
-     update_and_final_groestl( &h_ctx.groestl1, (char*)hashA,
-                               (const char*)hashB, 512 );
+       groestl512_full( &ctx.groestl, hashA, hashB, 512 );
 #else
-     sph_groestl512 (&h_ctx.groestl1, hashB, 64); //1
-     sph_groestl512_close(&h_ctx.groestl1, hashA); //2
+       sph_groestl512_init( &ctx.groestl );
+       sph_groestl512( &ctx.groestl, hashB, 64 ); //1
+       sph_groestl512_close( &ctx.groestl, hashA ); //2
 #endif
    }
    else
    {
-      sph_skein512 (&h_ctx.skein1, hashB, 64); //1
-      sph_skein512_close(&h_ctx.skein1, hashA); //2
+      sph_skein512_init( &ctx.skein );
+      sph_skein512( &ctx.skein, hashB, 64 ); //1
+      sph_skein512_close( &ctx.skein, hashA ); //2
    }
 	
-    sph_jh512 (&h_ctx.jh1, hashA, 64); //3
-    sph_jh512_close(&h_ctx.jh1, hashB); //4
+    sph_jh512_init( &ctx.jh );
+    sph_jh512( &ctx.jh, hashA, 64 ); //3
+    sph_jh512_close( &ctx.jh, hashB ); //4

-    sph_keccak512 (&h_ctx.keccak1, hashB, 64); //2
-    sph_keccak512_close(&h_ctx.keccak1, hashA); //3
+    sph_keccak512_init( &ctx.keccak );
+    sph_keccak512( &ctx.keccak, hashB, 64 ); //2
+    sph_keccak512_close( &ctx.keccak, hashA ); //3

    if ( hashA[0] & mask ) //4
    {
-        sph_blake512 (&h_ctx.blake1, hashA, 64); //
-        sph_blake512_close(&h_ctx.blake1, hashB); //5
+        blake512_init( &ctx.blake );
+        blake512_update( &ctx.blake, hashA, 64 );
+        blake512_close( &ctx.blake, hashB );
    }
    else
    {
-        sph_bmw512 (&h_ctx.bmw2, hashA, 64); //4
-        sph_bmw512_close(&h_ctx.bmw2, hashB);   //5
+        sph_bmw512_init( &ctx.bmw );
+        sph_bmw512( &ctx.bmw, hashA, 64 ); //4
+        sph_bmw512_close( &ctx.bmw, hashB );   //5
    }
    
-     update_and_final_luffa( &h_ctx.luffa1, hashA, hashB, 64 );
+    luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );

-     cubehashUpdateDigest( &h_ctx.cube, hashB, hashA, 64 );
+    cubehash_full( &ctx.cube, hashB, 512, hashA, 64 );

    if ( hashB[0] & mask ) //7
    {
-        sph_keccak512 (&h_ctx.keccak2, hashB, 64); //
-        sph_keccak512_close(&h_ctx.keccak2, hashA); //8
+        sph_keccak512_init( &ctx.keccak );
+        sph_keccak512( &ctx.keccak, hashB, 64 ); //
+        sph_keccak512_close( &ctx.keccak, hashA ); //8
    }
    else
    {
-        sph_jh512 (&h_ctx.jh2, hashB, 64); //7
-        sph_jh512_close(&h_ctx.jh2, hashA); //8
+        sph_jh512_init( &ctx.jh );
+        sph_jh512( &ctx.jh, hashB, 64 ); //7
+        sph_jh512_close( &ctx.jh, hashA ); //8
    }

-    sph_shavite512 (&h_ctx.shavite1, hashA, 64); //3
-    sph_shavite512_close(&h_ctx.shavite1, hashB); //4
+    sph_shavite512_init( &ctx.shavite );
+    sph_shavite512( &ctx.shavite, hashA, 64 ); //3
+    sph_shavite512_close( &ctx.shavite, hashB ); //4

-#if defined(__aarch64__)
-    sph_simd512 (&h_ctx.simd1, hashB, 64); //3
-    sph_simd512_close(&h_ctx.simd1, hashA); //4
-#else    
-    update_final_sd( &h_ctx.simd1, (BitSequence *)hashA,
-                                   (const BitSequence *)hashB, 512 );
-#endif
+    simd512_ctx( &ctx.simd, hashA, hashB, 64 );

    if ( hashA[0] & mask ) //4
    {
-        sph_whirlpool (&h_ctx.whirlpool2, hashA, 64); //
-        sph_whirlpool_close(&h_ctx.whirlpool2, hashB); //5
+        sph_whirlpool_init( &ctx.whirlpool );
+        sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //
+        sph_whirlpool_close( &ctx.whirlpool, hashB ); //5
    }
    else
    {
-        sph_haval256_5 (&h_ctx.haval1, hashA, 64); //4
-        sph_haval256_5_close(&h_ctx.haval1, hashB);   //5
+        sph_haval256_5_init( &ctx.haval );
+        sph_haval256_5( &ctx.haval, hashA, 64 ); //4
+        sph_haval256_5_close( &ctx.haval, hashB );   //5
        memset(&hashB[8], 0, 32);
    }

-#if defined(__AES__)
-    update_final_echo ( &h_ctx.echo1, (BitSequence *)hashA,
-                        (const BitSequence *)hashB, 512 );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+    echo_full( &ctx.echo, hashA, 512, hashB, 64 );
 #else
-    sph_echo512 (&h_ctx.echo1, hashB, 64); //5
-    sph_echo512_close(&h_ctx.echo1, hashA); //6
+    sph_echo512_init( &ctx.echo );
+    sph_echo512( &ctx.echo, hashB, 64 ); //5
+    sph_echo512_close( &ctx.echo, hashA ); //6
 #endif

-    sph_blake512 (&h_ctx.blake2, hashA, 64); //6
-    sph_blake512_close(&h_ctx.blake2, hashB); //7
+    blake512_init( &ctx.blake );
+    blake512_update( &ctx.blake, hashA, 64 );
+    blake512_close( &ctx.blake, hashB );

    if ( hashB[0] & mask ) //7
    {
-        sph_shavite512 (&h_ctx.shavite2, hashB, 64); //
-        sph_shavite512_close(&h_ctx.shavite2, hashA); //8
+       sph_shavite512_init( &ctx.shavite );
+       sph_shavite512( &ctx.shavite, hashB, 64 ); //
+       sph_shavite512_close( &ctx.shavite, hashA ); //8
    }
    else
-    {
-     update_and_final_luffa( &h_ctx.luffa2, hashA, hashB, 64 );
-    }
+       luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );

-    sph_hamsi512 (&h_ctx.hamsi1, hashA, 64); //3
-    sph_hamsi512_close(&h_ctx.hamsi1, hashB); //4
+    sph_hamsi512_init( &ctx.hamsi );
+    sph_hamsi512( &ctx.hamsi, hashA, 64 ); //3
+    sph_hamsi512_close( &ctx.hamsi, hashB ); //4

 #if defined(__AES__)
-    fugue512_Update( &h_ctx.fugue1, hashB, 512 ); //2   ////
-    fugue512_Final( &h_ctx.fugue1, hashA ); //3 
+    fugue512_full( &ctx.fugue, hashA, hashB, 64 );
 #else
-    sph_fugue512 (&h_ctx.fugue1, hashB, 64); //2   ////
-    sph_fugue512_close(&h_ctx.fugue1, hashA); //3 
+    sph_fugue512_init( &ctx.fugue );
+    sph_fugue512( &ctx.fugue, hashB, 64 ); //2   ////
+    sph_fugue512_close( &ctx.fugue, hashA ); //3 
 #endif

    if ( hashA[0] & mask ) //4
    {
-#if defined(__AES__)
-     update_final_echo ( &h_ctx.echo2, (BitSequence *)hashB,
-                         (const BitSequence *)hashA, 512 );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+       echo_full( &ctx.echo, hashB, 512, hashA, 64 );
 #else
-     sph_echo512 (&h_ctx.echo2, hashA, 64); //
-     sph_echo512_close(&h_ctx.echo2, hashB); //5
+       sph_echo512_init( &ctx.echo );
+       sph_echo512( &ctx.echo, hashA, 64 ); //
+       sph_echo512_close( &ctx.echo, hashB ); //5
 #endif
    }
    else
-    {
-#if defined(__aarch64__)
-    sph_simd512(&h_ctx.simd2, hashA, 64); //6
-    sph_simd512_close(&h_ctx.simd2, hashB); //7
-#else
-    update_final_sd( &h_ctx.simd2, (BitSequence *)hashB,
-                      (const BitSequence *)hashA, 512 );
-#endif
-    }
+       simd512_ctx( &ctx.simd, hashB, hashA, 64 );

-    sph_shabal512 (&h_ctx.shabal1, hashB, 64); //5
-    sph_shabal512_close(&h_ctx.shabal1, hashA); //6
+    sph_shabal512_init( &ctx.shabal );
+    sph_shabal512( &ctx.shabal, hashB, 64 ); //5
+    sph_shabal512_close( &ctx.shabal, hashA ); //6

-    sph_whirlpool (&h_ctx.whirlpool3, hashA, 64); //6
-    sph_whirlpool_close(&h_ctx.whirlpool3, hashB); //7
+    sph_whirlpool_init( &ctx.whirlpool );
+    sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //6
+    sph_whirlpool_close( &ctx.whirlpool, hashB ); //7

    if ( hashB[0] & mask ) //7
    {
 #if defined(__AES__)
-        fugue512_Update( &h_ctx.fugue2, hashB, 512 ); //
-        fugue512_Final( &h_ctx.fugue2, hashA ); //8
+       fugue512_full( &ctx.fugue, hashA, hashB, 64 );
 #else
-        sph_fugue512 (&h_ctx.fugue2, hashB, 64); //
-        sph_fugue512_close(&h_ctx.fugue2, hashA); //8
+       sph_fugue512_init( &ctx.fugue );
+       sph_fugue512( &ctx.fugue, hashB, 64 ); //
+       sph_fugue512_close( &ctx.fugue, hashA ); //8
 #endif
    }
    else
    {
-        sph_sha512( &h_ctx.sha1, hashB, 64 );
-        sph_sha512_close( &h_ctx.sha1, hashA );
+       sph_sha512_init( &ctx.sha );
+       sph_sha512( &ctx.sha, hashB, 64 );
+       sph_sha512_close( &ctx.sha, hashA );
    }

 #if defined(__AES__)
-    update_and_final_groestl( &h_ctx.groestl2, (char*)hashB,
-                               (const char*)hashA, 512 );
+    groestl512_full( &ctx.groestl, hashB, hashA, 512 );
 #else
-    sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3
-    sph_groestl512_close(&h_ctx.groestl2, hashB); //4
+    sph_groestl512_init( &ctx.groestl );
+    sph_groestl512( &ctx.groestl, hashA, 64 ); //3
+    sph_groestl512_close( &ctx.groestl, hashB ); //4
 #endif

-    sph_sha512( &h_ctx.sha2, hashB, 64 );
-    sph_sha512_close( &h_ctx.sha2, hashA );
+    sph_sha512_init( &ctx.sha );
+    sph_sha512( &ctx.sha, hashB, 64 );
+    sph_sha512_close( &ctx.sha, hashA );

    if ( hashA[0] & mask ) //4
    {
-        sph_haval256_5 (&h_ctx.haval2, hashA, 64); //
-        sph_haval256_5_close(&h_ctx.haval2, hashB); //5
-	memset(&hashB[8], 0, 32);
+        sph_haval256_5_init( &ctx.haval );
+        sph_haval256_5( &ctx.haval, hashA, 64 ); //
+        sph_haval256_5_close( &ctx.haval, hashB ); //5
+        memset( &hashB[8], 0, 32 );
    }
    else
    {
-        sph_whirlpool (&h_ctx.whirlpool4, hashA, 64); //4
-        sph_whirlpool_close(&h_ctx.whirlpool4, hashB);   //5
+        sph_whirlpool_init( &ctx.whirlpool );
+        sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //4
+        sph_whirlpool_close( &ctx.whirlpool, hashB );   //5
    }

-    sph_bmw512 (&h_ctx.bmw3, hashB, 64); //5
-    sph_bmw512_close(&h_ctx.bmw3, hashA); //6
+    sph_bmw512_init( &ctx.bmw );
+    sph_bmw512( &ctx.bmw, hashB, 64 ); //5
+    sph_bmw512_close( &ctx.bmw, hashA ); //6

-	memcpy(state, hashA, 32);
+	memcpy( state, hashA, 32 );
 }

 int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
 //        uint32_t endiandata[32] __attribute__((aligned(64)));
-        uint32_t endiandata[20] __attribute__((aligned(64)));
-        uint32_t hash64[8] __attribute__((aligned(64)));
+        uint32_t endiandata[20] __attribute__((aligned(32)));
+        uint32_t hash64[8] __attribute__((aligned(32)));
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
@@ -356,7 +277,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
        for (int k = 0; k < 20; k++)
                be32enc(&endiandata[k], pdata[k]);

-        hmq_bmw512_midstate( endiandata );
+//        hmq_bmw512_midstate( endiandata );

 //	if (opt_debug) 
 //	{
--- a/algo/x16/hex.c
+++ b/algo/x16/hex.c
@@ -23,13 +23,12 @@ static void hex_getAlgoString(const uint32_t* prevblock, char *output)
   *sptr = '\0';
 }

-static __thread x16r_context_overlay hex_ctx;

 int hex_hash( void* output, const void* input, int thrid )
 {
   uint32_t _ALIGN(128) hash[16];
   x16r_context_overlay ctx;
-   memcpy( &ctx, &hex_ctx, sizeof(ctx) );
+   memcpy( &ctx, &x16r_ref_ctx, sizeof(ctx) );
   void *in = (void*) input;
   int size = 80;

@@ -52,7 +51,7 @@ int hex_hash( void* output, const void* input, int thrid )
         break;
         case GROESTL:
 #if defined(__AES__)
-            groestl512_full( &ctx.groestl, (char*)hash, (char*)in, size<<3 );
+            groestl512_full( &ctx.groestl, hash, in, size<<3 );
 #else
            sph_groestl512_init( &ctx.groestl );
            sph_groestl512( &ctx.groestl, in, size );
@@ -87,7 +86,7 @@ int hex_hash( void* output, const void* input, int thrid )
         case LUFFA:
            if ( i == 0 )
            {
-              update_and_final_luffa( &ctx.luffa, hash, (const void*)in+64, 16 );
+              update_and_final_luffa( &ctx.luffa, hash, in+64, 16 );
            }
            else
            {
@@ -97,7 +96,7 @@ int hex_hash( void* output, const void* input, int thrid )
            break;
         case CUBEHASH:
            if ( i == 0 )
-               cubehashUpdateDigest( &ctx.cube, hash, (const void*)in+64, 16 );
+               cubehashUpdateDigest( &ctx.cube, hash, in+64, 16 );
            else
            {
               cubehashInit( &ctx.cube, 512, 16, 32 );
@@ -108,26 +107,15 @@ int hex_hash( void* output, const void* input, int thrid )
            shavite512_full( &ctx.shavite, hash, in, size );
         break;
         case SIMD:
-#if defined(__aarch64__)
-            sph_simd512_init( &ctx.simd );
-            sph_simd512(&ctx.simd, (const void*) hash, 64);
-            sph_simd512_close(&ctx.simd, hash);
-#else
-            simd_full( &ctx.simd, (BitSequence *)hash,
-                             (const BitSequence*)in, size<<3 );
-             init_sd( &ctx.simd, 512 );
-             update_final_sd( &ctx.simd, (BitSequence *)hash,
-                              (const BitSequence*)in, size<<3 );
-#endif
+            simd512_ctx( &ctx.simd, hash, in, size<<3 );
         break;
         case ECHO:
-#if defined(__AES__)
-            echo_full( &ctx.echo, (BitSequence *)hash, 512,
-                              (const BitSequence *)in, size );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES) 
+            echo_full( &ctx.echo, hash, 512, in, size );
 #else
-             sph_echo512_init( &ctx.echo );
-             sph_echo512( &ctx.echo, in, size );
-             sph_echo512_close( &ctx.echo, hash );
+            sph_echo512_init( &ctx.echo );
+            sph_echo512( &ctx.echo, in, size );
+            sph_echo512_close( &ctx.echo, hash );
 #endif
         break;
         case HAMSI:
@@ -216,32 +204,32 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
   switch ( algo )
   {
      case JH:
-         sph_jh512_init( &hex_ctx.jh );
-         sph_jh512( &hex_ctx.jh, edata, 64 );
+         sph_jh512_init( &x16r_ref_ctx.jh );
+         sph_jh512( &x16r_ref_ctx.jh, edata, 64 );
      break;
      case SKEIN:
-         sph_skein512_init( &hex_ctx.skein );
-         sph_skein512( &hex_ctx.skein, edata, 64 );
+         sph_skein512_init( &x16r_ref_ctx.skein );
+         sph_skein512( &x16r_ref_ctx.skein, edata, 64 );
      break;
      case LUFFA:
-         init_luffa( &hex_ctx.luffa, 512 );
-         update_luffa( &hex_ctx.luffa, edata, 64 );
+         init_luffa( &x16r_ref_ctx.luffa, 512 );
+         update_luffa( &x16r_ref_ctx.luffa, edata, 64 );
      break;
      case CUBEHASH:
-         cubehashInit( &hex_ctx.cube, 512, 16, 32 );
-         cubehashUpdate( &hex_ctx.cube, edata, 64 );
+         cubehashInit( &x16r_ref_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x16r_ref_ctx.cube, edata, 64 );
      break;
      case HAMSI:
-         sph_hamsi512_init( &hex_ctx.hamsi );
-         sph_hamsi512( &hex_ctx.hamsi, edata, 64 );
+         sph_hamsi512_init( &x16r_ref_ctx.hamsi );
+         sph_hamsi512( &x16r_ref_ctx.hamsi, edata, 64 );
      break;
      case SHABAL:
-         sph_shabal512_init( &hex_ctx.shabal );
-         sph_shabal512( &hex_ctx.shabal, edata, 64 );
+         sph_shabal512_init( &x16r_ref_ctx.shabal );
+         sph_shabal512( &x16r_ref_ctx.shabal, edata, 64 );
      break;
      case WHIRLPOOL:
-         sph_whirlpool_init( &hex_ctx.whirlpool );
-         sph_whirlpool( &hex_ctx.whirlpool, edata, 64 );
+         sph_whirlpool_init( &x16r_ref_ctx.whirlpool );
+         sph_whirlpool( &x16r_ref_ctx.whirlpool, edata, 64 );
      break;
   }
   
--- a/algo/x16/minotaur.c
+++ b/algo/x16/minotaur.c
@@ -14,9 +14,9 @@
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#endif
+//#if defined(__aarch64__)
+//  #include "algo/simd/sph_simd.h"
+//#endif
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
@@ -24,10 +24,14 @@
 #include "algo/yespower/yespower.h"
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/echo/aes_ni/hash_api.h"
-  #include "algo/groestl/aes_ni/hash-groestl.h"
+#else
+  #include "algo/echo/sph_echo.h"
+#endif
+#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+ #include "algo/groestl/aes_ni/hash-groestl.h"
+#else
+  #include "algo/groestl/sph_groestl.h"
 #endif
-#include "algo/echo/sph_echo.h"
-#include "algo/groestl/sph_groestl.h"
 #if defined(__AES__)
  #include "algo/fugue/fugue-aesni.h"
 #else
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -971,4 +971,405 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }

+#elif defined (X16R_2WAY)
+
+void x16r_2x64_prehash( void *vdata, void *pdata )
+{
+   uint32_t edata[20] __attribute__ ((aligned (64)));
+   const char elem = x16r_hash_order[0];
+   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+   switch ( algo )
+   {
+      case JH:
+         v128_bswap32_intrlv80_2x64( vdata, pdata );
+         jh512_2x64_init( &x16r_ctx.jh );
+         jh512_2x64_update( &x16r_ctx.jh, vdata, 64 );
+      break;
+      case KECCAK:
+         v128_bswap32_intrlv80_2x64( vdata, pdata );
+         keccak512_2x64_init( &x16r_ctx.keccak );
+         keccak512_2x64_update( &x16r_ctx.keccak, vdata, 72 );
+      break;
+      case SKEIN:
+         v128_bswap32_intrlv80_2x64( vdata, pdata );
+         skein512_2x64_prehash64( &x16r_ctx.skein, vdata );
+      break;
+      case LUFFA:
+      {
+         v128_bswap32_80( edata, pdata );
+         init_luffa( &x16r_ctx.luffa, 512 );
+         update_luffa( &x16r_ctx.luffa, edata, 64 );
+         intrlv_2x64( vdata, edata, edata, 640 );
+      }
+      break;
+      case CUBEHASH:
+      {
+         v128_bswap32_80( edata, pdata );
+         cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x16r_ctx.cube, edata, 64 );
+         intrlv_2x64( vdata, edata, edata, 640 );
+      }
+      break;
+      case HAMSI:
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
+         v128_bswap32_intrlv80_2x64( vdata, pdata );
+         hamsi512_2x64_init( &x16r_ctx.hamsi );
+         hamsi512_2x64_update( &x16r_ctx.hamsi, vdata, 72 );
+#else
+         v128_bswap32_80( edata, pdata );
+         sph_hamsi512_init( &x16r_ctx.hamsi );
+         sph_hamsi512( &x16r_ctx.hamsi, edata, 72 );
+         intrlv_2x64( vdata, edata, edata, 640 );
+#endif
+      break;
+      case FUGUE:
+         v128_bswap32_80( edata, pdata );
+#if defined(__AES__)
+         fugue512_init( &x16r_ctx.fugue );
+         fugue512_update( &x16r_ctx.fugue, edata, 76 );
+#else         
+         sph_fugue512_init( &x16r_ctx.fugue );
+         sph_fugue512( &x16r_ctx.fugue, edata, 76 );
+#endif
+         intrlv_2x64( vdata, edata, edata, 640 );
+      break;
+      case SHABAL:
+         v128_bswap32_80( edata, pdata );
+         sph_shabal512_init( &x16r_ctx.shabal );
+         sph_shabal512( &x16r_ctx.shabal, edata, 64);
+         intrlv_2x64( vdata, edata, edata, 640 );
+      break;
+      case WHIRLPOOL:
+         v128_bswap32_80( edata, pdata );
+         sph_whirlpool_init( &x16r_ctx.whirlpool );
+         sph_whirlpool( &x16r_ctx.whirlpool, edata, 64 );
+         intrlv_2x64( vdata, edata, edata, 640 );
+      break;
+      default:
+         v128_bswap32_intrlv80_2x64( vdata, pdata );
+   }
+}
+
+int x16r_2x64_hash_generic( void* output, const void* input, int thrid )
+{
+   uint32_t vhash[20*2] __attribute__ ((aligned (64)));
+   uint32_t hash0[20] __attribute__ ((aligned (32)));
+   uint32_t hash1[20] __attribute__ ((aligned (32)));
+   x16r_2x64_context_overlay ctx;
+   memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   int size = 80;
+
+   dintrlv_2x64( hash0, hash1, input, 640 );
+
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = x16r_hash_order[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            if ( i == 0 )
+               blake512_2x64_full( &ctx.blake, vhash, input, size );
+            else
+            {
+               intrlv_2x64( vhash, in0, in1, size<<3 );
+               blake512_2x64_full( &ctx.blake, vhash, vhash, size );
+            }
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+         case BMW:
+            bmw512_2x64_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_2x64_update( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_2x64( vhash, in0, in1, size<<3 );
+               bmw512_2x64_update( &ctx.bmw, vhash, size );
+            }
+            bmw512_2x64_close( &ctx.bmw, vhash );
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+         case GROESTL:
+#if defined(__AES__)  // || defined(__ARM_FEATURE_AES)
+            groestl512_full( &ctx.groestl, hash0, in0, size<<3 );
+            groestl512_full( &ctx.groestl, hash1, in1, size<<3 );
+#else
+            sph_groestl512_init( &ctx.groestl );
+            sph_groestl512( &ctx.groestl, in0, size );
+            sph_groestl512_close( &ctx.groestl, hash0 );
+            sph_groestl512_init( &ctx.groestl );
+            sph_groestl512( &ctx.groestl, in1, size );
+            sph_groestl512_close( &ctx.groestl, hash1 );
+#endif
+          break;
+         case JH:
+            if ( i == 0 )
+               jh512_2x64_update( &ctx.jh, input + (64*2), 16 );
+            else
+            {
+               intrlv_2x64( vhash, in0, in1, size<<3 );
+               jh512_2x64_init( &ctx.jh );
+               jh512_2x64_update( &ctx.jh, vhash, size );
+            }
+            jh512_2x64_close( &ctx.jh, vhash );
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+         case KECCAK:
+           if ( i == 0 )
+               keccak512_2x64_update( &ctx.keccak, input + (72*2), 8 );
+            else
+            {
+               intrlv_2x64( vhash, in0, in1, size<<3 );
+               keccak512_2x64_init( &ctx.keccak );
+               keccak512_2x64_update( &ctx.keccak, vhash, size );
+            }
+            keccak512_2x64_close( &ctx.keccak, vhash );
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+         case SKEIN:
+            if ( i == 0 )
+               skein512_2x64_final16( &ctx.skein, vhash, input + (64*2) );
+            else
+            {
+               intrlv_2x64( vhash, in0, in1, size<<3 );
+               skein512_2x64_full( &ctx.skein, vhash, vhash, size );
+            }
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+         case LUFFA:
+            if ( i == 0 )
+            {
+               update_and_final_luffa( &ctx.luffa, hash0, in0 + 64, 16 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               update_and_final_luffa( &ctx.luffa, hash1, in1 + 64, 16 );
+            }
+            else
+            {
+               luffa_full( &ctx.luffa, hash0, 512, hash0, size );
+               luffa_full( &ctx.luffa, hash1, 512, hash1, size );
+            }
+         break;
+         case CUBEHASH:
+            if ( i == 0 )
+            {
+               cubehashUpdateDigest( &ctx.cube, hash0, in0 + 64, 16 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, hash1, in1 + 64, 16 );
+            }
+            else
+            {
+               cubehash_full( &ctx.cube, hash0, 512, hash0, size );
+               cubehash_full( &ctx.cube, hash1, 512, hash1, size );
+            }
+         break;
+         case SHAVITE:
+            shavite512_full( &ctx.shavite, hash0, in0, size );
+            shavite512_full( &ctx.shavite, hash1, in1, size );
+          break;
+         case SIMD:
+            simd512_ctx( &ctx.simd, hash0, in0, size );
+            simd512_ctx( &ctx.simd, hash1, in1, size );
+         break;
+         case ECHO:
+#if defined(__AES__)
+            echo_full( &ctx.echo, hash0, 512, in0, size );
+            echo_full( &ctx.echo, hash1, 512, in1, size );
+#else
+            sph_echo512_init( &ctx.echo );
+            sph_echo512( &ctx.echo, in0, size );
+            sph_echo512_close( &ctx.echo, hash0 );
+            sph_echo512_init( &ctx.echo );
+            sph_echo512( &ctx.echo, in1, size );
+            sph_echo512_close( &ctx.echo, hash1 );
+#endif
+          break;
+         case HAMSI:
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
+            if ( i == 0 )
+               hamsi512_2x64_update( &ctx.hamsi, input + (72*2), 8 );
+            else
+            {
+               intrlv_2x64( vhash, hash0, hash1, size<<3 );
+               hamsi512_2x64_init( &ctx.hamsi );
+               hamsi512_2x64_update( &ctx.hamsi, vhash, size );
+            }
+            hamsi512_2x64_close( &ctx.hamsi, vhash );
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+#else
+            if ( i == 0 )
+            {
+               sph_hamsi512( &ctx.hamsi, in0 + 72, 8 );
+               sph_hamsi512_close( &ctx.hamsi, hash0 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               sph_hamsi512( &ctx.hamsi, in1 + 72, 8 );
+               sph_hamsi512_close( &ctx.hamsi, hash1 );
+            }
+            else
+            {
+               sph_hamsi512_init( &ctx.hamsi );
+               sph_hamsi512( &ctx.hamsi, hash0, size );
+               sph_hamsi512_close( &ctx.hamsi, hash0 );
+               sph_hamsi512_init( &ctx.hamsi );
+               sph_hamsi512( &ctx.hamsi, hash1, size );
+               sph_hamsi512_close( &ctx.hamsi, hash1 );
+             }
+#endif
+            break;
+         case FUGUE:
+#if defined(__AES__)
+            if ( i == 0 )
+            {
+               fugue512_update( &ctx.fugue, in0 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash0 );
+               memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in1 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash1 );
+            }
+            else
+            {
+               fugue512_full( &ctx.fugue, hash0, hash0, size );
+               fugue512_full( &ctx.fugue, hash1, hash1, size );
+            }
+#else
+            if ( i == 0 )
+            {
+               sph_fugue512( &ctx.fugue, in0 + 76, 4 );
+               sph_fugue512_close( &ctx.fugue, hash0 );
+               memcpy( &ctx, &x16r_ctx, sizeof(sph_fugue512_context) );
+               sph_fugue512( &ctx.fugue, in1 + 76, 4 );
+               sph_fugue512_close( &ctx.fugue, hash1 );
+            }
+             else
+             {
+                sph_fugue512_full( &ctx.fugue, hash0, hash0, size );
+                sph_fugue512_full( &ctx.fugue, hash1, hash1, size );
+             }
+#endif
+             break;
+         case SHABAL:
+            if ( i == 0 )
+            {
+               sph_shabal512( &ctx.shabal, in0 + 64, 16 );
+               sph_shabal512_close( &ctx.shabal, hash0 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               sph_shabal512( &ctx.shabal, in1 + 64, 16 );
+               sph_shabal512_close( &ctx.shabal, hash1 );
+            }
+            else
+            {
+               sph_shabal512_init( &ctx.shabal );
+               sph_shabal512( &ctx.shabal, hash0, size );
+               sph_shabal512_close( &ctx.shabal, hash0 );
+               sph_shabal512_init( &ctx.shabal );
+               sph_shabal512( &ctx.shabal, hash1, size );
+               sph_shabal512_close( &ctx.shabal, hash1 );
+             }
+         break;
+         case WHIRLPOOL:
+            if ( i == 0 )
+            {
+               sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash0 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash1 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+            }
+            else
+            {
+               sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, size );
+            }
+         break;
+         case SHA_512:
+            sha512_2x64_init( &ctx.sha512 );
+            if ( i == 0 )
+               sha512_2x64_update( &ctx.sha512, input, size );
+            else
+            {
+               intrlv_2x64( vhash, in0, in1, size<<3 );
+               sha512_2x64_init( &ctx.sha512 );
+               sha512_2x64_update( &ctx.sha512, vhash, size );
+            }
+            sha512_2x64_close( &ctx.sha512, vhash );
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+      }
+
+      if ( work_restart[thrid].restart ) return 0;
+
+      size = 64;
+   }
+   memcpy( output,     hash0, 64 );
+   memcpy( output+64,  hash1, 64 );
+
+   return 1;
+}
+
+int x16r_2x64_hash( void* output, const void* input, int thrid )
+{
+   uint8_t hash[64*2] __attribute__ ((aligned (64)));
+   if ( !x16r_2x64_hash_generic( hash, input, thrid ) )
+      return 0;
+
+   memcpy( output,     hash,     32 );
+   memcpy( output+32,  hash+64,  32 );
+
+   return 1;
+}
+
+int scanhash_x16r_2x64( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[16*2] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*2] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   v128_t *noncev = (v128_t*)vdata + 9;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( bench )  ptarget[7] = 0x0cff;
+
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   const uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+         applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
+   }
+
+   x16r_2x64_prehash( vdata, pdata );
+   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
+   do
+   {
+      if ( x16r_2x64_hash( hash, vdata, thr_id ) );
+      for ( int i = 0; i < 2; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
+      n += 2;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
 #endif
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -13,10 +13,13 @@ __thread x16r_8way_context_overlay x16r_ctx;

 __thread x16r_4way_context_overlay x16r_ctx;

+#elif defined (X16R_2WAY)
+
+__thread x16r_2x64_context_overlay x16r_ctx;
+
 #endif

-__thread x16r_context_overlay x16_ctx;
-
+__thread x16r_context_overlay x16r_ref_ctx;

 void x16r_getAlgoString( const uint8_t* prevblock, char *output )
 {
@@ -58,11 +61,15 @@ bool register_x16r_algo( algo_gate_t* gate )
 #elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
+#elif defined (X16R_2WAY)
+  gate->scanhash  = (void*)&scanhash_x16r_2x64;
+  gate->hash      = (void*)&x16r_2x64_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
+                      | NEON_OPT;
  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -76,11 +83,15 @@ bool register_x16rv2_algo( algo_gate_t* gate )
 #elif defined (X16RV2_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rv2_4way;
  gate->hash      = (void*)&x16rv2_4way_hash;
+#elif defined (X16RV2_2WAY)
+  gate->scanhash  = (void*)&scanhash_x16rv2_2x64;
+  gate->hash      = (void*)&x16rv2_2x64_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rv2;
  gate->hash      = (void*)&x16rv2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
+                      | NEON_OPT;
  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -94,11 +105,15 @@ bool register_x16s_algo( algo_gate_t* gate )
 #elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
+#elif defined (X16R_2WAY)
+  gate->scanhash  = (void*)&scanhash_x16r_2x64;
+  gate->hash      = (void*)&x16r_2x64_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
+                      | NEON_OPT;
  x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -108,7 +123,6 @@ bool register_x16s_algo( algo_gate_t* gate )
 //
 //   X16RT

-
 void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash )
 {
    int32_t maskedTime = timeStamp & 0xffffff80;
@@ -221,34 +235,42 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )

 bool register_x16rt_algo( algo_gate_t* gate )
 {
-#if defined (X16R_8WAY)
+#if defined (X16RT_8WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_8way;
  gate->hash      = (void*)&x16r_8way_hash;
-#elif defined (X16R_4WAY)
+#elif defined (X16RT_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16r_4way_hash;
+#elif defined (X16RT_2WAY)
+  gate->scanhash  = (void*)&scanhash_x16rt_2x64;
+  gate->hash      = (void*)&x16r_2x64_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
+                      | NEON_OPT;
  opt_target_factor = 256.0;
  return true;
 };

 bool register_x16rt_veil_algo( algo_gate_t* gate )
 {
-#if defined (X16R_8WAY)
+#if defined (X16RT_8WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_8way;
  gate->hash      = (void*)&x16r_8way_hash;
-#elif defined (X16R_4WAY)
+#elif defined (X16RT_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16r_4way_hash;
+#elif defined (X16RT_2WAY)
+  gate->scanhash  = (void*)&scanhash_x16rt_2x64;
+  gate->hash      = (void*)&x16r_2x64_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
+                      | NEON_OPT;
  gate->build_extraheader = (void*)&veil_build_extraheader;
  opt_target_factor = 256.0;
  return true;
@@ -262,7 +284,7 @@ bool register_hex_algo( algo_gate_t* gate )
 {
  gate->scanhash        = (void*)&scanhash_hex;
  gate->hash            = (void*)&x16r_hash;
-  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
  gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
  opt_target_factor = 128.0;
  return true;
@@ -274,20 +296,25 @@ bool register_hex_algo( algo_gate_t* gate )

 bool register_x21s_algo( algo_gate_t* gate )
 {
-#if defined (X16R_8WAY)
+#if defined (X21S_8WAY)
  gate->scanhash          = (void*)&scanhash_x21s_8way;
  gate->hash              = (void*)&x21s_8way_hash;
  gate->miner_thread_init = (void*)&x21s_8way_thread_init;
-#elif defined (X16R_4WAY)
+#elif defined (X21S_4WAY)
  gate->scanhash          = (void*)&scanhash_x21s_4way;
  gate->hash              = (void*)&x21s_4way_hash;
  gate->miner_thread_init = (void*)&x21s_4way_thread_init;
+#elif defined (X21S_2WAY)
+  gate->scanhash          = (void*)&scanhash_x21s_2x64;
+  gate->hash              = (void*)&x21s_2x64_hash;
+  gate->miner_thread_init = (void*)&x21s_2x64_thread_init;
 #else
  gate->scanhash          = (void*)&scanhash_x21s;
  gate->hash              = (void*)&x21s_hash;
  gate->miner_thread_init = (void*)&x21s_thread_init;
 #endif
-  gate->optimizations  = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
+                      | NEON_OPT;
  x16_r_s_getAlgoString   = (void*)&x16s_getAlgoString;
  opt_target_factor = 256.0;
  return true;
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -7,13 +7,15 @@
 #include <unistd.h>
 #include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
-#include "algo/groestl/sph_groestl.h"
 #include "algo/jh/sph_jh.h"
+#include "algo/groestl/sph_groestl.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/simd/sph_simd.h"
+#include "algo/simd/nist.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
@@ -21,13 +23,13 @@
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sha512-hash.h"

-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/fugue/fugue-aesni.h"
 #endif

-#if defined (__AVX2__)
+//#if defined (__AVX2__)
  #include "algo/bmw/bmw-hash-4way.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/skein/skein-hash-4way.h"
@@ -39,7 +41,7 @@
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/hamsi/hamsi-hash-4way.h"
  #include "algo/shabal/shabal-hash-4way.h"
-#endif
+//#endif

 #if defined(__VAES__)
  #include "algo/groestl/groestl512-hash-4way.h"
@@ -48,28 +50,41 @@
  #include "algo/echo/echo-hash-4way.h"
 #endif

-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
+// X16R, X16S
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X16R_8WAY   1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X16R_4WAY   1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define X16R_2WAY   1
 #endif

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-  #define X16R_8WAY   1
  #define X16RV2_8WAY 1
-  #define X16RT_8WAY  1
-  #define X21S_8WAY   1
-
 #elif defined(__AVX2__) && defined(__AES__)
-
  #define X16RV2_4WAY 1
-  #define X16RT_4WAY  1
-  #define X21S_4WAY   1
-  #define X16R_4WAY   1
-
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define X16RV2_2WAY 1
 #endif

+// X16RT, VEIL
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X16RT_8WAY  1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X16RT_4WAY  1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define X16RT_2WAY  1
+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X21S_8WAY   1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X21S_4WAY   1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define X21S_2WAY   1
+#endif
+
+
 enum x16r_Algo {
        BLAKE = 0,
        BMW,
@@ -167,7 +182,6 @@ union _x16r_4way_context_overlay
    keccak512_4way_context  keccak;
    luffa_2way_context      luffa;
    cube_2way_context       cube;
-    hashState_luffa         luffa1;
    simd_2way_context       simd;
    hamsi512_4way_context   hamsi;
    hashState_fugue         fugue;
@@ -187,34 +201,84 @@ int scanhash_x16r_4way( struct work *, uint32_t,
                        uint64_t *, struct thr_info * );
 extern __thread x16r_4way_context_overlay x16r_ctx;

+#elif defined(X16R_2WAY)
+
+union _x16r_2x64_context_overlay
+{
+    blake512_2x64_context   blake;
+    bmw512_2x64_context     bmw;
+#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+    hashState_groestl       groestl;
+#else
+    sph_groestl512_context  groestl;
+#endif
+    skein512_2x64_context   skein;
+    jh512_2x64_context      jh;
+    keccak512_2x64_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    shavite512_context      shavite;
+    simd512_context         simd;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+    hashState_echo          echo;
+#else
+    sph_echo512_context     echo;
+#endif
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
+    hamsi_2x64_context      hamsi;
+#else
+    sph_hamsi512_context    hamsi;
+#endif
+#if defined(__AES__)
+    hashState_fugue         fugue;
+#else
+    sph_fugue512_context    fugue;
+#endif
+    sph_shabal512_context   shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_2x64_context     sha512;
+} __attribute__ ((aligned (64)));
+
+typedef union _x16r_2x64_context_overlay x16r_2x64_context_overlay;
+
+void x16r_2x64_prehash( void *, void * );
+int x16r_2x64_hash_generic( void *, const void *, int );
+int x16r_2x64_hash( void *, const void *, int );
+int scanhash_x16r_2x64( struct work *, uint32_t,
+                        uint64_t *, struct thr_info * );
+extern __thread x16r_2x64_context_overlay x16r_ctx;
+
 #endif

+// need a reference, add hooks for SSE2.
 // needed for hex
 union _x16r_context_overlay
 {
-#if defined(__AES__)
-        hashState_echo          echo;
-        hashState_groestl       groestl;
-        hashState_fugue         fugue;
-#else
-        sph_groestl512_context   groestl;
-        sph_echo512_context      echo;
-        sph_fugue512_context    fugue;
-#endif
        blake512_context        blake;
        sph_bmw512_context      bmw;
+#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+        hashState_groestl       groestl;
+#else
+        sph_groestl512_context  groestl;
+#endif
        sph_skein512_context    skein;
        sph_jh512_context       jh;
        sph_keccak512_context   keccak;
        hashState_luffa         luffa;
        cubehashParam           cube;
        shavite512_context      shavite;
-#if defined(__aarch64__)
-        sph_simd512_context     simd;
+        simd512_context         simd;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+        hashState_echo          echo;
 #else
-        hashState_sd            simd;
+        sph_echo512_context     echo;
 #endif
        sph_hamsi512_context    hamsi;
+#if defined(__AES__)
+        hashState_fugue         fugue;
+#else
+        sph_fugue512_context    fugue;
+#endif
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
        sph_sha512_context      sha512;
@@ -222,7 +286,7 @@ union _x16r_context_overlay

 typedef union _x16r_context_overlay x16r_context_overlay;

-extern __thread x16r_context_overlay x16_ctx;
+extern __thread x16r_context_overlay x16r_ref_ctx;

 void x16r_prehash( void *, void * );
 int x16r_hash_generic( void *, const void *, int );
@@ -242,6 +306,12 @@ int x16rv2_4way_hash( void *state, const void *input, int thrid );
 int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

+#elif defined(X16RV2_2WAY)
+
+int x16rv2_2x64_hash( void *state, const void *input, int thrid );
+int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
 #else

 int x16rv2_hash( void *state, const void *input, int thr_id );
@@ -251,18 +321,24 @@ int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
 #endif

 // x16rt, veil
-#if defined(X16R_8WAY)
+#if defined(X16RT_8WAY)

 //void x16rt_8way_hash( void *state, const void *input );
 int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

-#elif defined(X16R_4WAY)
+#elif defined(X16RT_4WAY)

 //void x16rt_4way_hash( void *state, const void *input );
 int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

+#elif defined(X16RT_2WAY)
+
+//void x16rt_4way_hash( void *state, const void *input );
+int scanhash_x16rt_2x64( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
 #else

 //void x16rt_hash( void *state, const void *input );
@@ -272,20 +348,27 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
 #endif

 // x21s
-#if defined(X16R_8WAY)
+#if defined(X21S_8WAY)

 int x21s_8way_hash( void *state, const void *input, int thrid );
 int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 bool x21s_8way_thread_init();

-#elif defined(X16R_4WAY)
+#elif defined(X21S_4WAY)

 int x21s_4way_hash( void *state, const void *input, int thrid );
 int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 bool x21s_4way_thread_init();

+#elif defined(X21S_2WAY)
+
+int x21s_2x64_hash( void *state, const void *input, int thrid );
+int scanhash_x21s_2x64( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+bool x21s_2x64_thread_init();
+
 #else

 int x21s_hash( void *state, const void *input, int thr_id );
--- a/algo/x16/x16r.c
+++ b/algo/x16/x16r.c
@@ -18,32 +18,36 @@ void x16r_prehash( void *edata, void *pdata )
   switch ( algo )
   {
      case JH:
-         sph_jh512_init( &x16_ctx.jh );
-         sph_jh512( &x16_ctx.jh, edata, 64 );
+         sph_jh512_init( &x16r_ref_ctx.jh );
+         sph_jh512( &x16r_ref_ctx.jh, edata, 64 );
      break;
      case SKEIN:
-         sph_skein512_init( &x16_ctx.skein );
-         sph_skein512( &x16_ctx.skein, edata, 64 );
+         sph_skein512_init( &x16r_ref_ctx.skein );
+         sph_skein512( &x16r_ref_ctx.skein, edata, 64 );
+      break;
+      case KECCAK:
+         sph_keccak512_init( &x16r_ref_ctx.keccak );
+         sph_keccak512( &x16r_ref_ctx.keccak, edata, 72 );
      break;
      case LUFFA:
-         init_luffa( &x16_ctx.luffa, 512 );
-         update_luffa( &x16_ctx.luffa, edata, 64 );
+         init_luffa( &x16r_ref_ctx.luffa, 512 );
+         update_luffa( &x16r_ref_ctx.luffa, edata, 64 );
      break;
      case CUBEHASH:
-         cubehashInit( &x16_ctx.cube, 512, 16, 32 );
-         cubehashUpdate( &x16_ctx.cube, edata, 64 );
+         cubehashInit( &x16r_ref_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x16r_ref_ctx.cube, edata, 64 );
      break;
      case HAMSI:
-         sph_hamsi512_init( &x16_ctx.hamsi );
-         sph_hamsi512( &x16_ctx.hamsi, edata, 64 );
-      break;
+         sph_hamsi512_init( &x16r_ref_ctx.hamsi );
+         sph_hamsi512( &x16r_ref_ctx.hamsi, edata, 72 );
+         break;
      case SHABAL:
-         sph_shabal512_init( &x16_ctx.shabal );
-         sph_shabal512( &x16_ctx.shabal, edata, 64 );
+         sph_shabal512_init( &x16r_ref_ctx.shabal );
+         sph_shabal512( &x16r_ref_ctx.shabal, edata, 64 );
      break;
      case WHIRLPOOL:
-         sph_whirlpool_init( &x16_ctx.whirlpool );
-         sph_whirlpool( &x16_ctx.whirlpool, edata, 64 );
+         sph_whirlpool_init( &x16r_ref_ctx.whirlpool );
+         sph_whirlpool( &x16r_ref_ctx.whirlpool, edata, 64 );
      break;
   }
 }
@@ -52,7 +56,7 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
 {
   uint32_t _ALIGN(128) hash[16];
   x16r_context_overlay ctx;
-   memcpy( &ctx, &x16_ctx, sizeof(ctx) );
+   memcpy( &ctx, &x16r_ref_ctx, sizeof(ctx) );
   void *in = (void*) input;
   int size = 80;

@@ -70,36 +74,41 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
         break;
         case BMW:
            sph_bmw512_init( &ctx.bmw );
-            sph_bmw512(&ctx.bmw, in, size);
-            sph_bmw512_close(&ctx.bmw, hash);
+            sph_bmw512( &ctx.bmw, in, size );
+            sph_bmw512_close( &ctx.bmw, hash );
         break;
         case GROESTL:
-#if defined(__AES__)
-            groestl512_full( &ctx.groestl, (char*)hash, (char*)in, size<<3 );
+#if defined(__AES__)  // || defined(__ARM_FEATURE_AES)
+            groestl512_full( &ctx.groestl, hash, in, size<<3 );
 #else
            sph_groestl512_init( &ctx.groestl );
            sph_groestl512( &ctx.groestl, in, size );
-            sph_groestl512_close(&ctx.groestl, hash);
+            sph_groestl512_close( &ctx.groestl, hash );
 #endif
         break;
         case JH:
            if ( i == 0 )
-               sph_jh512(&ctx.jh, in+64, 16 );
+               sph_jh512( &ctx.jh, in+64, 16 );
            else
            {
               sph_jh512_init( &ctx.jh );
-               sph_jh512(&ctx.jh, in, size );
+               sph_jh512( &ctx.jh, in, size );
            }
-            sph_jh512_close(&ctx.jh, hash );
+            sph_jh512_close( &ctx.jh, hash );
         break;
         case KECCAK:
-            sph_keccak512_init( &ctx.keccak );
-            sph_keccak512( &ctx.keccak, in, size );
+            if ( i == 0 )
+               sph_keccak512( &ctx.keccak, in+72, 8 );
+            else
+            {
+               sph_keccak512_init( &ctx.keccak );
+               sph_keccak512( &ctx.keccak, in, size );
+            }
            sph_keccak512_close( &ctx.keccak, hash );
         break;
         case SKEIN:
            if ( i == 0 )
-               sph_skein512(&ctx.skein, in+64, 16 );
+               sph_skein512( &ctx.skein, in+64, 16 );
            else
            {
               sph_skein512_init( &ctx.skein );
@@ -109,13 +118,13 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
         break;
         case LUFFA:
            if ( i == 0 )
-               update_and_final_luffa( &ctx.luffa, hash, (const void*)in+64, 16 );
+               update_and_final_luffa( &ctx.luffa, hash, in+64, 16 );
            else
               luffa_full( &ctx.luffa, hash, 512, in, size );
            break;
         case CUBEHASH:
            if ( i == 0 )
-               cubehashUpdateDigest( &ctx.cube, hash, (const void*)in+64, 16 );
+               cubehashUpdateDigest( &ctx.cube, hash, in+64, 16 );
            else
               cubehash_full( &ctx.cube, hash, 512, in, size );
         break;
@@ -123,19 +132,13 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
            shavite512_full( &ctx.shavite, hash, in, size );
         break;
         case SIMD:
-#if defined(__aarch64__)
            sph_simd512_init( &ctx.simd );
-            sph_simd512(&ctx.simd, (const void*) hash, 64);
-            sph_simd512_close(&ctx.simd, hash);
-#else
-            simd_full( &ctx.simd, (BitSequence *)hash,
-                             (const BitSequence*)in, size<<3 );
-#endif
+            sph_simd512( &ctx.simd, hash, size );
+            sph_simd512_close( &ctx.simd, hash );
         break;
         case ECHO:
 #if defined(__AES__)
-            echo_full( &ctx.echo, (BitSequence*)hash, 512,
-                            (const BitSequence*)in, size );
+            echo_full( &ctx.echo, hash, 512, in, size );
 #else
            sph_echo512_init( &ctx.echo );
            sph_echo512( &ctx.echo, in, size );
@@ -144,7 +147,7 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
         break;
         case HAMSI:
            if ( i == 0 )
-               sph_hamsi512( &ctx.hamsi, in+64, 16 );
+               sph_hamsi512( &ctx.hamsi, in+72, 8 );
            else
            {
               sph_hamsi512_init( &ctx.hamsi );
@@ -153,12 +156,8 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
            sph_hamsi512_close( &ctx.hamsi, hash );
         break;
         case FUGUE:
-#if defined(__AES__)
-         fugue512_full( &ctx.fugue, hash, in, size );
-#else
-	 sph_fugue512_full( &ctx.fugue, hash, in, size );
-#endif
-	 break;
+	         sph_fugue512_full( &ctx.fugue, hash, in, size );
+	      break;
         case SHABAL:
            if ( i == 0 )
               sph_shabal512( &ctx.shabal, in+64, 16 );
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -3,7 +3,7 @@
 #include <stdlib.h>
 #include <string.h>

-#if defined (X16R_8WAY)
+#if defined (X16RT_8WAY)

 int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr)
@@ -57,7 +57,7 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
   return 0;
 }

-#elif defined (X16R_4WAY)
+#elif defined (X16RT_4WAY)

 int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr)
@@ -110,4 +110,55 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }

+#elif defined (X16RT_2WAY)
+
+int scanhash_x16rt_2x64( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[2*16] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) timeHash[4*8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   v128_t *noncev = (v128_t*)vdata + 9;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+
+   if ( bench )  ptarget[7] = 0x0cff;
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
+   if ( s_ntime != masked_ntime )
+   {
+      x16rt_getTimeHash( masked_ntime, &timeHash );
+      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
+      s_ntime = masked_ntime;
+      if ( !thr_id )
+          applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
+                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
+   }
+
+   x16r_2x64_prehash( vdata, pdata );
+   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
+   do
+   {
+      if ( x16r_2x64_hash( hash, vdata, thr_id ) )
+      for ( int i = 0; i < 2; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
+      n += 2;
+   } while ( (  n < last_nonce ) && !(*restart) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
 #endif
--- a/algo/x16/x16rt.c
+++ b/algo/x16/x16rt.c
@@ -1,6 +1,6 @@
 #include "x16r-gate.h"

-#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
+#if !defined(X16RT_8WAY) && !defined(X16RT_4WAY)

 int scanhash_x16rt( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr )
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -395,7 +395,7 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
         break;
         case HAMSI:
            if ( i == 0 )
-               hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 );
+               hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -409,14 +409,43 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
                          hash7, vhash );
         break;
         case FUGUE:
-            fugue512_full( &ctx.fugue, hash0, in0, size );
-            fugue512_full( &ctx.fugue, hash1, in1, size );
-            fugue512_full( &ctx.fugue, hash2, in2, size );
-            fugue512_full( &ctx.fugue, hash3, in3, size );
-            fugue512_full( &ctx.fugue, hash4, in4, size );
-            fugue512_full( &ctx.fugue, hash5, in5, size );
-            fugue512_full( &ctx.fugue, hash6, in6, size );
-            fugue512_full( &ctx.fugue, hash7, in7, size );
+            if ( i == 0 )
+            {
+               fugue512_update( &ctx.fugue, in0 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in1 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash1 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in2 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash2 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in3 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash3 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) ); 
+               fugue512_update( &ctx.fugue, in4 + 76, 4 ); 
+               fugue512_final( &ctx.fugue, hash4 ); 
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) ); 
+               fugue512_update( &ctx.fugue, in5 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash5 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in6 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash6 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in7 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash7 );
+            }
+            else
+            {
+               fugue512_full( &ctx.fugue, hash0, hash0, size );
+               fugue512_full( &ctx.fugue, hash1, hash1, size );
+               fugue512_full( &ctx.fugue, hash2, hash2, size );
+               fugue512_full( &ctx.fugue, hash3, hash3, size );
+               fugue512_full( &ctx.fugue, hash4, hash4, size );
+               fugue512_full( &ctx.fugue, hash5, hash5, size );
+               fugue512_full( &ctx.fugue, hash6, hash6, size );
+               fugue512_full( &ctx.fugue, hash7, hash7, size );
+            }
         break;
         case SHABAL:
            intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -588,7 +617,7 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
   {
      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
      s_ntime = ntime;
-      if ( opt_debug && !thr_id )
+      if ( !opt_quiet && !thr_id )
         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

@@ -626,7 +655,14 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
      case HAMSI:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
         hamsi512_8way_init( &x16rv2_ctx.hamsi );
-         hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 );
+         hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 72 );
+      break;
+      case FUGUE:
+         v128_bswap32_80( edata, pdata );
+         fugue512_init( &x16rv2_ctx.fugue );
+         fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );
      break;
      case SHABAL:
         mm256_bswap32_intrlv80_8x32( vdata2, pdata );
@@ -824,8 +860,8 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               skein512_4way_init( &ctx.skein );
               skein512_4way_update( &ctx.skein, vhash, size );
+               skein512_4way_close( &ctx.skein, vhash );
            }
-            skein512_4way_close( &ctx.skein, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case LUFFA:
@@ -945,7 +981,7 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
         break;
         case HAMSI:
            if ( i == 0 )
-               hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 );
+               hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
@@ -956,10 +992,27 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case FUGUE:
-            fugue512_full( &ctx.fugue, hash0, in0, size );
-            fugue512_full( &ctx.fugue, hash1, in1, size );
-            fugue512_full( &ctx.fugue, hash2, in2, size );
-            fugue512_full( &ctx.fugue, hash3, in3, size );
+            if ( i == 0 )
+            {
+               fugue512_update( &ctx.fugue, in0 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in1 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash1 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in2 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash2 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in3 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash3 );
+            }
+            else
+            {
+               fugue512_full( &ctx.fugue, hash0, hash0, size );
+               fugue512_full( &ctx.fugue, hash1, hash1, size );
+               fugue512_full( &ctx.fugue, hash2, hash2, size );
+               fugue512_full( &ctx.fugue, hash3, hash3, size );
+            }
         break;
         case SHABAL:
             intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
@@ -1077,7 +1130,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
   {
      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
      s_ntime = ntime;
-      if ( opt_debug && !thr_id )
+      if ( !opt_quiet && !thr_id )
         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

@@ -1101,7 +1154,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
      break;
      case SKEIN:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         skein512_4way_prehash64( &x16r_ctx.skein, vdata );
+         skein512_4way_prehash64( &x16rv2_ctx.skein, vdata );
      break;
      case CUBEHASH:
         v128_bswap32_80( edata, pdata );
@@ -1112,7 +1165,13 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
      case HAMSI:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
         hamsi512_4way_init( &x16rv2_ctx.hamsi );
-         hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 );
+         hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 72 );
+      break;
+      case FUGUE:
+         v128_bswap32_80( edata, pdata );
+         fugue512_init( &x16rv2_ctx.fugue );
+         fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
      break;
      case SHABAL:
         v128_bswap32_intrlv80_4x32( vdata32, pdata );
@@ -1151,4 +1210,453 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }

+#elif defined (X16RV2_2WAY)
+
+union _x16rv2_2x64_context_overlay
+{
+    blake512_2x64_context   blake;
+    bmw512_2x64_context     bmw;
+#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+    hashState_groestl       groestl;
+#else
+    sph_groestl512_context  groestl;
+#endif
+    skein512_2x64_context   skein;
+    jh512_2x64_context      jh;
+    keccak512_2x64_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    shavite512_context      shavite;
+    simd512_context         simd;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+    hashState_echo          echo;
+#else
+    sph_echo512_context     echo;
+#endif
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
+    hamsi_2x64_context      hamsi;
+#else
+    sph_hamsi512_context    hamsi;
+#endif
+#if defined(__AES__)
+    hashState_fugue         fugue;
+#else
+    sph_fugue512_context    fugue;
+#endif
+    sph_shabal512_context   shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_2x64_context     sha512;
+    sph_tiger_context       tiger;
+} __attribute__ ((aligned (64)));
+
+typedef union _x16rv2_2x64_context_overlay x16rv2_2x64_context_overlay;
+
+static __thread x16rv2_2x64_context_overlay x16rv2_ctx;
+
+// Pad the 24 bytes tiger hash to 64 bytes
+static inline void padtiger512( uint32_t* hash )
+{
+  for ( int i = 6; i < 16; i++ ) hash[i] = 0;
+}
+
+int x16rv2_2x64_hash( void* output, const void* input, int thrid )
+{
+   uint32_t vhash[20*2] __attribute__ ((aligned (64)));
+   uint32_t hash0[20] __attribute__ ((aligned (32)));
+   uint32_t hash1[20] __attribute__ ((aligned (32)));
+   x16rv2_2x64_context_overlay ctx;
+   memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   int size = 80;
+
+   dintrlv_2x64( hash0, hash1, input, 640 );
+
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = x16r_hash_order[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            if ( i == 0 )
+               blake512_2x64_full( &ctx.blake, vhash, input, size );
+            else
+            {
+               intrlv_2x64( vhash, in0, in1, size<<3 );
+               blake512_2x64_full( &ctx.blake, vhash, vhash, size );
+            }
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+         case BMW:
+            bmw512_2x64_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_2x64_update( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_2x64( vhash, in0, in1, size<<3 );
+               bmw512_2x64_update( &ctx.bmw, vhash, size );
+            }
+            bmw512_2x64_close( &ctx.bmw, vhash );
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+         case GROESTL:
+#if defined(__AES__)
+            groestl512_full( &ctx.groestl, hash0, in0, size<<3 );
+            groestl512_full( &ctx.groestl, hash1, in1, size<<3 );
+#else
+            sph_groestl512_init( &ctx.groestl );
+            sph_groestl512( &ctx.groestl, in0, size );
+            sph_groestl512_close( &ctx.groestl, hash0 );
+            sph_groestl512_init( &ctx.groestl );
+            sph_groestl512( &ctx.groestl, in1, size );
+            sph_groestl512_close( &ctx.groestl, hash1 );
+#endif
+         break;
+         case JH:
+            if ( i == 0 )
+               jh512_2x64_update( &ctx.jh, input + (64<<1), 16 );
+            else
+            {
+               intrlv_2x64( vhash, in0, in1, size<<3 );
+               jh512_2x64_init( &ctx.jh );
+               jh512_2x64_update( &ctx.jh, vhash, size );
+            }
+            jh512_2x64_close( &ctx.jh, vhash );
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+         case KECCAK:
+            if ( i == 0 )
+            {
+               sph_tiger( &ctx.tiger, in0 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_tiger( &ctx.tiger, in1 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash1 );
+            }
+            else
+            {
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in0, size );
+               sph_tiger_close( &ctx.tiger, hash0 );
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in1, size );
+               sph_tiger_close( &ctx.tiger, hash1 );
+            }
+            for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = 0;
+
+            intrlv_2x64( vhash, hash0, hash1, 512 );
+            keccak512_2x64_init( &ctx.keccak );
+            keccak512_2x64_update( &ctx.keccak, vhash, 64 );
+            keccak512_2x64_close( &ctx.keccak, vhash );
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+         case SKEIN:
+            if ( i == 0 )
+               skein512_2x64_final16( &ctx.skein, vhash, input + (64*2) );
+            else
+            {
+               intrlv_2x64( vhash, in0, in1, size<<3 );
+               skein512_2x64_full( &ctx.skein, vhash, vhash,  size );
+            }
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+         case LUFFA:
+            if ( i == 0 )
+            {
+               sph_tiger( &ctx.tiger, in0 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_tiger( &ctx.tiger, in1 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash1 );
+            }
+            else
+            {
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in0, size );
+               sph_tiger_close( &ctx.tiger, hash0 );
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in1, size );
+               sph_tiger_close( &ctx.tiger, hash1 );
+            }
+            for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = 0;
+            luffa_full( &ctx.luffa, hash0, 512, hash0, 64 );
+            luffa_full( &ctx.luffa, hash1, 512, hash1, 64 );
+         break;
+         case CUBEHASH:
+            if ( i == 0 )
+            {
+               cubehashUpdateDigest( &ctx.cube, hash0, in0 + 64, 16 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, hash1, in1 + 64, 16 );
+            }
+            else
+            {
+               cubehash_full( &ctx.cube, hash0, 512, hash0, size );
+               cubehash_full( &ctx.cube, hash1, 512, hash1, size );
+            }
+         break;
+         case SHAVITE:
+            shavite512_full( &ctx.shavite, hash0, in0, size );
+            shavite512_full( &ctx.shavite, hash1, in1, size );
+         break;
+         case SIMD:
+            simd512_ctx( &ctx.simd, hash0, in0, size );
+            simd512_ctx( &ctx.simd, hash1, in1, size );
+         break;
+         case ECHO:
+#if defined(__AES__)
+            echo_full( &ctx.echo, hash0, 512, in0, size );
+            echo_full( &ctx.echo, hash1, 512, in1, size );
+#else
+            sph_echo512_init( &ctx.echo );
+            sph_echo512( &ctx.echo, in0, size );
+            sph_echo512_close( &ctx.echo, hash0 );
+            sph_echo512_init( &ctx.echo );
+            sph_echo512( &ctx.echo, in1, size );
+            sph_echo512_close( &ctx.echo, hash1 );
+#endif
+         break;
+         case HAMSI:
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
+            if ( i == 0 )
+               hamsi512_2x64_update( &ctx.hamsi, input + (72*2), 8 );
+            else
+            {
+               intrlv_2x64( vhash, hash0, hash1, size<<3 );
+               hamsi512_2x64_init( &ctx.hamsi );
+               hamsi512_2x64_update( &ctx.hamsi, vhash, size );
+            }
+            hamsi512_2x64_close( &ctx.hamsi, vhash );
+            dintrlv_2x64( hash0, hash1, vhash, 512 );
+#else
+            if ( i == 0 )
+            {
+               sph_hamsi512( &ctx.hamsi, in0 + 72, 8 );
+               sph_hamsi512_close( &ctx.hamsi, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_hamsi512( &ctx.hamsi, in1 + 72, 8 );
+               sph_hamsi512_close( &ctx.hamsi, hash1 );
+            }
+            else
+            {
+               sph_hamsi512_init( &ctx.hamsi );
+               sph_hamsi512( &ctx.hamsi, hash0, size );
+               sph_hamsi512_close( &ctx.hamsi, hash0 );
+               sph_hamsi512_init( &ctx.hamsi );
+               sph_hamsi512( &ctx.hamsi, hash1, size );
+               sph_hamsi512_close( &ctx.hamsi, hash1 );
+             }
+#endif
+         break;
+         case FUGUE:
+#if defined(__AES__)
+            if ( i == 0 )
+            {
+               fugue512_update( &ctx.fugue, in0 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
+               fugue512_update( &ctx.fugue, in1 + 76, 4 );
+               fugue512_final( &ctx.fugue, hash1 );
+            }
+            else
+            {
+               fugue512_full( &ctx.fugue, hash0, hash0, size );
+               fugue512_full( &ctx.fugue, hash1, hash1, size );
+            }
+#else
+            if ( i == 0 )
+            {
+               sph_fugue512( &ctx.fugue, in0 + 76, 4 );
+               sph_fugue512_close( &ctx.fugue, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(sph_fugue512_context) );
+               sph_fugue512( &ctx.fugue, in1 + 76, 4 );
+               sph_fugue512_close( &ctx.fugue, hash1 );
+            }
+             else
+             {
+                sph_fugue512_full( &ctx.fugue, hash0, hash0, size );
+                sph_fugue512_full( &ctx.fugue, hash1, hash1, size );
+             }
+#endif
+             break;
+         case SHABAL:
+            if ( i == 0 )
+            {
+               sph_shabal512( &ctx.shabal, in0 + 64, 16 );
+               sph_shabal512_close( &ctx.shabal, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_shabal512( &ctx.shabal, in1 + 64, 16 );
+               sph_shabal512_close( &ctx.shabal, hash1 );
+            }
+            else
+            {
+               sph_shabal512_init( &ctx.shabal );
+               sph_shabal512( &ctx.shabal, hash0, size );
+               sph_shabal512_close( &ctx.shabal, hash0 );
+               sph_shabal512_init( &ctx.shabal );
+               sph_shabal512( &ctx.shabal, hash1, size );
+               sph_shabal512_close( &ctx.shabal, hash1 );
+             }
+          break;
+          case WHIRLPOOL:
+            sph_whirlpool512_full( &ctx.whirlpool, hash0, in0, size );
+            sph_whirlpool512_full( &ctx.whirlpool, hash1, in1, size );
+         break;
+         case SHA_512:
+             if ( i == 0 )
+             {
+                sph_tiger( &ctx.tiger, in0 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash0 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in1 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash1 );
+             }
+             else
+             {
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in0, size );
+                sph_tiger_close( &ctx.tiger, hash0 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in1, size );
+                sph_tiger_close( &ctx.tiger, hash1 );
+             }
+             for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = 0;
+
+             intrlv_2x64( vhash, hash0, hash1, 512 );
+             sha512_2x64_init( &ctx.sha512 );
+             sha512_2x64_update( &ctx.sha512, vhash, 64 );
+             sha512_2x64_close( &ctx.sha512, vhash );
+             dintrlv_2x64( hash0, hash1, vhash, 512 );
+         break;
+      }
+
+      if ( work_restart[thrid].restart ) return 0;
+
+      size = 64;
+   }
+   memcpy( output,    hash0, 32 );
+   memcpy( output+32, hash1, 32 );
+   return 1;
+}
+
+int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[2*16] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
+   uint32_t edata[20];
+   uint32_t bedata1[2];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   v128_t *noncev = (v128_t*)vdata + 9;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+
+   if ( bench )  ptarget[7] = 0x0fff;
+
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   const uint32_t ntime = bswap_32(pdata[17]);
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( !opt_quiet && !thr_id )
+         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+   }
+
+   // Do midstate prehash on hash functions with block size <= 64 bytes.
+   const char elem = x16r_hash_order[0];
+   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+   switch ( algo )
+   {
+      case JH:
+         v128_bswap32_intrlv80_2x64( vdata, pdata );
+         jh512_2x64_init( &x16rv2_ctx.jh );
+         jh512_2x64_update( &x16rv2_ctx.jh, vdata, 64 );
+      break;
+      case KECCAK:
+      case LUFFA:
+      case SHA_512:
+         v128_bswap32_80( edata, pdata );
+         sph_tiger_init( &x16rv2_ctx.tiger );
+         sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
+         intrlv_2x64( vdata, edata, edata, 640 );
+      break;
+      case SKEIN:
+         v128_bswap32_intrlv80_2x64( vdata, pdata );
+         skein512_2x64_prehash64( &x16rv2_ctx.skein, vdata );
+      break;
+      case CUBEHASH:
+         v128_bswap32_80( edata, pdata );
+         cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x16rv2_ctx.cube, edata, 64 );
+         intrlv_2x64( vdata, edata, edata, 640 );
+      break;
+      case HAMSI:
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
+         v128_bswap32_intrlv80_2x64( vdata, pdata );
+         hamsi512_2x64_init( &x16rv2_ctx.hamsi );
+         hamsi512_2x64_update( &x16rv2_ctx.hamsi, vdata, 72 );
+#else
+         v128_bswap32_80( edata, pdata );
+         sph_hamsi512_init( &x16rv2_ctx.hamsi );
+         sph_hamsi512( &x16rv2_ctx.hamsi, edata, 72 );
+         intrlv_2x64( vdata, edata, edata, 640 );
+#endif
+      break;
+      case FUGUE:
+         v128_bswap32_80( edata, pdata );
+#if defined(__AES__)
+         fugue512_init( &x16rv2_ctx.fugue );
+         fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
+#else
+         sph_fugue512_init( &x16rv2_ctx.fugue );
+         sph_fugue512( &x16rv2_ctx.fugue, edata, 76 );
+#endif
+         intrlv_2x64( vdata, edata, edata, 640 );
+      break;
+      case SHABAL:
+         v128_bswap32_80( edata, pdata );
+         sph_shabal512_init( &x16rv2_ctx.shabal );
+         sph_shabal512( &x16rv2_ctx.shabal, edata, 64);
+         intrlv_2x64( vdata, edata, edata, 640 );
+      break;
+      default:
+         v128_bswap32_intrlv80_2x64( vdata, pdata );
+   }
+
+   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
+
+   do
+   {
+      if ( x16rv2_2x64_hash( hash, vdata, thr_id ) )
+      for ( int i = 0; i < 2; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
+      n += 2;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+
+
 #endif
--- a/algo/x16/x16rv2.c
+++ b/algo/x16/x16rv2.c
@@ -6,21 +6,15 @@
 */
 #include "x16r-gate.h"

-#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
+#if !defined(X16RV2_8WAY) && !defined(X16RV2_4WAY) && !defined(X16RV2_2WAY)

 #include "algo/tiger/sph_tiger.h"

 union _x16rv2_context_overlay
 {
-#if defined(__AES__)
-        hashState_echo          echo;
-        hashState_groestl       groestl;
-        hashState_fugue         fugue;
-#else
        sph_groestl512_context   groestl;
        sph_echo512_context      echo;
        sph_fugue512_context    fugue;
-#endif
        blake512_context        blake;
        sph_bmw512_context      bmw;
        sph_skein512_context    skein;
@@ -29,11 +23,7 @@ union _x16rv2_context_overlay
        hashState_luffa         luffa;
        cubehashParam           cube;
        shavite512_context      shavite;
-#if defined(__aarch64__)
        sph_simd512_context     simd;
-#else
-        hashState_sd            simd;
-#endif        
        sph_hamsi512_context    hamsi;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
@@ -72,15 +62,9 @@ int x16rv2_hash( void* output, const void* input, int thrid )
            sph_bmw512_close(&ctx.bmw, hash);
         break;
         case GROESTL:
-#if defined(__AES__)
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash,
-                                      (const char*)in, size<<3 );
-#else
            sph_groestl512_init( &ctx.groestl );
            sph_groestl512( &ctx.groestl, in, size );
            sph_groestl512_close(&ctx.groestl, hash);
-#endif
         break;
         case SKEIN:
            sph_skein512_init( &ctx.skein );
@@ -117,25 +101,14 @@ int x16rv2_hash( void* output, const void* input, int thrid )
            shavite512_full( &ctx.shavite, hash, in, size );
         break;
         case SIMD:
-#if defined(__aarch64__)
            sph_simd512_init( &ctx.simd );
-            sph_simd512(&ctx.simd, (const void*) hash, 64);
+            sph_simd512(&ctx.simd, hash, 64);
            sph_simd512_close(&ctx.simd, hash);
-#else
-            simd_full( &ctx.simd, (BitSequence *)hash,
-                             (const BitSequence*)in, size<<3 );
-#endif
         break;
         case ECHO:
-#if defined(__AES__)
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash,
-                                (const BitSequence*)in, size<<3 );
-#else
             sph_echo512_init( &ctx.echo );
             sph_echo512( &ctx.echo, in, size );
             sph_echo512_close( &ctx.echo, hash );
-#endif
         break;
         case HAMSI:
             sph_hamsi512_init( &ctx.hamsi );
@@ -143,11 +116,7 @@ int x16rv2_hash( void* output, const void* input, int thrid )
             sph_hamsi512_close( &ctx.hamsi, hash );
         break;
         case FUGUE:
-#if defined(__AES__)
-             fugue512_full( &ctx.fugue, hash, in, size );
-#else
             sph_fugue512_full( &ctx.fugue, hash, in, size );
-#endif
 	     break;
         case SHABAL:
             sph_shabal512_init( &ctx.shabal );
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -9,6 +9,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "algo/haval/haval-hash-4way.h"
+#include "algo/haval/sph-haval.h"
 #include "algo/tiger/sph_tiger.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/lyra2/lyra2.h"
@@ -351,4 +352,119 @@ bool x21s_4way_thread_init()
   return x21s_4way_matrix;
 }

+#elif defined (X21S_2WAY)
+
+static __thread uint64_t* x21s_2x64_matrix;
+
+union _x21s_2x64_context_overlay
+{
+    sph_haval256_5_context  haval;
+    sph_tiger_context       tiger;
+    sph_gost512_context     gost;
+} __attribute__ ((aligned (64)));
+
+typedef union _x21s_2x64_context_overlay x21s_2x64_context_overlay;
+
+int x21s_2x64_hash( void* output, const void* input, int thrid )
+{
+   uint8_t  shash[64*2] __attribute__ ((aligned (64)));
+   x21s_2x64_context_overlay ctx;
+   uint32_t *hash0 = (uint32_t*)  shash;
+   uint32_t *hash1 = (uint32_t*)( shash+64  );
+
+   if ( !x16r_2x64_hash_generic( shash, input, thrid ) )
+      return 0;
+
+   sph_haval256_5_init( &ctx.haval );
+   sph_haval256_5( &ctx.haval, hash0, 64 );
+   sph_haval256_5_close( &ctx.haval, hash0 );
+   sph_haval256_5_init( &ctx.haval );
+   sph_haval256_5( &ctx.haval, hash1, 64 );
+   sph_haval256_5_close( &ctx.haval, hash1 );
+
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash0, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash0 );
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash1, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash1 );
+
+   LYRA2REV2( x21s_2x64_matrix, (void*) hash0, 32, (const void*) hash0, 32,
+            (const void*) hash0, 32, 1, 4, 4 );
+   LYRA2REV2( x21s_2x64_matrix, (void*) hash1, 32, (const void*) hash1, 32,
+            (const void*) hash1, 32, 1, 4, 4 );
+
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash0, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash0 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash1, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash1 );
+
+   sha256_full( output,    hash0, 64 );
+   sha256_full( output+32, hash1, 64 );
+
+   return 1;
+}
+
+int scanhash_x21s_2x64( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[16*2] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*2] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   v128_t *noncev = (v128_t*)vdata + 9;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( bench )  ptarget[7] = 0x0cff;
+
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
+   }
+
+   x16r_2x64_prehash( vdata, pdata );
+   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
+   do
+   {
+      if ( x21s_2x64_hash( hash, vdata, thr_id ) )
+      for ( int i = 0; i < 2; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
+      n += 2;
+   } while ( likely( (  n < last_nonce ) && !(*restart) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+bool x21s_2x64_thread_init()
+{
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+   const int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+   x21s_2x64_matrix = mm_malloc( size, 64 );
+   return x21s_2x64_matrix;
+}
+
 #endif
--- a/algo/x16/x21s.c
+++ b/algo/x16/x21s.c
@@ -15,7 +15,7 @@
 #include "algo/gost/sph_gost.h"
 #include "algo/lyra2/lyra2.h"

-#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
+#if !defined(X21S_8WAY) && !defined(X21S_4WAY)

 static __thread uint64_t* x21s_matrix;

--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -1133,14 +1133,12 @@ int scanhash_x17_2x64( struct work *work, uint32_t max_nonce,
      {
         if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
         {
-applog(LOG_INFO,"Submitted Thread %d, lane %d",thr_id,0);
              pdata[19] = bswap_32( n );
 //            pdata[19] = n;
            submit_solution( work, hash, mythr );
         }
         if ( unlikely( valid_hash( hash+8, ptarget ) && !bench ) )
         {
-applog(LOG_INFO,"Submitted Thread %d, lane %d",thr_id,1);            
            pdata[19] = bswap_32( n+1 );
            submit_solution( work, hash+8, mythr );
         }
--- a/algo/x22/x22i.c
+++ b/algo/x22/x22i.c
@@ -5,24 +5,23 @@
 #include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
 #if defined(__AES__)
-  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/fugue/fugue-aesni.h"
 #else
  #include "algo/groestl/sph_groestl.h"
-  #include "algo/echo/sph_echo.h"
  #include "algo/fugue/sph_fugue.h"
 #endif
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+  #include "algo/echo/aes_ni/hash_api.h"
+#else
+  #include "algo/echo/sph_echo.h"
+#endif
 #include "algo/skein/sph_skein.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
@@ -41,12 +40,15 @@ union _x22i_context_overlay
        sph_bmw512_context     bmw;
 #if defined(__AES__)
        hashState_groestl       groestl;
-        hashState_echo          echo;
        hashState_fugue         fugue;
 #else
        sph_groestl512_context  groestl;
-        sph_echo512_context     echo;
        sph_fugue512_context    fugue;
+#endif
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+        hashState_echo          echo;
+#else
+        sph_echo512_context     echo;
 #endif
        sph_jh512_context       jh;
        sph_keccak512_context   keccak;
@@ -54,11 +56,7 @@ union _x22i_context_overlay
        hashState_luffa         luffa;
        cubehashParam           cube;
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-        sph_simd512_context     simd;
-#else
-        hashState_sd            simd;
-#endif
+        simd512_context         simd;
        sph_hamsi512_context    hamsi;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
@@ -84,9 +82,7 @@ int x22i_hash( void *output, const void *input, int thrid )
   sph_bmw512_close(&ctx.bmw, hash);

 #if defined(__AES__)
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash,
-                                  (const char*)hash, 512 );
+   groestl512_full( &ctx.groestl, hash, hash, 512 );
 #else
   sph_groestl512_init( &ctx.groestl );
   sph_groestl512( &ctx.groestl, hash, 64 );
@@ -109,26 +105,16 @@ int x22i_hash( void *output, const void *input, int thrid )
   
   luffa_full( &ctx.luffa, hash, 512, hash, 64 );

-   cubehashInit( &ctx.cube, 512, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
-
+   cubehash_full( &ctx.cube, hash, 512, hash, 64 );
+   
   sph_shavite512_init(&ctx.shavite);
   sph_shavite512(&ctx.shavite, (const void*) hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512_init(&ctx.simd );
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    simd_full( &ctx.simd, (BitSequence *)hash,
-                       (const BitSequence *)hash, 512 );
-#endif
+   simd512_ctx( &ctx.simd, hash, hash, 64 );

-#if defined(__AES__)
-   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)hash,
-                            (const BitSequence*)hash, 512 );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+   echo_full( &ctx.echo, hash, 512, hash, 64 );
 #else
   sph_echo512_init( &ctx.echo );
   sph_echo512( &ctx.echo, hash, 64 );
@@ -192,8 +178,8 @@ int x22i_hash( void *output, const void *input, int thrid )
 int scanhash_x22i( struct work *work, uint32_t max_nonce,
             uint64_t *hashes_done, struct thr_info *mythr)
 {
-   uint32_t edata[20] __attribute__((aligned(64)));
-   uint32_t hash64[8] __attribute__((aligned(64)));
+   uint32_t edata[20] __attribute__((aligned(32)));
+   uint32_t hash64[8] __attribute__((aligned(32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
--- a/algo/x22/x25x.c
+++ b/algo/x22/x25x.c
@@ -5,24 +5,23 @@
 #include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
 #if defined(__AES__)
-  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/fugue/fugue-aesni.h"
 #else
  #include "algo/groestl/sph_groestl.h"
-  #include "algo/echo/sph_echo.h"
  #include "algo/fugue/sph_fugue.h"
 #endif
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+  #include "algo/echo/aes_ni/hash_api.h"
+#else
+  #include "algo/echo/sph_echo.h"
+#endif
 #include "algo/skein/sph_skein.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
@@ -44,12 +43,15 @@ union _x25x_context_overlay
        sph_bmw512_context      bmw;
 #if defined(__AES__)
        hashState_groestl       groestl;
-        hashState_echo          echo;
        hashState_fugue         fugue;
 #else
        sph_groestl512_context  groestl;
-        sph_echo512_context     echo;
        sph_fugue512_context    fugue;
+#endif
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+        hashState_echo          echo;
+#else
+        sph_echo512_context     echo;
 #endif
        sph_jh512_context       jh;
        sph_keccak512_context   keccak;
@@ -57,11 +59,7 @@ union _x25x_context_overlay
        hashState_luffa         luffa;
        cubehashParam           cube;
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-        sph_simd512_context     simd;
-#else
-        hashState_sd            simd;
-#endif
+        simd512_context         simd;
        sph_hamsi512_context    hamsi;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
@@ -89,9 +87,7 @@ int x25x_hash( void *output, const void *input, int thrid )
   sph_bmw512_close(&ctx.bmw, &hash[1]);

 #if defined(__AES__)
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)&hash[2],
-                                  (const char*)&hash[1], 512 );
+   groestl512_full( &ctx.groestl, (void*)&hash[2], (const void*)&hash[1], 512 );
 #else
   sph_groestl512_init( &ctx.groestl );
   sph_groestl512( &ctx.groestl, &hash[1], 64 );
@@ -112,28 +108,18 @@ int x25x_hash( void *output, const void *input, int thrid )

   if ( work_restart[thrid].restart ) return 0;
   
-   init_luffa( &ctx.luffa, 512 );
-   luffa_full( &ctx.luffa, &hash[6], 512, &hash[5], 64 );
+   luffa_full( &ctx.luffa, (void*)&hash[6], 512, (const void*)&hash[5], 64 );

-   cubehashInit( &ctx.cube, 512, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, &hash[7], &hash[6], 64 );
+   cubehash_full( &ctx.cube, (void*)&hash[7], 512, (const void*)&hash[6], 64 );

   sph_shavite512_init(&ctx.shavite);
   sph_shavite512(&ctx.shavite, (const void*) &hash[7], 64);
   sph_shavite512_close(&ctx.shavite, &hash[8]);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) &hash[8], 64);
-    sph_simd512_close(&ctx.simd, &hash[9] );
-#else
-    update_final_sd( &ctx.simd, (BitSequence *)&hash[9],
-                       (const BitSequence *)&hash[8], 512 );
-#endif
+   simd512_ctx( &ctx.simd, (void*)&hash[9], (const void*)&hash[8], 64 ); 

-#if defined(__AES__)
-   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)&hash[10],
-                            (const BitSequence*)&hash[9], 512 );
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+    echo_full( &ctx.echo, (void*)&hash[10], 512, (const void*)&hash[9], 64 );
 #else
   sph_echo512_init( &ctx.echo );
   sph_echo512( &ctx.echo, &hash[9], 64 );
@@ -227,8 +213,8 @@ int x25x_hash( void *output, const void *input, int thrid )
 int scanhash_x25x( struct work *work, uint32_t max_nonce,
             uint64_t *hashes_done, struct thr_info *mythr)
 {
-   uint32_t edata[20] __attribute__((aligned(64)));
-   uint32_t hash64[8] __attribute__((aligned(64)));
+   uint32_t edata[20] __attribute__((aligned(32)));
+   uint32_t hash64[8] __attribute__((aligned(32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
@@ -245,7 +231,7 @@ int scanhash_x25x( struct work *work, uint32_t max_nonce,
   do
   {
      edata[19] = n;
-      if ( x25x_hash( hash64, edata, thr_id ) )
+      if ( x25x_hash( hash64, edata, thr_id ) );
      if ( unlikely( valid_hash( hash64, ptarget ) && !bench ) )
      {
         pdata[19] = bswap_32( n );
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.10.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.12.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='23.10'
-PACKAGE_STRING='cpuminer-opt 23.10'
+PACKAGE_VERSION='23.12'
+PACKAGE_STRING='cpuminer-opt 23.12'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 23.10 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 23.12 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1432,7 +1432,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 23.10:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 23.12:";;
   esac
  cat <<\_ACEOF

@@ -1538,7 +1538,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 23.10
+cpuminer-opt configure 23.12
 generated by GNU Autoconf 2.71

 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 23.10, which was
+It was created by cpuminer-opt $as_me 23.12, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='23.10'
+ VERSION='23.12'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 23.10, which was
+This file was extended by cpuminer-opt $as_me 23.12, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 23.10
+cpuminer-opt config.status 23.12
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [23.10])
+AC_INIT([cpuminer-opt], [23.12])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/4355
+++ b/4355