v3.9.6

2026-07-14 10:56:50 +00:00 · 2019-07-17 17:54:38 -04:00
parent e2d5762ef2
commit 6f49ba09b7
34 changed files with 1930 additions and 382 deletions
--- a/algo/argon2/argon2d/argon2d-gate.c
+++ b/algo/argon2/argon2d/argon2d-gate.c
@@ -36,35 +36,31 @@ void argon2d_crds_hash( void *output, const void *input )
 int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t _ALIGN(64) endiandata[20];
-        uint32_t _ALIGN(64) hash[8];
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-        int thr_id = mythr->id;  // thr_id arg is deprecated
+   uint32_t _ALIGN(64) endiandata[20];
+   uint32_t _ALIGN(64) hash[8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t Htarg = ptarget[7];
+   uint32_t nonce = first_nonce;

-        const uint32_t first_nonce = pdata[19];
-        const uint32_t Htarg = ptarget[7];
+   swab32_array( endiandata, pdata, 20 );

-        uint32_t nonce = first_nonce;
+   do {
+      be32enc(&endiandata[19], nonce);
+      argon2d_crds_hash( hash, endiandata );
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
+      {
+          pdata[19] = nonce;
+          submit_solution( work, hash, mythr );
+      }
+      nonce++;
+   } while (nonce < max_nonce && !work_restart[thr_id].restart);

-        swab32_array( endiandata, pdata, 20 );
-
-        do {
-                be32enc(&endiandata[19], nonce);
-                argon2d_crds_hash( hash, endiandata );
-                if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
-                {
-                        pdata[19] = nonce;
-                        *hashes_done = pdata[19] - first_nonce;
-                        work_set_target_ratio(work, hash);
-                        return 1;
-                }
-                nonce++;
-        } while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-        pdata[19] = nonce;
-        *hashes_done = pdata[19] - first_nonce + 1;
-        return 0;
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce + 1;
+   return 0;
 }

 bool register_argon2d_crds_algo( algo_gate_t* gate )
@@ -107,35 +103,32 @@ void argon2d_dyn_hash( void *output, const void *input )
 int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t _ALIGN(64) endiandata[20];
-        uint32_t _ALIGN(64) hash[8];
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-        int thr_id = mythr->id;  // thr_id arg is deprecated
+   uint32_t _ALIGN(64) endiandata[20];
+   uint32_t _ALIGN(64) hash[8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t Htarg = ptarget[7];
+   uint32_t nonce = first_nonce;

-        const uint32_t first_nonce = pdata[19];
-        const uint32_t Htarg = ptarget[7];
+   swab32_array( endiandata, pdata, 20 );

-        uint32_t nonce = first_nonce;
+   do
+   {
+      be32enc(&endiandata[19], nonce);
+      argon2d_dyn_hash( hash, endiandata );
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
+      {
+          pdata[19] = nonce;
+          submit_solution( work, hash, mythr );
+      }
+      nonce++;
+  } while (nonce < max_nonce && !work_restart[thr_id].restart);

-        swab32_array( endiandata, pdata, 20 );
-
-        do {
-                be32enc(&endiandata[19], nonce);
-                argon2d_dyn_hash( hash, endiandata );
-                if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
-                {
-                        pdata[19] = nonce;
-                        *hashes_done = pdata[19] - first_nonce;
-                        work_set_target_ratio(work, hash);
-                        return 1;
-                }
-                nonce++;
-        } while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-        pdata[19] = nonce;
-        *hashes_done = pdata[19] - first_nonce + 1;
-        return 0;
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce + 1;
+   return 0;
 }

 bool register_argon2d_dyn_algo( algo_gate_t* gate )
@@ -171,11 +164,10 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
      be32enc( &endiandata[19], n );
      argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) endiandata, 80,
                 (char*) endiandata, 80, (char*) vhash, 32, ARGON2_VERSION_13 );
-      if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) )
+      if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) && !opt_benchmark )
      {
-         *hashes_done = n - first_nonce + 1;
         pdata[19] = n;
-         return true;
+         submit_solution( work, vhash, mythr );
      }
      n++;

--- a/algo/bmw/bmw512-4way.c
+++ b/algo/bmw/bmw512-4way.c
@@ -0,0 +1,59 @@
+#include "bmw512-gate.h"
+
+#ifdef BMW512_4WAY
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+//#include "sph_keccak.h"
+#include "bmw-hash-4way.h"
+
+void bmw512hash_4way(void *state, const void *input)
+{
+    bmw512_4way_context ctx;
+    bmw512_4way_init( &ctx );
+    bmw512_4way( &ctx, input, 80 );
+    bmw512_4way_close( &ctx, state );
+}
+
+int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t hash[16*4] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[25]);   // 3*8+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+//   const uint32_t Htarg = ptarget[7];
+    int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   do {
+       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+	
+      bmw512hash_4way( hash, vdata );
+
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
+      {
+          extr_lane_4x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) )
+          {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+      }
+      n += 4;
+
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#endif
--- a/algo/bmw/bmw512-gate.c
+++ b/algo/bmw/bmw512-gate.c
@@ -0,0 +1,20 @@
+#include "bmw512-gate.h"
+
+int64_t bmw512_get_max64() { return 0x7ffffLL; }
+
+bool register_bmw512_algo( algo_gate_t* gate )
+{
+  gate->optimizations = AVX2_OPT;
+  gate->set_target      = (void*)&alt_set_target;
+  gate->get_max64       = (void*)&bmw512_get_max64;
+#if defined (BMW512_4WAY)
+  gate->scanhash  = (void*)&scanhash_bmw512_4way;
+  gate->hash      = (void*)&bmw512hash_4way;
+#else
+  gate->scanhash        = (void*)&scanhash_bmw512;
+  gate->hash            = (void*)&bmw512hash;
+#endif
+  return true;
+};
+
+
--- a/algo/bmw/bmw512-gate.h
+++ b/algo/bmw/bmw512-gate.h
@@ -0,0 +1,23 @@
+#ifndef BMW512_GATE_H__
+#define BMW512_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__)
+  #define BMW512_4WAY 1
+#endif
+
+#if defined(BMW512_4WAY)
+
+void bmw512hash_4way( void *state, const void *input );
+int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
+void bmw512hash( void *state, const void *input );
+int scanhash_bmw512( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
--- a/algo/bmw/bmw512.c
+++ b/algo/bmw/bmw512.c
@@ -0,0 +1,53 @@
+#include "algo-gate-api.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "sph_bmw.h"
+
+void bmw512hash(void *state, const void *input)
+{
+    sph_bmw512_context ctx;
+    uint32_t hash[32];	
+   
+    sph_bmw512_init( &ctx );
+    sph_bmw512( &ctx,input, 80 );
+    sph_bmw512_close( &ctx, hash );
+
+    memcpy( state, hash, 32 );
+}
+
+int scanhash_bmw512( struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+	uint32_t n = pdata[19] - 1;
+	const uint32_t first_nonce = pdata[19];
+	//const uint32_t Htarg = ptarget[7];
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+	uint32_t _ALIGN(32) hash64[8];
+	uint32_t endiandata[32];
+
+   for (int i=0; i < 19; i++) 
+           be32enc(&endiandata[i], pdata[i]);
+
+	do {
+	
+		pdata[19] = ++n;
+		be32enc(&endiandata[19], n); 
+		bmw512hash(hash64, endiandata);
+        if (((hash64[7]&0xFFFFFF00)==0) && 
+				fulltest(hash64, ptarget)) {
+            *hashes_done = n - first_nonce + 1;
+			return true;
+		}
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+	
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
+}
+
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -7,6 +7,7 @@

 // 2x128

+/*
 // The result of hashing 10 rounds of initial data which consists of params
 // zero padded.
 static const uint64_t IV256[] =
@@ -24,13 +25,14 @@ static const uint64_t IV512[] =
 0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
 0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
 };
+*/

 static void transform_2way( cube_2way_context *sp )
 {
    int r;
    const int rounds = sp->rounds;

-    __m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
+    __m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1;

    x0 = _mm256_load_si256( (__m256i*)sp->h     );
    x1 = _mm256_load_si256( (__m256i*)sp->h + 1 );
@@ -47,18 +49,12 @@ static void transform_2way( cube_2way_context *sp )
        x5 = _mm256_add_epi32( x1, x5 );
        x6 = _mm256_add_epi32( x2, x6 );
        x7 = _mm256_add_epi32( x3, x7 );
-        y0 = x2;
-        y1 = x3;
-        y2 = x0;
-        y3 = x1;
-        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0,  7 ),
-                               _mm256_srli_epi32( y0, 25 ) );
-        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1,  7 ),
-                               _mm256_srli_epi32( y1, 25 ) );
-        x2 = _mm256_xor_si256( _mm256_slli_epi32( y2,  7 ),
-                               _mm256_srli_epi32( y2, 25 ) );
-        x3 = _mm256_xor_si256( _mm256_slli_epi32( y3,  7 ),
-                               _mm256_srli_epi32( y3, 25 ) );
+        y0 = x0;
+        y1 = x1;
+        x0 = mm256_rol_32( x2, 7 );
+        x1 = mm256_rol_32( x3, 7 );
+        x2 = mm256_rol_32( y0, 7 );
+        x3 = mm256_rol_32( y1, 7 );
        x0 = _mm256_xor_si256( x0, x4 );
        x1 = _mm256_xor_si256( x1, x5 );
        x2 = _mm256_xor_si256( x2, x6 );
@@ -71,18 +67,12 @@ static void transform_2way( cube_2way_context *sp )
        x5 = _mm256_add_epi32( x1, x5 );
        x6 = _mm256_add_epi32( x2, x6 );
        x7 = _mm256_add_epi32( x3, x7 );
-        y0 = x1;
-        y1 = x0;
-        y2 = x3;
-        y3 = x2;
-        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 11 ),
-                               _mm256_srli_epi32( y0, 21 ) );
-        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 11 ),
-                               _mm256_srli_epi32( y1, 21 ) );
-        x2 = _mm256_xor_si256( _mm256_slli_epi32( y2, 11 ),
-                               _mm256_srli_epi32( y2, 21 ) );
-        x3 = _mm256_xor_si256( _mm256_slli_epi32( y3, 11 ),
-                               _mm256_srli_epi32( y3, 21 ) );
+        y0 = x0;
+        y1 = x2;
+        x0 = mm256_rol_32( x1, 11 );
+        x1 = mm256_rol_32( y0, 11 );
+        x2 = mm256_rol_32( x3, 11 );
+        x3 = mm256_rol_32( y1, 11 );
        x0 = _mm256_xor_si256( x0, x4 );
        x1 = _mm256_xor_si256( x1, x5 );
        x2 = _mm256_xor_si256( x2, x6 );
@@ -107,23 +97,40 @@ static void transform_2way( cube_2way_context *sp )
 int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
                    int blockbytes )
 {
-    const uint64_t* iv = hashbitlen == 512 ? IV512 : IV256;
+    __m128i* h = (__m128i*)sp->h;
    sp->hashlen   = hashbitlen/128;
    sp->blocksize = blockbytes/16;
    sp->rounds    = rounds;
    sp->pos       = 0;

-    __m256i* h = (__m256i*)sp->h;
-
-    h[0] = _mm256_set_epi64x( iv[ 1], iv[ 0], iv[ 1], iv[ 0] );
-    h[1] = _mm256_set_epi64x( iv[ 3], iv[ 2], iv[ 3], iv[ 2] );
-    h[2] = _mm256_set_epi64x( iv[ 5], iv[ 4], iv[ 5], iv[ 4] );
-    h[3] = _mm256_set_epi64x( iv[ 7], iv[ 6], iv[ 7], iv[ 6] );
-    h[4] = _mm256_set_epi64x( iv[ 9], iv[ 8], iv[ 9], iv[ 8] );
-    h[5] = _mm256_set_epi64x( iv[11], iv[10], iv[11], iv[10] );
-    h[6] = _mm256_set_epi64x( iv[13], iv[12], iv[13], iv[12] );
-    h[7] = _mm256_set_epi64x( iv[15], iv[14], iv[15], iv[14] );
+    if ( hashbitlen == 512 )
+    {

+       h[ 0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
+       h[ 2] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
+       h[ 4] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
+       h[ 6] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
+       h[ 8] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
+       h[10] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
+       h[12] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
+       h[14] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
+       h[1] = h[ 0];  h[ 3] = h[ 2]; h[ 5] = h[ 4]; h[ 7] = h[ 6];
+       h[9] = h[ 8];  h[11] = h[10]; h[13] = h[12]; h[15] = h[14];
+    }
+    else
+    {
+       h[ 0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
+       h[ 2] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
+       h[ 4] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
+       h[ 6] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
+       h[ 8] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
+       h[10] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
+       h[12] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
+       h[14] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
+       h[1] = h[ 0];  h[ 3] = h[ 2]; h[ 5] = h[ 4]; h[ 7] = h[ 6];
+       h[9] = h[ 8];  h[11] = h[10]; h[13] = h[12]; h[15] = h[14];
+    }
+    
    return 0;
 }

@@ -165,7 +172,7 @@ int cube_2way_close( cube_2way_context *sp, void *output )

    for ( i = 0; i < 10; ++i )           transform_2way( sp );

-    for ( i = 0; i < sp->hashlen; i++ )  hash[i] = sp->h[i];
+    memcpy( hash, sp->h, sp->hashlen<<5 );
    return 0;
 }

@@ -198,7 +205,7 @@ int cube_2way_update_close( cube_2way_context *sp, void *output,

    for ( i = 0; i < 10; ++i )            transform_2way( sp );

-    for ( i = 0; i < sp->hashlen; i++ )   hash[i] = sp->h[i];
+    memcpy( hash, sp->h, sp->hashlen<<5 );
    return 0;
 }

--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -16,24 +16,6 @@
 #include "simd-utils.h"
 #include <stdio.h>

-// The result of hashing 10 rounds of initial data which is params and 
-// mostly zeros.
-static const uint64_t IV256[] =
-{
-0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
-0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
-0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
-0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
-};
-
-static const uint64_t IV512[] =
-{
-0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
-0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
-0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
-0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
-};
-
 static void transform( cubehashParam *sp )
 {
    int r;
@@ -53,26 +35,22 @@ static void transform( cubehashParam *sp )
        x2 = _mm256_add_epi32( x0, x2 );
        x3 = _mm256_add_epi32( x1, x3 );
        y0 = x0;
-        x0 = _mm256_xor_si256( _mm256_slli_epi32( x1, 7 ),
-                               _mm256_srli_epi32( x1, 25 ) );
-        x1 = _mm256_xor_si256( _mm256_slli_epi32( y0, 7 ),
-                               _mm256_srli_epi32( y0, 25 ) );
+        x0 = mm256_rol_32( x1, 7 );
+        x1 = mm256_rol_32( y0, 7 );
        x0 = _mm256_xor_si256( x0, x2 );
        x1 = _mm256_xor_si256( x1, x3 );
-        x2 = _mm256_shuffle_epi32( x2, 0x4e );
-        x3 = _mm256_shuffle_epi32( x3, 0x4e );
+        x2 = mm256_swap64_128( x2 );
+        x3 = mm256_swap64_128( x3 );
        x2 = _mm256_add_epi32( x0, x2 );
        x3 = _mm256_add_epi32( x1, x3 );
-        y0 = _mm256_permute4x64_epi64( x0, 0x4e );
-        y1 = _mm256_permute4x64_epi64( x1, 0x4e );
-        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 11 ),
-                               _mm256_srli_epi32( y0, 21 ) );
-        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 11 ), 
-                               _mm256_srli_epi32( y1, 21 ) );
+        y0 = mm256_swap_128( x0 );
+        y1 = mm256_swap_128( x1 );
+        x0 = mm256_rol_32( y0, 11 );
+        x1 = mm256_rol_32( y1, 11 );
        x0 = _mm256_xor_si256( x0, x2 );
        x1 = _mm256_xor_si256( x1, x3 );
-        x2 = _mm256_shuffle_epi32( x2, 0xb1 );
-        x3 = _mm256_shuffle_epi32( x3, 0xb1 );
+        x2 = mm256_swap32_64( x2 );
+        x3 = mm256_swap32_64( x3 );
    }

    _mm256_store_si256( (__m256i*)sp->x,     x0 );
@@ -147,37 +125,58 @@ static void transform( cubehashParam *sp )
 #endif
 }  // transform

+/*
+// The result of hashing 10 rounds of initial data which is params and
+// mostly zeros.
+static const uint64_t IV256[] =
+{
+0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
+0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
+0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
+0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
+};
+
+static const uint64_t IV512[] =
+{
+0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
+0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
+0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
+0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
+};
+*/
+
 int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
 {
-    const uint64_t* iv = hashbitlen == 512 ? IV512 : IV256;
+    __m128i *x = (__m128i*)sp->x;
    sp->hashlen   = hashbitlen/128;
    sp->blocksize = blockbytes/16;
    sp->rounds    = rounds;
    sp->pos       = 0;
-    
-#if defined(__AVX2__)

-    __m256i* x = (__m256i*)sp->x;
+    if ( hashbitlen == 512 )
+    {

-    x[0] = _mm256_set_epi64x( iv[ 3], iv[ 2], iv[ 1], iv[ 0] );
-    x[1] = _mm256_set_epi64x( iv[ 7], iv[ 6], iv[ 5], iv[ 4] );
-    x[2] = _mm256_set_epi64x( iv[11], iv[10], iv[ 9], iv[ 8] );
-    x[3] = _mm256_set_epi64x( iv[15], iv[14], iv[13], iv[12] );
+       x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
+       x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
+       x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
+       x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
+       x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
+       x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
+       x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
+       x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
+    }
+    else
+    {
+       x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
+       x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
+       x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
+       x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
+       x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
+       x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
+       x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
+       x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
+    }   

-#else
-
-    __m128i* x = (__m128i*)sp->x;
-
-     x[0] = _mm_set_epi64x( iv[ 1], iv[ 0] );
-     x[1] = _mm_set_epi64x( iv[ 3], iv[ 2] );
-     x[2] = _mm_set_epi64x( iv[ 5], iv[ 4] );
-     x[3] = _mm_set_epi64x( iv[ 7], iv[ 6] );
-     x[4] = _mm_set_epi64x( iv[ 9], iv[ 8] );
-     x[5] = _mm_set_epi64x( iv[11], iv[10] );
-     x[6] = _mm_set_epi64x( iv[13], iv[12] );
-     x[7] = _mm_set_epi64x( iv[15], iv[14] );
-
-#endif
    return SUCCESS;
 }

--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -323,7 +323,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
     mpz_clears(magipi, magisw, product, bns0, bns1, NULL);

    *hashes_done = n - first_nonce + 1;
-    return rc;
+    return 0;
 }

 bool register_m7m_algo( algo_gate_t *gate )
--- a/algo/x13/x13bcd-4way.c
+++ b/algo/x13/x13bcd-4way.c
@@ -0,0 +1,283 @@
+#include "x13sm3-gate.h"
+
+#if defined(X13SM3_4WAY)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+//#include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/simd-hash-2way.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/sm3/sm3-hash-4way.h"
+#include "algo/hamsi/hamsi-hash-4way.h"
+#include "algo/fugue/sph_fugue.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+//    luffa_2way_context      luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    simd_2way_context       simd;
+    hashState_echo          echo;
+    sm3_4way_ctx_t          sm3;
+    hamsi512_4way_context   hamsi;
+    sph_fugue512_context    fugue;
+} x13bcd_4way_ctx_holder;
+
+x13bcd_4way_ctx_holder x13bcd_4way_ctx __attribute__ ((aligned (64)));
+static __thread blake512_4way_context x13bcd_ctx_mid;
+
+void init_x13bcd_4way_ctx()
+{
+     blake512_4way_init( &x13bcd_4way_ctx.blake );
+     bmw512_4way_init( &x13bcd_4way_ctx.bmw );
+     init_groestl( &x13bcd_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x13bcd_4way_ctx.skein );
+     jh512_4way_init( &x13bcd_4way_ctx.jh );
+     keccak512_4way_init( &x13bcd_4way_ctx.keccak );
+//     luffa_2way_init( &x13bcd_4way_ctx.luffa, 512 );
+     cubehashInit( &x13bcd_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x13bcd_4way_ctx.shavite );
+     simd_2way_init( &x13bcd_4way_ctx.simd, 512 );
+     init_echo( &x13bcd_4way_ctx.echo, 512 );
+     sm3_4way_init( &x13bcd_4way_ctx.sm3 );
+     hamsi512_4way_init( &x13bcd_4way_ctx.hamsi );
+     sph_fugue512_init( &x13bcd_4way_ctx.fugue );
+};
+
+void x13bcd_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x13bcd_4way_ctx_holder ctx;
+     memcpy( &ctx, &x13bcd_4way_ctx, sizeof(x13bcd_4way_ctx) );
+
+     // Blake
+     memcpy( &ctx.blake, &x13bcd_ctx_mid, sizeof(x13bcd_ctx_mid) );
+     blake512_4way( &ctx.blake, input + (64<<2), 16 );
+
+//     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // Bmw
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     // Serial
+     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     reinit_groestl( &ctx.groestl );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     reinit_groestl( &ctx.groestl );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     reinit_groestl( &ctx.groestl );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // Parallel 4way
+     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // SM3 parallel 32 bit
+     uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64)));
+     memset( sm3_vhash, 0, sizeof sm3_vhash );
+     uint32_t sm3_hash0[32] __attribute__ ((aligned (32)));
+     memset( sm3_hash0, 0, sizeof sm3_hash0 );
+     uint32_t sm3_hash1[32] __attribute__ ((aligned (32)));
+     memset( sm3_hash1, 0, sizeof sm3_hash1 );
+     uint32_t sm3_hash2[32] __attribute__ ((aligned (32)));
+     memset( sm3_hash2, 0, sizeof sm3_hash2 );
+     uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
+     memset( sm3_hash3, 0, sizeof sm3_hash3 );
+
+     sm3_4way( &ctx.sm3, vhash, 64 );
+     sm3_4way_close( &ctx.sm3, sm3_vhash );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
+
+/*     
+     // Luffa
+     intrlv_2x128( vhash, hash0, hash1, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     dintrlv_2x128( hash0, hash1, vhash, 512 );
+     intrlv_2x128( vhash, hash2, hash3, 512 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     dintrlv_2x128( hash2, hash3, vhash, 512 );
+*/
+     
+     // Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x13bcd_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x13bcd_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x13bcd_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x13bcd_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x13bcd_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x13bcd_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // Simd
+     intrlv_2x128( vhash, hash0, hash1, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_2x128( hash0, hash1, vhash, 512 );
+     intrlv_2x128( vhash, hash2, hash3, 512 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_2x128( hash2, hash3, vhash, 512 );
+
+     // Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x13bcd_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x13bcd_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x13bcd_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+/*
+     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // SM3 parallel 32 bit
+     uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64)));
+     memset( sm3_vhash, 0, sizeof sm3_vhash );
+     uint32_t sm3_hash0[32] __attribute__ ((aligned (32)));
+     memset( sm3_hash0, 0, sizeof sm3_hash0 );
+     uint32_t sm3_hash1[32] __attribute__ ((aligned (32)));
+     memset( sm3_hash1, 0, sizeof sm3_hash1 );
+     uint32_t sm3_hash2[32] __attribute__ ((aligned (32)));
+     memset( sm3_hash2, 0, sizeof sm3_hash2 );
+     uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
+     memset( sm3_hash3, 0, sizeof sm3_hash3 );
+
+     sm3_4way( &ctx.sm3, vhash, 64 );
+     sm3_4way_close( &ctx.sm3, sm3_vhash );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
+*/
+
+     // Hamsi parallel 4x32x2
+     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_close( &ctx.hamsi, vhash );
+     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // Fugue serial
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+     int thr_id = mythr->id;  // thr_id arg is deprecated
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     mm256_bswap32_intrlv80_4x64( vdata, pdata );
+
+     blake512_4way_init( &x13bcd_ctx_mid );
+     blake512_4way( &x13bcd_ctx_mid, vdata, 64 );
+
+     for ( int m=0; m < 6; m++ )
+       if ( Htarg <= htmax[m] )
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+
+            x13bcd_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            for ( int i = 0; i < 4; i++ )
+            if ( ( ( (hash+(i<<3))[7] & mask ) == 0 ) )
+            if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+            {
+               pdata[19] = n+i;
+              submit_lane_solution( work, hash+(i<<3), mythr, i );
+            }
+            n += 4;
+         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return 0;
+}
+
+#endif
--- a/algo/x13/x13bcd.c
+++ b/algo/x13/x13bcd.c
@@ -0,0 +1,258 @@
+#include "x13sm3-gate.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "algo/groestl/sph_groestl.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/luffa/sph_luffa.h"
+#include "algo/cubehash/sph_cubehash.h"
+#include "algo/simd/sph_simd.h"
+#include "algo/echo/sph_echo.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/sm3/sph_sm3.h"
+
+//#include "algo/luffa/luffa_for_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
+#include "algo/simd/nist.h"
+#include "algo/blake/sse2/blake.c"
+#include "algo/bmw/sse2/bmw.c"
+#include "algo/keccak/sse2/keccak.c"
+#include "algo/skein/sse2/skein.c"
+#include "algo/jh/sse2/jh_sse2_opt64.h"
+
+#ifndef NO_AES_NI
+  #include "algo/groestl/aes_ni/hash-groestl.h"
+  #include "algo/echo/aes_ni/hash_api.h"
+#endif
+
+typedef struct {
+#ifdef NO_AES_NI
+        sph_groestl512_context  groestl;
+        sph_echo512_context     echo;
+#else
+        hashState_echo          echo;
+        hashState_groestl       groestl;
+#endif
+//        hashState_luffa         luffa;
+        cubehashParam           cube;
+        sph_shavite512_context  shavite;
+        hashState_sd            simd;
+        sm3_ctx_t               sm3;
+        sph_hamsi512_context    hamsi;
+        sph_fugue512_context    fugue;
+} x13bcd_ctx_holder;
+
+x13bcd_ctx_holder x13bcd_ctx;
+
+void init_x13bcd_ctx()
+{
+#ifdef NO_AES_NI
+        sph_groestl512_init(&x13bcd_ctx.groestl);
+        sph_echo512_init(&x13bcd_ctx.echo);
+#else
+        init_echo(&x13bcd_ctx.echo, 512);
+        init_groestl(&x13bcd_ctx.groestl, 64 );
+#endif
+//        init_luffa(&x13bcd_ctx.luffa,512);
+        cubehashInit(&x13bcd_ctx.cube,512,16,32);
+        sph_shavite512_init(&x13bcd_ctx.shavite);
+        init_sd(&x13bcd_ctx.simd,512);
+        sm3_init( &x13bcd_ctx.sm3 );
+        sph_hamsi512_init(&x13bcd_ctx.hamsi);
+        sph_fugue512_init(&x13bcd_ctx.fugue);
+};
+
+void x13bcd_hash(void *output, const void *input)
+{
+	unsigned char hash[128] __attribute__ ((aligned (32)));
+
+        x13bcd_ctx_holder ctx;
+        memcpy(&ctx, &x13bcd_ctx, sizeof(x13bcd_ctx));
+
+        unsigned char hashbuf[128];
+        size_t hashptr;
+        sph_u64 hashctA;
+        sph_u64 hashctB;
+
+        //---blake1---
+        
+        DECL_BLK;
+        BLK_I;
+        BLK_W;
+        BLK_C;
+
+        //---bmw2---
+
+        DECL_BMW;
+        BMW_I;
+        BMW_U;
+
+        #define M(x)    sph_dec64le_aligned(data + 8 * (x))
+        #define H(x)    (h[x])
+        #define dH(x)   (dh[x])
+
+        BMW_C;
+
+        #undef M
+        #undef H
+        #undef dH
+
+        //---groestl----
+
+#ifdef NO_AES_NI
+        sph_groestl512 (&ctx.groestl, hash, 64);
+        sph_groestl512_close(&ctx.groestl, hash);
+#else
+        update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                  (const char*)hash, 512 );
+#endif
+
+        //---skein4---
+
+        DECL_SKN;
+        SKN_I;
+        SKN_U;
+        SKN_C;
+
+        //---jh5------
+
+        DECL_JH;
+        JH_H;
+
+        //---keccak6---
+
+        DECL_KEC;
+        KEC_I;
+        KEC_U;
+        KEC_C;
+
+        uint32_t sm3_hash[32] __attribute__ ((aligned (32)));
+        memset(sm3_hash, 0, sizeof sm3_hash);
+
+        sph_sm3(&ctx.sm3, hash, 64);
+        sph_sm3_close(&ctx.sm3, sm3_hash);
+
+        cubehashUpdateDigest( &ctx.cube, (byte*) hash,
+                              (const byte*)sm3_hash, 64 );
+
+/*
+        //--- luffa7
+        update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                (const BitSequence*)hash, 64 );
+
+        // 8 Cube
+        cubehashUpdateDigest( &ctx.cube, (byte*) hash,
+                              (const byte*)hash, 64 );
+*/
+
+        // 9 Shavite
+        sph_shavite512( &ctx.shavite, hash, 64);
+        sph_shavite512_close( &ctx.shavite, hash);
+
+        // 10 Simd
+        update_final_sd( &ctx.simd, (BitSequence *)hash,
+                         (const BitSequence *)hash, 512 );
+
+        //11---echo---
+#ifdef NO_AES_NI
+        sph_echo512(&ctx.echo, hash, 64);
+        sph_echo512_close(&ctx.echo, hash);
+#else
+        update_final_echo ( &ctx.echo, (BitSequence *)hash,
+                            (const BitSequence *)hash, 512 );
+#endif
+
+        /*
+        uint32_t sm3_hash[32] __attribute__ ((aligned (32)));
+        memset(sm3_hash, 0, sizeof sm3_hash);
+
+        sph_sm3(&ctx.sm3, hash, 64);
+        sph_sm3_close(&ctx.sm3, sm3_hash);
+
+        sph_hamsi512(&ctx.hamsi, sm3_hash, 64);
+*/
+
+        sph_hamsi512(&ctx.hamsi, hash, 64);
+        sph_hamsi512_close(&ctx.hamsi, hash);
+
+        sph_fugue512(&ctx.fugue, hash, 64);
+        sph_fugue512_close(&ctx.fugue, hash);
+
+        asm volatile ("emms");
+	memcpy(output, hash, 32);
+}
+
+int scanhash_x13bcd( struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr)
+{
+        uint32_t endiandata[20] __attribute__((aligned(64)));
+        uint32_t hash64[8] __attribute__((aligned(64)));
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+	uint32_t n = pdata[19] - 1;
+	const uint32_t first_nonce = pdata[19];
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+	const uint32_t Htarg = ptarget[7];
+
+	uint64_t htmax[] = {
+		0,
+		0xF,
+		0xFF,
+		0xFFF,
+		0xFFFF,
+		0x10000000
+	};
+	uint32_t masks[] = {
+		0xFFFFFFFF,
+		0xFFFFFFF0,
+		0xFFFFFF00,
+		0xFFFFF000,
+		0xFFFF0000,
+		0
+	};
+
+	// we need bigendian data...
+        swab32_array( endiandata, pdata, 20 );
+
+#ifdef DEBUG_ALGO
+	if (Htarg != 0)
+		printf("[%d] Htarg=%X\n", thr_id, Htarg);
+#endif
+	for (int m=0; m < 6; m++) {
+		if (Htarg <= htmax[m]) {
+			uint32_t mask = masks[m];
+			do {
+				pdata[19] = ++n;
+				be32enc(&endiandata[19], n);
+				x13bcd_hash(hash64, endiandata);
+#ifndef DEBUG_ALGO
+				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
+					*hashes_done = n - first_nonce + 1;
+					return true;
+				}
+#else
+				if (!(n % 0x1000) && !thr_id) printf(".");
+				if (!(hash64[7] & mask)) {
+					printf("[%d]",thr_id);
+					if (fulltest(hash64, ptarget)) {
+                  work_set_target_ratio( work, hash64 );
+						*hashes_done = n - first_nonce + 1;
+						return true;
+					}
+				}
+#endif
+			} while (n < max_nonce && !work_restart[thr_id].restart);
+			// see blake.c if else to understand the loop on htmax => mask
+			break;
+		}
+	}
+
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
+}
+
--- a/algo/x13/x13sm3-gate.c
+++ b/algo/x13/x13sm3-gate.c
@@ -16,3 +16,19 @@ bool register_x13sm3_algo( algo_gate_t* gate )
  return true;
 };

+bool register_x13bcd_algo( algo_gate_t* gate )
+{
+#if defined (X13SM3_4WAY)
+  init_x13bcd_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x13bcd_4way;
+  gate->hash      = (void*)&x13bcd_4way_hash;
+#else
+  init_x13bcd_ctx();
+  gate->scanhash  = (void*)&scanhash_x13bcd;
+  gate->hash      = (void*)&x13bcd_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/x13/x13sm3-gate.h
+++ b/algo/x13/x13sm3-gate.h
@@ -10,23 +10,31 @@

 bool register_x13sm3_algo( algo_gate_t* gate );

+bool register_x13bcd_algo( algo_gate_t* gate );
+
 #if defined(X13SM3_4WAY)

 void x13sm3_4way_hash( void *state, const void *input );
-
 int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x13sm3_4way_ctx();

+void x13bcd_4way_hash( void *state, const void *input );
+int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+void init_x13bcd_4way_ctx();
+
 #endif

 void x13sm3_hash( void *state, const void *input );
-
 int scanhash_x13sm3( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x13sm3_ctx();

+void x13bcd_hash( void *state, const void *input );
+int scanhash_x13bcd( struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr );
+void init_x13bcd_ctx();
+
 #endif

--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -62,3 +62,149 @@ bool register_x16s_algo( algo_gate_t* gate )
  return true;
 };

+////////////////
+//
+//   X16RT
+
+
+void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash )
+{
+    int32_t maskedTime = timeStamp & 0xffffff80;
+    sha256d( (unsigned char*)timeHash, (const unsigned char*)( &maskedTime ),
+             sizeof( maskedTime ) );
+}
+
+void x16rt_getAlgoString( const uint32_t *timeHash, char *output)
+{
+   char *sptr = output;
+   uint8_t* data = (uint8_t*)timeHash;
+
+   for (uint8_t j = 0; j < X16R_HASH_FUNC_COUNT; j++) {
+      uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed
+      uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4;
+
+      if (algoDigit >= 10)
+         sprintf(sptr, "%c", 'A' + (algoDigit - 10));
+      else
+         sprintf(sptr, "%u", (uint32_t) algoDigit);
+      sptr++;
+   }
+   *sptr = '\0';
+}
+
+void x16rt_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
+{
+   uchar merkle_tree[64] = { 0 };
+   size_t t;
+
+   algo_gate.gen_merkle_root( merkle_tree, sctx );
+   // Increment extranonce2
+   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
+
+   // Assemble block header
+//   algo_gate.build_block_header( g_work, le32dec( sctx->job.version ),
+//          (uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_tree,
+//          le32dec( sctx->job.ntime ), le32dec(sctx->job.nbits) );
+   int i;
+
+   memset( g_work->data, 0, sizeof(g_work->data) );
+   g_work->data[0] = le32dec( sctx->job.version );
+
+   if ( have_stratum )
+      for ( i = 0; i < 8; i++ )
+         g_work->data[ 1+i ] = le32dec( (uint32_t*)sctx->job.prevhash + i );
+   else
+      for (i = 0; i < 8; i++)
+         g_work->data[ 8-i ] = le32dec( (uint32_t*)sctx->job.prevhash + i );
+
+   g_work->data[ algo_gate.ntime_index ] = le32dec( sctx->job.ntime );
+   g_work->data[ algo_gate.nbits_index ] = le32dec( sctx->job.nbits );
+   g_work->data[20] = 0x80000000;
+   g_work->data[31] = 0x00000280;
+
+   for ( i = 0; i < 8; i++ )
+      g_work->merkleroothash[7 - i] = be32dec((uint32_t *)merkle_tree + i);
+   for ( i = 0; i < 8; i++ )
+      g_work->witmerkleroothash[7 - i] = be32dec((uint32_t *)merkle_tree + i);
+   for ( i = 0; i < 8; i++ )
+      g_work->denom10[i] =    le32dec((uint32_t *)sctx->job.denom10 + i);
+   for ( i = 0; i < 8; i++ )
+      g_work->denom100[i] =   le32dec((uint32_t *)sctx->job.denom100 + i);
+   for ( i = 0; i < 8; i++ )
+      g_work->denom1000[i] =  le32dec((uint32_t *)sctx->job.denom1000 + i);
+   for ( i = 0; i < 8; i++ )
+      g_work->denom10000[i] = le32dec((uint32_t *)sctx->job.denom10000 + i);
+
+   uint32_t pofnhash[8];
+   memset(pofnhash, 0x00, 32);
+
+   char denom10_str      [ 2 * sizeof( g_work->denom10 )           + 1 ];
+   char denom100_str     [ 2 * sizeof( g_work->denom100 )          + 1 ];
+   char denom1000_str    [ 2 * sizeof( g_work->denom1000 )         + 1 ];
+   char denom10000_str   [ 2 * sizeof( g_work->denom10000 )        + 1 ];
+   char merkleroot_str   [ 2 * sizeof( g_work->merkleroothash )    + 1 ];
+   char witmerkleroot_str[ 2 * sizeof( g_work->witmerkleroothash ) + 1 ];
+   char pofn_str         [ 2 * sizeof( pofnhash )                  + 1 ];
+
+   cbin2hex( denom10_str,       (char*) g_work->denom10,           32 );
+   cbin2hex( denom100_str,      (char*) g_work->denom100,          32 );
+   cbin2hex( denom1000_str,     (char*) g_work->denom1000,         32 );
+   cbin2hex( denom10000_str,    (char*) g_work->denom10000,        32 );
+   cbin2hex( merkleroot_str,    (char*) g_work->merkleroothash,    32 );
+   cbin2hex( witmerkleroot_str, (char*) g_work->witmerkleroothash, 32 );
+   cbin2hex( pofn_str,          (char*) pofnhash,                  32 );
+
+   if ( true )
+   {
+       char* data;
+       data = (char*)malloc( 2 + strlen( denom10_str ) * 4 + 16 * 4
+                             + strlen( merkleroot_str ) * 3 );
+       // Build the block header veildatahash in hex
+       sprintf( data, "%s%s%s%s%s%s%s%s%s%s%s%s",
+                       merkleroot_str, witmerkleroot_str, "04",
+                       "0a00000000000000", denom10_str,
+                       "6400000000000000", denom100_str,
+                       "e803000000000000", denom1000_str,
+                       "1027000000000000", denom10000_str, pofn_str );
+       // Covert the hex to binary
+       uint32_t test[100];
+       hex2bin( (unsigned char*)(&test), data, 257);
+       // Compute the sha256d of the binary
+       uint32_t _ALIGN(64) hash[8];
+       sha256d( (unsigned char*)hash, (unsigned char*)&(test), 257);
+       // assign the veildatahash in the blockheader
+       for ( i = 0; i < 8; i++ )
+           g_work->data[16 - i] = le32dec(hash + i);
+       free(data);
+    }
+}
+
+bool register_x16rt_algo( algo_gate_t* gate )
+{
+#if defined (X16R_4WAY)
+  gate->scanhash  = (void*)&scanhash_x16rt_4way;
+  gate->hash      = (void*)&x16rt_4way_hash;
+#else
+  gate->scanhash  = (void*)&scanhash_x16rt;
+  gate->hash      = (void*)&x16rt_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->set_target = (void*)&alt_set_target;
+  return true;
+};
+
+bool register_x16rt_veil_algo( algo_gate_t* gate )
+{
+#if defined (X16R_4WAY)
+  gate->scanhash  = (void*)&scanhash_x16rt_4way;
+  gate->hash      = (void*)&x16rt_4way_hash;
+#else
+  gate->scanhash  = (void*)&scanhash_x16rt;
+  gate->hash      = (void*)&x16rt_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->set_target = (void*)&alt_set_target;
+  gate->build_extraheader = (void*)&x16rt_build_extraheader;
+  return true;
+};
+
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -4,6 +4,7 @@
 #include "algo-gate-api.h"
 #include "simd-utils.h"
 #include <stdint.h>
+#include <unistd.h>

 #if defined(__AVX2__) && defined(__AES__)
  #define X16R_4WAY
@@ -30,11 +31,15 @@ enum x16r_Algo {
 };

 void (*x16_r_s_getAlgoString) ( const uint8_t*, char* );
-void x16r_getAlgoString( const uint8_t* prevblock, char *output );
-void x16s_getAlgoString( const uint8_t* prevblock, char *output );
+void x16r_getAlgoString( const uint8_t *prevblock, char *output );
+void x16s_getAlgoString( const uint8_t *prevblock, char *output );
+void x16rt_getAlgoString( const uint32_t *timeHash, char *output );
+
+void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash );

 bool register_x16r_algo( algo_gate_t* gate );
 bool register_x16s_algo( algo_gate_t* gate );
+bool register_x16rt_algo( algo_gate_t* gate );

 #if defined(X16R_4WAY)

@@ -42,11 +47,18 @@ void x16r_4way_hash( void *state, const void *input );
 int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

+void x16rt_4way_hash( void *state, const void *input );
+int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
 #endif

 void x16r_hash( void *state, const void *input );
 int scanhash_x16r( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );

+void x16rt_hash( void *state, const void *input );
+int scanhash_x16rt( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr );
 #endif

--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -0,0 +1,353 @@
+#include "x16r-gate.h"
+
+#if defined (X16R_4WAY)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cubehash_sse2.h"
+#include "algo/simd/simd-hash-2way.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/hamsi/hamsi-hash-4way.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/shabal-hash-4way.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include "algo/sha/sha2-hash-4way.h"
+
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread bool s_implemented = false;
+static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
+
+union _x16rt_4way_context_overlay
+{
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_echo          echo;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    luffa_2way_context      luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    simd_2way_context       simd;
+    hamsi512_4way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_4way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_4way_context     sha512;
+};
+typedef union _x16rt_4way_context_overlay x16rt_4way_context_overlay;
+
+void x16rt_4way_hash( void* output, const void* input )
+{
+   uint32_t hash0[24] __attribute__ ((aligned (64)));
+   uint32_t hash1[24] __attribute__ ((aligned (64)));
+   uint32_t hash2[24] __attribute__ ((aligned (64)));
+   uint32_t hash3[24] __attribute__ ((aligned (64)));
+   uint32_t vhash[24*4] __attribute__ ((aligned (64)));
+   x16rt_4way_context_overlay ctx;
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   void *in2 = (void*) hash2;
+   void *in3 = (void*) hash3;
+   int size = 80;
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
+ 
+/*
+   void *in = (void*) input;
+   uint32_t *in32 = (uint32_t*) hash0;
+   uint32_t ntime = in32[17];
+   if ( s_ntime == UINT32_MAX )
+   {
+      uint32_t _ALIGN(64) timeHash[8];
+      x16rt_getTimeHash(ntime, &timeHash);
+      x16rt_getAlgoString(&timeHash[0], hashOrder);
+   }
+*/
+
+   // Input data is both 64 bit interleaved (input)
+   // and deinterleaved in inp0-3.
+   // If First function uses 64 bit data it is not required to interleave inp
+   // first. It may use the inerleaved data dmost convenient, ie 4way 64 bit.
+   // All other functions assume data is deinterleaved in hash0-3
+   // All functions must exit with data deinterleaved in hash0-3.
+   // Alias in0-3 points to either inp0-3 or hash0-3 according to
+   // its hashOrder position. Size is also set accordingly.
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            blake512_4way_init( &ctx.blake );
+            if ( i == 0 )
+               blake512_4way( &ctx.blake, input, size );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               blake512_4way( &ctx.blake, vhash, size );
+            }
+            blake512_4way_close( &ctx.blake, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case BMW:
+            bmw512_4way_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_4way( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               bmw512_4way( &ctx.bmw, vhash, size );
+            }
+            bmw512_4way_close( &ctx.bmw, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case GROESTL:
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                 (const char*)in0, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                 (const char*)in1, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                 (const char*)in2, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                 (const char*)in3, size<<3 );
+         break;
+         case SKEIN:
+            skein512_4way_init( &ctx.skein );
+            if ( i == 0 )
+               skein512_4way( &ctx.skein, input, size );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               skein512_4way( &ctx.skein, vhash, size );
+            }
+            skein512_4way_close( &ctx.skein, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case JH:
+            jh512_4way_init( &ctx.jh );
+            if ( i == 0 )
+               jh512_4way( &ctx.jh, input, size );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               jh512_4way( &ctx.jh, vhash, size );
+            }
+            jh512_4way_close( &ctx.jh, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case KECCAK:
+            keccak512_4way_init( &ctx.keccak );
+            if ( i == 0 )
+               keccak512_4way( &ctx.keccak, input, size );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               keccak512_4way( &ctx.keccak, vhash, size );
+            }
+            keccak512_4way_close( &ctx.keccak, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case LUFFA:
+            intrlv_2x128( vhash, in0, in1, size<<3 );
+            luffa_2way_init( &ctx.luffa, 512 );
+            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
+            dintrlv_2x128( hash0, hash1, vhash, 512 );
+            intrlv_2x128( vhash, in2, in3, size<<3 );
+            luffa_2way_init( &ctx.luffa, 512 );
+            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
+            dintrlv_2x128( hash2, hash3, vhash, 512 );
+         break;
+         case CUBEHASH:
+            cubehashInit( &ctx.cube, 512, 16, 32 );
+            cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
+                                  (const byte*)in0, size );
+            cubehashInit( &ctx.cube, 512, 16, 32 );
+            cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
+                                  (const byte*)in1, size );
+            cubehashInit( &ctx.cube, 512, 16, 32 );
+            cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
+                                  (const byte*)in2, size );
+            cubehashInit( &ctx.cube, 512, 16, 32 );
+            cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
+                                  (const byte*)in3, size );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in0, size );
+            sph_shavite512_close( &ctx.shavite, hash0 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in1, size );
+            sph_shavite512_close( &ctx.shavite, hash1 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in2, size );
+            sph_shavite512_close( &ctx.shavite, hash2 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in3, size );
+            sph_shavite512_close( &ctx.shavite, hash3 );
+         break;
+         case SIMD:
+            intrlv_2x128( vhash, in0, in1, size<<3 );
+            simd_2way_init( &ctx.simd, 512 );
+            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_2x128( hash0, hash1, vhash, 512 );
+            intrlv_2x128( vhash, in2, in3, size<<3 );
+            simd_2way_init( &ctx.simd, 512 );
+            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_2x128( hash2, hash3, vhash, 512 );
+         break;
+         case ECHO:
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                                (const BitSequence*)in0, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
+                                (const BitSequence*)in1, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
+                                (const BitSequence*)in2, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
+                                (const BitSequence*)in3, size<<3 );
+         break;
+         case HAMSI:
+             intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+             hamsi512_4way_init( &ctx.hamsi );
+             hamsi512_4way( &ctx.hamsi, vhash, size );
+             hamsi512_4way_close( &ctx.hamsi, vhash );
+             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in0, size );
+             sph_fugue512_close( &ctx.fugue, hash0 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in1, size );
+             sph_fugue512_close( &ctx.fugue, hash1 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in2, size );
+             sph_fugue512_close( &ctx.fugue, hash2 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in3, size );
+             sph_fugue512_close( &ctx.fugue, hash3 );
+         break;
+         case SHABAL:
+             intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
+             shabal512_4way_init( &ctx.shabal );
+             shabal512_4way( &ctx.shabal, vhash, size );
+             shabal512_4way_close( &ctx.shabal, vhash );
+             dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in0, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash0 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in1, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash1 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in2, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash2 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in3, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+         break;
+         case SHA_512:
+             intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+             sha512_4way_init( &ctx.sha512 );
+             sha512_4way( &ctx.sha512, vhash, size );
+             sha512_4way_close( &ctx.sha512, vhash );
+             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+      }
+      size = 64;
+   }
+   memcpy( output,    hash0, 32 );
+   memcpy( output+32, hash1, 32 );
+   memcpy( output+64, hash2, 32 );
+   memcpy( output+96, hash3, 32 );
+}
+
+int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[4*16] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t _ALIGN(64) timeHash[4*8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+   casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+   uint32_t ntime = swab32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16rt_getTimeHash( ntime, &timeHash );
+      x16rt_getAlgoString( &timeHash[0], hashOrder );
+      s_ntime = ntime;
+      s_implemented = true;
+      if ( opt_debug && !thr_id )
+          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
+                               hashOrder, ntime, timeHash );
+   }
+   if ( !s_implemented )
+   {
+      applog( LOG_WARNING, "s not implemented");
+      sleep(1);
+      return 0;
+   }
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+   do
+   {
+      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+               _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+
+      x16rt_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 4; i++ )  if ( (hash+(i<<3))[7] <= Htarg )
+      if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+      {
+         pdata[19] = n+i;
+         submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 4;
+   } while ( (  n < max_nonce ) && !(*restart) );
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#endif
--- a/algo/x16/x16rt.c
+++ b/algo/x16/x16rt.c
@@ -0,0 +1,239 @@
+#include "x16r-gate.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "algo/blake/sph_blake.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/sph_groestl.h"
+#include "algo/jh/sph_jh.h"
+#include "algo/keccak/sph_keccak.h"
+#include "algo/skein/sph_skein.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/luffa/luffa_for_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
+#include "algo/simd/nist.h"
+#include "algo/echo/sph_echo.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/sph_shabal.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include <openssl/sha.h>
+#if defined(__AES__)
+  #include "algo/echo/aes_ni/hash_api.h"
+  #include "algo/groestl/aes_ni/hash-groestl.h"
+#endif
+
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread bool s_implemented = false;
+static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
+
+union _x16rt_context_overlay
+{
+#if defined(__AES__)
+        hashState_echo          echo;
+        hashState_groestl       groestl;
+#else
+        sph_groestl512_context   groestl;
+        sph_echo512_context      echo;
+#endif
+        sph_blake512_context    blake;
+        sph_bmw512_context      bmw;
+        sph_skein512_context    skein;
+        sph_jh512_context       jh;
+        sph_keccak512_context   keccak;
+        hashState_luffa         luffa;
+        cubehashParam           cube;
+        sph_shavite512_context  shavite;
+        hashState_sd            simd;
+        sph_hamsi512_context    hamsi;
+        sph_fugue512_context    fugue;
+        sph_shabal512_context   shabal;
+        sph_whirlpool_context   whirlpool;
+        SHA512_CTX              sha512;
+};
+typedef union _x16rt_context_overlay x16rt_context_overlay;
+
+void x16rt_hash( void* output, const void* input )
+{
+   uint32_t _ALIGN(128) hash[16];
+   x16rt_context_overlay ctx;
+   int size = 80;
+   void *in = (void*) input;
+
+/*
+   void *in = (void*) input;
+   uint32_t *in32 = (uint32_t*) in;
+   uint32_t ntime = in32[17];
+   if ( s_ntime == UINT32_MAX )
+   {
+      uint32_t _ALIGN(64) timeHash[8];
+      x16rt_getTimeHash(ntime, &timeHash);
+      x16rt_getAlgoString(&timeHash[0], hashOrder);
+   }
+*/
+   
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            sph_blake512_init( &ctx.blake );
+            sph_blake512( &ctx.blake, in, size );
+            sph_blake512_close( &ctx.blake, hash );
+         break;
+         case BMW:
+            sph_bmw512_init( &ctx.bmw );
+            sph_bmw512(&ctx.bmw, in, size);
+            sph_bmw512_close(&ctx.bmw, hash);
+         break;
+         case GROESTL:
+#if defined(__AES__)
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                      (const char*)in, size<<3 );
+#else
+            sph_groestl512_init( &ctx.groestl );
+            sph_groestl512( &ctx.groestl, in, size );
+            sph_groestl512_close(&ctx.groestl, hash);
+#endif
+         break;
+         case SKEIN:
+            sph_skein512_init( &ctx.skein );
+            sph_skein512( &ctx.skein, in, size );
+            sph_skein512_close( &ctx.skein, hash );
+         break;
+         case JH:
+            sph_jh512_init( &ctx.jh );
+            sph_jh512(&ctx.jh, in, size );
+            sph_jh512_close(&ctx.jh, hash );
+         break;
+         case KECCAK:
+            sph_keccak512_init( &ctx.keccak );
+            sph_keccak512( &ctx.keccak, in, size );
+            sph_keccak512_close( &ctx.keccak, hash );
+         break;
+         case LUFFA:
+            init_luffa( &ctx.luffa, 512 );
+            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                    (const BitSequence*)in, size );
+         break;
+         case CUBEHASH:
+            cubehashInit( &ctx.cube, 512, 16, 32 );
+            cubehashUpdateDigest( &ctx.cube, (byte*) hash,
+                                  (const byte*)in, size );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in, size );
+            sph_shavite512_close( &ctx.shavite, hash );
+         break;
+         case SIMD:
+             init_sd( &ctx.simd, 512 );
+             update_final_sd( &ctx.simd, (BitSequence *)hash,
+                              (const BitSequence*)in, size<<3 );
+         break;
+         case ECHO:
+#if defined(__AES__)
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash,
+                                (const BitSequence*)in, size<<3 );
+#else
+             sph_echo512_init( &ctx.echo );
+             sph_echo512( &ctx.echo, in, size );
+             sph_echo512_close( &ctx.echo, hash );
+#endif
+         break;
+         case HAMSI:
+             sph_hamsi512_init( &ctx.hamsi );
+             sph_hamsi512( &ctx.hamsi, in, size );
+             sph_hamsi512_close( &ctx.hamsi, hash );
+         break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in, size );
+             sph_fugue512_close( &ctx.fugue, hash );
+         break;
+         case SHABAL:
+             sph_shabal512_init( &ctx.shabal );
+             sph_shabal512( &ctx.shabal, in, size );
+             sph_shabal512_close( &ctx.shabal, hash );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash );
+         break;
+         case SHA_512:
+             SHA512_Init( &ctx.sha512 );
+             SHA512_Update( &ctx.sha512, in, size );
+             SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
+         break;
+      }
+      in = (void*) hash;
+      size = 64;
+   }
+   memcpy(output, hash, 32);
+}
+
+int scanhash_x16rt( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(128) hash32[8];
+   uint32_t _ALIGN(128) endiandata[20];
+   uint32_t _ALIGN(64) timeHash[8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+   uint32_t nonce = first_nonce;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+   uint32_t ntime = swab32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16rt_getTimeHash( ntime, &timeHash );
+      x16rt_getAlgoString( &timeHash[0], hashOrder );
+      s_ntime = ntime;
+      s_implemented = true;
+      if ( opt_debug && !thr_id )
+          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
+                               hashOrder, ntime, timeHash );
+   }
+   if ( !s_implemented )
+   {
+      applog( LOG_WARNING, "s not implemented");
+      sleep(1);
+      return 0;
+   }
+   
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   do
+   {
+      be32enc( &endiandata[19], nonce );
+      x16rt_hash( hash32, endiandata );
+
+      if ( hash32[7] <= Htarg )
+      if (fulltest( hash32, ptarget ) && !opt_benchmark )
+      {
+         pdata[19] = nonce;
+         submit_solution( work, hash32, mythr );
+      }
+      nonce++;
+   } while ( nonce < max_nonce && !(*restart) );
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce + 1;
+   return 0;
+}
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -69,7 +69,7 @@ void sonoa_4way_hash( void *state, const void *input )
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );

-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -80,7 +80,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -134,13 +134,13 @@ void sonoa_4way_hash( void *state, const void *input )

 // 2

-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );

-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -151,7 +151,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -203,7 +203,7 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
@@ -215,7 +215,7 @@ void sonoa_4way_hash( void *state, const void *input )
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );

-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -226,7 +226,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -278,13 +278,13 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );

-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -300,13 +300,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512_close( &ctx.fugue, hash3 );

 // 4
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );

-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -317,7 +317,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -369,13 +369,13 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );

-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -390,7 +390,7 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );

     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
@@ -402,7 +402,7 @@ void sonoa_4way_hash( void *state, const void *input )
     hamsi512_4way( &ctx.hamsi, vhashB, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );

-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -438,7 +438,7 @@ void sonoa_4way_hash( void *state, const void *input )
     shabal512_4way( &ctx.shabal, vhashB, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );

-     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );

     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -449,7 +449,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -501,13 +501,13 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );

-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -522,13 +522,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );

     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );

-     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );

     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -545,13 +545,13 @@ void sonoa_4way_hash( void *state, const void *input )

 // 6

-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     
     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );

-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -562,7 +562,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -614,13 +614,13 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );

-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -635,13 +635,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );

     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );

-     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );

     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -656,13 +656,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash3 );

-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     sha512_4way_init( &ctx.sha512 );
     sha512_4way( &ctx.sha512, vhash, 64 );
     sha512_4way_close( &ctx.sha512, vhash );

-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -679,13 +679,13 @@ void sonoa_4way_hash( void *state, const void *input )

 // 7

-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );

-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -696,7 +696,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -748,13 +748,13 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );

-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -769,13 +769,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );

     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );

-     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );

     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -790,7 +790,7 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash3 );

-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     sha512_4way_init( &ctx.sha512 );
     sha512_4way( &ctx.sha512, vhash, 64 );
@@ -806,7 +806,7 @@ void sonoa_4way_hash( void *state, const void *input )
 int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
 	            uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t hash[4*16] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t lane_hash[8] __attribute__ ((aligned (32)));
     uint32_t *hash7 = &(hash[7<<2]);
@@ -816,7 +816,7 @@ int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
     const uint32_t first_nonce = pdata[19];
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     const uint32_t Htarg = ptarget[7];
-     int thr_id = mythr->id;  // thr_id arg is deprecated
+     int thr_id = mythr->id;
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -68,7 +68,7 @@ void x17_4way_hash( void *state, const void *input )
     bmw512_4way_close( &ctx.bmw, vhash );

     // Serialize
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

     // 3 Groestl
     init_groestl( &ctx.groestl, 64 );
@@ -81,7 +81,7 @@ void x17_4way_hash( void *state, const void *input )
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

     // Parallellize
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     // 4 Skein parallel 4 way 64 bit 
     skein512_4way_init( &ctx.skein );
@@ -142,13 +142,13 @@ void x17_4way_hash( void *state, const void *input )
                       (const BitSequence *) hash3, 512 );

     // 12 Hamsi parallel 4 way 64 bit
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );

-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

     // 13 Fugue serial
     sph_fugue512_init( &ctx.fugue );
@@ -165,13 +165,13 @@ void x17_4way_hash( void *state, const void *input )
     sph_fugue512_close( &ctx.fugue, hash3 );

     // 14 Shabal, parallel 4 way 32 bit
-     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );

     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );

-     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
       
     // 15 Whirlpool serial
     sph_whirlpool_init( &ctx.whirlpool );
@@ -188,7 +188,7 @@ void x17_4way_hash( void *state, const void *input )
     sph_whirlpool_close( &ctx.whirlpool, hash3 );

     // 16 SHA512 parallel 64 bit 
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     sha512_4way_init( &ctx.sha512 );
     sha512_4way( &ctx.sha512, vhash, 64 );
@@ -205,7 +205,7 @@ void x17_4way_hash( void *state, const void *input )
 int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t hash[4*16] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t lane_hash[8] __attribute__ ((aligned (32)));
     uint32_t *hash7 = &(hash[7<<2]);
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -332,7 +332,7 @@ void xevan_4way_hash( void *output, const void *input )
 int scanhash_xevan_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t hash[4*16] __attribute__ ((aligned (64)));
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[7<<2]);
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -399,15 +399,15 @@ int scanhash_yescrypt( struct work *work, uint32_t max_nonce,
                be32enc(&endiandata[k], pdata[k]);

        do {
-                be32enc(&endiandata[19], n);
-                yescrypt_hash((char*) endiandata, (char*) vhash, 80);
-                if (vhash[7] < Htarg && fulltest(vhash, ptarget)) {
-                        work_set_target_ratio( work, vhash );
-                        *hashes_done = n - first_nonce + 1;
-                        pdata[19] = n;
-                        return true;
-                }
-                n++;
+           be32enc(&endiandata[19], n);
+           yescrypt_hash((char*) endiandata, (char*) vhash, 80);
+           if (vhash[7] < Htarg && fulltest(vhash, ptarget ) 
+               && !opt_benchmark )
+           {
+               pdata[19] = n;
+               submit_solution( work, vhash, mythr );
+           }
+           n++;
        } while (n < max_nonce && !work_restart[thr_id].restart);

        *hashes_done = n - first_nonce + 1;
--- a/algo/yespower/yespower.c
+++ b/algo/yespower/yespower.c
@@ -53,15 +53,15 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce,
        for (int k = 0; k < 19; k++)
                be32enc(&endiandata[k], pdata[k]);
        do {
-                be32enc(&endiandata[19], n);
-                yespower_hash((char*) endiandata, (char*) vhash, 80);
-                if (vhash[7] < Htarg && fulltest(vhash, ptarget)) {
-                        work_set_target_ratio( work, vhash );
-                        *hashes_done = n - first_nonce + 1;
-                        pdata[19] = n;
-                        return true;
-                }
-                n++;
+           be32enc(&endiandata[19], n);
+           yespower_hash((char*) endiandata, (char*) vhash, 80);
+           if ( vhash[7] < Htarg && fulltest( vhash, ptarget )
+              && !opt_benchmark )
+           {
+               pdata[19] = n;
+               submit_solution( work, vhash, mythr );
+            }
+            n++;
        } while (n < max_nonce && !work_restart[thr_id].restart);

        *hashes_done = n - first_nonce + 1;