v3.21.3 Unreleased

2025-09-17 23:44:27 +00:00 · 2023-03-13 03:20:13 -04:00
parent b339450898
commit c6bc9d67fb
49 changed files with 1126 additions and 1111 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -264,6 +264,8 @@ cpuminer_SOURCES = \
  algo/x16/x16r-4way.c \
  algo/x16/x16rv2.c \
  algo/x16/x16rv2-4way.c \
+  algo/x16/x16rt.c \
+  algo/x16/x16rt-4way.c \
  algo/x16/hex.c \
  algo/x16/x21s-4way.c \
  algo/x16/x21s.c \
--- a/10
+++ b/10
@@ -65,7 +65,11 @@ If not what makes it happen or not happen?
 Change Log
 ----------

-v3.22.3
+v3.21.3.1 UNRELEASED
+
+Revert to 3.21.2
+
+v3.21.3 CANCELLED

 #392 #379 #389 Fixed misaligned address segfault solo mining.
 #392 Fixed stats for myr-gr algo, and a few others, for CPUs without AVX2.
@@ -74,10 +78,10 @@ v3.22.3
     Windows binaries no longer support CPU groups,
     Windows binaries support CPUs with up to 64 threads.
 Midstate prehash is now centralized, done only once instead of by every thread
-for selected algos. 
+for selected algos.
 Small optimizations to serialized vectoring.

-v3.22.2
+v3.21.2 

 Faster SALSA SIMD shuffle for yespower, yescrypt & scryptn2.
 Fixed a couple of compiler warnings with gcc-12.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -253,7 +253,6 @@ void init_algo_gate( algo_gate_t* gate )
   gate->miner_thread_init       = (void*)&return_true;
   gate->scanhash                = (void*)&scanhash_generic;
   gate->hash                    = (void*)&null_hash;
-   gate->prehash                 = (void*)&return_true;
   gate->get_new_work            = (void*)&std_get_new_work;
   gate->work_decode             = (void*)&std_le_work_decode;
   gate->decode_extra_data       = (void*)&do_nothing;
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -119,7 +119,7 @@ typedef struct
 // to be registered with the gate. 
 int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* );

-int ( *hash )     ( void*, const void*, const int );
+int ( *hash )     ( void*, const void*, int );

 //optional, safe to use default in most cases

@@ -127,9 +127,6 @@ int ( *hash )     ( void*, const void*, const int );
 // other initialization specific to miner threads.
 bool ( *miner_thread_init )     ( int );

-// Perform prehash after receiving new work
-int ( *prehash )                ( struct work* );
-
 // Get thread local copy of blockheader with unique nonce.
 void ( *get_new_work )          ( struct work*, struct work*, int, uint32_t* );

--- a/algo/blake/blake2s-4way.c
+++ b/algo/blake/blake2s-4way.c
@@ -1,6 +1,5 @@
 #include "blake2s-gate.h"
 #include "blake2s-hash-4way.h"
-//#include "sph-blake2s.h"
 #include <string.h>
 #include <stdint.h>

@@ -8,43 +7,6 @@

 static __thread blake2s_16way_state blake2s_16w_ctx;

-/*
-static blake2s_16way_state blake2s_16w_ctx;
-static uint32_t blake2s_16way_vdata[20*16] __attribute__ ((aligned (64)));
-*/
-/*
-int blake2s_16way_prehash( struct work *work )
-{
-   uint32_t edata[20] __attribute__ ((aligned (64)));
-   blake2s_state ctx;
-   mm128_bswap32_80( edata, work->data );
-   blake2s_init( &ctx, BLAKE2S_OUTBYTES );
-   ctx.buflen = ctx.t[0] = 64;
-   blake2s_compress( &ctx, (const uint8_t*)edata );
-
-   blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
-   intrlv_16x32( blake2s_16w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
-                                    ctx.h, ctx.h, ctx.h, ctx.h,
-                                    ctx.h, ctx.h, ctx.h, ctx.h,
-                                    ctx.h, ctx.h, ctx.h, ctx.h, 256 );
-   intrlv_16x32( blake2s_16way_vdata, edata, edata, edata, edata,
-                                      edata, edata, edata, edata,
-                                      edata, edata, edata, edata,
-                                      edata, edata, edata, edata, 640 );
-   blake2s_16w_ctx.t[0] = 64;
-   return 1;
-}
-*/
-/*
-int blake2s_16way_prehash( struct work *work )
-{
-   mm512_bswap32_intrlv80_16x32( blake2s_16way_vdata, work->data );
-   blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
-   blake2s_16way_update( &blake2s_16w_ctx, blake2s_16way_vdata, 64 );
-   return 1;
-}
-*/
-
 void blake2s_16way_hash( void *output, const void *input )
 {
   blake2s_16way_state ctx;
@@ -68,40 +30,10 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   int thr_id = mythr->id;  

-/*   
-//   pthread_rwlock_rdlock( &g_work_lock );
-       memcpy( (__m512i*)vdata +16, (__m512i*)blake2s_16way_vdata +16, 3*4*16 );
-//     casti_m512i( vdata, 16 ) = casti_m512i( blake2s_16way_vdata, 16 );
-//     casti_m512i( vdata, 17 ) = casti_m512i( blake2s_16way_vdata, 17 );
-//     casti_m512i( vdata, 18 ) = casti_m512i( blake2s_16way_vdata, 18 );
-       
-//   pthread_rwlock_unlock( &g_work_lock );
-*/
-/*
-   uint32_t edata[20] __attribute__ ((aligned (64)));
-   blake2s_state ctx;
-   mm128_bswap32_80( edata, pdata );
-   blake2s_init( &ctx, BLAKE2S_OUTBYTES );
-   ctx.buflen = ctx.t[0] = 64;
-   blake2s_compress( &ctx, (const uint8_t*)edata );
-
-   blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
-   intrlv_16x32( blake2s_16w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
-                                    ctx.h, ctx.h, ctx.h, ctx.h,
-                                    ctx.h, ctx.h, ctx.h, ctx.h,
-                                    ctx.h, ctx.h, ctx.h, ctx.h, 256 );
-   intrlv_16x32( blake2s_16way_blake2s_16way_vdata, edata, edata, edata, edata,
-                                      edata, edata, edata, edata,
-                                      edata, edata, edata, edata,
-                                      edata, edata, edata, edata, 640 );
-   blake2s_16w_ctx.t[0] = 64;
-*/
-   
   mm512_bswap32_intrlv80_16x32( vdata, pdata );
   blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
   blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );

-
   do {
      *noncev = mm512_bswap_32( _mm512_set_epi32(
 	                  n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
@@ -131,36 +63,6 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,

 static __thread blake2s_8way_state blake2s_8w_ctx;

-/*
-static blake2s_8way_state blake2s_8w_ctx;
-static uint32_t blake2s_8way_vdata[20*8] __attribute__ ((aligned (32)));
-
-int blake2s_8way_prehash( struct work *work )
-{
-   uint32_t edata[20] __attribute__ ((aligned (64)));
-   blake2s_state ctx;
-   mm128_bswap32_80( edata, work->data );
-   blake2s_init( &ctx, BLAKE2S_OUTBYTES );
-   ctx.buflen = ctx.t[0] = 64;
-   blake2s_compress( &ctx, (const uint8_t*)edata );
-
-   blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
-
-   for ( int i = 0; i < 8; i++ )
-      casti_m256i( blake2s_8w_ctx.h, i ) = _mm256_set1_epi32( ctx.h[i] );
-
-   casti_m256i( blake2s_8way_vdata, 16 ) = _mm256_set1_epi32( edata[16] );
-   casti_m256i( blake2s_8way_vdata, 17 ) = _mm256_set1_epi32( edata[17] );
-   casti_m256i( blake2s_8way_vdata, 18 ) = _mm256_set1_epi32( edata[18] );
-
-//   intrlv_8x32( blake2s_8w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
-//                                  ctx.h, ctx.h, ctx.h, ctx.h, 256 );
-//   intrlv_8x32( blake2s_8way_vdata, edata, edata, edata, edata,
-//                                    edata, edata, edata, edata, 640 );
-   blake2s_8w_ctx.t[0] = 64;
-}
-*/
-
 void blake2s_8way_hash( void *output, const void *input )
 {
   blake2s_8way_state ctx;
@@ -184,41 +86,10 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   int thr_id = mythr->id; 

-/*   
-//   pthread_rwlock_rdlock( &g_work_lock );
-       memcpy( &vdata[16*8], &blake2s_8way_vdata[16*8], 3*4*8 );
-//   pthread_rwlock_unlock( &g_work_lock );
-*/
-/*
-   uint32_t edata[20] __attribute__ ((aligned (64)));
-   blake2s_state ctx;
-   mm128_bswap32_80( edata, pdata );
-   blake2s_init( &ctx, BLAKE2S_OUTBYTES );
-   ctx.buflen = ctx.t[0] = 64;
-   blake2s_compress( &ctx, (const uint8_t*)edata );
-
-   blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
-   for ( int i = 0; i < 8; i++ )
-      casti_m256i( blake2s_8w_ctx.h, i ) = _mm256_set1_epi32( ctx.h[i] );
-
-   casti_m256i( vdata, 16 ) = _mm256_set1_epi32( edata[16] );
-   casti_m256i( vdata, 17 ) = _mm256_set1_epi32( edata[17] );
-   casti_m256i( vdata, 18 ) = _mm256_set1_epi32( edata[18] );
-
-
-//  intrlv_8x32( blake2s_8w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
-//                                  ctx.h, ctx.h, ctx.h, ctx.h, 256 );
-//   intrlv_8x32( vdata, edata, edata, edata, edata,
-//                                    edata, edata, edata, edata, 640 );
-
-   blake2s_8w_ctx.t[0] = 64;
-*/
-   
   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
   blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );

-
   do {
      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
                                                  n+3, n+2, n+1, n ) );
@@ -246,25 +117,7 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
 #elif defined(BLAKE2S_4WAY)

 static __thread blake2s_4way_state blake2s_4w_ctx;
-/*
-static blake2s_4way_state blake2s_4w_ctx;
-static uint32_t blake2s_4way_vdata[20*4] __attribute__ ((aligned (32)));

-int blake2s_4way_prehash( struct work *work )
-{
-   uint32_t edata[20] __attribute__ ((aligned (64)));
-   blake2s_state ctx;
-   mm128_bswap32_80( edata, work->data );
-   blake2s_init( &ctx, BLAKE2S_OUTBYTES );
-   ctx.buflen = ctx.t[0] = 64;
-   blake2s_compress( &ctx, (const uint8_t*)edata );
-
-   blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
-   intrlv_4x32( blake2s_4w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h, 256 );
-   intrlv_4x32( blake2s_4way_vdata, edata, edata, edata, edata, 640 );
-   blake2s_4w_ctx.t[0] = 64;
-}
-*/
 void blake2s_4way_hash( void *output, const void *input )
 {
   blake2s_4way_state ctx;
@@ -287,15 +140,11 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
   uint32_t n = first_nonce;
   int thr_id = mythr->id; 
-/*
-   pthread_rwlock_rdlock( &g_work_lock );
-       memcpy( vdata, blake2s_4way_vdata, sizeof vdata );
-   pthread_rwlock_unlock( &g_work_lock );
-*/
+
   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
   blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
-   
+
   do {
      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      pdata[19] = n;
--- a/algo/blake/blake2s-gate.c
+++ b/algo/blake/blake2s-gate.c
@@ -5,15 +5,13 @@ bool register_blake2s_algo( algo_gate_t* gate )
 #if defined(BLAKE2S_16WAY)
  gate->scanhash  = (void*)&scanhash_blake2s_16way;
  gate->hash      = (void*)&blake2s_16way_hash;
-//  gate->prehash   = (void*)&blake2s_16way_prehash;
 #elif defined(BLAKE2S_8WAY)
+//#if defined(BLAKE2S_8WAY)
  gate->scanhash  = (void*)&scanhash_blake2s_8way;
  gate->hash      = (void*)&blake2s_8way_hash;
-//  gate->prehash   = (void*)&blake2s_8way_prehash;
 #elif defined(BLAKE2S_4WAY)
  gate->scanhash  = (void*)&scanhash_blake2s_4way;
  gate->hash      = (void*)&blake2s_4way_hash;
-//  gate->prehash   = (void*)&blake2s_4way_prehash;
 #else
  gate->scanhash  = (void*)&scanhash_blake2s;
  gate->hash      = (void*)&blake2s_hash;
--- a/algo/blake/blake2s-gate.h
+++ b/algo/blake/blake2s-gate.h
@@ -23,22 +23,18 @@ bool register_blake2s_algo( algo_gate_t* gate );
 void blake2s_16way_hash( void *state, const void *input );
 int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-int blake2s_16way_prehash( struct work * );

 #elif defined (BLAKE2S_8WAY)

 void blake2s_8way_hash( void *state, const void *input );
 int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-int blake2s_8way_prehash( struct work * );

 #elif defined (BLAKE2S_4WAY)

 void blake2s_4way_hash( void *state, const void *input );
 int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-int blake2s_4way_prehash( struct work * );
-
 #else

 void blake2s_hash( void *state, const void *input );
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -105,8 +105,8 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )

 #define G4W( sigma0, sigma1, a, b, c, d ) \
 do { \
-   const uint8_t s0 = sigma0; \
-   const uint8_t s1 = sigma1; \
+   uint8_t s0 = sigma0; \
+   uint8_t s1 = sigma1; \
   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
   d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
   c = _mm_add_epi32( c, d ); \
@@ -120,7 +120,7 @@ do { \

 #define ROUND4W(r)  \
 do { \
-   const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
+   uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
   G4W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
   G4W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
   G4W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
@@ -317,8 +317,8 @@ do { \

 #define G8W( sigma0, sigma1, a, b, c, d) \
 do { \
-   const uint8_t s0 = sigma0; \
-   const uint8_t s1 = sigma1; \
+   uint8_t s0 = sigma0; \
+   uint8_t s1 = sigma1; \
   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
   d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
   c = _mm256_add_epi32( c, d ); \
@@ -331,7 +331,7 @@ do { \

 #define ROUND8W(r)  \
 do { \
-   const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
+   uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
   G8W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
   G8W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
   G8W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
@@ -529,8 +529,8 @@ int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )

 #define G16W( sigma0, sigma1, a, b, c, d) \
 do { \
-   const uint8_t s0 = sigma0; \
-   const uint8_t s1 = sigma1; \
+   uint8_t s0 = sigma0; \
+   uint8_t s1 = sigma1; \
   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s0 ] ); \
   d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
   c = _mm512_add_epi32( c, d ); \
@@ -543,7 +543,7 @@ do { \

 #define ROUND16W(r)  \
 do { \
-   const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
+   uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
   G16W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
   G16W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
   G16W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -20,7 +20,6 @@

 #include <stddef.h>
 #include <stdint.h>
-//#include "sph-blake2s.h"

 #if defined(_MSC_VER)
 #include <inttypes.h>
@@ -34,7 +33,7 @@
 #if defined(__cplusplus)
 extern "C" {
 #endif
-/*
+
 enum blake2s_constant
 {
   BLAKE2S_BLOCKBYTES = 64,
@@ -43,13 +42,6 @@ enum blake2s_constant
   BLAKE2S_SALTBYTES  = 8,
   BLAKE2S_PERSONALBYTES = 8
 };
-*/
-
-#define BLAKE2S_BLOCKBYTES  64
-#define BLAKE2S_OUTBYTES    32
-#define BLAKE2S_KEYBYTES    32
-#define BLAKE2S_SALTBYTES   8
-#define BLAKE2S_PERSONALBYTES  8

 #pragma pack(push, 1)
 typedef struct __blake2s_nway_param
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -8,6 +8,8 @@
 #include "sph-blake2s.h"

 static __thread blake2s_state blake2s_ctx;
+//static __thread blake2s_state s_ctx;
+#define MIDLEN 76

 void blake2s_hash( void *output, const void *input )
 {
@@ -17,27 +19,37 @@ void blake2s_hash( void *output, const void *input )
   memcpy( &ctx, &blake2s_ctx, sizeof ctx );
   blake2s_update( &ctx, input+64, 16 );
 
+//	blake2s_init(&ctx, BLAKE2S_OUTBYTES);
+//	blake2s_update(&ctx, input, 80);
 	blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );

 	memcpy(output, hash, 32);
 }
-
+/*
+static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
+{
+	s_ctx.buflen = MIDLEN;
+	memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
+	blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
+	blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
+}
+*/
 int scanhash_blake2s( struct work *work,
 	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;

 	uint32_t _ALIGN(64) hash64[8];
 	uint32_t _ALIGN(64) endiandata[20];
-   int thr_id = mythr->id;  
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];

 	uint32_t n = first_nonce;

-   swab32_array( endiandata, pdata, 20 );
+        swab32_array( endiandata, pdata, 20 );

 	// midstate
 	blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
@@ -46,12 +58,11 @@ int scanhash_blake2s( struct work *work,
 	do {
 		be32enc(&endiandata[19], n);
 		blake2s_hash( hash64, endiandata );
-      if (hash64[7] <= Htarg )
-      if ( fulltest(hash64, ptarget) && !opt_benchmark )
-      {
-         pdata[19] = n;
-         submit_solution( work, hash64, mythr );
-      }
+		if (hash64[7] <= Htarg && fulltest(hash64, ptarget)) {
+			*hashes_done = n - first_nonce + 1;
+			pdata[19] = n;
+			return true;
+		}
 		n++;

 	} while (n < max_nonce && !work_restart[thr_id].restart);
--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -17,7 +17,6 @@

 #include "algo/sha/sph_types.h"
 #include "sph-blake2s.h"
-#include "simd-utils.h"

 static const uint32_t blake2s_IV[8] =
 {
@@ -226,71 +225,6 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
 	v[13] = S->t[1] ^ blake2s_IV[5];
 	v[14] = S->f[0] ^ blake2s_IV[6];
 	v[15] = S->f[1] ^ blake2s_IV[7];
-
-#if 0    
-//#if defined(__SSE2__) // always true
-
-The only application for this is to do a prehash for the blake2s algorithm.
-SSE2 also supports 4 way parallel hashing so that is preferred in most cases.
-Testing has found that using this serial SIMD code for prehash is slower than
-doing a parallel hash. A parallel hash has more instructions and uses more
-data. The serial hash uses fewer instructions and data and only needs to
-interleave the final hash into parallel streams. This has shown negligible
-improvement on other algos, notably blake256 which is almost identical.
-Considering the low frequency of prehash no statistically valid change
-was expected. It was simply better on paper.
-
-Furthermore, simply defining this macro has an additional negative effect on
-blake2s as a whole. There are no references to this macro, blake2s-4way does
-not include it in any header files, it's just another unused macro which should
-have no effect beyond the preprocessor. But just being visible to the compiler
-changes things in a dramatic way.
-
-These 2 things combined reduced the hash rate for blake2s by more than 5% when
-using serial SIMD for the blake2s prehash over 16way parallel prehash.
-16way parallel hashing was used in the high frequency nonce loop in both cases.
-Comsidering the prehash represents 50% of the algorithm and is done once vs
-the high frequency second half that is done mega, maybe giga, times more it's
-hard to imagine that big of an effect in either direction.
-
-#define ROUND( r ) \
-{ \
-   __m128i *V = (__m128i*)v; \
-   const uint8_t *sigma = blake2s_sigma[r]; \
-   V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
-                       _mm_set_epi32( m[ sigma[ 6 ] ], m[ sigma[ 4 ] ], \
-                                      m[ sigma[ 2 ] ], m[ sigma[ 0 ] ] ) ) ); \
-   V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
-   V[2] = _mm_add_epi32( V[2], V[3] ); \
-   V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
-   V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
-                        _mm_set_epi32( m[ sigma[ 7 ] ], m[ sigma[ 5 ] ], \
-                                       m[ sigma[ 3 ] ], m[ sigma[ 1 ] ] ) ) ); \
-   V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
-   V[2] = _mm_add_epi32( V[2], V[3] ); \
-   V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
-   V[3] = mm128_shufll_32( V[3] ); \
-   V[2] = mm128_swap_64( V[2] ); \
-   V[1] = mm128_shuflr_32( V[1] ); \
-   V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
-                        _mm_set_epi32( m[ sigma[14] ], m[ sigma[12] ], \
-                                       m[ sigma[10] ], m[ sigma[ 8] ] ) ) ); \
-   V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
-   V[2] = _mm_add_epi32( V[2], V[3] ); \
-   V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
-   V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
-                        _mm_set_epi32( m[ sigma[15] ], m[ sigma[13] ], \
-                                       m[ sigma[11] ], m[ sigma[ 9] ] ) ) ); \
-   V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
-   V[2] = _mm_add_epi32( V[2], V[3] ); \
-   V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
-   V[3] = mm128_shuflr_32( V[3] ); \
-   V[2] = mm128_swap_64( V[2] ); \
-   V[1] = mm128_shufll_32( V[1] ); \
-}
-
-#else
-
 #define G(r,i,a,b,c,d) \
 	do { \
 		a = a + b + m[blake2s_sigma[r][2*i+0]]; \
@@ -313,10 +247,7 @@ hard to imagine that big of an effect in either direction.
 		G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
 		G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
 	} while(0)
-
-#endif
-
-   ROUND( 0 );
+	ROUND( 0 );
 	ROUND( 1 );
 	ROUND( 2 );
 	ROUND( 3 );
--- a/algo/blake/sph-blake2s.h
+++ b/algo/blake/sph-blake2s.h
@@ -91,7 +91,6 @@ static inline void secure_zero_memory(void *v, size_t n)
 extern "C" {
 #endif

-/*   
 	enum blake2s_constant
 	{
 		BLAKE2S_BLOCKBYTES = 64,
@@ -100,13 +99,6 @@ extern "C" {
 		BLAKE2S_SALTBYTES  = 8,
 		BLAKE2S_PERSONALBYTES = 8
 	};
-*/
-
-#define BLAKE2S_BLOCKBYTES  64
-#define BLAKE2S_OUTBYTES    32
-#define BLAKE2S_KEYBYTES    32
-#define BLAKE2S_SALTBYTES   8
-#define BLAKE2S_PERSONALBYTES  8

 #pragma pack(push, 1)
 	typedef struct __blake2s_param
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -103,16 +103,16 @@
   const uint8_t *sigmaR = sigma[R]; \
   BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
   BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
-   V2 = mm128_alignr_64( V[3], V[2], 1 ); \
-   V3 = mm128_alignr_64( V[2], V[3], 1 ); \
-   V6 = mm128_alignr_64( V[6], V[7], 1 ); \
-   V7 = mm128_alignr_64( V[7], V[6], 1 ); \
+   V2 = mm128_alignr_64( V[3], V[2] ); \
+   V3 = mm128_alignr_64( V[2], V[3] ); \
+   V6 = mm128_alignr_64( V[6], V[7] ); \
+   V7 = mm128_alignr_64( V[7], V[6] ); \
   BLAKE2B_G( V[0], V2, V[5], V6,  8,  9, 10, 11 ); \
   BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
-   V[2] = mm128_alignr_64( V2, V3, 1 ); \
-   V[3] = mm128_alignr_64( V3, V2, 1 ); \
-   V[6] = mm128_alignr_64( V7, V6, 1 ); \
-   V[7] = mm128_alignr_64( V6, V7, 1 ); \
+   V[2] = mm128_alignr_64( V2, V3 ); \
+   V[3] = mm128_alignr_64( V3, V2 ); \
+   V[6] = mm128_alignr_64( V7, V6 ); \
+   V[7] = mm128_alignr_64( V6, V7 ); \
 }

 #else
--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -73,11 +73,11 @@ int scanhash_myriad( struct work *work, uint32_t max_nonce,
      be32enc(&endiandata[19], nonce);
      myriad_hash(hash, endiandata);

-      if (hash[7] <= Htarg )
-      if ( fulltest(hash, ptarget) && !opt_benchmark )
+      if (hash[7] <= Htarg && fulltest(hash, ptarget))
      {
         pdata[19] = nonce;
-         submit_solution( work, hash, mythr );
+         *hashes_done = pdata[19] - first_nonce;
+         return 1;
      }
      nonce++;

--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -19,34 +19,26 @@
 */

 #include <string.h>
+#include <emmintrin.h>
 #include "simd-utils.h"
 #include "luffa_for_sse2.h"

-#if defined(__AVX512VL__)
-
-#define MULT2( a0, a1 ) \
-{ \
-  __m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
-  a0 = _mm_alignr_epi32( a1, b, 1 ); \
-  a1 = _mm_alignr_epi32( b, a1, 1 ); \
-}
-
-#elif defined(__SSE4_1__)
+#if defined(__SSE4_1__)

 #define MULT2( a0, a1 ) do \
 { \
-  __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
-  a0 = _mm_alignr_epi8( a1, b, 4 ); \
-  a1 = _mm_alignr_epi8( b, a1, 4 ); \
+  __m128i b =  _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
+  a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
+  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) );  \
 } while(0)

 #else

 #define MULT2( a0, a1 ) do \
 { \
-  __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
+  __m128i b =  _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 16 ) ); \
  a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
-  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
+  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) );  \
 } while(0)

 #endif
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -24,45 +24,6 @@ typedef union {
 #endif
 } allium_16way_ctx_holder;

-static uint32_t allium_16way_midstate_vars[16*16] __attribute__ ((aligned (64)));
-static __m512i allium_16way_block0_hash[8] __attribute__ ((aligned (64)));
-static __m512i allium_16way_block_buf[16] __attribute__ ((aligned (64)));
-
-int allium_16way_prehash( struct work *work )
-{
-   uint32_t phash[8] __attribute__ ((aligned (32))) =
-   {
-      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-   };
-   uint32_t *pdata = work->data;
-
-   // Prehash first block.
-   blake256_transform_le( phash, pdata, 512, 0 );
-
-   // Interleave hash for second block prehash.
-   allium_16way_block0_hash[0] = _mm512_set1_epi32( phash[0] );
-   allium_16way_block0_hash[1] = _mm512_set1_epi32( phash[1] );
-   allium_16way_block0_hash[2] = _mm512_set1_epi32( phash[2] );
-   allium_16way_block0_hash[3] = _mm512_set1_epi32( phash[3] );
-   allium_16way_block0_hash[4] = _mm512_set1_epi32( phash[4] );
-   allium_16way_block0_hash[5] = _mm512_set1_epi32( phash[5] );
-   allium_16way_block0_hash[6] = _mm512_set1_epi32( phash[6] );
-   allium_16way_block0_hash[7] = _mm512_set1_epi32( phash[7] );
-
-   // Build vectored second block, interleave 12 of last 16 bytes of data,
-   // excluding the nonce.
-   allium_16way_block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
-   allium_16way_block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
-   allium_16way_block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
-
-   // Partialy prehash second block without touching nonces in block_buf[3].
-   blake256_16way_round0_prehash_le( allium_16way_midstate_vars,
-                         allium_16way_block0_hash, allium_16way_block_buf );
-
-   return 1;
-}
-
 static void allium_16way_hash( void *state, const void *midstate_vars, 
                               const void *midhash, const void *block )
 {
@@ -239,6 +200,11 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
   uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
   __m512i block0_hash[8] __attribute__ ((aligned (64)));
   __m512i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (32))) = 
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -250,19 +216,31 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,

   if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   pthread_rwlock_rdlock( &g_work_lock );
+   // Prehash first block.
+   blake256_transform_le( phash, pdata, 512, 0 );

-   memcpy( midstate_vars, allium_16way_midstate_vars, sizeof midstate_vars );
-   memcpy( block0_hash,   allium_16way_block0_hash,   sizeof block0_hash );
-   memcpy( block_buf,     allium_16way_block_buf,     sizeof block_buf );
+   // Interleave hash for second block prehash.
+   block0_hash[0] = _mm512_set1_epi32( phash[0] );
+   block0_hash[1] = _mm512_set1_epi32( phash[1] );
+   block0_hash[2] = _mm512_set1_epi32( phash[2] );
+   block0_hash[3] = _mm512_set1_epi32( phash[3] );
+   block0_hash[4] = _mm512_set1_epi32( phash[4] );
+   block0_hash[5] = _mm512_set1_epi32( phash[5] );
+   block0_hash[6] = _mm512_set1_epi32( phash[6] );
+   block0_hash[7] = _mm512_set1_epi32( phash[7] );

-   pthread_rwlock_unlock( &g_work_lock );
-
-   // fill in the nonces
-   block_buf[3] =
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces.
+   block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
+   block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
+   block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
+   block_buf[ 3] =
             _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
-   
+
+   // Partialy prehash second block without touching nonces in block_buf[3].
+   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+
   do {
     allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );

@@ -293,44 +271,6 @@ typedef union {
 #endif
 } allium_8way_ctx_holder;

-static uint32_t allium_8way_midstate_vars[16*8] __attribute__ ((aligned (64)));
-static __m256i allium_8way_block0_hash[8] __attribute__ ((aligned (64)));
-static __m256i allium_8way_block_buf[16] __attribute__ ((aligned (64)));
-
-int allium_8way_prehash ( struct work *work )
-{
-   uint32_t phash[8] __attribute__ ((aligned (32))) =
-   {
-      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-   };
-   uint32_t *pdata = work->data;
-
-   // Prehash first block
-   blake256_transform_le( phash, pdata, 512, 0 );
-
-   allium_8way_block0_hash[0] = _mm256_set1_epi32( phash[0] );
-   allium_8way_block0_hash[1] = _mm256_set1_epi32( phash[1] );
-   allium_8way_block0_hash[2] = _mm256_set1_epi32( phash[2] );
-   allium_8way_block0_hash[3] = _mm256_set1_epi32( phash[3] );
-   allium_8way_block0_hash[4] = _mm256_set1_epi32( phash[4] );
-   allium_8way_block0_hash[5] = _mm256_set1_epi32( phash[5] );
-   allium_8way_block0_hash[6] = _mm256_set1_epi32( phash[6] );
-   allium_8way_block0_hash[7] = _mm256_set1_epi32( phash[7] );
-
-   // Build vectored second block, interleave 12 of the last 16 bytes,
-   // excepting the nonces.
-   allium_8way_block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
-   allium_8way_block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
-   allium_8way_block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
-
-   // Partialy prehash second block without touching nonces
-   blake256_8way_round0_prehash_le( allium_8way_midstate_vars,
-                             allium_8way_block0_hash, allium_8way_block_buf );
-
-   return 1;
-}
-
 static void allium_8way_hash( void *hash, const void *midstate_vars,
                               const void *midhash, const void *block )
 {
@@ -446,6 +386,11 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
   uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
   __m256i block0_hash[8] __attribute__ ((aligned (64)));
   __m256i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (32))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
   uint64_t *ptarget = (uint64_t*)work->target;
   const uint32_t first_nonce = pdata[19];
@@ -455,17 +400,29 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
   const bool bench = opt_benchmark;
   const __m256i eight = m256_const1_32( 8 );

-   pthread_rwlock_rdlock( &g_work_lock );
+   // Prehash first block
+   blake256_transform_le( phash, pdata, 512, 0 );

-   memcpy( midstate_vars, allium_8way_midstate_vars, sizeof midstate_vars );
-   memcpy( block0_hash,   allium_8way_block0_hash,   sizeof block0_hash );
-   memcpy( block_buf,     allium_8way_block_buf,     sizeof block_buf );
+   block0_hash[0] = _mm256_set1_epi32( phash[0] );
+   block0_hash[1] = _mm256_set1_epi32( phash[1] );
+   block0_hash[2] = _mm256_set1_epi32( phash[2] );
+   block0_hash[3] = _mm256_set1_epi32( phash[3] );
+   block0_hash[4] = _mm256_set1_epi32( phash[4] );
+   block0_hash[5] = _mm256_set1_epi32( phash[5] );
+   block0_hash[6] = _mm256_set1_epi32( phash[6] );
+   block0_hash[7] = _mm256_set1_epi32( phash[7] );

-   pthread_rwlock_unlock( &g_work_lock );
-   
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces.
+   block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
+   block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
+   block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
   block_buf[ 3] = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4,
                                     n+ 3, n+ 2, n+ 1, n );
-   
+
+   // Partialy prehash second block without touching nonces
+   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+
   do {
     allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );

@@ -481,7 +438,6 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
     n += 8;
     block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
   } while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
-
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -131,12 +131,10 @@ bool register_lyra2z_algo( algo_gate_t* gate )
 {
 #if defined(LYRA2Z_16WAY)
  gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
-  gate->prehash    = (void*)&lyra2z_16way_prehash;
  gate->scanhash   = (void*)&scanhash_lyra2z_16way;
 //  gate->hash       = (void*)&lyra2z_16way_hash;
 #elif defined(LYRA2Z_8WAY)
  gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
-  gate->prehash    = (void*)&lyra2z_8way_prehash;
  gate->scanhash   = (void*)&scanhash_lyra2z_8way;
 //  gate->hash       = (void*)&lyra2z_8way_hash;
 #elif defined(LYRA2Z_4WAY)
@@ -177,10 +175,8 @@ bool register_lyra2h_algo( algo_gate_t* gate )
 bool register_allium_algo( algo_gate_t* gate )
 {
 #if defined (ALLIUM_16WAY)
-  gate->prehash   = (void*)&allium_16way_prehash;
  gate->scanhash  = (void*)&scanhash_allium_16way;
 #elif defined (ALLIUM_8WAY)
-  gate->prehash   = (void*)&allium_8way_prehash;
  gate->scanhash  = (void*)&scanhash_allium_8way;
 #else
  gate->miner_thread_init = (void*)&init_allium_ctx;
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -5,6 +5,7 @@
 #include <stdint.h>
 #include "lyra2.h"

+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define LYRA2REV3_16WAY 1
 #elif defined(__AVX2__)
@@ -101,7 +102,6 @@ bool init_lyra2rev2_ctx();
 //void lyra2z_16way_hash( void *state, const void *input );
 int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
-int lyra2z_16way_prehash ( struct work *work );
 bool lyra2z_16way_thread_init();

 #elif defined(LYRA2Z_8WAY)
@@ -110,7 +110,6 @@ bool lyra2z_16way_thread_init();
 int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_8way_thread_init();
-int lyra2z_8way_prehash ( struct work *work );

 #elif defined(LYRA2Z_4WAY)

@@ -166,13 +165,11 @@ bool register_allium_algo( algo_gate_t* gate );

 int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
-int allium_16way_prehash ( struct work *work );

 #elif defined(ALLIUM_8WAY)

 int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
-int allium_8way_prehash ( struct work *work );

 #else

--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -14,44 +14,6 @@ bool lyra2z_16way_thread_init()
 return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
 }

-static uint32_t lyra2z_16way_midstate_vars[16*16] __attribute__ ((aligned (64)));
-static __m512i lyra2z_16way_block0_hash[8] __attribute__ ((aligned (64)));
-static __m512i lyra2z_16way_block_buf[16] __attribute__ ((aligned (64)));
-
-int lyra2z_16way_prehash ( struct work *work )
-{
-   uint32_t phash[8] __attribute__ ((aligned (32))) =
-   {
-      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-   };
-   uint32_t *pdata = work->data;
-
-   // Prehash first block
-   blake256_transform_le( phash, pdata, 512, 0 );
-
-   lyra2z_16way_block0_hash[0] = _mm512_set1_epi32( phash[0] );
-   lyra2z_16way_block0_hash[1] = _mm512_set1_epi32( phash[1] );
-   lyra2z_16way_block0_hash[2] = _mm512_set1_epi32( phash[2] );
-   lyra2z_16way_block0_hash[3] = _mm512_set1_epi32( phash[3] );
-   lyra2z_16way_block0_hash[4] = _mm512_set1_epi32( phash[4] );
-   lyra2z_16way_block0_hash[5] = _mm512_set1_epi32( phash[5] );
-   lyra2z_16way_block0_hash[6] = _mm512_set1_epi32( phash[6] );
-   lyra2z_16way_block0_hash[7] = _mm512_set1_epi32( phash[7] );
-
-   // Build vectored second block, interleave 12 of last 16 bytes of data
-   // excepting the nonce.
-   lyra2z_16way_block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
-   lyra2z_16way_block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
-   lyra2z_16way_block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
-
-   // Partialy prehash second block without touching nonces in block_buf[3].
-   blake256_16way_round0_prehash_le( lyra2z_16way_midstate_vars, 
-                       lyra2z_16way_block0_hash, lyra2z_16way_block_buf );
-
-   return 1;
-}
-
 static void lyra2z_16way_hash( void *state, const void *midstate_vars,
                        const void *midhash, const void *block )
 {
@@ -129,6 +91,11 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
   uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
   __m512i block0_hash[8] __attribute__ ((aligned (64)));
   __m512i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (64))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -140,18 +107,30 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,

   if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   pthread_rwlock_rdlock( &g_work_lock );
+   // Prehash first block
+   blake256_transform_le( phash, pdata, 512, 0 );

-   memcpy( midstate_vars, lyra2z_16way_midstate_vars, sizeof midstate_vars );
-   memcpy( block0_hash,   lyra2z_16way_block0_hash,   sizeof block0_hash );
-   memcpy( block_buf,     lyra2z_16way_block_buf,     sizeof block_buf );
+   block0_hash[0] = _mm512_set1_epi32( phash[0] );
+   block0_hash[1] = _mm512_set1_epi32( phash[1] );
+   block0_hash[2] = _mm512_set1_epi32( phash[2] );
+   block0_hash[3] = _mm512_set1_epi32( phash[3] );
+   block0_hash[4] = _mm512_set1_epi32( phash[4] );
+   block0_hash[5] = _mm512_set1_epi32( phash[5] );
+   block0_hash[6] = _mm512_set1_epi32( phash[6] );
+   block0_hash[7] = _mm512_set1_epi32( phash[7] );

-   pthread_rwlock_unlock( &g_work_lock );
-   
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces.
+   block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
+   block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
+   block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
   block_buf[ 3] =
             _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );

+   // Partialy prehash second block without touching nonces in block_buf[3].
+   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+
   do {
     lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );

@@ -178,44 +157,6 @@ bool lyra2z_8way_thread_init()
 return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
 }

-static uint32_t lyra2z_8way_midstate_vars[16*8] __attribute__ ((aligned (64)));
-static __m256i lyra2z_8way_block0_hash[8] __attribute__ ((aligned (64)));
-static __m256i lyra2z_8way_block_buf[16] __attribute__ ((aligned (64)));
-
-int lyra2z_8way_prehash ( struct work *work )
-{
-   uint32_t phash[8] __attribute__ ((aligned (32))) =
-   {
-      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-   };
-   uint32_t *pdata = work->data;
-
-   // Prehash first block
-   blake256_transform_le( phash, pdata, 512, 0 );
-
-   lyra2z_8way_block0_hash[0] = _mm256_set1_epi32( phash[0] );
-   lyra2z_8way_block0_hash[1] = _mm256_set1_epi32( phash[1] );
-   lyra2z_8way_block0_hash[2] = _mm256_set1_epi32( phash[2] );
-   lyra2z_8way_block0_hash[3] = _mm256_set1_epi32( phash[3] );
-   lyra2z_8way_block0_hash[4] = _mm256_set1_epi32( phash[4] );
-   lyra2z_8way_block0_hash[5] = _mm256_set1_epi32( phash[5] );
-   lyra2z_8way_block0_hash[6] = _mm256_set1_epi32( phash[6] );
-   lyra2z_8way_block0_hash[7] = _mm256_set1_epi32( phash[7] );
-
-   // Build vectored second block, interleave last 16 bytes of data using
-   // unique nonces.
-   lyra2z_8way_block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
-   lyra2z_8way_block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
-   lyra2z_8way_block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
-
-   // Partialy prehash second block without touching nonces
-   blake256_8way_round0_prehash_le( lyra2z_8way_midstate_vars,
-                           lyra2z_8way_block0_hash, lyra2z_8way_block_buf );
-
-   return 1;
-}
-
 static void lyra2z_8way_hash( void *state, const void *midstate_vars,
                       const void *midhash, const void *block )
 {
@@ -260,6 +201,11 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
   uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
   __m256i block0_hash[8] __attribute__ ((aligned (64)));
   __m256i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (32))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
   uint64_t *ptarget = (uint64_t*)work->target;
   const uint32_t first_nonce = pdata[19];
@@ -269,14 +215,23 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
   const bool bench = opt_benchmark;
   const __m256i eight = m256_const1_32( 8 );

-   pthread_rwlock_rdlock( &g_work_lock );
+   // Prehash first block
+   blake256_transform_le( phash, pdata, 512, 0 );

-   memcpy( midstate_vars, lyra2z_8way_midstate_vars, sizeof midstate_vars );
-   memcpy( block0_hash,   lyra2z_8way_block0_hash,   sizeof block0_hash );
-   memcpy( block_buf,     lyra2z_8way_block_buf,     sizeof block_buf );
+   block0_hash[0] = _mm256_set1_epi32( phash[0] );
+   block0_hash[1] = _mm256_set1_epi32( phash[1] );
+   block0_hash[2] = _mm256_set1_epi32( phash[2] );
+   block0_hash[3] = _mm256_set1_epi32( phash[3] );
+   block0_hash[4] = _mm256_set1_epi32( phash[4] );
+   block0_hash[5] = _mm256_set1_epi32( phash[5] );
+   block0_hash[6] = _mm256_set1_epi32( phash[6] );
+   block0_hash[7] = _mm256_set1_epi32( phash[7] );

-   pthread_rwlock_unlock( &g_work_lock );
-   
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces.
+   block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
+   block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
+   block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
   block_buf[ 3] =
            _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );

--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -146,25 +146,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );

 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
-{ \
-   __m128i t; \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   t = mm128_alignr_64( s7, s6, 1 ); \
-   s6 = mm128_alignr_64( s6, s7, 1 ); \
-   s7 = t; \
-   t = mm128_alignr_64( s2, s3, 1 ); \
-   s2 =  mm128_alignr_64( s3, s2, 1 ); \
-   s3 = t; \
+   mm128_vrol256_64( s6, s7 ); \
+   mm128_vror256_64( s2, s3 ); \
   G_2X64( s0, s2, s5, s6 ); \
   G_2X64( s1, s3, s4, s7 ); \
-   t = mm128_alignr_64( s6, s7, 1 ); \
-   s6 = mm128_alignr_64( s7, s6, 1 ); \
-   s7 = t; \
-   t = mm128_alignr_64( s3, s2, 1 ); \
-   s2 =  mm128_alignr_64( s2, s3, 1 ); \
-   s3 = t; \
-} 
+   mm128_vror256_64( s6, s7 ); \
+   mm128_vrol256_64( s2, s3 );

 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -7,16 +7,8 @@

 #if defined (SKEIN_8WAY)

-static skein512_8way_context skein512_8way_ctx
+static __thread skein512_8way_context skein512_8way_ctx
                                            __attribute__ ((aligned (64)));
-static uint32_t skein_8way_vdata[20*8] __attribute__ ((aligned (64)));
-
-int skein_8way_prehash( struct work *work )
-{
-    mm512_bswap32_intrlv80_8x64( skein_8way_vdata, work->data );
-    skein512_8way_prehash64( &skein512_8way_ctx, skein_8way_vdata );
-    return 1;
-}

 void skeinhash_8way( void *state, const void *input )
 {
@@ -37,27 +29,25 @@ void skeinhash_8way( void *state, const void *input )
 int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[20*8] __attribute__ ((aligned (128)));
-   uint32_t hash[8*8] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hash_d7 = &(hash[7*8]);
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t targ_d7 = ptarget[7];
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 8;
-   uint32_t n = first_nonce;
-   __m512i  *noncev = (__m512i*)vdata + 9; 
-   const int thr_id = mythr->id; 
-   const bool bench = opt_benchmark;
-    
-    pthread_rwlock_rdlock( &g_work_lock );
-       memcpy( vdata, skein_8way_vdata, sizeof vdata );
-    pthread_rwlock_unlock( &g_work_lock );
+    uint32_t vdata[20*8] __attribute__ ((aligned (128)));
+    uint32_t hash[8*8] __attribute__ ((aligned (64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+    uint32_t *hash_d7 = &(hash[7*8]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint32_t targ_d7 = ptarget[7];
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t last_nonce = max_nonce - 8;
+    uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9; 
+    const int thr_id = mythr->id; 
+    const bool bench = opt_benchmark;

+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
   *noncev = mm512_intrlv_blend_32(
                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
+   skein512_8way_prehash64( &skein512_8way_ctx, vdata );
   do
   {
       skeinhash_8way( hash, vdata );
@@ -84,16 +74,8 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce,

 #elif defined (SKEIN_4WAY)

-static skein512_4way_context skein512_4way_ctx
+static __thread skein512_4way_context skein512_4way_ctx
                                            __attribute__ ((aligned (64)));
-static uint32_t skein_4way_vdata[20*4] __attribute__ ((aligned (64)));
-
-int skein_4way_prehash( struct work *work )
-{
-    mm256_bswap32_intrlv80_4x64( skein_4way_vdata, work->data );
-    skein512_4way_prehash64( &skein512_4way_ctx, skein_4way_vdata );
-    return 1;
-}

 void skeinhash_4way( void *state, const void *input )
 {
@@ -136,24 +118,23 @@ void skeinhash_4way( void *state, const void *input )
 int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t hash[8*4] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t *hash_d7 = &(hash[7<<2]);
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t targ_d7 = ptarget[7];
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 4;
-   uint32_t n = first_nonce;
-   __m256i  *noncev = (__m256i*)vdata + 9; 
-   const int thr_id = mythr->id; 
-   const bool bench = opt_benchmark;
+    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+    uint32_t hash[8*4] __attribute__ ((aligned (64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+    uint32_t *hash_d7 = &(hash[7<<2]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint32_t targ_d7 = ptarget[7];
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t last_nonce = max_nonce - 4;
+    uint32_t n = first_nonce;
+    __m256i  *noncev = (__m256i*)vdata + 9; 
+    const int thr_id = mythr->id; 
+    const bool bench = opt_benchmark;
+
+   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   skein512_4way_prehash64( &skein512_4way_ctx, vdata );

-   pthread_rwlock_rdlock( &g_work_lock );
-      memcpy( vdata, skein_4way_vdata, sizeof vdata );
-   pthread_rwlock_unlock( &g_work_lock );
-    
   *noncev = mm256_intrlv_blend_32(
                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -7,12 +7,10 @@ bool register_skein_algo( algo_gate_t* gate )
 #if defined (SKEIN_8WAY)
    gate->optimizations = AVX2_OPT | AVX512_OPT;
    gate->scanhash  = (void*)&scanhash_skein_8way;
-    gate->prehash   = (void*)&skein_8way_prehash;
    gate->hash      = (void*)&skeinhash_8way;
 #elif defined (SKEIN_4WAY)
    gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
    gate->scanhash  = (void*)&scanhash_skein_4way;
-    gate->prehash   = (void*)&skein_4way_prehash;
    gate->hash      = (void*)&skeinhash_4way;
 #else
    gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
@@ -27,12 +25,10 @@ bool register_skein2_algo( algo_gate_t* gate )
  gate->optimizations = AVX2_OPT | AVX512_OPT;
 #if defined (SKEIN_8WAY)
  gate->scanhash  = (void*)&scanhash_skein2_8way;
-//  gate->hash      = (void*)&skein2hash_8way;
-  gate->prehash   = (void*)&skein2_8way_prehash;
+  gate->hash      = (void*)&skein2hash_8way;
 #elif defined (SKEIN_4WAY)
  gate->scanhash  = (void*)&scanhash_skein2_4way;
-//  gate->hash      = (void*)&skein2hash_4way;
-  gate->prehash   = (void*)&skein2_4way_prehash;
+  gate->hash      = (void*)&skein2hash_4way;
 #else
  gate->scanhash  = (void*)&scanhash_skein2;
  gate->hash      = (void*)&skein2hash;
--- a/algo/skein/skein-gate.h
+++ b/algo/skein/skein-gate.h
@@ -14,24 +14,20 @@
 void skeinhash_8way( void *output, const void *input );
 int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-int skein_8way_prehash( struct work * );

 void skein2hash_8way( void *output, const void *input );
 int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
                          uint64_t* hashes_done, struct thr_info *mythr );
-int skein2_8way_prehash( struct work * );

 #elif defined(SKEIN_4WAY)

 void skeinhash_4way( void *output, const void *input );
 int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-int skein_4way_prehash( struct work * );

 void skein2hash_4way( void *output, const void *input );
 int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
                          uint64_t* hashes_done, struct thr_info *mythr );
-int skein2_4way_prehash( struct work * );

 #else

--- a/algo/skein/skein.c
+++ b/algo/skein/skein.c
@@ -31,19 +31,18 @@ int scanhash_skein( struct work *work, uint32_t max_nonce,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t n = first_nonce;
-   int thr_id = mythr->id;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   swab32_array( endiandata, pdata, 20 );

 	do {
 		be32enc(&endiandata[19], n); 
 		skeinhash(hash64, endiandata);
-      if (hash64[7] <= Htarg )
-      if ( fulltest(hash64, ptarget) && !opt_benchmark )
-      {
-         pdata[19] = n;
-         submit_solution( work, hash64, mythr );
-      }
+		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
+			*hashes_done = n - first_nonce + 1;
+			pdata[19] = n;
+			return true;
+		}
 		n++;

 	} while (n < max_nonce && !work_restart[thr_id].restart);
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -5,17 +5,9 @@

 #if defined(SKEIN_8WAY)

-static skein512_8way_context skein512_8way_ctx __attribute__ ((aligned (64)));
-static uint32_t skein2_8way_vdata[20*8] __attribute__ ((aligned (64)));
+ static __thread skein512_8way_context skein512_8way_ctx
+                                             __attribute__ ((aligned (64)));

-int skein2_8way_prehash( struct work *work )
-{
-    mm512_bswap32_intrlv80_8x64( skein2_8way_vdata, work->data );
-    skein512_8way_prehash64( &skein512_8way_ctx, skein2_8way_vdata );
-    return 1;
-}
-
-/* not used
 void skein2hash_8way( void *output, const void *input )
 {
   uint64_t hash[16*8] __attribute__ ((aligned (128)));
@@ -25,7 +17,6 @@ void skein2hash_8way( void *output, const void *input )
   skein512_8way_final16( &ctx, hash, input + (64*8) );
   skein512_8way_full( &ctx, output, hash, 64 );
 }
-*/

 int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
@@ -45,14 +36,11 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
    const bool bench = opt_benchmark;
    skein512_8way_context ctx;

-    pthread_rwlock_rdlock( &g_work_lock );
-       memcpy( vdata, skein2_8way_vdata, sizeof vdata );
-       memcpy( &ctx, &skein512_8way_ctx, sizeof ctx );
-    pthread_rwlock_unlock( &g_work_lock );
-
+    mm512_bswap32_intrlv80_8x64( vdata, pdata );
    *noncev = mm512_intrlv_blend_32(
                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
+    skein512_8way_prehash64( &ctx, vdata );
    do
    {
       skein512_8way_final16( &ctx, hash, vdata + (16*8) );
@@ -79,18 +67,10 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
 }

 #elif defined(SKEIN_4WAY)
-                                           
-static skein512_4way_context skein512_4way_ctx __attribute__ ((aligned (64)));
-static uint32_t skein2_4way_vdata[20*4] __attribute__ ((aligned (64)));
-                                           
-int skein2_4way_prehash( struct work *work )
-{
-    mm256_bswap32_intrlv80_4x64( skein2_4way_vdata, work->data );
-    skein512_4way_prehash64( &skein512_4way_ctx, skein2_4way_vdata );
-    return 1;
-}   

-/* not used
+static __thread skein512_4way_context skein512_4way_ctx
+                                           __attribute__ ((aligned (64)));
+
 void skein2hash_4way( void *output, const void *input )
 {
   skein512_4way_context ctx;
@@ -100,7 +80,6 @@ void skein2hash_4way( void *output, const void *input )
   skein512_4way_final16( &ctx, hash, input + (64*4) );
   skein512_4way_full( &ctx, output, hash, 64 );
 }
-*/

 int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
@@ -120,11 +99,8 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
    const bool bench = opt_benchmark;
    skein512_4way_context ctx;

-    pthread_rwlock_rdlock( &g_work_lock );
-       memcpy( vdata, skein2_4way_vdata, sizeof vdata );
-       memcpy( &ctx, &skein512_4way_ctx, sizeof ctx );
-    pthread_rwlock_unlock( &g_work_lock );
-
+    mm256_bswap32_intrlv80_4x64( vdata, pdata );
+    skein512_4way_prehash64( &ctx, vdata );
    *noncev = mm256_intrlv_blend_32(
                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
    do 
--- a/algo/skein/skein2.c
+++ b/algo/skein/skein2.c
@@ -34,31 +34,31 @@ void skein2hash(void *output, const void *input)
 	sph_skein512_close(&ctx_skein, hash);

 	memcpy(output, hash, 32);
+
 }

 int scanhash_skein2( struct work *work,	uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
 	uint32_t hash64[8] __attribute__ ((aligned (64)));
 	uint32_t endiandata[20] __attribute__ ((aligned (64)));
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t n = first_nonce;
-   int thr_id = mythr->id; 
+   int thr_id = mythr->id;  // thr_id arg is deprecated

-   swab32_array( endiandata, pdata, 20 );
+        swab32_array( endiandata, pdata, 20 );

 	do {
 		be32enc(&endiandata[19], n);
 		skein2hash(hash64, endiandata);
-      if (hash64[7] <= Htarg )
-      if ( fulltest(hash64, ptarget) && !opt_benchmark )
-      {
-         pdata[19] = n;
-         submit_solution( work, hash64, mythr );
-      }
+		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
+			*hashes_done = n - first_nonce + 1;
+			pdata[19] = n;
+			return true;
+		}
 		n++;

 	} while (n < max_nonce && !work_restart[thr_id].restart);
--- a/algo/x16/hex.c
+++ b/algo/x16/hex.c
@@ -25,7 +25,7 @@ static void hex_getAlgoString(const uint32_t* prevblock, char *output)

 static __thread x16r_context_overlay hex_ctx;

-int hex_hash( void* output, const void* input, const int thrid )
+int hex_hash( void* output, const void* input, int thrid )
 {
   uint32_t _ALIGN(128) hash[16];
   x16r_context_overlay ctx;
--- a/algo/x16/minotaur.c
+++ b/algo/x16/minotaur.c
@@ -72,7 +72,7 @@ struct TortureGarden

 // Get a 64-byte hash for given 64-byte input, using given TortureGarden contexts and given algo index
 static int get_hash( void *output, const void *input, TortureGarden *garden,
-	                  unsigned int algo, const int thr_id )
+	                  unsigned int algo, int thr_id )
 {    
 	unsigned char hash[64] __attribute__ ((aligned (64)));
   int rc = 1;
@@ -233,7 +233,7 @@ bool initialize_torture_garden()
 }

 // Produce a 32-byte hash from 80-byte input data
-int minotaur_hash( void *output, const void *input, const int thr_id )
+int minotaur_hash( void *output, const void *input, int thr_id )
 {    
    unsigned char hash[64] __attribute__ ((aligned (64)));
    int rc = 1;
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -19,7 +19,7 @@
 // Perform midstate prehash of hash functions with block size <= 72 bytes,
 // 76 bytes for hash functions that operate on 32 bit data.

-void x16r_8way_do_prehash( void *vdata, const void *pdata )
+void x16r_8way_prehash( void *vdata, void *pdata )
 {
   uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
   uint32_t edata[20] __attribute__ ((aligned (64)));
@@ -106,18 +106,11 @@ void x16r_8way_do_prehash( void *vdata, const void *pdata )
   }
 }

-int x16r_8way_prehash( struct work *work )
-{
-   x16r_gate_get_hash_order( work, x16r_hash_order );
-   x16r_8way_do_prehash( x16r_8way_vdata, work->data );
-   return 1;
-}
-
 // Perform the full x16r hash and returns 512 bit intermediate hash.
 // Called by wrapper hash function to optionally continue hashing and
 // convert to final hash.

-int x16r_8way_hash_generic( void* output, const void* input, const int thrid )
+int x16r_8way_hash_generic( void* output, const void* input, int thrid )
 {
   uint32_t vhash[20*8] __attribute__ ((aligned (128)));
   uint32_t hash0[20] __attribute__ ((aligned (16)));
@@ -478,7 +471,7 @@ int x16r_8way_hash_generic( void* output, const void* input, const int thrid )

 // x16-r,-s,-rt wrapper called directly by scanhash to repackage 512 bit
 // hash to 256 bit final hash.
-int x16r_8way_hash( void* output, const void* input, const int thrid )
+int x16r_8way_hash( void* output, const void* input, int thrid )
 {
   uint8_t hash[64*8] __attribute__ ((aligned (128)));
   if ( !x16r_8way_hash_generic( hash, input, thrid ) )
@@ -502,6 +495,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*8] __attribute__ ((aligned (128)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -514,16 +508,27 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,

   if ( bench )   ptarget[7] = 0x0cff;

-   pthread_rwlock_rdlock( &g_work_lock );
-      memcpy( vdata, x16r_8way_vdata, sizeof vdata );
-   pthread_rwlock_unlock( &g_work_lock );
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );

+   static __thread uint32_t s_ntime = UINT32_MAX;
+   const uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+
+      if ( opt_debug && !thr_id )
+          applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
+   }
+
+   x16r_8way_prehash( vdata, pdata );
   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
   do
   {
-      if( algo_gate.hash( hash, vdata, thr_id ) );
+      if( x16r_8way_hash( hash, vdata, thr_id ) );
      for ( int i = 0; i < 8; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
@@ -541,7 +546,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,

 #elif defined (X16R_4WAY)

-void x16r_4way_do_prehash( void *vdata, const void *pdata )
+void x16r_4way_prehash( void *vdata, void *pdata )
 {
   uint32_t vdata2[20*4] __attribute__ ((aligned (64)));
   uint32_t edata[20] __attribute__ ((aligned (64)));
@@ -622,14 +627,7 @@ void x16r_4way_do_prehash( void *vdata, const void *pdata )
   }
 }

-int x16r_4way_prehash( struct work *work )
-{
-   x16r_gate_get_hash_order( work, x16r_hash_order );
-   x16r_4way_do_prehash( x16r_4way_vdata, work->data );
-   return 1;
-}
-
-int x16r_4way_hash_generic( void* output, const void* input, const int thrid )
+int x16r_4way_hash_generic( void* output, const void* input, int thrid )
 {
   uint32_t vhash[20*4] __attribute__ ((aligned (128)));
   uint32_t hash0[20] __attribute__ ((aligned (32)));
@@ -637,14 +635,13 @@ int x16r_4way_hash_generic( void* output, const void* input, const int thrid )
   uint32_t hash2[20] __attribute__ ((aligned (32)));
   uint32_t hash3[20] __attribute__ ((aligned (32)));
   x16r_4way_context_overlay ctx;
+   memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
   void *in0 = (void*) hash0;
   void *in1 = (void*) hash1;
   void *in2 = (void*) hash2;
   void *in3 = (void*) hash3;
   int size = 80;

-   memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-
   dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );

   for ( int i = 0; i < 16; i++ )
@@ -908,7 +905,7 @@ int x16r_4way_hash_generic( void* output, const void* input, const int thrid )
   return 1;
 }

-int x16r_4way_hash( void* output, const void* input, const int thrid )
+int x16r_4way_hash( void* output, const void* input, int thrid )
 {
   uint8_t hash[64*4] __attribute__ ((aligned (64)));
   if ( !x16r_4way_hash_generic( hash, input, thrid ) )
@@ -927,6 +924,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -939,15 +937,25 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,

   if ( bench )  ptarget[7] = 0x0cff;

-   pthread_rwlock_rdlock( &g_work_lock );
-      memcpy( vdata, x16r_4way_vdata, sizeof vdata );
-   pthread_rwlock_unlock( &g_work_lock );
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );

+   static __thread uint32_t s_ntime = UINT32_MAX;
+   const uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+         applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
+   }
+
+   x16r_4way_prehash( vdata, pdata );
   *noncev = mm256_intrlv_blend_32(
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
   {
-      if ( algo_gate.hash( hash, vdata, thr_id ) );
+      if ( x16r_4way_hash( hash, vdata, thr_id ) );
      for ( int i = 0; i < 4; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -1,44 +1,26 @@
 #include "x16r-gate.h"
 #include "algo/sha/sha256d.h"

-char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = {0};
+__thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = { 0 };

-void (*x16r_gate_get_hash_order) ( const struct work *, char * ) = NULL;
+void (*x16_r_s_getAlgoString) ( const uint8_t*, char* ) = NULL;

 #if defined (X16R_8WAY)

-x16r_8way_context_overlay x16r_ctx;
-uint32_t x16r_8way_vdata[24*8] __attribute__ ((aligned (64)));
+__thread x16r_8way_context_overlay x16r_ctx;

 #elif defined (X16R_4WAY)

-x16r_4way_context_overlay x16r_ctx;
-uint32_t x16r_4way_vdata[24*4] __attribute__ ((aligned (64)));
-
+__thread x16r_4way_context_overlay x16r_ctx;

 #endif

-#if defined (X16RV2_8WAY)
+__thread x16r_context_overlay x16_ctx;

-x16rv2_8way_context_overlay x16rv2_ctx;

-#elif defined (X16RV2_4WAY)
-
-x16rv2_4way_context_overlay x16rv2_ctx;
-
-#endif
-
-x16r_context_overlay x16_ctx;
-uint32_t x16r_edata[24] __attribute__ ((aligned (32)));
-
-void x16r_get_hash_order( const struct work *work, char *hash_order )
+void x16r_getAlgoString( const uint8_t* prevblock, char *output )
 {
-   char *sptr = hash_order;
-   const uint32_t *pdata = work->data;
-   uint8_t prevblock[16];
-   ((uint32_t*)prevblock)[0] = bswap_32( pdata[1] );
-   ((uint32_t*)prevblock)[1] = bswap_32( pdata[2] );
-
+   char *sptr = output;
   for ( int j = 0; j < X16R_HASH_FUNC_COUNT; j++ )
   {
      uint8_t b = (15 - j) >> 1; // 16 first ascii hex chars (lsb in uint256)
@@ -50,51 +32,38 @@ void x16r_get_hash_order( const struct work *work, char *hash_order )
      sptr++;
   }
   *sptr = '\0';
-
-   if ( !opt_quiet )
-      applog( LOG_INFO, "Hash order %s", x16r_hash_order );
 }
-   
-void x16s_get_hash_order( const struct work *work, char *hash_order )
+
+void x16s_getAlgoString( const uint8_t* prevblock, char *output )
 {
-   const uint32_t *pdata = work->data;
-   uint8_t prevblock[16];
-   ((uint32_t*)prevblock)[0] = bswap_32( pdata[1] );
-   ((uint32_t*)prevblock)[1] = bswap_32( pdata[2] );
-   strcpy( hash_order, "0123456789ABCDEF" );
+   strcpy( output, "0123456789ABCDEF" );
   for ( int i = 0; i < 16; i++ )
   {
      uint8_t b = (15 - i) >> 1; // 16 ascii hex chars, reversed
      uint8_t algoDigit = (i & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4;
      int offset = algoDigit;
      // insert the nth character at the front
-      char oldVal = hash_order[ offset ];
+      char oldVal = output[offset];
      for( int j = offset; j-- > 0; )
-         hash_order[ j+1 ] = hash_order[ j ];
-      hash_order[ 0 ] = oldVal;
+         output[j+1] = output[j];
+      output[0] = oldVal;
   }
-
-   if ( !opt_quiet )
-      applog( LOG_INFO, "Hash order %s", x16r_hash_order );
 }

 bool register_x16r_algo( algo_gate_t* gate )
 {
 #if defined (X16R_8WAY)
  gate->scanhash  = (void*)&scanhash_x16r_8way;
-  gate->prehash   = (void*)&x16r_8way_prehash;
  gate->hash      = (void*)&x16r_8way_hash;
 #elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
-  gate->prehash   = (void*)&x16r_4way_prehash;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16r;
-  gate->prehash   = (void*)&x16r_prehash;
  gate->hash      = (void*)&x16r_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
-  x16r_gate_get_hash_order = (void*)&x16r_get_hash_order;
+  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
 };
@@ -102,20 +71,17 @@ bool register_x16r_algo( algo_gate_t* gate )
 bool register_x16rv2_algo( algo_gate_t* gate )
 {
 #if defined (X16RV2_8WAY)
-  gate->scanhash  = (void*)&scanhash_x16r_8way;
-  gate->prehash   = (void*)&x16rv2_8way_prehash;
+  gate->scanhash  = (void*)&scanhash_x16rv2_8way;
  gate->hash      = (void*)&x16rv2_8way_hash;
 #elif defined (X16RV2_4WAY)
-  gate->scanhash  = (void*)&scanhash_x16r_4way;
-  gate->prehash   = (void*)&x16rv2_4way_prehash;
+  gate->scanhash  = (void*)&scanhash_x16rv2_4way;
  gate->hash      = (void*)&x16rv2_4way_hash;
 #else
-  gate->scanhash  = (void*)&scanhash_x16r;
-  gate->prehash   = (void*)&x16rv2_prehash;
+  gate->scanhash  = (void*)&scanhash_x16rv2;
  gate->hash      = (void*)&x16rv2_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
-  x16r_gate_get_hash_order = (void*)&x16r_get_hash_order;
+  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
 };
@@ -124,19 +90,16 @@ bool register_x16s_algo( algo_gate_t* gate )
 {
 #if defined (X16R_8WAY)
  gate->scanhash  = (void*)&scanhash_x16r_8way;
-  gate->prehash   = (void*)&x16r_8way_prehash;
  gate->hash      = (void*)&x16r_8way_hash;
 #elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
-  gate->prehash   = (void*)&x16r_4way_prehash;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16r;
-  gate->prehash   = (void*)&x16r_prehash;
  gate->hash      = (void*)&x16r_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
-  x16r_gate_get_hash_order = (void*)&x16s_get_hash_order;
+  x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
  opt_target_factor = 256.0;
  return true;
 };
@@ -145,33 +108,30 @@ bool register_x16s_algo( algo_gate_t* gate )
 //
 //   X16RT

-void x16rt_get_hash_order( const struct work * work, char * hash_order )
-{   
-   uint32_t _ALIGN(64) timehash[8*8];
-   const uint32_t ntime = bswap_32( work->data[17] );
-   const int32_t masked_ntime = ntime & 0xffffff80;
-   uint8_t* data = (uint8_t*)timehash;
-   char *sptr = hash_order;

-   sha256d( (unsigned char*)timehash, (const unsigned char*)( &masked_ntime ),
-             sizeof( masked_ntime ) );
+void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash )
+{
+    int32_t maskedTime = timeStamp & 0xffffff80;
+    sha256d( (unsigned char*)timeHash, (const unsigned char*)( &maskedTime ),
+             sizeof( maskedTime ) );
+}

-   for ( uint8_t j = 0; j < X16R_HASH_FUNC_COUNT; j++ )
-   {
+void x16rt_getAlgoString( const uint32_t *timeHash, char *output)
+{
+   char *sptr = output;
+   uint8_t* data = (uint8_t*)timeHash;
+
+   for (uint8_t j = 0; j < X16R_HASH_FUNC_COUNT; j++) {
      uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed
      uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4;

-      if ( algoDigit >= 10 )
-         sprintf( sptr, "%c", 'A' + (algoDigit - 10) );
+      if (algoDigit >= 10)
+         sprintf(sptr, "%c", 'A' + (algoDigit - 10));
      else
-         sprintf( sptr, "%u", (uint32_t) algoDigit );
+         sprintf(sptr, "%u", (uint32_t) algoDigit);
      sptr++;
   }
   *sptr = '\0';
-
-   if ( !opt_quiet )
-      applog( LOG_INFO, "Hash order %s, ntime %08x, time hash %08x",
-                         hash_order, ntime, timehash );
 }

 void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
@@ -262,19 +222,15 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 bool register_x16rt_algo( algo_gate_t* gate )
 {
 #if defined (X16R_8WAY)
-  gate->scanhash  = (void*)&scanhash_x16r_8way;
-  gate->prehash   = (void*)&x16r_8way_prehash;
+  gate->scanhash  = (void*)&scanhash_x16rt_8way;
  gate->hash      = (void*)&x16r_8way_hash;
 #elif defined (X16R_4WAY)
-  gate->scanhash  = (void*)&scanhash_x16r_4way;
-  gate->prehash   = (void*)&x16r_4way_prehash;
+  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
-  gate->scanhash  = (void*)&scanhash_x16r;
-  gate->prehash   = (void*)&x16r_prehash;
+  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  x16r_gate_get_hash_order = (void*)&x16rt_get_hash_order;
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
  opt_target_factor = 256.0;
  return true;
@@ -283,20 +239,16 @@ bool register_x16rt_algo( algo_gate_t* gate )
 bool register_x16rt_veil_algo( algo_gate_t* gate )
 {
 #if defined (X16R_8WAY)
-  gate->scanhash  = (void*)&scanhash_x16r_8way;
-  gate->prehash   = (void*)&x16r_8way_prehash;
+  gate->scanhash  = (void*)&scanhash_x16rt_8way;
  gate->hash      = (void*)&x16r_8way_hash;
 #elif defined (X16R_4WAY)
-  gate->scanhash  = (void*)&scanhash_x16r_4way;
-  gate->prehash   = (void*)&x16r_4way_prehash;
+  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
-  gate->scanhash  = (void*)&scanhash_x16r;
-  gate->prehash   = (void*)&x16r_prehash;
+  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16r_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
-  x16r_gate_get_hash_order = (void*)&x16rt_get_hash_order;
  gate->build_extraheader = (void*)&veil_build_extraheader;
  opt_target_factor = 256.0;
  return true;
@@ -323,23 +275,20 @@ bool register_hex_algo( algo_gate_t* gate )
 bool register_x21s_algo( algo_gate_t* gate )
 {
 #if defined (X16R_8WAY)
-  gate->scanhash          = (void*)&scanhash_x16r_8way;
-  gate->prehash           = (void*)&x16r_8way_prehash;
+  gate->scanhash          = (void*)&scanhash_x21s_8way;
  gate->hash              = (void*)&x21s_8way_hash;
  gate->miner_thread_init = (void*)&x21s_8way_thread_init;
 #elif defined (X16R_4WAY)
-  gate->scanhash          = (void*)&scanhash_x16r_4way;
-  gate->prehash           = (void*)&x16r_4way_prehash;
+  gate->scanhash          = (void*)&scanhash_x21s_4way;
  gate->hash              = (void*)&x21s_4way_hash;
  gate->miner_thread_init = (void*)&x21s_4way_thread_init;
 #else
-  gate->scanhash          = (void*)&scanhash_x16r;
-  gate->prehash           = (void*)&x16r_prehash;
+  gate->scanhash          = (void*)&scanhash_x21s;
  gate->hash              = (void*)&x21s_hash;
  gate->miner_thread_init = (void*)&x21s_thread_init;
 #endif
  gate->optimizations  = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
-  x16r_gate_get_hash_order = (void*)&x16s_get_hash_order;
+  x16_r_s_getAlgoString   = (void*)&x16s_getAlgoString;
  opt_target_factor = 256.0;
  return true;
 };
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -21,7 +21,6 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sph_sha2.h"
-#include "algo/tiger/sph_tiger.h"

 #if defined(__AES__)
 #include "algo/echo/aes_ni/hash_api.h"
@@ -58,11 +57,13 @@

  #define X16R_8WAY   1
  #define X16RV2_8WAY 1
+  #define X16RT_8WAY  1
  #define X21S_8WAY   1

 #elif defined(__AVX2__) && defined(__AES__)

  #define X16RV2_4WAY 1
+  #define X16RT_4WAY  1
  #define X21S_4WAY   1
  #define X16R_4WAY   1

@@ -88,29 +89,23 @@ enum x16r_Algo {
        X16R_HASH_FUNC_COUNT
 };

+extern __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ];

-//extern __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ];
-extern char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ];
-
-
-extern void (*x16r_gate_get_hash_order) ( const struct work *, char * );
-
-// x16r, x16rv2
-void x16r_get_hash_order( const struct work *, char * );
-// x16s, x21s
-void x16s_get_hash_order( const struct work *, char * );
-// x16rt
-void x16rt_get_hash_order( const struct work *, char * );
+extern void (*x16_r_s_getAlgoString) ( const uint8_t*, char* );
+void x16r_getAlgoString( const uint8_t *prevblock, char *output );
+void x16s_getAlgoString( const uint8_t *prevblock, char *output );
+void x16rt_getAlgoString( const uint32_t *timeHash, char *output );

+void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash );

 bool register_x16r_algo( algo_gate_t* gate );
 bool register_x16rv2_algo( algo_gate_t* gate );
 bool register_x16s_algo( algo_gate_t* gate );
 bool register_x16rt_algo( algo_gate_t* gate );
-bool register_hex_algo( algo_gate_t* gate );
-bool register_x21s_algo( algo_gate_t* gate );
+bool register_hex__algo( algo_gate_t* gate );
+bool register_x21s__algo( algo_gate_t* gate );

-// x16r, x16s, x16rt
+// x16r, x16s
 #if defined(X16R_8WAY)

 union _x16r_8way_context_overlay
@@ -141,15 +136,15 @@ union _x16r_8way_context_overlay

 typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;

-extern x16r_8way_context_overlay x16r_ctx;
-extern uint32_t x16r_8way_vdata[24*8] __attribute__ ((aligned (64)));
+extern __thread x16r_8way_context_overlay x16r_ctx;

-void x16r_8way_do_prehash( void *, const void * );
-int x16r_8way_prehash( struct work * );
-int x16r_8way_hash_generic( void *, const void *, const int );
-int x16r_8way_hash( void *, const void *, const int );
+void x16r_8way_prehash( void *, void * );
+int x16r_8way_hash_generic( void *, const void *, int );
+int x16r_8way_hash( void *, const void *, int );
 int scanhash_x16r_8way( struct work *, uint32_t ,
                        uint64_t *, struct thr_info * );
+extern __thread x16r_8way_context_overlay x16r_ctx;
+

 #elif defined(X16R_4WAY)

@@ -182,15 +177,14 @@ union _x16r_4way_context_overlay

 typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;

-extern x16r_4way_context_overlay x16r_ctx;
-extern uint32_t x16r_4way_vdata[24*4] __attribute__ ((aligned (64)));
+extern __thread x16r_4way_context_overlay x16r_ctx;

-void x16r_4way_do_prehash( void *, const void * );
-int x16r_4way_prehash( struct work * );
-int x16r_4way_hash_generic( void *, const void *, const int );
-int x16r_4way_hash( void *, const void *, const int );
+void x16r_4way_prehash( void *, void * );
+int x16r_4way_hash_generic( void *, const void *, int );
+int x16r_4way_hash( void *, const void *, int );
 int scanhash_x16r_4way( struct work *, uint32_t,
                        uint64_t *, struct thr_info * );
+extern __thread x16r_4way_context_overlay x16r_ctx;

 #endif

@@ -223,113 +217,80 @@ union _x16r_context_overlay

 typedef union _x16r_context_overlay x16r_context_overlay;

-extern x16r_context_overlay x16_ctx;
-extern uint32_t x16r_edata[24] __attribute__ ((aligned (32)));
+extern __thread x16r_context_overlay x16_ctx;

-void x16r_do_prehash( const void * );
-int x16r_prehash( const struct work * );
-int x16r_hash_generic( void *, const void *, const int );
-int x16r_hash( void *, const void *, const int );
+void x16r_prehash( void *, void * );
+int x16r_hash_generic( void *, const void *, int );
+int x16r_hash( void *, const void *, int );
 int scanhash_x16r( struct work *, uint32_t, uint64_t *, struct thr_info * );

 // x16Rv2
 #if defined(X16RV2_8WAY)

-union _x16rv2_8way_context_overlay
-{
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
-    luffa_4way_context      luffa;
-    cubehashParam           cube;
-    simd_4way_context       simd;
-    hamsi512_8way_context   hamsi;
-    hashState_fugue         fugue;
-    shabal512_8way_context  shabal;
-    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
-    sph_tiger_context       tiger;
-#if defined(__VAES__)
-    groestl512_4way_context groestl;
-    shavite512_4way_context shavite;
-    echo_4way_context       echo;
-#else
-    hashState_groestl       groestl;
-    shavite512_context      shavite;
-    hashState_echo          echo;
-#endif
-} __attribute__ ((aligned (64)));
-
-typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay;
-extern x16rv2_8way_context_overlay x16rv2_ctx;
-
-int x16rv2_8way_prehash( struct work * );
-int x16rv2_8way_hash( void *state, const void *input, const int thrid );
-//int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
-//                          uint64_t *hashes_done, struct thr_info *mythr );
+int x16rv2_8way_hash( void *state, const void *input, int thrid );
+int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );

 #elif defined(X16RV2_4WAY)

-union _x16rv2_4way_context_overlay
-{
-    blake512_4way_context   blake;
-    bmw512_4way_context     bmw;
-#if defined(__VAES__)
-    groestl512_2way_context groestl;
-    shavite512_2way_context shavite;
-    echo_2way_context       echo;
+int x16rv2_4way_hash( void *state, const void *input, int thrid );
+int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
 #else
-    hashState_groestl       groestl;
-    shavite512_context      shavite;
-    hashState_echo          echo;
+
+int x16rv2_hash( void *state, const void *input, int thr_id );
+int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr );
+
 #endif
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;
-    keccak512_4way_context  keccak;
-    luffa_2way_context      luffa;
-    cubehashParam           cube;
-    simd_2way_context       simd;
-    hamsi512_4way_context   hamsi;
-    hashState_fugue         fugue;
-    shabal512_4way_context  shabal;
-    sph_whirlpool_context   whirlpool;
-    sha512_4way_context     sha512;
-    sph_tiger_context       tiger;
-};

-typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay;
-extern x16rv2_4way_context_overlay x16rv2_ctx;
+// x16rt, veil
+#if defined(X16R_8WAY)

-int x16rv2_4way_hash( void *state, const void *input, const int thrid );
-int x16rv2_4way_prehash( struct work * );
+//void x16rt_8way_hash( void *state, const void *input );
+int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(X16R_4WAY)
+
+//void x16rt_4way_hash( void *state, const void *input );
+int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );

 #else

-int x16rv2_hash( void *state, const void *input, const int thr_id );
-int x16rv2_prehash( const struct work * );
+//void x16rt_hash( void *state, const void *input );
+int scanhash_x16rt( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr );

 #endif

 // x21s
 #if defined(X16R_8WAY)

-int x21s_8way_hash( void *state, const void *input, const int thrid );
+int x21s_8way_hash( void *state, const void *input, int thrid );
+int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
 bool x21s_8way_thread_init();

 #elif defined(X16R_4WAY)

-int x21s_4way_hash( void *state, const void *input, const int thrid );
+int x21s_4way_hash( void *state, const void *input, int thrid );
+int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
 bool x21s_4way_thread_init();

 #else

-int x21s_hash( void *state, const void *input, const int thr_id );
+int x21s_hash( void *state, const void *input, int thr_id );
+int scanhash_x21s( struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr );
 bool x21s_thread_init();

 #endif

+//void hex_hash( void *state, const void *input );
 int scanhash_hex( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );

--- a/algo/x16/x16r.c
+++ b/algo/x16/x16r.c
@@ -10,7 +10,7 @@
 #include <stdlib.h>
 #include <string.h>

-void x16r_do_prehash( const void *edata )
+void x16r_prehash( void *edata, void *pdata )
 {
   const char elem = x16r_hash_order[0];
   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
@@ -48,7 +48,7 @@ void x16r_do_prehash( const void *edata )
   }
 }

-int x16r_hash_generic( void* output, const void* input, const int thrid )
+int x16r_hash_generic( void* output, const void* input, int thrid )
 {
   uint32_t _ALIGN(128) hash[16];
   x16r_context_overlay ctx;
@@ -192,15 +192,7 @@ int x16r_hash_generic( void* output, const void* input, const int thrid )
   return true;
 }

-int x16r_prehash( const struct work *work )
-{
-   mm128_bswap32_80( x16r_edata, work->data );
-   x16r_gate_get_hash_order( work, x16r_hash_order );
-   x16r_do_prehash( x16r_edata );  
-   return 1;
-}
-
-int x16r_hash( void* output, const void* input, const int thrid )
+int x16r_hash( void* output, const void* input, int thrid )
 {  
   uint8_t hash[64] __attribute__ ((aligned (64)));
   if ( !x16r_hash_generic( hash, input, thrid ) )
@@ -213,8 +205,8 @@ int x16r_hash( void* output, const void* input, const int thrid )
 int scanhash_x16r( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t _ALIGN(32) hash32[8];
-   uint32_t _ALIGN(32) edata[20];
+   uint32_t _ALIGN(128) hash32[8];
+   uint32_t _ALIGN(128) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -224,14 +216,24 @@ int scanhash_x16r( struct work *work, uint32_t max_nonce,
   const bool bench = opt_benchmark;
   if ( bench )  ptarget[7] = 0x0cff;

-   pthread_rwlock_rdlock( &g_work_lock );
-      memcpy( edata, x16r_edata, sizeof edata );
-   pthread_rwlock_unlock( &g_work_lock );
+   mm128_bswap32_80( edata, pdata );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   if ( s_ntime != pdata[17] )
+   {
+      uint32_t ntime = swab32(pdata[17]);
+      x16_r_s_getAlgoString( (const uint8_t*)(&edata[1]), x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+           applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
+   }
+
+   x16r_prehash( edata, pdata );

   do
   {
      edata[19] = nonce;
-      if ( algo_gate.hash( hash32, edata, thr_id ) )
+      if ( x16r_hash( hash32, edata, thr_id ) )
      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
      {
         pdata[19] = bswap_32( nonce );
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -0,0 +1,113 @@
+#include "x16r-gate.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if defined (X16R_8WAY)
+
+int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[16*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) timeHash[8*8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   const int thr_id = mythr->id;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+
+   if ( bench )   ptarget[7] = 0x0cff;
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
+   if ( s_ntime != masked_ntime )
+   {
+      x16rt_getTimeHash( masked_ntime, &timeHash );
+      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
+      s_ntime = masked_ntime;
+      if ( !thr_id )
+          applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
+                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
+   }
+
+   x16r_8way_prehash( vdata, pdata );
+   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
+                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
+   do
+   {
+      if ( x16r_8way_hash( hash, vdata, thr_id ) )
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (X16R_4WAY)
+
+int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[4*16] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) timeHash[4*8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;  
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+
+   if ( bench )  ptarget[7] = 0x0cff;
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
+   if ( s_ntime != masked_ntime )
+   {
+      x16rt_getTimeHash( masked_ntime, &timeHash );
+      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
+      s_ntime = masked_ntime;
+      if ( !thr_id )
+          applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
+                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
+   }
+
+   x16r_4way_prehash( vdata, pdata );
+   *noncev = mm256_intrlv_blend_32(
+                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
+   do
+   {
+      if ( x16r_4way_hash( hash, vdata, thr_id ) )
+      for ( int i = 0; i < 4; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
+      n += 4;
+   } while ( (  n < last_nonce ) && !(*restart) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
--- a/algo/x16/x16rt.c
+++ b/algo/x16/x16rt.c
@@ -0,0 +1,53 @@
+#include "x16r-gate.h"
+
+#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
+
+int scanhash_x16rt( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(128) hash32[8];
+   uint32_t _ALIGN(128) edata[20];
+   uint32_t _ALIGN(64) timeHash[8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const int thr_id = mythr->id; 
+   uint32_t nonce = first_nonce;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+   if ( bench )  ptarget[7] = 0x0cff;
+
+   mm128_bswap32_80( edata, pdata );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   uint32_t masked_ntime = swab32( pdata[17] ) & 0xffffff80;
+   if ( s_ntime != masked_ntime )
+   {
+      x16rt_getTimeHash( masked_ntime, &timeHash );
+      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
+      s_ntime = masked_ntime;
+      if ( opt_debug && !thr_id )
+          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
+                        x16r_hash_order, swab32( pdata[17] ), timeHash );
+   }
+   
+   x16r_prehash( edata, pdata );
+   
+   do
+   {
+      edata[19] = nonce;
+      if ( x16r_hash( hash32, edata, thr_id ) )
+      if ( valid_hash( hash32, ptarget ) && !bench )
+      {
+         pdata[19] = bswap_32( nonce );
+         submit_solution( work, hash32, mythr );
+      }
+      nonce++;
+   } while ( nonce < max_nonce && !(*restart) );
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce;
+   return 0;
+}
+
+#endif  // !defined(X16R_8WAY) && !defined(X16R_4WAY)
+
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -12,73 +12,37 @@

 #if defined (X16RV2_8WAY)

-void x16rv2_8way_do_prehash( void *vdata, void *pdata )
+union _x16rv2_8way_context_overlay
 {
-   uint32_t vdata32[20*8] __attribute__ ((aligned (64)));
-   uint32_t edata[20] __attribute__ ((aligned (64)));
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cubehashParam           cube;
+    simd_4way_context       simd;
+    hamsi512_8way_context   hamsi;
+    hashState_fugue         fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+    sph_tiger_context       tiger;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    shavite512_context      shavite;
+    hashState_echo          echo;
+#endif
+} __attribute__ ((aligned (64)));

-   const char elem = x16r_hash_order[0];
-   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay;
+static __thread x16rv2_8way_context_overlay x16rv2_ctx;

-   switch ( algo )
-   {
-      case JH:
-         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         jh512_8way_init( &x16rv2_ctx.jh );
-         jh512_8way_update( &x16rv2_ctx.jh, vdata, 64 );
-      break;
-      case KECCAK:
-      case LUFFA:
-      case SHA_512:
-         mm128_bswap32_80( edata, pdata );
-         sph_tiger_init( &x16rv2_ctx.tiger );
-         sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
-         intrlv_8x64( vdata, edata, edata, edata, edata,
-                             edata, edata, edata, edata, 640 );
-      break;
-      case SKEIN:
-         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         skein512_8way_init( &x16rv2_ctx.skein );
-         skein512_8way_update( &x16rv2_ctx.skein, vdata, 64 );
-      break;
-      case CUBEHASH:
-         mm128_bswap32_80( edata, pdata );
-         cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
-         cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 );
-         intrlv_8x64( vdata, edata, edata, edata, edata,
-                             edata, edata, edata, edata, 640 );
-      break;
-      case HAMSI:
-         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         hamsi512_8way_init( &x16rv2_ctx.hamsi );
-         hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 );
-      break;
-      case SHABAL:
-         mm256_bswap32_intrlv80_8x32( vdata32, pdata );
-         shabal512_8way_init( &x16rv2_ctx.shabal );
-         shabal512_8way_update( &x16rv2_ctx.shabal, vdata32, 64 );
-         rintrlv_8x32_8x64( vdata, vdata32, 640 );
-      break;
-      case WHIRLPOOL:
-         mm128_bswap32_80( edata, pdata );
-         sph_whirlpool_init( &x16rv2_ctx.whirlpool );
-         sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 );
-         intrlv_8x64( vdata, edata, edata, edata, edata,
-                             edata, edata, edata, edata, 640 );
-      break;
-      default:
-         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-   }
-}
-
-int x16rv2_8way_prehash( struct work *work )
-{
-   x16r_gate_get_hash_order( work, x16r_hash_order );
-   x16rv2_8way_do_prehash( x16r_8way_vdata, work->data );
-   return 1;
-}
-
-int x16rv2_8way_hash( void* output, const void* input, const int thrid )
+int x16rv2_8way_hash( void* output, const void* input, int thrid )
 {
   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
   uint32_t hash0[24] __attribute__ ((aligned (32)));
@@ -593,28 +557,50 @@ int x16rv2_8way_hash( void* output, const void* input, const int thrid )
   return 1;
 }

-#elif defined (X16RV2_4WAY)
-
-// Pad the 24 bytes tiger hash to 64 bytes
-inline void padtiger512( uint32_t* hash )
+int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
 {
-  for ( int i = 6; i < 16; i++ ) hash[i] = 0;
-}
-
-void x16rv2_4way_do_prehash( void *vdata, void *pdata )
-{
-   uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash[16*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
   uint32_t edata[20] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   const int thr_id = mythr->id;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;

+   if ( bench ) ptarget[7] = 0x0cff;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   const uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+   }
+
+   // Do midstate prehash on hash functions with block size <= 64 bytes.
   const char elem = x16r_hash_order[0];
   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-
   switch ( algo )
   {
      case JH:
-         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         jh512_4way_init( &x16rv2_ctx.jh );
-         jh512_4way_update( &x16rv2_ctx.jh, vdata, 64 );
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         jh512_8way_init( &x16rv2_ctx.jh );
+         jh512_8way_update( &x16rv2_ctx.jh, vdata, 64 );
      break;
      case KECCAK:
      case LUFFA:
@@ -622,45 +608,100 @@ void x16rv2_4way_do_prehash( void *vdata, void *pdata )
         mm128_bswap32_80( edata, pdata );
         sph_tiger_init( &x16rv2_ctx.tiger );
         sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
-         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );
      break;
      case SKEIN:
-         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         skein512_4way_prehash64( &x16r_ctx.skein, vdata );
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         skein512_8way_init( &x16rv2_ctx.skein );
+         skein512_8way_update( &x16rv2_ctx.skein, vdata, 64 );
      break;
      case CUBEHASH:
         mm128_bswap32_80( edata, pdata );
         cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
         cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 );
-         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );
      break;
      case HAMSI:
-         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         hamsi512_4way_init( &x16rv2_ctx.hamsi );
-         hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 );
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         hamsi512_8way_init( &x16rv2_ctx.hamsi );
+         hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 );
      break;
      case SHABAL:
-         mm128_bswap32_intrlv80_4x32( vdata32, pdata );
-         shabal512_4way_init( &x16rv2_ctx.shabal );
-         shabal512_4way_update( &x16rv2_ctx.shabal, vdata32, 64 );
-         rintrlv_4x32_4x64( vdata, vdata32, 640 );
+         mm256_bswap32_intrlv80_8x32( vdata2, pdata );
+         shabal512_8way_init( &x16rv2_ctx.shabal );
+         shabal512_8way_update( &x16rv2_ctx.shabal, vdata2, 64 );
+         rintrlv_8x32_8x64( vdata, vdata2, 640 );
      break;
      case WHIRLPOOL:
         mm128_bswap32_80( edata, pdata );
         sph_whirlpool_init( &x16rv2_ctx.whirlpool );
         sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 );
-         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );
      break;
      default:
-         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
   }
-}   
+   
+   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
+                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
+   do
+   {
+      if ( x16rv2_8way_hash( hash, vdata, thr_id ) )
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}

-int x16rv2_4way_prehash( struct work *work )
+#elif defined (X16RV2_4WAY)
+
+union _x16rv2_4way_context_overlay
 {
-   x16r_gate_get_hash_order( work, x16r_hash_order );
-   x16rv2_4way_do_prehash( x16r_4way_vdata, work->data );
-   return 1;
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+#if defined(__VAES__)
+    groestl512_2way_context groestl;
+    shavite512_2way_context shavite;
+    echo_2way_context       echo;
+#else
+    hashState_groestl       groestl;
+    shavite512_context      shavite;
+    hashState_echo          echo;
+#endif
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    luffa_2way_context      luffa;
+    cubehashParam           cube;
+    simd_2way_context       simd;
+    hamsi512_4way_context   hamsi;
+    hashState_fugue         fugue;
+    shabal512_4way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_4way_context     sha512;
+    sph_tiger_context       tiger;
+};
+typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay;
+
+static __thread x16rv2_4way_context_overlay x16rv2_ctx;
+
+// Pad the 24 bytes tiger hash to 64 bytes
+inline void padtiger512( uint32_t* hash )
+{
+  for ( int i = 6; i < 16; i++ ) hash[i] = 0;
 }

 int x16rv2_4way_hash( void* output, const void* input, int thrid )
@@ -1007,4 +1048,107 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
   return 1;
 }

+int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[4*16] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
+   uint32_t edata[20];
+   uint32_t bedata1[2];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id; 
+    __m256i  *noncev = (__m256i*)vdata + 9; 
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+
+   if ( bench )  ptarget[7] = 0x0fff;
+   
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   const uint32_t ntime = bswap_32(pdata[17]);
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+   }
+
+   // Do midstate prehash on hash functions with block size <= 64 bytes.
+   const char elem = x16r_hash_order[0];
+   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+   switch ( algo )
+   {
+      case JH:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         jh512_4way_init( &x16rv2_ctx.jh );
+         jh512_4way_update( &x16rv2_ctx.jh, vdata, 64 );
+      break;
+      case KECCAK:
+      case LUFFA:
+      case SHA_512:
+         mm128_bswap32_80( edata, pdata );
+         sph_tiger_init( &x16rv2_ctx.tiger );
+         sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+      break;
+      case SKEIN:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         skein512_4way_prehash64( &x16r_ctx.skein, vdata );
+      break;
+      case CUBEHASH:
+         mm128_bswap32_80( edata, pdata );
+         cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+      break;
+      case HAMSI:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         hamsi512_4way_init( &x16rv2_ctx.hamsi );
+         hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 );
+      break;
+      case SHABAL:
+         mm128_bswap32_intrlv80_4x32( vdata32, pdata );
+         shabal512_4way_init( &x16rv2_ctx.shabal );
+         shabal512_4way_update( &x16rv2_ctx.shabal, vdata32, 64 );
+         rintrlv_4x32_4x64( vdata, vdata32, 640 );
+      break;
+      case WHIRLPOOL:
+         mm128_bswap32_80( edata, pdata );
+         sph_whirlpool_init( &x16rv2_ctx.whirlpool );
+         sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+      break;
+      default:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   }
+
+   *noncev = mm256_intrlv_blend_32(
+                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
+
+   do
+   {
+      if ( x16rv2_4way_hash( hash, vdata, thr_id ) )
+      for ( int i = 0; i < 4; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
+      n += 4;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
 #endif
--- a/algo/x16/x16rv2.c
+++ b/algo/x16/x16rv2.c
@@ -43,16 +43,9 @@ inline void padtiger512(uint32_t* hash) {
   for (int i = (24/4); i < (64/4); i++) hash[i] = 0;
 }

-// no prehash
-int x16rv2_prehash( const struct work *work )
+int x16rv2_hash( void* output, const void* input, int thrid )
 {
-   x16r_gate_get_hash_order( work, x16r_hash_order );
-   return 1;
-}
-
-int x16rv2_hash( void* output, const void* input, const int thrid )
-{
-   uint32_t _ALIGN(32) hash[16];
+   uint32_t _ALIGN(128) hash[16];
   x16rv2_context_overlay ctx;
   void *in = (void*) input;
   int size = 80;
@@ -177,4 +170,52 @@ int x16rv2_hash( void* output, const void* input, const int thrid )
   return 1;
 }

+int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(128) hash32[8];
+   uint32_t _ALIGN(128) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const int thr_id = mythr->id;  
+   uint32_t nonce = first_nonce;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+
+   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   if ( s_ntime != pdata[17] )
+   {
+      uint32_t ntime = swab32(pdata[17]);
+      x16_r_s_getAlgoString( (const uint8_t*) (&edata[1]), x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)",
+                                 x16r_hash_order, ntime );
+   }
+
+   if ( bench )   ptarget[7] = 0x0cff;
+
+   do
+   {
+      edata[19] = nonce;
+      if ( x16rv2_hash( hash32, edata, thr_id ) )
+      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( nonce );
+         submit_solution( work, hash32, mythr );
+      }
+      nonce++;
+   } while ( nonce < max_nonce && !(*restart) );
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce;
+   return 0;
+}
+
 #endif
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -30,7 +30,7 @@ union _x21s_8way_context_overlay

 typedef union _x21s_8way_context_overlay x21s_8way_context_overlay;

-int x21s_8way_hash( void* output, const void* input, const int thrid )
+int x21s_8way_hash( void* output, const void* input, int thrid )
 {
   uint32_t vhash[16*8] __attribute__ ((aligned (128)));
   uint8_t shash[64*8] __attribute__ ((aligned (64)));
@@ -129,6 +129,66 @@ int x21s_8way_hash( void* output, const void* input, const int thrid )
   return 1;
 }

+int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[16*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &hash[7<<3];
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 16;
+   const int thr_id = mythr->id;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+
+   if ( bench )   ptarget[7] = 0x0cff;
+
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+   }
+
+   x16r_8way_prehash( vdata, pdata );
+   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
+                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
+   do
+   {
+      if ( x21s_8way_hash( hash, vdata, thr_id ) )
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( unlikely( hash7[lane] <= Htarg ) )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+             pdata[19] = bswap_32( n + lane );
+             submit_solution( work, lane_hash, mythr );
+         }
+      }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
 bool x21s_8way_thread_init()
 {
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
@@ -155,7 +215,7 @@ union _x21s_4way_context_overlay

 typedef union _x21s_4way_context_overlay x21s_4way_context_overlay;

-int x21s_4way_hash( void* output, const void* input, const int thrid )
+int x21s_4way_hash( void* output, const void* input, int thrid )
 {
   uint32_t vhash[16*4] __attribute__ ((aligned (64)));
   uint8_t  shash[64*4] __attribute__ ((aligned (64)));
@@ -231,6 +291,58 @@ int x21s_4way_hash( void* output, const void* input, const int thrid )
   return 1;
 }

+int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[16*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id; 
+   const bool bench = opt_benchmark;
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( bench )  ptarget[7] = 0x0cff;
+ 
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
+   }
+
+   x16r_4way_prehash( vdata, pdata );
+   *noncev = mm256_intrlv_blend_32(
+                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
+   do
+   {
+      if ( x21s_4way_hash( hash, vdata, thr_id ) )
+      for ( int i = 0; i < 4; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
+      n += 4;
+   } while ( likely( (  n < last_nonce ) && !(*restart) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
 bool x21s_4way_thread_init()
 {
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
--- a/algo/x16/x21s.c
+++ b/algo/x16/x21s.c
@@ -27,7 +27,7 @@ union _x21s_context_overlay
 };
 typedef union _x21s_context_overlay x21s_context_overlay;

-int x21s_hash( void* output, const void* input, const int thrid )
+int x21s_hash( void* output, const void* input, int thrid )
 {
   uint32_t _ALIGN(128) hash[16];
   x21s_context_overlay ctx;
@@ -57,6 +57,50 @@ int x21s_hash( void* output, const void* input, const int thrid )
   return 1;
 }

+int scanhash_x21s( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(128) hash32[8];
+   uint32_t _ALIGN(128) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const int thr_id = mythr->id;
+   uint32_t nonce = first_nonce;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+   if ( bench )  ptarget[7] = 0x0cff;
+
+   mm128_bswap32_80( edata, pdata );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   if ( s_ntime != pdata[17] )
+   {
+      uint32_t ntime = swab32(pdata[17]);
+      x16_r_s_getAlgoString( (const uint8_t*)(&edata[1]), x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+          applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+   }
+
+   x16r_prehash( edata, pdata );
+
+   do
+   {
+      edata[19] = nonce;
+      if ( x21s_hash( hash32, edata, thr_id ) )
+      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( nonce );
+         submit_solution( work, hash32, mythr );
+      }
+      nonce++;
+   } while ( nonce < max_nonce && !(*restart) );
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce;
+   return 0;
+}
+
 bool x21s_thread_init()
 {
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
--- a/algo/yespower/yespower-gate.c
+++ b/algo/yespower/yespower-gate.c
@@ -31,26 +31,8 @@

 yespower_params_t yespower_params;

-// master g_work 
-sha256_context yespower_sha256_prehash_ctx;
-uint32_t _ALIGN(64) yespower_endiandata[20];
-
-// local work
 __thread sha256_context sha256_prehash_ctx;

-
-int yespower_sha256_prehash( struct work *work )
-{
-   uint32_t *pdata = work->data;
-
-   for ( int k = 0; k < 19; k++ )
-      be32enc( &yespower_endiandata[k], pdata[k] );
-
-   sha256_ctx_init( &yespower_sha256_prehash_ctx );
-   sha256_update( &yespower_sha256_prehash_ctx, yespower_endiandata, 64 );
-
-   return 1;
-}
 // YESPOWER

 int yespower_hash( const char *input, char *output, uint32_t len, int thrid )
@@ -71,15 +53,14 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;

-//   pthread_rwlock_rdlock( &g_work_lock );
-
-   memcpy( endiandata, yespower_endiandata, sizeof endiandata );
-   memcpy( &sha256_prehash_ctx, &yespower_sha256_prehash_ctx, sizeof sha256_prehash_ctx );
-
-//   pthread_rwlock_unlock( &g_work_lock );
-
+   for ( int k = 0; k < 19; k++ )
+      be32enc( &endiandata[k], pdata[k] );
   endiandata[19] = n;

+   // do sha256 prehash
+   sha256_ctx_init( &sha256_prehash_ctx );
+   sha256_update( &sha256_prehash_ctx, endiandata, 64 );
+
   do {
      if ( yespower_hash( (char*)endiandata, (char*)vhash, 80, thr_id ) )
      if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark )
@@ -159,7 +140,6 @@ bool register_yespower_algo( algo_gate_t* gate )

  gate->optimizations = SSE2_OPT | SHA_OPT;
  gate->scanhash      = (void*)&scanhash_yespower;
-  gate->prehash       = (void*)&yespower_sha256_prehash;
  gate->hash          = (void*)&yespower_hash;
  opt_target_factor = 65536.0;
  return true;
@@ -174,7 +154,6 @@ bool register_yespowerr16_algo( algo_gate_t* gate )
  yespower_params.perslen = 0;
  gate->optimizations = SSE2_OPT | SHA_OPT;
  gate->scanhash      = (void*)&scanhash_yespower;
-  gate->prehash       = (void*)&yespower_sha256_prehash;
  gate->hash          = (void*)&yespower_hash;
  opt_target_factor = 65536.0;
  return true;
@@ -186,7 +165,6 @@ bool register_yescrypt_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
-   gate->prehash       = (void*)&yespower_sha256_prehash;
   yespower_params.version = YESPOWER_0_5;
   opt_target_factor = 65536.0;

@@ -220,7 +198,6 @@ bool register_yescryptr8_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
-   gate->prehash       = (void*)&yespower_sha256_prehash;
   yespower_params.version = YESPOWER_0_5;
   yespower_params.N       = 2048;
   yespower_params.r       = 8;
@@ -234,7 +211,6 @@ bool register_yescryptr16_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
-   gate->prehash       = (void*)&yespower_sha256_prehash;
   yespower_params.version = YESPOWER_0_5;
   yespower_params.N       = 4096;
   yespower_params.r       = 16;
@@ -248,7 +224,6 @@ bool register_yescryptr32_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
-   gate->prehash       = (void*)&yespower_sha256_prehash;
   yespower_params.version = YESPOWER_0_5;
   yespower_params.N       = 4096;
   yespower_params.r       = 32;
--- a/algo/yespower/yespower.h
+++ b/algo/yespower/yespower.h
@@ -80,8 +80,6 @@ extern yespower_params_t yespower_params;

 extern __thread sha256_context sha256_prehash_ctx;

-int yespower_sha256_prehash( struct work *work );
-
 /**
 * yespower_init_local(local):
 * Initialize the thread-local (RAM) data structure.  Actual memory allocation
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.21.3.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.21.3.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.21.3'
-PACKAGE_STRING='cpuminer-opt 3.21.3'
+PACKAGE_VERSION='3.21.3.1'
+PACKAGE_STRING='cpuminer-opt 3.21.3.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.21.3 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.21.3.1 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1432,7 +1432,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.21.3:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.21.3.1:";;
   esac
  cat <<\_ACEOF

@@ -1538,7 +1538,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.21.3
+cpuminer-opt configure 3.21.3.1
 generated by GNU Autoconf 2.71

 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.21.3, which was
+It was created by cpuminer-opt $as_me 3.21.3.1, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.21.3'
+ VERSION='3.21.3.1'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.21.3, which was
+This file was extended by cpuminer-opt $as_me 3.21.3.1, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 3.21.3
+cpuminer-opt config.status 3.21.3.1
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.21.3])
+AC_INIT([cpuminer-opt], [3.21.3.1])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -37,7 +37,6 @@
 #include <curl/curl.h>
 #include <jansson.h>
 #include <openssl/sha.h>
-#include <mm_malloc.h>
 #include "sysinfos.c"
 #include "algo/sha/sha256d.h"

@@ -318,9 +317,8 @@ static void affine_to_cpu( struct thr_info *thr )
   if ( !ok )
   {
      last_error = GetLastError();
-      if ( !thread )
-      applog( LOG_WARNING, "Set affinity returned error 0x%x for thread %d",
-                           last_error, thread );
+      applog( LOG_WARNING, "affine_to_cpu_mask for %u returned 0x%x",
+                           thread, last_error );
   }
 }   

@@ -1727,9 +1725,9 @@ static bool workio_get_work( struct workio_cmd *wc, CURL *curl )
   struct work *ret_work;
   int failures = 0;

-   ret_work = (struct work*) _mm_malloc( sizeof(*ret_work), 32 );
-   if ( !ret_work )  return false;
-   memset( ret_work, 0, sizeof(*ret_work) );
+   ret_work = (struct work*) calloc( 1, sizeof(*ret_work) );
+   if ( !ret_work )
+	return false;

   /* obtain new work from bitcoin via JSON-RPC */
   while ( !get_upstream_work( curl, ret_work ) )
@@ -1738,23 +1736,22 @@ static bool workio_get_work( struct workio_cmd *wc, CURL *curl )
      {
         applog( LOG_ERR, "json_rpc_call failed, terminating workio thread" );
         free( ret_work );
-         return false;
+	      return false;
      }

      /* pause, then restart work-request loop */
-      applog( LOG_ERR, "json_rpc_call failed, retry after %d seconds",
-              opt_fail_pause );
+	   applog( LOG_ERR, "json_rpc_call failed, retry after %d seconds",
+		        opt_fail_pause );
      sleep( opt_fail_pause );
   }

   /* send work to requesting thread */
   if ( !tq_push(wc->thr->q, ret_work ) )
-      free( ret_work );
+   	free( ret_work );

   return true;
 }

-
 static bool workio_submit_work(struct workio_cmd *wc, CURL *curl)
 {
   int failures = 0;
@@ -1973,15 +1970,15 @@ static bool wanna_mine(int thr_id)
 		float temp = cpu_temp(0);
 		if (temp > opt_max_temp)
      {
-         if ( !thr_id && !conditional_state[thr_id] && !opt_quiet )
-           applog(LOG_NOTICE, "CPU temp too high: %.0fC max %.0f, waiting...", temp, opt_max_temp );
-         state = false;
+			if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
+				applog(LOG_INFO, "temperature too high (%.0fC), waiting...", temp);
+			state = false;
 		}
 	}
 	if (opt_max_diff > 0.0 && net_diff > opt_max_diff)
   {
 		if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
-			applog(LOG_NOTICE, "network diff too high, waiting...");
+			applog(LOG_INFO, "network diff too high, waiting...");
 		state = false;
 	}
 	if (opt_max_rate > 0.0 && net_hashrate > opt_max_rate)
@@ -1990,14 +1987,12 @@ static bool wanna_mine(int thr_id)
      {
 			char rate[32];
 			format_hashrate(opt_max_rate, rate);
-			applog(LOG_NOTICE, "network hashrate too high (%s), waiting...", rate);
+			applog(LOG_INFO, "network hashrate too high, waiting %s...", rate);
 		}
 		state = false;
 	}
-  
-   if ( conditional_state[thr_id] && state && !thr_id && !opt_quiet )
-      applog(LOG_NOTICE, "...resuming" );
-	conditional_state[thr_id] = (uint8_t) !state;
+	if (thr_id < MAX_CPUS)
+		conditional_state[thr_id] = (uint8_t) !state;
 	return state;
 }

@@ -2122,10 +2117,6 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
         t++ );

   g_work_time = time(NULL);
-
-   // Do midstate prehash
-   algo_gate.prehash( g_work );
-
   restart_threads();

   pthread_mutex_unlock( &sctx->work_lock );
@@ -2149,7 +2140,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
   else if ( g_work->job_id && new_job )
      applog( LOG_BLUE, "New Work: Block %d, Net diff %.5g, Job %s",
                         sctx->block_height, net_diff, g_work->job_id );
-   else if ( opt_debug )
+   else if ( !opt_quiet )
   {
      unsigned char *xnonce2str = bebin2hex( g_work->xnonce2,
                                             g_work->xnonce2_len );
@@ -2344,9 +2335,6 @@ static void *miner_thread( void *userdata )
 		             goto out;
 	             }
                g_work_time = time(NULL);
-
-                // do midstate prehash
-                algo_gate.prehash( &g_work );
                restart_threads();
             }

@@ -2366,14 +2354,6 @@ static void *miner_thread( void *userdata )
       if ( unlikely( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) ) )
          continue;

-       // conditional mining
-       if ( unlikely( !wanna_mine( thr_id ) ) )
-       {
-          restart_threads();
-          sleep(5);
-          continue;
-       }
-       
       // opt_scantime expressed in hashes
       max64 = opt_scantime * thr_hashrates[thr_id];

@@ -2520,6 +2500,14 @@ static void *miner_thread( void *userdata )
             }
          }
       }  // benchmark
+
+       // conditional mining
+       if ( unlikely( !wanna_mine( thr_id ) ) )
+       {
+          sleep(5);
+          continue;
+       }
+
   }  // miner_thread loop

 out:
@@ -3694,7 +3682,7 @@ int main(int argc, char *argv[])

 #if defined(WIN32)

-// Get the number of cpus, display after parsing command line
+// Are Windows CPU Groups supported?
 #if defined(WINDOWS_CPU_GROUPS_ENABLED)
 	num_cpus = 0;
 	num_cpugroups = GetActiveProcessorGroupCount();
@@ -3703,8 +3691,8 @@ int main(int argc, char *argv[])
 	   int cpus = GetActiveProcessorCount( i );
 	   num_cpus += cpus;

-//	   if (opt_debug)
-//         applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
+	   if (opt_debug)
+         applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
 	}

 #else
@@ -3721,7 +3709,7 @@ int main(int argc, char *argv[])
 	sysctl(req, 2, &num_cpus, &len, NULL, 0);
 #else
 	num_cpus = 1;
-#endif 
+#endif

   if ( num_cpus < 1 )
      num_cpus = 1;
@@ -3873,11 +3861,6 @@ int main(int argc, char *argv[])
 	}
 #endif

-#if defined(WIN32) && defined(WINDOWS_CPU_GROUPS_ENABLED)
-      if ( !opt_quiet )
-         applog( LOG_INFO, "Found %d CPUs in %d groups", num_cpus, num_cpugroups );
-#endif
-   
   if ( opt_affinity && num_cpus > max_cpus )
   {
      applog( LOG_WARNING, "More than %d CPUs, CPU affinity is disabled",
@@ -3889,7 +3872,7 @@ int main(int argc, char *argv[])
   {
      for ( int thr = 0, cpu = 0; thr < opt_n_threads; thr++, cpu++ )
      {
-         while ( !( ( opt_affinity >> ( cpu & 63 ) ) & 1ULL ) ) cpu++;   
+         while ( !( ( opt_affinity >> ( cpu&63 ) ) & 1ULL ) ) cpu++;   
         thread_affinity_map[ thr ] = cpu % num_cpus;
      }
      if ( !opt_quiet )
--- a/miner.h
+++ b/miner.h
@@ -392,8 +392,8 @@ float cpu_temp( int core );

 struct work
 {
-   uint32_t data[48] __attribute__ ((aligned (64)));
-   uint32_t target[8] __attribute__ ((aligned (32)));
+   uint32_t target[8] __attribute__ ((aligned (64)));
+	uint32_t data[48] __attribute__ ((aligned (64)));
 	double targetdiff;
 	double sharediff;
   double stratum_diff;
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -470,7 +470,7 @@ static inline void mm128_intrlv_4x32x( void *dst, void *src0, void  *src1,

 #if defined(__SSSE3__)

-static inline void mm128_bswap32_80( void *d, const void *s )
+static inline void mm128_bswap32_80( void *d, void *s )
 {
  __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
  casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), bswap_shuf );
@@ -482,7 +482,7 @@ static inline void mm128_bswap32_80( void *d, const void *s )

 #else

-static inline void mm128_bswap32_80( void *d, const void *s )
+static inline void mm128_bswap32_80( void *d, void *s )
 {
  ( (uint32_t*)d )[ 0] = bswap_32( ( (uint32_t*)s )[ 0] );
  ( (uint32_t*)d )[ 1] = bswap_32( ( (uint32_t*)s )[ 1] );
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -385,7 +385,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_var_32( v, c ) \
   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )

-/* Not used
+//
 // Limited 2 input shuffle, combines shuffle with blend. The destination low
 // half is always taken from v1, and the high half from v2.
 #define mm128_shuffle2_64( v1, v2, c ) \
@@ -395,7 +395,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_shuffle2_32( v1, v2, c ) \
   _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
                                     _mm_castsi128_ps( v2 ), c ) ); 
-*/

 //
 // Rotate vector elements accross all lanes
@@ -407,7 +406,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_shuflr_32( v )   _mm_shuffle_epi32( v, 0x39 )
 #define mm128_shufll_32( v )   _mm_shuffle_epi32( v, 0x93 )

-/* Not used
 #if defined(__SSSE3__)

 // Rotate right by c bytes, no SSE2 equivalent.
@@ -415,7 +413,6 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 { return _mm_alignr_epi8( v, v, c ); }

 #endif
-*/

 // Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit rotations
 // for multiples of 8 bits. Uses ror/rol macros when AVX512 is available
@@ -558,25 +555,68 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
   v2 = _mm_xor_si128( v1, v2 ); \
   v1 = _mm_xor_si128( v1, v2 );

-// Concatenate { hi, lo }, rotate right by c elements and return low 128 bits.
+
+// alignr for 32 & 64 bit elements is only available with AVX512 but
+// emulated here. Shift argument is not needed, it's always 1.
+// Behaviour is otherwise consistent with Intel alignr intrinsics.
+
 #if defined(__SSSE3__)

-// _mm_alignr_epi32 & _mm_alignr_epi64 are only available with AVX512VL but
-// are emulated here using _mm_alignr_epi8. There are no fast equivalents for
-// 256 bit vectors, though there is no for this functionality.
-
-#define mm128_alignr_64( hi, lo, c )    _mm_alignr_epi8( hi, lo, (c)*8 )
-#define mm128_alignr_32( hi, lo, c )    _mm_alignr_epi8( hi, lo, (c)*4 )
+#define mm128_alignr_64( v1, v2 )    _mm_alignr_epi8( v1, v2, 8 )
+#define mm128_alignr_32( v1, v2 )    _mm_alignr_epi8( v1, v2, 4 )

 #else

-#define mm128_alignr_64( hi, lo, c ) \
-   _mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )
+#define mm128_alignr_64( v1, v2 )    _mm_or_si128( _mm_slli_si128( v1, 8 ), \
+                                                   _mm_srli_si128( v2, 8 ) )

-#define mm128_alignr_32( hi, lo, c ) \
-   _mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )
+#define mm128_alignr_32( v1, v2 )    _mm_or_si128( _mm_slli_si128( v1, 4 ), \
+                                                   _mm_srli_si128( v2, 4 ) )

 #endif

+// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
+// vrol & vror are deprecated and do not exist for larger vectors.
+// Their only use is by lyra2 blake2b when AVX2 is not available and is
+// grandfathered.
+
+#if defined(__SSSE3__)
+
+#define mm128_vror256_64( v1, v2 ) \
+do { \
+   __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
+           v1 = _mm_alignr_epi8( v2, v1, 8 ); \
+           v2 = t; \
+} while(0)
+
+#define mm128_vrol256_64( v1, v2 ) \
+do { \
+   __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
+           v2 = _mm_alignr_epi8( v2, v1, 8 ); \
+           v1 = t; \
+} while(0)
+
+#else  // SSE2
+
+#define mm128_vror256_64( v1, v2 ) \
+do { \
+   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
+                              _mm_slli_si128( v2, 8 ) ); \
+           v2 = _mm_or_si128( _mm_srli_si128( v2, 8 ), \
+                              _mm_slli_si128( v1, 8 ) ); \
+           v1 = t; \
+} while(0)
+
+#define mm128_vrol256_64( v1, v2 ) \
+do { \
+   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
+                              _mm_srli_si128( v2, 8 ) ); \
+           v2 = _mm_or_si128( _mm_slli_si128( v2, 8 ), \
+                              _mm_srli_si128( v1, 8 ) ); \
+           v1 = t; \
+} while(0)
+
+#endif  // SSE4.1 else SSE2
+
 #endif // __SSE2__
 #endif // SIMD_128_H__
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -239,8 +239,8 @@ static inline __m256i mm256_not( const __m256i v )

 // Mask making
 // Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
-// Create a 64 or 32 bit integer mask from MSB of 64 or 32 bit elements.
-// Effectively a sign test: if (mask[n]) then -1 else  0.
+// Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
+// Effectively a sign test.

 #define mm256_movmask_64( v ) \
   _mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) )
@@ -348,7 +348,7 @@ static inline __m256i mm256_not( const __m256i v )
   _mm256_or_si256( _mm256_slli_epi16( v, c ), \
                    _mm256_srli_epi16( v, 16-(c) ) )

-// Deprecated. Obsolete sm3, the only user, is grandfathered.
+// Deprecated.
 #define mm256_rol_var_32( v, c ) \
   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
                    _mm256_srli_epi32( v, 32-(c) ) )
@@ -391,7 +391,6 @@ static inline __m256i mm256_shufll_32( const __m256i v )
 //
 // Rotate elements within each 128 bit lane of 256 bit vector.

-/* Not used
 // Limited 2 input shuffle
 #define mm256_shuffle2_64( v1, v2, c ) \
   _mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( v1 ), \
@@ -400,7 +399,6 @@ static inline __m256i mm256_shufll_32( const __m256i v )
 #define mm256_shuffle2_32( v1, v2, c ) \
   _mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( v1 ), \
                                           _mm256_castsi256_ps( v2 ), c ) ); 
-*/

 #define mm256_swap128_64( v )  _mm256_shuffle_epi32( v, 0x4e )
 #define mm256_shuflr128_64 mm256_swap128_64
@@ -513,8 +511,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 } while(0)

 // swap 256 bit vectors in place.
-// Deprecated, Shabal is the only user and it should be modified to reorder
-// instructions.
+// This should be avoided, it's more efficient to switch references.
 #define mm256_swap512_256( v1, v2 ) \
   v1 = _mm256_xor_si256( v1, v2 ); \
   v2 = _mm256_xor_si256( v1, v2 ); \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -409,20 +409,19 @@ static inline __m512i mm512_shuflr_x64( const __m512i v, const int n )
 static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 { return _mm512_alignr_epi32( v, v, n ); }

-/* Not used
 #define mm512_shuflr_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                       0x0000001F001E001D, 0x001C001B001A0019, \
-                       0x0018001700160015, 0x0014001300120011, \
-                       0x0010000F000E000D, 0x000C000B000A0009, \
-                       0x0008000700060005, 0x0004000300020001 ), v )
+                       0X0018001700160015, 0X0014001300120011, \
+                       0X0010000F000E000D, 0X000C000B000A0009, \
+                       0X0008000700060005, 0X0004000300020001 ), v )

 #define mm512_shufll_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                       0x001E001D001C001B, 0x001A001900180017, \
-                       0x0016001500140013, 0x001200110010000F, \
-                       0x000E000D000C000B, 0x000A000900080007, \
-                       0x0006000500040003, 0x000200010000001F ), v )
+                       0X0016001500140013, 0X001200110010000F, \
+                       0X000E000D000C000B, 0X000A000900080007, \
+                       0X0006000500040003, 0X000200010000001F ), v )

 #define mm512_shuflr_8( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
@@ -437,7 +436,6 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
                       0x2E2D2C2B2A292827, 0x262524232221201F, \
                       0x1E1D1C1B1A191817, 0x161514131211100F, \
                       0x0E0D0C0B0A090807, 0x060504030201003F ) )
-*/

 // 256 bit lanes used only by lyra2, move these there
 // Rotate elements within 256 bit lanes of 512 bit vector.
@@ -451,7 +449,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 #define mm512_shuflr256_64( v )     _mm512_permutex_epi64( v, 0x39 )
 #define mm512_shufll256_64( v )     _mm512_permutex_epi64( v, 0x93 )

-/* Not used
+/*
 // Rotate 256 bit lanes by one 32 bit element
 #define mm512_shuflr256_32( v ) \
   _mm512_permutexvar_epi32( m512_const_64( \
@@ -498,7 +496,6 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 //
 // Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
 
-/* Not used
 // Limited 2 input, 1 output shuffle, combines shuffle with blend.
 // Like most shuffles it's limited to 128 bit lanes and like some shuffles
 // destination elements must come from a specific source arg. 
@@ -509,10 +506,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 #define mm512_shuffle2_32( v1, v2, c ) \
   _mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( v1 ), \
                                           _mm512_castsi512_ps( v2 ), c ) ); 
-*/

-// These hard coded shuffles exist for consistency with AVX2 & SSE2 where
-// efficient generic versions don't exist.
 // Swap 64 bits in each 128 bit lane
 #define mm512_swap128_64( v )   _mm512_shuffle_epi32( v, 0x4e )
 #define mm512_shuflr128_64  mm512_swap128_64
@@ -522,11 +516,9 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 #define mm512_shuflr128_32( v )    _mm512_shuffle_epi32( v, 0x39 )
 #define mm512_shufll128_32( v )    _mm512_shuffle_epi32( v, 0x93 )

-/* Not used
-// Rotate right 128 bit lanes by c bytes, efficient generic version of above.
+// Rotate right 128 bit lanes by c bytes, versatile and just as fast
 static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 {  return _mm512_alignr_epi8( v, v, c ); }
-*/

 // Rotate byte elements in each 64 or 32 bit lane. Redundant for AVX512, all
 // can be done with ror & rol. Defined only for convenience and consistency
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -17,7 +17,6 @@ export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
 # used by GCC
 export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
 # Support for Windows 7 CPU groups, AES sometimes not included in -march
-# Disabled due to CPU group incompatibilities between Intel and AMD CPU. 
 export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
 export DEFAULT_CFLAGS_OLD="-O3 -Wall"

@@ -46,7 +45,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
 ./clean-all.sh || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-march=icelake-client $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=icelake-client $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
@@ -54,7 +53,7 @@ mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
 # AVX512 AES: Intel Core HEDT Slylake-X, Cascadelake 
 make clean || echo clean
 rm -f config.status
-CFLAGS="-march=skylake-avx512 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=skylake-avx512 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512.exe
@@ -62,7 +61,7 @@ mv cpuminer.exe release/cpuminer-avx512.exe
 # AVX2 SHA VAES: Intel Alderlake, AMD Zen3
 make clean || echo done
 rm -f config.status
-CFLAGS="-mavx2 -msha -mvaes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
+CFLAGS="-mavx2 -msha -mvaes $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
@@ -70,7 +69,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
 # AVX2 AES SHA: AMD Zen1
 make clean || echo clean
 rm -f config.status
-CFLAGS="-march=znver1 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=znver1 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2-sha.exe
@@ -78,7 +77,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha.exe
 # AVX2 AES: Intel Core Haswell, Skylake, Kabylake, Coffeelake, Cometlake
 make clean || echo clean
 rm -f config.status
-CFLAGS="-march=core-avx2 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=core-avx2 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2.exe