v3.12.4.3

v3.12.4.2
v3.12.4.1
2025-09-17 23:44:27 +00:00 · 2020-02-24 21:35:19 -05:00 · 2020-02-23 15:31:06 -05:00 · 2020-02-22 18:06:39 -05:00 · 2020-02-21 16:34:53 -05:00 · 2020-02-18 12:05:47 -05:00
58 changed files with 3637 additions and 4227 deletions
--- a/README.md
+++ b/README.md
@@ -12,10 +12,24 @@ a false positive, they are flagged simply because they are cryptocurrency
 miners. The source code is open for anyone to inspect. If you don't trust 
 the software, don't use it.

+
+New thread:
+
+https://bitcointalk.org/index.php?topic=5226770.msg53865575#msg53865575
+
+Old thread:
+
 https://bitcointalk.org/index.php?topic=1326803.0

 mailto://jayddee246@gmail.com

+This note is to confirm that bitcointalk users JayDDee and joblo are the
+same person.
+
+I created a new BCT user JayDDee to match my github user id.
+The old thread has been locked but still contains useful information for
+reading.
+
 See file RELEASE_NOTES for change log and INSTALL_LINUX or INSTALL_WINDOWS
 for compile instructions.

--- a/47
+++ b/47
@@ -65,6 +65,53 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v3.12.4.3
+
+Fixed segfault in new block log for getwork.
+
+Disabled silent discarding of stale work after the submit is logged.
+
+v3.12.4.2
+
+Issue #245: fixed getwork stale shares, solo mining with getwork now works.
+
+Issue #246: implemented block and summary logs for getwork.
+
+v3.12.4.1
+
+Issue #245: fix scantime when mining solo with getwork.
+
+Added debug logs for creation of stratum and longpoll threads, use -D to
+enable.
+
+v3.12.4
+
+Issue #244: Change longpoll to ignore job id.
+
+Lyra2rev2 AVX2 +3%, AVX512 +6%.
+
+v3.12.3.1
+
+Issue #241: Fixed regression that broke coinbase address in v3.11.7.
+
+v3.12.3
+
+Issue #238: Fixed skunk AVX2.
+
+Issue #239: Faster AVX2 & AVX512 for skein +44%, skein2 +30%, plus marginal
+increases for skunk, x16r, x16rv2, x16rt, x16rt-veil, x16s, x21s.
+
+Faster anime VAES +57%, AVX512 +21%, AVX2 +3%.
+
+Redesigned code reponsible for #236.
+
+v3.12.2
+
+Fixed xevan, skein, skein2 AVX2, #238.
+
+Reversed polarity of AVX2 vector bit test utilities, and all users, to be
+logically and semantically correct. Follow up to issue #236. 
+
 v3.12.1

 Fixed anime AVX2 low difficulty shares, git issue #236.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -281,39 +281,37 @@ void exec_hash_function( int algo, void *output, const void *pdata )
 const char* const algo_alias_map[][2] =
 {
 //   alias                proper
-  { "argon2d-crds",      "argon2d250"   },
-  { "argon2d-dyn",       "argon2d500"   },
-  { "argon2d-uis",       "argon2d4096"  },
-  { "bcd",               "x13bcd"       },
-  { "bitcore",           "timetravel10" },
-  { "bitzeny",           "yescryptr8"   },
-  { "blake256r8",        "blakecoin"    },
-  { "blake256r8vnl",     "vanilla"      },
-  { "blake256r14",       "blake"        },
-  { "blake256r14dcr",    "decred"       },
-  { "cryptonote",        "cryptonight"  },
-  { "cryptonight-light", "cryptolight"  },
-  { "diamond",           "dmd-gr"       },
-  { "droplp",            "drop"         },
-  { "espers",            "hmq1725"      },
-  { "flax",              "c11"          },
-  { "hsr",               "x13sm3"       },
-  { "jackpot",           "jha"          },
-  { "jane",              "scryptjane"   }, 
-  { "lyra2",             "lyra2re"      },
-  { "lyra2v2",           "lyra2rev2"    },
-  { "lyra2v3",           "lyra2rev3"    },
-  { "myrgr",             "myr-gr"       },
-  { "myriad",            "myr-gr"       },
-  { "neo",               "neoscrypt"    },
-  { "phi",               "phi1612"      },
-  { "sib",               "x11gost"      },
-  { "timetravel8",       "timetravel"   },
-  { "veil",              "x16rt-veil"   },
-  { "x16r-hex",          "hex"          },
-  { "yenten",            "yescryptr16"  },
-  { "ziftr",             "zr5"          },
-  { NULL,                NULL           }   
+  { "argon2d-crds",      "argon2d250"     },
+  { "argon2d-dyn",       "argon2d500"     },
+  { "argon2d-uis",       "argon2d4096"    },
+  { "bcd",               "x13bcd"         },
+  { "bitcore",           "timetravel10"   },
+  { "bitzeny",           "yescryptr8"     },
+  { "blake256r8",        "blakecoin"      },
+  { "blake256r8vnl",     "vanilla"        },
+  { "blake256r14",       "blake"          },
+  { "blake256r14dcr",    "decred"         },
+  { "diamond",           "dmd-gr"         },
+  { "espers",            "hmq1725"        },
+  { "flax",              "c11"            },
+  { "hsr",               "x13sm3"         },
+  { "jackpot",           "jha"            },
+  { "jane",              "scryptjane"     }, 
+  { "lyra2",             "lyra2re"        },
+  { "lyra2v2",           "lyra2rev2"      },
+  { "lyra2v3",           "lyra2rev3"      },
+  { "myrgr",             "myr-gr"         },
+  { "myriad",            "myr-gr"         },
+  { "neo",               "neoscrypt"      },
+  { "phi",               "phi1612"        },
+  { "scryptn2",          "scrypt:1048576" },
+  { "sib",               "x11gost"        },
+  { "timetravel8",       "timetravel"     },
+  { "veil",              "x16rt-veil"     },
+  { "x16r-hex",          "hex"            },
+  { "yenten",            "yescryptr16"    },
+  { "ziftr",             "zr5"            },
+  { NULL,                NULL             }   
 };

 // if arg is a valid alias for a known algo it is updated with the proper
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -138,7 +138,7 @@ void bmw512_2way_close( bmw512_2way_context *ctx, void *dst );

 #if defined(__AVX2__)

-// BMW-512 4 way 64
+// BMW-512 64 bit 4 way

 typedef struct {
   __m256i buf[16];
@@ -149,7 +149,6 @@ typedef struct {

 typedef bmw_4way_big_context bmw512_4way_context;

-
 void bmw512_4way_init(void *cc);

 void bmw512_4way_update(void *cc, const void *data, size_t len);
@@ -164,6 +163,7 @@ void bmw512_4way_addbits_and_close(

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

+// BMW-512 64 bit 8 way
 typedef struct {
   __m512i buf[16];
   __m512i H[16];
@@ -171,6 +171,8 @@ typedef struct {
   uint64_t bit_count;
 } bmw512_8way_context __attribute__((aligned(128)));

+void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
+                         size_t len );
 void bmw512_8way_init( bmw512_8way_context *ctx );
 void bmw512_8way_update( bmw512_8way_context *ctx, const void *data,
                         size_t len );
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -1507,6 +1507,93 @@ void bmw512_8way_close( bmw512_8way_context *ctx, void *dst )
      casti_m512i( dst, u ) = h1[ v ];
 }

+void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
+                                size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   __m512i *buf = ctx->buf;
+   __m512i htmp[16];
+   __m512i *H = ctx->H;
+   __m512i *h2 = htmp;
+   uint64_t bit_count = len * 8;
+   size_t ptr = 0;
+   const int buf_size = 128;  // bytes of one lane, compatible with len
+
+// Init
+
+   H[ 0] = m512_const1_64( 0x8081828384858687 );
+   H[ 1] = m512_const1_64( 0x88898A8B8C8D8E8F );
+   H[ 2] = m512_const1_64( 0x9091929394959697 );
+   H[ 3] = m512_const1_64( 0x98999A9B9C9D9E9F );
+   H[ 4] = m512_const1_64( 0xA0A1A2A3A4A5A6A7 );
+   H[ 5] = m512_const1_64( 0xA8A9AAABACADAEAF );
+   H[ 6] = m512_const1_64( 0xB0B1B2B3B4B5B6B7 );
+   H[ 7] = m512_const1_64( 0xB8B9BABBBCBDBEBF );
+   H[ 8] = m512_const1_64( 0xC0C1C2C3C4C5C6C7 );
+   H[ 9] = m512_const1_64( 0xC8C9CACBCCCDCECF );
+   H[10] = m512_const1_64( 0xD0D1D2D3D4D5D6D7 );
+   H[11] = m512_const1_64( 0xD8D9DADBDCDDDEDF );
+   H[12] = m512_const1_64( 0xE0E1E2E3E4E5E6E7 );
+   H[13] = m512_const1_64( 0xE8E9EAEBECEDEEEF );
+   H[14] = m512_const1_64( 0xF0F1F2F3F4F5F6F7 );
+   H[15] = m512_const1_64( 0xF8F9FAFBFCFDFEFF );
+
+// Update
+
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_512( buf + (ptr>>3), vdata, clen >> 3 );
+      vdata = vdata + (clen>>3);
+      len -= clen;
+      ptr += clen;
+      if ( ptr == buf_size )
+      {
+         __m512i *ht;
+         compress_big_8way( buf, H, h2 );
+         ht = H;
+         H = h2;
+         h2 = ht;
+         ptr = 0;
+      }
+   }
+   if ( H != ctx->H )
+      memcpy_512( ctx->H, H, 16 );
+
+// Close   
+{
+   __m512i h1[16], h2[16];
+   size_t u, v;
+
+   buf[ ptr>>3 ] = m512_const1_64( 0x80 );
+   ptr += 8;
+
+   if (  ptr > (buf_size - 8) )
+   {
+      memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+      compress_big_8way( buf, H, h1 );
+      ptr = 0;
+      H = h1;
+   }
+   memset_zero_512( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
+   buf[ (buf_size - 8) >> 3 ] = _mm512_set1_epi64( bit_count );
+   compress_big_8way( buf, H, h2 );
+   for ( u = 0; u < 16; u ++ )
+      buf[ u ] = h2[ u ];
+   compress_big_8way( buf, final_b8, h1 );
+   for (u = 0, v = 8; u < 8; u ++, v ++)
+      casti_m512i( out, u ) = h1[ v ];
+}
+
+
+
+}   
+
+
+
 #endif // AVX512

 #ifdef __cplusplus
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -179,14 +179,6 @@ int cube_4way_full( cube_4way_context *sp, void *output,  int hashbitlen,
    sp->rounds    = 16;
    sp->pos       = 0;

-    h[ 0] = m512_const1_128( iv[0] );
-    h[ 1] = m512_const1_128( iv[1] );
-    h[ 2] = m512_const1_128( iv[2] );
-    h[ 3] = m512_const1_128( iv[3] );
-    h[ 4] = m512_const1_128( iv[4] );
-    h[ 5] = m512_const1_128( iv[5] );
-    h[ 6] = m512_const1_128( iv[6] );
-    h[ 7] = m512_const1_128( iv[7] );
    h[ 0] = m512_const1_128( iv[0] );
    h[ 1] = m512_const1_128( iv[1] );
    h[ 2] = m512_const1_128( iv[2] );
@@ -447,14 +439,6 @@ int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,
    sp->rounds    = 16;
    sp->pos       = 0;

-    h[ 0] = m256_const1_128( iv[0] );
-    h[ 1] = m256_const1_128( iv[1] );
-    h[ 2] = m256_const1_128( iv[2] );
-    h[ 3] = m256_const1_128( iv[3] );
-    h[ 4] = m256_const1_128( iv[4] );
-    h[ 5] = m256_const1_128( iv[5] );
-    h[ 6] = m256_const1_128( iv[6] );
-    h[ 7] = m256_const1_128( iv[7] );
    h[ 0] = m256_const1_128( iv[0] );
    h[ 1] = m256_const1_128( iv[1] );
    h[ 2] = m256_const1_128( iv[2] );
--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -28,6 +28,27 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,
 int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
                    const void *data, size_t size );

+int cube_4x256_full( cube_4way_context *sp, void *output, int hashbitlen,
+                     const void *data, size_t size );
+
+#define cube512_4way_init( sp ) cube_4way_update( sp, 512 )
+#define cube512_4way_update cube_4way_update
+#define cube512_4way_update_close cube_4way_update
+#define cube512_4way_close cube_4way_update
+#define cube512_4way_full( sp, output, data, size ) \
+           cube_4way_full( sp, output, 512, data, size )
+#define cube512_4x256_full( sp, output, data, size ) \
+           cube_4x256_full( sp, output, 512, data, size )
+
+#define cube256_4way_init( sp ) cube_4way_update( sp, 256 )
+#define cube256_4way_update cube_4way_update
+#define cube256_4way_update_close cube_4way_update
+#define cube256_4way_close cube_4way_update
+#define cube256_4way_full( sp, output, data, size ) \
+           cube_4way_full( sp, output, 256, data, size )
+#define cube256_4x256_full( sp, output, data, size ) \
+           cube_4x256_full( sp, output, 256, data, size )
+
 #endif

 // 2x128, 2 way parallel SSE2
--- a/algo/echo/echo-hash-4way.h
+++ b/algo/echo/echo-hash-4way.h
@@ -22,18 +22,26 @@ typedef struct
 } echo_4way_context __attribute__ ((aligned (64)));

 int echo_4way_init( echo_4way_context *state, int hashbitlen );
-
+#define echo512_4way_init( state ) echo_4way_init( state, 512 )
+#define echo256_4way_init( state ) echo_4way_init( state, 256 )

 int echo_4way_update( echo_4way_context *state, const void *data,
    unsigned int databitlen);
+#define echo512_4way_update echo_4way_update

 int echo_close( echo_4way_context *state, void *hashval );
+#define echo512_4way_close echo_4way_close

 int echo_4way_update_close( echo_4way_context *state, void *hashval,
                              const void *data, int databitlen );
+#define echo512_4way_update_close echo_4way_update_close

 int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
                    const void *data, int datalen );
+#define echo512_4way_full( state, hashval, data, datalen ) \
+           echo_4way_full( state, hashval, 512, data, datalen )
+#define echo256_4way_full( state, hashval, data, datalen ) \
+           echo_4way_full( state, hashval, 256, data, datalen )

 #endif 
 #endif
--- a/algo/fugue/sph_fugue.h
+++ b/algo/fugue/sph_fugue.h
@@ -74,6 +74,14 @@ void sph_fugue512_close(void *cc, void *dst);
 void sph_fugue512_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);

+#define sph_fugue512_full( cc, dst, data, len ) \
+do{ \
+   sph_fugue512_init( cc ); \
+   sph_fugue512( cc, data, len ); \
+   sph_fugue512_close( cc, dst ); \
+}while(0)
+
+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -94,12 +94,12 @@ bool lyra2rev2_thread_init()
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;

   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
-#if defined (LYRA2REV2_8WAY)
+#if defined (LYRA2REV2_16WAY)
   l2v2_wholeMatrix = _mm_malloc( 2 * size, 64 );   // 2 way
-   init_lyra2rev2_8way_ctx();;
-#elif defined (LYRA2REV2_4WAY)
+   init_lyra2rev2_16way_ctx();;
+#elif defined (LYRA2REV2_8WAY)
   l2v2_wholeMatrix = _mm_malloc( size, 64 );
-   init_lyra2rev2_4way_ctx();;
+   init_lyra2rev2_8way_ctx();;
 #else
   l2v2_wholeMatrix = _mm_malloc( size, 64 );
   init_lyra2rev2_ctx();
@@ -109,17 +109,17 @@ bool lyra2rev2_thread_init()

 bool register_lyra2rev2_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV2_8WAY)
+#if defined (LYRA2REV2_16WAY)
+  gate->scanhash  = (void*)&scanhash_lyra2rev2_16way;
+  gate->hash      = (void*)&lyra2rev2_16way_hash;
+#elif defined (LYRA2REV2_8WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev2_8way;
  gate->hash      = (void*)&lyra2rev2_8way_hash;
-#elif defined (LYRA2REV2_4WAY)
-  gate->scanhash  = (void*)&scanhash_lyra2rev2_4way;
-  gate->hash      = (void*)&lyra2rev2_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_lyra2rev2;
  gate->hash      = (void*)&lyra2rev2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
  opt_target_factor = 256.0;
  return true;
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -51,30 +51,32 @@ bool init_lyra2rev3_ctx();
 //////////////////////////////////

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define LYRA2REV2_8WAY 1
+  #define LYRA2REV2_16WAY 1
 #elif defined(__AVX2__)
-  #define LYRA2REV2_4WAY 1
+  #define LYRA2REV2_8WAY 1
 #endif

 extern __thread uint64_t* l2v2_wholeMatrix;

 bool register_lyra2rev2_algo( algo_gate_t* gate );

-#if defined(LYRA2REV2_8WAY)
+#if defined(LYRA2REV2_16WAY)
+
+void lyra2rev2_16way_hash( void *state, const void *input );
+int scanhash_lyra2rev2_16way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr );
+bool init_lyra2rev2_16way_ctx();
+
+#elif defined(LYRA2REV2_8WAY)

 void lyra2rev2_8way_hash( void *state, const void *input );
 int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev2_8way_ctx();

-#elif defined(LYRA2REV2_4WAY)
-
-void lyra2rev2_4way_hash( void *state, const void *input );
-int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
-                             uint64_t *hashes_done, struct thr_info *mythr );
-bool init_lyra2rev2_4way_ctx();

 #else
+
 void lyra2rev2_hash( void *state, const void *input );
 int scanhash_lyra2rev2( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -7,23 +7,227 @@
 #include "algo/cubehash/cubehash_sse2.h" 
 #include "algo/cubehash/cube-hash-2way.h"

-#if defined (LYRA2REV2_8WAY)
+
+#if defined (LYRA2REV2_16WAY)
+
+typedef struct {
+   blake256_16way_context    blake;
+   keccak256_8way_context    keccak;
+   cubehashParam             cube;
+   skein256_8way_context     skein;
+   bmw256_16way_context      bmw;
+} lyra2v2_16way_ctx_holder __attribute__ ((aligned (64)));
+
+static lyra2v2_16way_ctx_holder l2v2_16way_ctx;
+
+bool init_lyra2rev2_16way_ctx()
+{
+   keccak256_8way_init( &l2v2_16way_ctx.keccak );
+   cubehashInit( &l2v2_16way_ctx.cube, 256, 16, 32 );
+   skein256_8way_init( &l2v2_16way_ctx.skein );
+   bmw256_16way_init( &l2v2_16way_ctx.bmw );
+   return true;
+}
+
+void lyra2rev2_16way_hash( void *state, const void *input )
+{
+   uint32_t vhash[8*16] __attribute__ ((aligned (128)));
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (64)));
+   uint32_t hash2[8] __attribute__ ((aligned (64)));
+   uint32_t hash3[8] __attribute__ ((aligned (64)));
+   uint32_t hash4[8] __attribute__ ((aligned (64)));
+   uint32_t hash5[8] __attribute__ ((aligned (64)));
+   uint32_t hash6[8] __attribute__ ((aligned (64)));
+   uint32_t hash7[8] __attribute__ ((aligned (64)));
+   uint32_t hash8[8] __attribute__ ((aligned (64)));
+   uint32_t hash9[8] __attribute__ ((aligned (64)));
+   uint32_t hash10[8] __attribute__ ((aligned (64)));
+   uint32_t hash11[8] __attribute__ ((aligned (64)));
+   uint32_t hash12[8] __attribute__ ((aligned (64)));
+   uint32_t hash13[8] __attribute__ ((aligned (64)));
+   uint32_t hash14[8] __attribute__ ((aligned (64)));
+   uint32_t hash15[8] __attribute__ ((aligned (64)));
+   lyra2v2_16way_ctx_holder ctx __attribute__ ((aligned (64)));
+   memcpy( &ctx, &l2v2_16way_ctx, sizeof(l2v2_16way_ctx) );
+
+   blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
+   blake256_16way_close( &ctx.blake, vhash );
+
+   dintrlv_16x32( hash0,  hash1,  hash2,  hash3,
+                  hash4,  hash5,  hash6,  hash7,
+                  hash8,  hash9,  hash10, hash11,
+                  hash12, hash13, hash14, hash15, vhash, 256 );
+
+   intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
+                       hash4, hash5, hash6, hash7, 256 );
+
+   keccak256_8way_update( &ctx.keccak, vhash, 32 );
+   keccak256_8way_close( &ctx.keccak, vhash );
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3,
+                 hash4, hash5, hash6, hash7, vhash, 256 );
+   intrlv_8x64( vhash, hash8,  hash9,  hash10, hash11,
+                       hash12, hash13, hash14, hash15, 256 );
+
+   keccak256_8way_init( &ctx.keccak );
+   keccak256_8way_update( &ctx.keccak, vhash, 32 );
+   keccak256_8way_close( &ctx.keccak, vhash );
+
+   dintrlv_8x64( hash8,  hash9,  hash10,  hash11,
+                 hash12, hash13, hash14, hash5, vhash, 256 );
+
+   cubehash_full( &ctx.cube, (byte*) hash0,  256, (const byte*) hash0,  32 );
+   cubehash_full( &ctx.cube, (byte*) hash1,  256, (const byte*) hash1,  32 );
+   cubehash_full( &ctx.cube, (byte*) hash2,  256, (const byte*) hash2,  32 );
+   cubehash_full( &ctx.cube, (byte*) hash3,  256, (const byte*) hash3,  32 );
+   cubehash_full( &ctx.cube, (byte*) hash4,  256, (const byte*) hash4,  32 );
+   cubehash_full( &ctx.cube, (byte*) hash5,  256, (const byte*) hash5,  32 );
+   cubehash_full( &ctx.cube, (byte*) hash6,  256, (const byte*) hash6,  32 );
+   cubehash_full( &ctx.cube, (byte*) hash7,  256, (const byte*) hash7,  32 );
+   cubehash_full( &ctx.cube, (byte*) hash8,  256, (const byte*) hash8,  32 );
+   cubehash_full( &ctx.cube, (byte*) hash9,  256, (const byte*) hash9,  32 );
+   cubehash_full( &ctx.cube, (byte*) hash10, 256, (const byte*) hash10, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash11, 256, (const byte*) hash11, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash12, 256, (const byte*) hash12, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash13, 256, (const byte*) hash13, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash14, 256, (const byte*) hash14, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash15, 256, (const byte*) hash15, 32 );
+
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+   intrlv_2x256( vhash, hash8, hash9, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash8, hash9, vhash, 256 );
+   intrlv_2x256( vhash, hash10, hash11, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash10, hash11, vhash, 256 );
+   intrlv_2x256( vhash, hash12, hash13, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash12, hash13, vhash, 256 );
+   intrlv_2x256( vhash, hash14, hash15, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash14, hash15, vhash, 256 );
+
+   intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
+                       hash4, hash5, hash6, hash7, 256 );
+   skein256_8way_update( &ctx.skein, vhash, 32 );
+   skein256_8way_close( &ctx.skein, vhash );
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3,
+                 hash4, hash5, hash6, hash7, vhash, 256 );
+   intrlv_8x64( vhash, hash8,  hash9,  hash10, hash11, hash12,
+                       hash13, hash14, hash15, 256 );
+
+   skein256_8way_init( &ctx.skein );
+   skein256_8way_update( &ctx.skein, vhash, 32 );
+   skein256_8way_close( &ctx.skein, vhash );
+
+   dintrlv_8x64( hash8,  hash9,  hash10, hash11,
+                 hash12, hash13, hash14, hash15, vhash, 256 );
+
+   
+   cubehash_full( &ctx.cube, (byte*) hash0,  256, (const byte*) hash0, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash1,  256, (const byte*) hash1, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash2,  256, (const byte*) hash2, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash3,  256, (const byte*) hash3, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash4,  256, (const byte*) hash4, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash5,  256, (const byte*) hash5, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash6,  256, (const byte*) hash6, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash7,  256, (const byte*) hash7, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash8,  256, (const byte*) hash8,  32 );
+   cubehash_full( &ctx.cube, (byte*) hash9,  256, (const byte*) hash9,  32 );
+   cubehash_full( &ctx.cube, (byte*) hash10, 256, (const byte*) hash10, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash11, 256, (const byte*) hash11, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash12, 256, (const byte*) hash12, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash13, 256, (const byte*) hash13, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash14, 256, (const byte*) hash14, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash15, 256, (const byte*) hash15, 32 );
+
+   intrlv_16x32( vhash, hash0,  hash1,  hash2,  hash3,
+                        hash4,  hash5,  hash6,  hash7,
+                        hash8,  hash9,  hash10, hash11,
+                        hash12, hash13, hash14, hash15, 256 );
+
+   bmw256_16way_update( &ctx.bmw, vhash, 32 );
+   bmw256_16way_close( &ctx.bmw, state );
+}
+
+int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   uint32_t *hashd7 = &hash[7*16];
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 16;
+   uint32_t n = first_nonce;
+   const uint32_t targ32 = ptarget[7];
+   __m512i  *noncev = (__m512i*)vdata + 19;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+
+   if ( bench )  ptarget[7] = 0x0000ff;
+
+   mm512_bswap32_intrlv80_16x32( vdata, pdata );
+   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
+                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
+   blake256_16way_init( &l2v2_16way_ctx.blake );
+   blake256_16way_update( &l2v2_16way_ctx.blake, vdata, 64 );
+
+   do
+   {
+      lyra2rev2_16way_hash( hash, vdata );
+
+      for ( int lane = 0; lane < 16; lane++ )
+      if ( unlikely( hashd7[lane] <= targ32 ) )
+      {
+         extr_lane_16x32( lane_hash, hash, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+             pdata[19] = bswap_32( n + lane );
+             submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
+      n += 16;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (LYRA2REV2_8WAY)

 typedef struct {
   blake256_8way_context     blake;
-   keccak256_8way_context    keccak;
-   cube_4way_context          cube;
-   skein256_8way_context     skein;
-   bmw256_8way_context          bmw;
+   keccak256_4way_context    keccak;
+   cubehashParam             cube;
+   skein256_4way_context     skein;
+   bmw256_8way_context       bmw;
 } lyra2v2_8way_ctx_holder __attribute__ ((aligned (64)));

 static lyra2v2_8way_ctx_holder l2v2_8way_ctx;

 bool init_lyra2rev2_8way_ctx()
 {
-   keccak256_8way_init( &l2v2_8way_ctx.keccak );
-   cube_4way_init( &l2v2_8way_ctx.cube, 256, 16, 32 );
-   skein256_8way_init( &l2v2_8way_ctx.skein );
+   keccak256_4way_init( &l2v2_8way_ctx.keccak );
+   cubehashInit( &l2v2_8way_ctx.cube, 256, 16, 32 );
+   skein256_4way_init( &l2v2_8way_ctx.skein );
   bmw256_8way_init( &l2v2_8way_ctx.bmw );
   return true;
 }
@@ -31,8 +235,6 @@ bool init_lyra2rev2_8way_ctx()
 void lyra2rev2_8way_hash( void *state, const void *input )
 {
   uint32_t vhash[8*8] __attribute__ ((aligned (128)));
-   uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
-   uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
   uint32_t hash0[8] __attribute__ ((aligned (64)));
   uint32_t hash1[8] __attribute__ ((aligned (64)));
   uint32_t hash2[8] __attribute__ ((aligned (64)));
@@ -47,103 +249,113 @@ void lyra2rev2_8way_hash( void *state, const void *input )
   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
   blake256_8way_close( &ctx.blake, vhash );

-   rintrlv_8x32_8x64( vhashA, vhash, 256 );
+   dintrlv_8x32( hash0, hash1, hash2, hash3,
+                 hash4, hash5, hash6, hash7, vhash, 256 );

-   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
-   keccak256_8way_close( &ctx.keccak, vhash );
+   intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
+   keccak256_4way_update( &ctx.keccak, vhash, 32 );
+   keccak256_4way_close( &ctx.keccak, vhash );
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
+   intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
+   keccak256_4way_init( &ctx.keccak );
+   keccak256_4way_update( &ctx.keccak, vhash, 32 );
+   keccak256_4way_close( &ctx.keccak, vhash );
+   dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );

-   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
+   cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash1, 256, (const byte*) hash1, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash2, 256, (const byte*) hash2, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash3, 256, (const byte*) hash3, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash4, 256, (const byte*) hash4, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash5, 256, (const byte*) hash5, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash6, 256, (const byte*) hash6, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash7, 256, (const byte*) hash7, 32 );

-   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
-   cube_4way_init( &ctx.cube, 256, 16, 32 );
-   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );

-   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
-   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
-
-   intrlv_2x256( vhash, hash0, hash1, 256 );
-   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash0, hash1, vhash, 256 );
-   intrlv_2x256( vhash, hash2, hash3, 256 );
-   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash2, hash3, vhash, 256 );
-   intrlv_2x256( vhash, hash4, hash5, 256 );
-   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash4, hash5, vhash, 256 );
-   intrlv_2x256( vhash, hash6, hash7, 256 );
-   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash6, hash7, vhash, 256 );
-
-   intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                hash7, 256 );
-
-   skein256_8way_update( &ctx.skein, vhash, 32 );
-   skein256_8way_close( &ctx.skein, vhash );
-
-   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
-
-   cube_4way_init( &ctx.cube, 256, 16, 32 );
-   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
-   cube_4way_init( &ctx.cube, 256, 16, 32 );
-   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
+   LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
+   LYRA2REV2( l2v2_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
+   LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
+   LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
+   LYRA2REV2( l2v2_wholeMatrix, hash4, 32, hash4, 32, hash4, 32, 1, 4, 4 );
+   LYRA2REV2( l2v2_wholeMatrix, hash5, 32, hash5, 32, hash5, 32, 1, 4, 4 );
+   LYRA2REV2( l2v2_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 );
+   LYRA2REV2( l2v2_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
   
-   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
-   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
+   intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
+   skein256_4way_update( &ctx.skein, vhash, 32 );
+   skein256_4way_close( &ctx.skein, vhash );
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
+   intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
+   skein256_4way_init( &ctx.skein );
+   skein256_4way_update( &ctx.skein, vhash, 32 );
+   skein256_4way_close( &ctx.skein, vhash );
+   dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );

-   intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, 
-                hash7, 256 );
+   cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash1, 256, (const byte*) hash1, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash2, 256, (const byte*) hash2, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash3, 256, (const byte*) hash3, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash4, 256, (const byte*) hash4, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash5, 256, (const byte*) hash5, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash6, 256, (const byte*) hash6, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash7, 256, (const byte*) hash7, 32 );
+
+   intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
+                       hash4, hash5, hash6, hash7, 256 );

   bmw256_8way_update( &ctx.bmw, vhash, 32 );
   bmw256_8way_close( &ctx.bmw, state );
 }

-int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
+int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*8] __attribute__ ((aligned (128)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t *hash7 = &(hash[7<<3]);
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hashd7 = &hash[7*8];
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
-   const uint32_t Htarg = ptarget[7];
-   __m256i *noncev = (__m256i*)vdata + 19;   // aligned
-   int thr_id = mythr->id; 
+   const uint32_t targ32 = ptarget[7];
+   __m256i  *noncev = (__m256i*)vdata + 19;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;

-   if ( opt_benchmark )
-      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+   if ( bench )  ptarget[7] = 0x0000ff;

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
-
+   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
   blake256_8way_init( &l2v2_8way_ctx.blake );
   blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 );

   do
   {
-      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
-                                                  n+3, n+2, n+1, n ) );
-
      lyra2rev2_8way_hash( hash, vdata );
-      pdata[19] = n;

-      for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( unlikely( hashd7[lane] <= targ32 ) )
      {
         extr_lane_8x32( lane_hash, hash, lane, 256 );
-         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
         {
-            pdata[19] = n + lane;
-            submit_lane_solution( work, lane_hash, mythr, lane );
+             pdata[19] = bswap_32( n + lane );
+             submit_lane_solution( work, lane_hash, mythr, lane );
         }
      }
+      *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
      n += 8;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }

+#endif
+
+/*
 #elif defined (LYRA2REV2_4WAY)

 typedef struct {
@@ -226,15 +438,16 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t *hash7 = &(hash[7<<2]);
+   uint32_t *hashd7 = &(hash[7<<2]);
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
-   const uint32_t Htarg = ptarget[7];
-   __m128i *noncev = (__m128i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const uint32_t targ32 = ptarget[7];
+   __m128i *noncev = (__m128i*)vdata + 19;  
+   int thr_id = mythr->id; 

   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;
@@ -249,21 +462,22 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );

      lyra2rev2_4way_hash( hash, vdata );
-      pdata[19] = n;

-      for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
+      for ( int lane = 0; lane < 4; lane++ ) if ( hashd7[lane] <= targ32 )
      {
         extr_lane_4x32( lane_hash, hash, lane, 256 );
-         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         if ( valid_hash( lane_hash, ptarget ) && !opt_benchmark )
         {
            pdata[19] = n + lane;         
            submit_lane_solution( work, lane_hash, mythr, lane );
         }
      }
      n += 4;
-   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
-   *hashes_done = n - first_nonce + 1;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
   return 0;
 }

 #endif
+*/
--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -99,7 +99,7 @@ int scanhash_lyra2rev2( struct work *work,
 		lyra2rev2_hash(hash, endiandata);

 		if (hash[7] <= Htarg )
-      if( fulltest( hash, ptarget ) && !opt_benchmark )
+      if( valid_hash( hash, ptarget ) && !opt_benchmark )
      {
 			pdata[19] = nonce;
         submit_solution( work, hash, mythr );
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -130,7 +130,7 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
 {
   uint32_t hash[8*16] __attribute__ ((aligned (128)));
   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
-   uint32_t *hash32 = &hash[7*16];
+   uint32_t *hashd7 = &hash[7*16];
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
@@ -159,10 +159,10 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
      pdata[19] = n;

      for ( int lane = 0; lane < 16; lane++ )
-      if ( unlikely( hash32[lane] <= targ32 ) )
+      if ( unlikely( hashd7[lane] <= targ32 ) )
      {
         extr_lane_16x32( lane_hash, hash, lane, 256 );
-         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ) )
         {
             pdata[19] = n + lane;
             submit_lane_solution( work, lane_hash, mythr, lane );
@@ -170,6 +170,7 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
      }
      n += 16;
   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
@@ -194,7 +195,7 @@ bool init_lyra2rev3_8way_ctx()

 void lyra2rev3_8way_hash( void *state, const void *input )
 {
-   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+   uint32_t vhash[8*8] __attribute__ ((aligned (128)));
   uint32_t hash0[8] __attribute__ ((aligned (64)));
   uint32_t hash1[8] __attribute__ ((aligned (32)));
   uint32_t hash2[8] __attribute__ ((aligned (32)));
@@ -250,9 +251,9 @@ void lyra2rev3_8way_hash( void *state, const void *input )
 int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[8*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t *hash32 = &hash[7*8];
+   uint32_t *hashd7 = &hash[7*8];
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -277,7 +278,7 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
      pdata[19] = n;

      for ( int lane = 0; lane < 8; lane++ )
-      if ( unlikely( hash32[lane] <= targ32 ) )
+      if ( unlikely( hashd7[lane] <= targ32 ) )
      {
         extr_lane_8x32( lane_hash, hash, lane, 256 );
         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
@@ -357,7 +358,7 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t *hash32 = &(hash[7*4]);
+   uint32_t *hashd7 = &(hash[7*4]);
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
@@ -379,7 +380,7 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
   do
   {
      lyra2rev3_4way_hash( hash, vdata );
-      for ( int lane = 0; lane < 4; lane++ ) if ( hash32[lane] <= targ32 )
+      for ( int lane = 0; lane < 4; lane++ ) if ( hashd7[lane] <= targ32 )
      {
         extr_lane_4x32( lane_hash, hash, lane, 256 );
         if ( valid_hash( lane_hash, ptarget ) && !opt_benchmark ) 
@@ -391,6 +392,7 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
      *noncev = _mm_add_epi32( *noncev, m128_const1_32( 4 ) );
      n += 4;
   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
+   pdata[19] = n;
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
--- a/algo/lyra2/lyra2rev3.c
+++ b/algo/lyra2/lyra2rev3.c
@@ -88,7 +88,7 @@ int scanhash_lyra2rev3( struct work *work,
 	lyra2rev3_hash(hash, endiandata);

      if (hash[7] <= Htarg )
-      if( fulltest( hash, ptarget ) && !opt_benchmark )
+      if( valid_hash( hash, ptarget ) && !opt_benchmark )
      {
          pdata[19] = nonce;
          submit_solution( work, hash, mythr );
--- a/algo/lyra2/lyra2z.c
+++ b/algo/lyra2/lyra2z.c
@@ -56,7 +56,7 @@ int scanhash_lyra2z( struct work *work, uint32_t max_nonce,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id; 

 	if (opt_benchmark)
 		ptarget[7] = 0x0000ff;
@@ -65,14 +65,13 @@ int scanhash_lyra2z( struct work *work, uint32_t max_nonce,
 		be32enc(&endiandata[i], pdata[i]);
 	}

-        lyra2z_midstate( endiandata );
+   lyra2z_midstate( endiandata );

 	do {
 		be32enc(&endiandata[19], nonce);
                lyra2z_hash( hash, endiandata );

-      if ( hash[7] <= Htarg )
-      if ( fulltest( hash, ptarget ) && !opt_benchmark )
+      if ( valid_hash( hash, ptarget ) && !opt_benchmark )
      {
 			pdata[19] = nonce;
 			submit_solution( work, hash, mythr );
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -9,7 +9,7 @@ void lyra2z330_hash(void *state, const void *input, uint32_t height)
 {
 	uint32_t _ALIGN(256) hash[16];

-        LYRA2Z( lyra2z330_wholeMatrix, hash, 32, input, 80, input, 80,
+   LYRA2Z( lyra2z330_wholeMatrix, hash, 32, input, 80, input, 80,
                 2, 330, 256 );

 	memcpy(state, hash, 32);
@@ -18,38 +18,40 @@ void lyra2z330_hash(void *state, const void *input, uint32_t height)
 int scanhash_lyra2z330( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[8] __attribute__ ((aligned (64))); 
-   uint32_t endiandata[20] __attribute__ ((aligned (64)));
+   uint32_t hash[8] __attribute__ ((aligned (128))); 
+   uint32_t edata[20] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t nonce = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id; 

   if (opt_benchmark)
 	ptarget[7] = 0x0000ff;

-   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
   
   do
   {
-      be32enc( &endiandata[19], nonce );
-      lyra2z330_hash( hash, endiandata, work->height );
-      if ( hash[7] <= Htarg )
-      if ( fulltest( hash, ptarget ) && !opt_benchmark )
+      edata[19] = nonce;
+
+      LYRA2Z( lyra2z330_wholeMatrix, hash, 32, edata, 80, edata, 80,
+                 2, 330, 256 );
+      
+//      lyra2z330_hash( hash, edata, work->height );
+      if ( valid_hash( hash, ptarget ) && !opt_benchmark )
      {
-         pdata[19] = nonce;
+         be32enc( pdata + 19, nonce );
         submit_solution( work, hash, mythr );
      }
      nonce++;
   } while ( nonce < max_nonce && !work_restart[thr_id].restart );
   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce + 1;
+   *hashes_done = nonce - first_nonce;
   return 0;
 }

--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -1,18 +1,241 @@
 #include "cpuminer-config.h"
 #include "anime-gate.h"
-
-#if defined (ANIME_4WAY)
-
 #include <stdio.h>
 #include <string.h>
 #include <stdint.h>
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+#endif
+
+#if defined (ANIME_8WAY)
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+#else
+    hashState_groestl       groestl;
+#endif
+    jh512_8way_context      jh;
+    skein512_8way_context   skein;
+    keccak512_8way_context  keccak;
+} anime_8way_ctx_holder;
+
+anime_8way_ctx_holder anime_8way_ctx __attribute__ ((aligned (64)));
+
+void init_anime_8way_ctx()
+{
+     blake512_8way_init( &anime_8way_ctx.blake );
+     bmw512_8way_init( &anime_8way_ctx.bmw );
+#if defined(__VAES__)
+     groestl512_4way_init( &anime_8way_ctx.groestl, 64 );
+#else
+     init_groestl( &anime_8way_ctx.groestl, 64 );
+#endif
+     skein512_8way_init( &anime_8way_ctx.skein );
+     jh512_8way_init( &anime_8way_ctx.jh );
+     keccak512_8way_init( &anime_8way_ctx.keccak );
+}
+
+void anime_8way_hash( void *state, const void *input )
+{
+    uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+    uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
+    uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
+    uint64_t vhashC[8*8] __attribute__ ((aligned (64)));
+#if !defined(__VAES__)
+    uint64_t hash0[8] __attribute__ ((aligned (64)));
+    uint64_t hash1[8] __attribute__ ((aligned (64)));
+    uint64_t hash2[8] __attribute__ ((aligned (64)));
+    uint64_t hash3[8] __attribute__ ((aligned (64)));
+    uint64_t hash4[8] __attribute__ ((aligned (64)));
+    uint64_t hash5[8] __attribute__ ((aligned (64)));
+    uint64_t hash6[8] __attribute__ ((aligned (64)));
+    uint64_t hash7[8] __attribute__ ((aligned (64)));
+#endif
+    __m512i* vh  = (__m512i*)vhash;
+    __m512i* vhA = (__m512i*)vhashA;
+    __m512i* vhB = (__m512i*)vhashB;
+    __m512i* vhC = (__m512i*)vhashC;
+    const __m512i bit3_mask = m512_const1_64( 8 );
+    const __m512i zero = _mm512_setzero_si512();
+    __mmask8 vh_mask;
+    anime_8way_ctx_holder ctx;
+    memcpy( &ctx, &anime_8way_ctx, sizeof(anime_8way_ctx) );
+
+    bmw512_8way_full( &ctx.bmw, vhash, input, 80 );
+
+    blake512_8way_full( &ctx.blake, vhash, vhash, 64 );
+
+    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
+                                       zero );
+
+#if defined(__VAES__)
+
+    rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+    if ( ( vh_mask & 0x0f ) != 0x0f )
+       groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
+    if ( ( vh_mask & 0xf0 ) != 0xf0 )
+       groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
+
+    rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 );
+
+#else
+    
+    dintrlv_8x64_512( hash0, hash1, hash2, hash3,
+                      hash4, hash5, hash6, hash7, vhash );
+
+    if ( hash0[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); 
+    if ( hash1[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    if ( hash2[0] & 8)
+       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    if ( hash3[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    if ( hash4[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+    if ( hash5[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+    if ( hash6[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+    if ( hash7[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+    intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3,
+                             hash4, hash5, hash6, hash7 );
+
+#endif
+
+    if ( vh_mask & 0xff )
+       skein512_8way_full( &ctx.skein, vhashB, vhash, 64 );
+
+    mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );
+
+#if defined(__VAES__)
+
+    rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+    groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
+    groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
+
+    rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+    
+    dintrlv_8x64_512( hash0, hash1, hash2, hash3,
+                      hash4, hash5, hash6, hash7, vhash );
+
+    groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+    intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
+                            hash4, hash5, hash6, hash7 );
+
+#endif
+
+    jh512_8way_init( &ctx.jh );
+    jh512_8way_update( &ctx.jh, vhash, 64 );
+    jh512_8way_close( &ctx.jh, vhash );
+
+    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
+                                       zero );
+
+    if ( ( vh_mask & 0xff ) != 0xff )
+       blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
+    if ( vh_mask & 0xff )
+       bmw512_8way_full( &ctx.bmw, vhashB, vhash, 64 );
+
+    mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+
+    keccak512_8way_init( &ctx.keccak );
+    keccak512_8way_update( &ctx.keccak, vhash, 64 );
+    keccak512_8way_close( &ctx.keccak, vhash );
+
+    skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
+
+    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ), 
+                                       zero );
+
+    if ( ( vh_mask & 0xff ) != 0xff )
+    {
+       keccak512_8way_init( &ctx.keccak );
+       keccak512_8way_update( &ctx.keccak, vhash, 64 );
+       keccak512_8way_close( &ctx.keccak, vhashA );
+    }
+    if ( vh_mask & 0xff )
+    {
+       jh512_8way_init( &ctx.jh );
+       jh512_8way_update( &ctx.jh, vhash, 64 );
+       jh512_8way_close( &ctx.jh, vhashB );
+    }
+
+   casti_m512i( state,0 ) = _mm512_mask_blend_epi64( vh_mask, vhA[0], vhB[0] );
+   casti_m512i( state,1 ) = _mm512_mask_blend_epi64( vh_mask, vhA[1], vhB[1] );
+   casti_m512i( state,2 ) = _mm512_mask_blend_epi64( vh_mask, vhA[2], vhB[2] );
+   casti_m512i( state,3 ) = _mm512_mask_blend_epi64( vh_mask, vhA[3], vhB[3] );
+}
+
+int scanhash_anime_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint64_t hash64[4*8] __attribute__ ((aligned (64)));
+    uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+    uint64_t *hash64_q3 = &(hash64[3*8]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint64_t targ64_q3 = ((uint64_t*)ptarget)[3];
+    uint32_t n = pdata[19];
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t last_nonce = max_nonce - 8;
+    __m512i  *noncev = (__m512i*)vdata + 9; 
+    const int thr_id = mythr->id;
+    const bool bench = opt_benchmark;
+
+    mm512_bswap32_intrlv80_8x64( vdata, pdata );
+    *noncev = mm512_intrlv_blend_32(
+             _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                               n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
+
+    do
+    {
+       anime_8way_hash( hash64, vdata );
+
+       for ( int lane = 0; lane < 8; lane++ )
+       if ( unlikely( hash64_q3[ lane ] <= targ64_q3 && !bench ) )
+       {
+          extr_lane_8x64( lane_hash, hash64, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) )
+          {
+             pdata[19] = bswap_32( n + lane );
+             submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+       }
+       *noncev = _mm512_add_epi32( *noncev,
+                                   m512_const1_64( 0x0000000800000000 ) );
+       n += 8;
+    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+    pdata[19] = n;
+    *hashes_done = n - first_nonce;
+    return 0;
+}
+
+#elif defined (ANIME_4WAY)

 typedef struct {
    blake512_4way_context  blake;
@@ -23,18 +246,6 @@ typedef struct {
    keccak512_4way_context keccak;
 } anime_4way_ctx_holder;

-anime_4way_ctx_holder anime_4way_ctx __attribute__ ((aligned (64)));
-
-void init_anime_4way_ctx()
-{
-     blake512_4way_init( &anime_4way_ctx.blake );
-     bmw512_4way_init( &anime_4way_ctx.bmw );
-     init_groestl( &anime_4way_ctx.groestl, 64 );
-     skein512_4way_init( &anime_4way_ctx.skein );
-     jh512_4way_init( &anime_4way_ctx.jh );
-     keccak512_4way_init( &anime_4way_ctx.keccak );
-}
-
 void anime_4way_hash( void *state, const void *input )
 {
    uint64_t hash0[8] __attribute__ ((aligned (64)));
@@ -48,81 +259,61 @@ void anime_4way_hash( void *state, const void *input )
    __m256i* vhA = (__m256i*)vhashA;
    __m256i* vhB = (__m256i*)vhashB;
    __m256i vh_mask;
-    const uint32_t mask = 8;
+    int h_mask;
    const __m256i bit3_mask = m256_const1_64( 8 );
    const __m256i zero = _mm256_setzero_si256();
    anime_4way_ctx_holder ctx;
-    memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );

+    bmw512_4way_init( &ctx.bmw );
    bmw512_4way_update( &ctx.bmw, input, 80 );
    bmw512_4way_close( &ctx.bmw, vhash );

-    blake512_4way_update( &ctx.blake, vhash, 64 );
-    blake512_4way_close( &ctx.blake, vhash );
+    blake512_4way_full( &ctx.blake, vhash, vhash, 64 );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
+    h_mask = _mm256_movemask_epi8( vh_mask );

    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-    if ( hash0[0] & mask )
-    {
-       update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                               (char*)hash0, 512 );
-    }
-    if ( hash1[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                               (char*)hash1, 512 );
-    }
-    if ( hash2[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                               (char*)hash2, 512 );
-    }
-    if ( hash3[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                               (char*)hash3, 512 );
-    }
+    // A
+    if ( hash0[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    if ( hash1[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    if ( hash2[0] & 8)
+       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    if ( hash3[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

    intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );

-    if ( mm256_anybits1( vh_mask ) )
-    {
-       skein512_4way_update( &ctx.skein, vhash, 64 );
-       skein512_4way_close( &ctx.skein, vhashB );
-    }
+    // B
+    if ( h_mask & 0xffffffff )
+       skein512_4way_full( &ctx.skein, vhashB, vhash, 64 );

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

+    jh512_4way_init( &ctx.jh );
    jh512_4way_update( &ctx.jh, vhash, 64 );
    jh512_4way_close( &ctx.jh, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
+    h_mask = _mm256_movemask_epi8( vh_mask );

-    if ( mm256_anybits0( vh_mask ) )
-    {
-       blake512_4way_init( &ctx.blake );
-       blake512_4way_update( &ctx.blake, vhash, 64 );
-       blake512_4way_close( &ctx.blake, vhashA );
-    }
-    if ( mm256_anybits1( vh_mask ) )
+    // A
+    if ( ( h_mask & 0xffffffff ) != 0xffffffff )
+       blake512_4way_full( &ctx.blake, vhashA, vhash, 64 );
+    // B
+    if ( h_mask & 0xffffffff )
    {
       bmw512_4way_init( &ctx.bmw );
       bmw512_4way_update( &ctx.bmw, vhash, 64 );
@@ -131,64 +322,74 @@ void anime_4way_hash( void *state, const void *input )

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

+    keccak512_4way_init( &ctx.keccak );
    keccak512_4way_update( &ctx.keccak, vhash, 64 );
    keccak512_4way_close( &ctx.keccak, vhash );

-    skein512_4way_init( &ctx.skein );
-    skein512_4way_update( &ctx.skein, vhash, 64 );
-    skein512_4way_close( &ctx.skein, vhash );
+    skein512_4way_full( &ctx.skein, vhash, vhash, 64 );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
+    h_mask = _mm256_movemask_epi8( vh_mask );

-    if ( mm256_anybits0( vh_mask ) )
+    // A
+    if ( ( h_mask & 0xffffffff ) != 0xffffffff )
    {
       keccak512_4way_init( &ctx.keccak );
       keccak512_4way_update( &ctx.keccak, vhash, 64 );
       keccak512_4way_close( &ctx.keccak, vhashA );
    }
-    if ( mm256_anybits1( vh_mask ) )
+    // B
+    if ( h_mask & 0xffffffff )
    {
       jh512_4way_init( &ctx.jh );
       jh512_4way_update( &ctx.jh, vhash, 64 );
       jh512_4way_close( &ctx.jh, vhashB );
    }

-    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
-
-    dintrlv_4x64( state, state+32, state+64, state+96, vhash, 256 );
+    casti_m256i( state, 0 ) = _mm256_blendv_epi8( vhA[0], vhB[0], vh_mask );
+    casti_m256i( state, 1 ) = _mm256_blendv_epi8( vhA[1], vhB[1], vh_mask );
+    casti_m256i( state, 2 ) = _mm256_blendv_epi8( vhA[2], vhB[2], vh_mask );
+    casti_m256i( state, 3 ) = _mm256_blendv_epi8( vhA[3], vhB[3], vh_mask );
 }

 int scanhash_anime_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
-    uint32_t hash[4*8] __attribute__ ((aligned (64)));
+    uint64_t hash64[4*4] __attribute__ ((aligned (64)));
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+    uint64_t *hash64_q3 = &(hash64[3*4]);
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
+    const uint64_t targ64_q3 = ((uint64_t*)ptarget)[3];
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
    const uint32_t last_nonce = max_nonce - 4;
-    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+    __m256i  *noncev = (__m256i*)vdata + 9;  
    const int thr_id = mythr->id;  
+    const bool bench = opt_benchmark;

    mm256_bswap32_intrlv80_4x64( vdata, pdata );
    *noncev = mm256_intrlv_blend_32(
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
-
    do
    {
-       anime_4way_hash( hash, vdata );
+       anime_4way_hash( hash64, vdata );

-       for ( int i = 0; i < 4; i++ )
-       if ( valid_hash( hash+(i<<3), ptarget ) && !opt_benchmark )
+       for ( int lane = 0; lane < 4; lane++ )
+       if ( unlikely( hash64_q3[ lane ] <= targ64_q3 && !bench ) )
       {
-          pdata[19] = bswap_32( n+i );
-          submit_solution( work, hash+(i<<3), mythr );
+          extr_lane_4x64( lane_hash, hash64, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) )
+          {
+             pdata[19] = bswap_32( n + lane );
+             submit_lane_solution( work, lane_hash, mythr, lane );
+          }
       }
       *noncev = _mm256_add_epi32( *noncev,
                                   m256_const1_64( 0x0000000400000000 ) );
       n += 4;
-    } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
    pdata[19] = n;
    *hashes_done = n - first_nonce;
    return 0;
--- a/algo/quark/anime-gate.c
+++ b/algo/quark/anime-gate.c
@@ -2,8 +2,10 @@

 bool register_anime_algo( algo_gate_t* gate )
 {
-#if defined (ANIME_4WAY)
-  init_anime_4way_ctx();
+#if defined (ANIME_8WAY)
+  gate->scanhash  = (void*)&scanhash_anime_8way;
+  gate->hash      = (void*)&anime_8way_hash;
+#elif defined (ANIME_4WAY)
  gate->scanhash  = (void*)&scanhash_anime_4way;
  gate->hash      = (void*)&anime_4way_hash;
 #else
@@ -11,7 +13,7 @@ bool register_anime_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_anime;
  gate->hash      = (void*)&anime_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
  return true;
 };

--- a/algo/quark/anime-gate.h
+++ b/algo/quark/anime-gate.h
@@ -4,18 +4,25 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define ANIME_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define ANIME_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define ANIME_4WAY 1
 #endif

 bool register_anime_algo( algo_gate_t* gate );

-#if defined(ANIME_4WAY)
+#if defined(ANIME_8WAY)
+
+void anime_8way_hash( void *state, const void *input );
+int scanhash_anime_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(ANIME_4WAY)

 void anime_4way_hash( void *state, const void *input );
 int scanhash_anime_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-void init_anime_4way_ctx();

 #endif

--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -72,12 +72,10 @@ void quark_8way_hash( void *state, const void *input )

    memcpy( &ctx, &quark_8way_ctx, sizeof(quark_8way_ctx) );

-    blake512_8way_update( &ctx.blake, input, 80 );
-    blake512_8way_close( &ctx.blake, vhash );
-
-    bmw512_8way_update( &ctx.bmw, vhash, 64 );
-    bmw512_8way_close( &ctx.bmw, vhash );
+    blake512_8way_full( &ctx.blake, vhash, input, 80 );

+    bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
+    
    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
                                       zero );

@@ -86,70 +84,34 @@ void quark_8way_hash( void *state, const void *input )

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

-     if ( ( vh_mask & 0x0f ) != 0x0f )
-     {
-        groestl512_4way_init( &ctx.groestl, 64 );
-        groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
-     }
-     if ( ( vh_mask & 0xf0 ) != 0xf0 )
-     {     
-        groestl512_4way_init( &ctx.groestl, 64 );
-        groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
-     }
-     rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 );
+    if ( ( vh_mask & 0x0f ) != 0x0f )
+       groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
+    if ( ( vh_mask & 0xf0 ) != 0xf0 )
+       groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
+
+    rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 );

 #else

    dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  vhash, 512 );

-    if ( hash0[0] & mask )
-    {
-       update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                               (char*)hash0, 512 );
-    }
-    if ( hash1[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                               (char*)hash1, 512 );
-    }
-    if ( hash2[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                               (char*)hash2, 512 );
-    }
-    if ( hash3[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                               (char*)hash3, 512 );
-    }
-    if ( hash4[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash4,
-                                               (char*)hash4, 512 );
-    }
-    if ( hash5[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash5,
-                                               (char*)hash5, 512 );
-    }
-    if ( hash6[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash6,
-                                               (char*)hash6, 512 );
-    }
-    if ( hash7[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash7,
-                                               (char*)hash7, 512 );
-    }
+     if ( hash0[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     if ( hash1[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     if ( hash2[0] & 8)
+       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     if ( hash3[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     if ( hash4[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     if ( hash5[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     if ( hash6[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     if ( hash7[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );

    intrlv_8x64( vhashC, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                         hash7, 512 );
@@ -157,10 +119,7 @@ void quark_8way_hash( void *state, const void *input )
 #endif

    if ( vh_mask & 0xff )
-    {
-       skein512_8way_update( &ctx.skein, vhash, 64 );
-       skein512_8way_close( &ctx.skein, vhashB );
-    }
+       skein512_8way_full( &ctx.skein, vhashB, vhash, 64 );

    mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );

@@ -168,10 +127,10 @@ void quark_8way_hash( void *state, const void *input )

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

-     groestl512_4way_init( &ctx.groestl, 64 );
-     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
-     groestl512_4way_init( &ctx.groestl, 64 );
-     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+     if ( ( vh_mask & 0x0f ) != 0x0f )
+       groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
+     if ( ( vh_mask & 0xf0 ) != 0xf0 )
+       groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );

     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );

@@ -180,22 +139,22 @@ void quark_8way_hash( void *state, const void *input )
    dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  vhash, 512 );

-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+    if ( hash0[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    if ( hash1[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    if ( hash2[0] & 8)
+       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    if ( hash3[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    if ( hash4[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+    if ( hash5[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+    if ( hash6[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+    if ( hash7[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );

    intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 512 );
@@ -209,27 +168,16 @@ void quark_8way_hash( void *state, const void *input )
                                       zero );

    if ( ( vh_mask & 0xff ) != 0xff )
-    {
-       blake512_8way_init( &ctx.blake );
-       blake512_8way_update( &ctx.blake, vhash, 64 );
-       blake512_8way_close( &ctx.blake, vhashA );
-    }
-
+       blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
    if ( vh_mask & 0xff )
-    {
-       bmw512_8way_init( &ctx.bmw );
-       bmw512_8way_update( &ctx.bmw, vhash, 64 );
-       bmw512_8way_close( &ctx.bmw, vhashB );
-    }
+       bmw512_8way_full( &ctx.bmw, vhashB, vhash, 64 );

    mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );

    keccak512_8way_update( &ctx.keccak, vhash, 64 );
    keccak512_8way_close( &ctx.keccak, vhash );

-    skein512_8way_init( &ctx.skein );
-    skein512_8way_update( &ctx.skein, vhash, 64 );
-    skein512_8way_close( &ctx.skein, vhash );
+    skein512_8way_full( &ctx.skein, vhash, vhash, 64 );

    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
                                       zero );
@@ -258,41 +206,44 @@ void quark_8way_hash( void *state, const void *input )
 int scanhash_quark_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
-    uint32_t hash[8*8] __attribute__ ((aligned (128)));
-    uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+    uint64_t hash64[4*8] __attribute__ ((aligned (128)));
+    uint32_t vdata[20*8] __attribute__ ((aligned (64)));
    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-    uint32_t *hash7 = &(hash[49]);
-    uint32_t *pdata = work->data;
+    uint64_t *hash64_q3 = &(hash64[3*8]);
    uint32_t *ptarget = work->target;
+    const uint64_t targ64_q3 = ((uint64_t*)ptarget)[3];
+    uint32_t *pdata = work->data;
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
-    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-    int thr_id = mythr->id; 
-    const uint32_t Htarg = ptarget[7];
+    const uint32_t last_nonce = max_nonce - 8;
+    __m512i  *noncev = (__m512i*)vdata + 9;
+    const int thr_id = mythr->id; 
+    const bool bench = opt_benchmark;

    mm512_bswap32_intrlv80_8x64( vdata, pdata );
+    *noncev = mm512_intrlv_blend_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
    do
    {
-       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+       quark_8way_hash( hash64, vdata );

-       quark_8way_hash( hash, vdata );
-       pdata[19] = n;
-
-       for ( int i = 0; i < 8; i++ )
-       if ( unlikely( hash7[ i<<1 ] <= Htarg ) )
+       for ( int lane = 0; lane < 8; lane++ )
+       if ( unlikely( hash64_q3[ lane ] <= targ64_q3 && !bench ) )
       {
-          extr_lane_8x64( lane_hash, hash, i, 256 );
-          if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+          extr_lane_8x64( lane_hash, hash64, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) )
          {
-            pdata[19] = n+i;
-            submit_lane_solution( work, lane_hash, mythr, i );
+             pdata[19] = bswap_32( n + lane );
+             submit_lane_solution( work, lane_hash, mythr, lane );
          }
       }
+       *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
       n += 8;
-    } while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
+    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );

+    pdata[19] = n;
    *hashes_done = n - first_nonce;
    return 0;
 }
@@ -333,67 +284,47 @@ void quark_4way_hash( void *state, const void *input )
    __m256i* vhA = (__m256i*)vhashA;
    __m256i* vhB = (__m256i*)vhashB;
    __m256i vh_mask;
+    int h_mask;
    quark_4way_ctx_holder ctx;
    const __m256i bit3_mask = m256_const1_64( 8 );
-    const uint32_t mask = 8;
    const __m256i zero = _mm256_setzero_si256();

    memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );

-    blake512_4way_update( &ctx.blake, input, 80 );
-    blake512_4way_close( &ctx.blake, vhash );
+    blake512_4way_full( &ctx.blake, vhash, input, 80 );

    bmw512_4way_update( &ctx.bmw, vhash, 64 );
    bmw512_4way_close( &ctx.bmw, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
+    h_mask = _mm256_movemask_epi8( vh_mask );

    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-    if ( hash0[0] & mask )
-    {
-       update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                               (char*)hash0, 512 );
-    }
-    if ( hash1[0] & mask )
-    {
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                               (char*)hash1, 512 );
-    }
-    if ( hash2[0] & mask )
-    {   
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                               (char*)hash2, 512 );
-    }
-    if ( hash3[0] & mask )
-    {   
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                               (char*)hash3, 512 );
-    }
+    // A
+    if ( hash0[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    if ( hash1[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    if ( hash2[0] & 8)
+       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    if ( hash3[0] & 8 )
+       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

    intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );

-    if ( mm256_anybits1( vh_mask ) )   
-    {
-       skein512_4way_update( &ctx.skein, vhash, 64 );
-       skein512_4way_close( &ctx.skein, vhashB );
-    }
+    // B
+    if ( likely( h_mask & 0xffffffff ) )
+       skein512_4way_full( &ctx.skein, vhashB, vhash, 64 );

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

@@ -401,15 +332,13 @@ void quark_4way_hash( void *state, const void *input )
    jh512_4way_close( &ctx.jh, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
+    h_mask = _mm256_movemask_epi8( vh_mask );

-    if ( mm256_anybits0( vh_mask ) )   
-    {
-       blake512_4way_init( &ctx.blake );
-       blake512_4way_update( &ctx.blake, vhash, 64 );
-       blake512_4way_close( &ctx.blake, vhashA );
-    }
-
-    if ( mm256_anybits1( vh_mask ) )
+    // A
+    if ( likely( ( h_mask & 0xffffffff ) != 0xffffffff ) )
+       blake512_4way_full( &ctx.blake, vhashA, vhash, 64 );
+    // B
+    if ( likely( h_mask & 0xffffffff ) )
    {
       bmw512_4way_init( &ctx.bmw );
       bmw512_4way_update( &ctx.bmw, vhash, 64 );
@@ -421,20 +350,20 @@ void quark_4way_hash( void *state, const void *input )
    keccak512_4way_update( &ctx.keccak, vhash, 64 );
    keccak512_4way_close( &ctx.keccak, vhash );

-    skein512_4way_init( &ctx.skein );
-    skein512_4way_update( &ctx.skein, vhash, 64 );
-    skein512_4way_close( &ctx.skein, vhash );
+    skein512_4way_full( &ctx.skein, vhash, vhash, 64 );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
+    h_mask = _mm256_movemask_epi8( vh_mask );

-    if ( mm256_anybits0( vh_mask ) )    
+    // A
+    if ( likely( ( h_mask & 0xffffffff ) != 0xffffffff ) )
    {
       keccak512_4way_init( &ctx.keccak );
       keccak512_4way_update( &ctx.keccak, vhash, 64 );
       keccak512_4way_close( &ctx.keccak, vhashA );
    }
-
-    if ( mm256_anybits1( vh_mask ) )
+    // B
+    if ( likely( h_mask & 0xffffffff ) )
    {
       jh512_4way_init( &ctx.jh );
       jh512_4way_update( &ctx.jh, vhash, 64 );
@@ -451,41 +380,44 @@ void quark_4way_hash( void *state, const void *input )
 int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
-    uint32_t hash[4*8] __attribute__ ((aligned (64)));
-    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+    uint64_t hash64[4*4] __attribute__ ((aligned (64)));
+    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-    uint32_t *hash7 = &(hash[25]);
+    uint64_t *hash64_q3 = &(hash64[3*4]);
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
+    const uint64_t targ64_q3 = ((uint64_t*)ptarget)[3];
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
-    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-    int thr_id = mythr->id;
-    const uint32_t Htarg = ptarget[7];
+    const uint32_t last_nonce = max_nonce - 4;
+    __m256i  *noncev = (__m256i*)vdata + 9;
+    const int thr_id = mythr->id;
+    const bool bench = opt_benchmark;
 
    mm256_bswap32_intrlv80_4x64( vdata, pdata );
+    *noncev = mm256_intrlv_blend_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
    do
    {
-       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+       quark_4way_hash( hash64, vdata );

-       quark_4way_hash( hash, vdata );
-       pdata[19] = n;
-
-       for ( int i = 0; i < 4; i++ )
-       if ( unlikely( hash7[ i<<1 ] <= Htarg ) )
+       for ( int lane = 0; lane < 4; lane++ )
+       if ( hash64_q3[ lane ] <= targ64_q3 && !bench )
       {
-          extr_lane_4x64( lane_hash, hash, i, 256 );
-          if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+          extr_lane_4x64( lane_hash, hash64, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) )
          {
-            pdata[19] = n+i;
-            submit_lane_solution( work, lane_hash, mythr, i );
+             pdata[19] = bswap_32( n + lane );
+             submit_lane_solution( work, lane_hash, mythr, lane );
          }
       }
+       *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
       n += 4;
-    } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
+    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );

-    *hashes_done = n - first_nonce + 1;
+    pdata[19] = n;
+    *hashes_done = n - first_nonce;
    return 0;
 }

--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -707,6 +707,7 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
   int thr_id = mythr->id;  // thr_id arg is deprecated
 	int throughput = scrypt_best_throughput();
 	int i;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
 	
 #ifdef HAVE_SHA256_4WAY
 	if (sha256_use_4way())
@@ -757,7 +758,7 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
            submit_solution( work, hash, mythr );
 			}
 		}
-	} while (likely(n < max_nonce && !work_restart[thr_id].restart));
+	} while ( likely( n < max_nonce && !(*restart) ) );
 	
 	*hashes_done = n - pdata[19] + 1;
 	pdata[19] = n;
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -33,7 +33,7 @@
 #include <stddef.h>
 #include <string.h>

-#ifdef __AES__
+#if defined(__AES__)

 #include "sph_shavite.h"
 #include "simd-utils.h"
--- a/algo/shavite/sph_shavite.c
+++ b/algo/shavite/sph_shavite.c
@@ -35,6 +35,8 @@

 #include "sph_shavite.h"

+#if !defined(__AES__)
+
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -1762,3 +1764,6 @@ sph_shavite512_sw_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst
 #ifdef __cplusplus
 }
 #endif
+
+#endif   // !AES
+
--- a/algo/shavite/sph_shavite.h
+++ b/algo/shavite/sph_shavite.h
@@ -262,15 +262,9 @@ void sph_shavite384_close(void *cc, void *dst);
 void sph_shavite384_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);

-// Always define sw but only define aesni when available
-// Define fptrs for aesni or sw, not both.
-void sph_shavite512_sw_init(void *cc);
-void sph_shavite512_sw(void *cc, const void *data, size_t len);
-void sph_shavite512_sw_close(void *cc, void *dst);
-void sph_shavite512_sw_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
-
+//Don't call these directly from application code, use the macros below.
 #ifdef __AES__
+
 void sph_shavite512_aesni_init(void *cc);
 void sph_shavite512_aesni(void *cc, const void *data, size_t len);
 void sph_shavite512_aesni_close(void *cc, void *dst);
@@ -285,6 +279,13 @@ void sph_shavite512_aesni_addbits_and_close(

 #else

+void sph_shavite512_sw_init(void *cc);
+void sph_shavite512_sw(void *cc, const void *data, size_t len);
+void sph_shavite512_sw_close(void *cc, void *dst);
+void sph_shavite512_sw_addbits_and_close(
+   void *cc, unsigned ub, unsigned n, void *dst);
+
+
 #define sph_shavite512_init  sph_shavite512_sw_init
 #define sph_shavite512       sph_shavite512_sw
 #define sph_shavite512_close sph_shavite512_sw_close
@@ -293,6 +294,20 @@ void sph_shavite512_aesni_addbits_and_close(

 #endif

+// Use these macros from application code.
+#define shavite512_context sph_shavite512_context
+
+#define shavite512_init   sph_shavite512_init
+#define shavite512_update sph_shavite512
+#define shavite512_close  sph_shavite512_close
+
+#define shavite512_full( cc, dst, data, len ) \
+do{ \
+   shavite512_init( cc ); \
+   shavite512_update( cc, data, len ); \
+   shavite512_close( cc, dst ); \
+}while(0)
+
 #ifdef __cplusplus
 }
 #endif	
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -13,18 +13,18 @@

 #if defined (SKEIN_8WAY)

+static __thread skein512_8way_context skein512_8way_ctx
+                                            __attribute__ ((aligned (64)));
+
 void skeinhash_8way( void *state, const void *input )
 {
     uint64_t vhash64[8*8] __attribute__ ((aligned (128)));
     skein512_8way_context ctx_skein;
-
+     memcpy( &ctx_skein, &skein512_8way_ctx, sizeof( ctx_skein ) );
     uint32_t vhash32[16*8] __attribute__ ((aligned (128)));
     sha256_8way_context ctx_sha256;

-     skein512_8way_init( &ctx_skein );
-     skein512_8way_update( &ctx_skein, input, 80 );
-     skein512_8way_close( &ctx_skein, vhash64 );
-
+     skein512_8way_final16( &ctx_skein, vhash64, input + (64*8) );
     rintrlv_8x64_8x32( vhash32, vhash64, 512 );

     sha256_8way_init( &ctx_sha256 );
@@ -36,63 +36,70 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t vdata[20*8] __attribute__ ((aligned (128)));
-    uint32_t hash[16*8] __attribute__ ((aligned (64)));
+    uint32_t hash[8*8] __attribute__ ((aligned (64)));
    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-    uint32_t *hash7 = &(hash[7<<3]);
+    uint32_t *hash_d7 = &(hash[7*8]);
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-    const uint32_t Htarg = ptarget[7];
+    const uint32_t targ_d7 = ptarget[7];
    const uint32_t first_nonce = pdata[19];
+    const uint32_t last_nonce = max_nonce - 8;
    uint32_t n = first_nonce;
-    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-    int thr_id = mythr->id; 
+    __m512i  *noncev = (__m512i*)vdata + 9; 
+    const int thr_id = mythr->id; 
+    const bool bench = opt_benchmark;

   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   *noncev = mm512_intrlv_blend_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
+   skein512_8way_prehash64( &skein512_8way_ctx, vdata );
   do
   {
-       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
-
       skeinhash_8way( hash, vdata );

       for ( int lane = 0; lane < 8; lane++ )
-       if (  hash7[ lane ] <= Htarg )
+       if ( unlikely( hash_d7[ lane ] <= targ_d7 ) && !bench )
       {
          extr_lane_8x32( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) )
+          if ( valid_hash( lane_hash, ptarget ) )
          {
-             pdata[19] = n + lane;
+             pdata[19] = bswap_32( n + lane );
             submit_lane_solution( work, lane_hash, mythr, lane );
          }
       }
+       *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
       n += 8;
-    } while ( (n < max_nonce-8) && !work_restart[thr_id].restart );
+    } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );

+    pdata[19] = n;
    *hashes_done = n - first_nonce;
    return 0;
 }

 #elif defined (SKEIN_4WAY)

+static __thread skein512_4way_context skein512_4way_ctx
+                                            __attribute__ ((aligned (64)));
+
 void skeinhash_4way( void *state, const void *input )
 {
     uint64_t vhash64[8*4] __attribute__ ((aligned (128)));
     skein512_4way_context ctx_skein;
+     memcpy( &ctx_skein, &skein512_4way_ctx, sizeof( ctx_skein ) );
 #if defined(__SHA__)
     uint32_t hash0[16] __attribute__ ((aligned (64)));
     uint32_t hash1[16] __attribute__ ((aligned (64)));
     uint32_t hash2[16] __attribute__ ((aligned (64)));
     uint32_t hash3[16] __attribute__ ((aligned (64)));
-     SHA256_CTX           ctx_sha256;
+     SHA256_CTX ctx_sha256;
 #else
     uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
     sha256_4way_context ctx_sha256;
 #endif

-     skein512_4way_init( &ctx_skein );
-     skein512_4way_update( &ctx_skein, input, 80 );
-     skein512_4way_close( &ctx_skein, vhash64 );
+     skein512_4way_final16( &ctx_skein, vhash64, input + (64*4) );

 #if defined(__SHA__)      
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 );
@@ -127,38 +134,43 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-    uint32_t hash[16*4] __attribute__ ((aligned (64)));
+    uint32_t hash[8*4] __attribute__ ((aligned (64)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-    uint32_t *hash7 = &(hash[7<<2]);
+    uint32_t *hash_d7 = &(hash[7<<2]);
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-    const uint32_t Htarg = ptarget[7];
+    const uint32_t targ_d7 = ptarget[7];
    const uint32_t first_nonce = pdata[19];
+    const uint32_t last_nonce = max_nonce - 4;
    uint32_t n = first_nonce;
-    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-    int thr_id = mythr->id; 
+    __m256i  *noncev = (__m256i*)vdata + 9; 
+    const int thr_id = mythr->id; 
+    const bool bench = opt_benchmark;

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   skein512_4way_prehash64( &skein512_4way_ctx, vdata );
+
+   *noncev = mm256_intrlv_blend_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
   {
-       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-
       skeinhash_4way( hash, vdata );
-
       for ( int lane = 0; lane < 4; lane++ )
-       if (  hash7[ lane ] <= Htarg )
+       if ( unlikely( ( hash_d7[ lane ] <= targ_d7 ) && !bench ) )
       {
          extr_lane_4x32( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) )
+          if ( valid_hash( lane_hash, ptarget ) )
          {
-             pdata[19] = n + lane;
+             pdata[19] = bswap_32( n + lane );
             submit_lane_solution( work, lane_hash, mythr, lane );
          }
       }
+       *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
       n += 4;
-    } while ( (n < max_nonce-4) && !work_restart[thr_id].restart );
+    } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );

+    pdata[19] = n;
    *hashes_done = n - first_nonce;
    return 0;
 }
--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -4,14 +4,16 @@

 bool register_skein_algo( algo_gate_t* gate )
 {
-    gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
 #if defined (SKEIN_8WAY)
+    gate->optimizations = AVX2_OPT | AVX512_OPT;
    gate->scanhash  = (void*)&scanhash_skein_8way;
    gate->hash      = (void*)&skeinhash_8way;
 #elif defined (SKEIN_4WAY)
+    gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
    gate->scanhash  = (void*)&scanhash_skein_4way;
    gate->hash      = (void*)&skeinhash_4way;
 #else
+    gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
    gate->scanhash  = (void*)&scanhash_skein;
    gate->hash      = (void*)&skeinhash;
 #endif
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -654,6 +654,160 @@ skein_big_close_8way( skein512_8way_context *sc, unsigned ub, unsigned n,
   memcpy_512( dst, buf, out_len >> 3 );
 }

+void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data,
+                     size_t len )
+{
+   __m512i h0, h1, h2, h3, h4, h5, h6, h7;
+   __m512i *vdata = (__m512i*)data;
+   __m512i *buf = sc->buf;
+   size_t ptr = 0;
+   unsigned first;
+   uint64_t bcount = 0;
+   const int buf_size = 64;   // 64 * _m256i
+
+// Init
+
+        h0 = m512_const1_64( 0x4903ADFF749C51CE );
+        h1 = m512_const1_64( 0x0D95DE399746DF03 );
+        h2 = m512_const1_64( 0x8FD1934127C79BCE );
+        h3 = m512_const1_64( 0x9A255629FF352CB1 );
+        h4 = m512_const1_64( 0x5DB62599DF6CA7B0 );
+        h5 = m512_const1_64( 0xEABE394CA9D5C3F4 );
+        h6 = m512_const1_64( 0x991112C71A75B523 );
+        h7 = m512_const1_64( 0xAE18A40B660FCC33 );
+
+// Update
+
+   if ( len <= buf_size - ptr )
+   {
+       memcpy_512( buf + (ptr>>3), vdata, len>>3 );
+       ptr += len;
+   }
+   else
+   {
+      first = ( bcount == 0 ) << 7;
+      do {
+         size_t clen;
+
+         if ( ptr == buf_size )
+         {
+            bcount ++;
+            UBI_BIG_8WAY( 96 + first, 0 );
+            first = 0;
+            ptr = 0;
+         }
+         clen = buf_size - ptr;
+         if ( clen > len )
+            clen = len;
+         memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
+         ptr += clen;
+         vdata += (clen>>3);
+         len -= clen;
+      } while ( len > 0 );
+   }
+
+// Close
+
+   unsigned et;
+
+   memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+   et = 352 + ((bcount == 0) << 7);
+   UBI_BIG_8WAY( et, ptr );
+
+   memset_zero_512( buf, buf_size >> 3 );
+   bcount = 0;
+   UBI_BIG_8WAY( 510, 8 );
+
+   casti_m512i( out, 0 ) = h0;
+   casti_m512i( out, 1 ) = h1;
+   casti_m512i( out, 2 ) = h2;
+   casti_m512i( out, 3 ) = h3;
+   casti_m512i( out, 4 ) = h4;
+   casti_m512i( out, 5 ) = h5;
+   casti_m512i( out, 6 ) = h6;
+   casti_m512i( out, 7 ) = h7;
+}
+
+void
+skein512_8way_prehash64( skein512_8way_context *sc, const void *data )
+{
+   __m512i *vdata = (__m512*)data;
+   __m512i *buf = sc->buf;
+   buf[0] = vdata[0];
+   buf[1] = vdata[1];
+   buf[2] = vdata[2];
+   buf[3] = vdata[3];
+   buf[4] = vdata[4];
+   buf[5] = vdata[5];
+   buf[6] = vdata[6];
+   buf[7] = vdata[7];
+   register __m512i h0 = m512_const1_64( 0x4903ADFF749C51CE );
+   register __m512i h1 = m512_const1_64( 0x0D95DE399746DF03 );
+   register __m512i h2 = m512_const1_64( 0x8FD1934127C79BCE );
+   register __m512i h3 = m512_const1_64( 0x9A255629FF352CB1 );
+   register __m512i h4 = m512_const1_64( 0x5DB62599DF6CA7B0 );
+   register __m512i h5 = m512_const1_64( 0xEABE394CA9D5C3F4 );
+   register __m512i h6 = m512_const1_64( 0x991112C71A75B523 );
+   register __m512i h7 = m512_const1_64( 0xAE18A40B660FCC33 );
+   uint64_t bcount = 1;
+
+   UBI_BIG_8WAY( 224, 0 );
+   sc->h0 = h0;
+   sc->h1 = h1;
+   sc->h2 = h2;
+   sc->h3 = h3;
+   sc->h4 = h4;
+   sc->h5 = h5;
+   sc->h6 = h6;
+   sc->h7 = h7;
+}
+
+void
+skein512_8way_final16( skein512_8way_context *sc,  void *output,
+                       const void *data )
+{
+   __m512i *in = (__m512i*)data;
+   __m512i *buf = sc->buf;
+   __m512i *out = (__m512i*)output;
+   register __m512i h0 = sc->h0;
+   register __m512i    h1 = sc->h1;
+   register __m512i    h2 = sc->h2;
+   register __m512i    h3 = sc->h3;
+   register __m512i    h4 = sc->h4;
+   register __m512i    h5 = sc->h5;
+   register __m512i    h6 = sc->h6;
+   register __m512i    h7 = sc->h7;
+
+   const __m512i zero = m512_zero;
+   buf[0] = in[0];
+   buf[1] = in[1];
+   buf[2] = zero;
+   buf[3] = zero;
+   buf[4] = zero;
+   buf[5] = zero;
+   buf[6] = zero;
+   buf[7] = zero;
+
+   uint64_t bcount = 1;
+   UBI_BIG_8WAY( 352, 16 );
+
+   buf[0] = zero;
+   buf[1] = zero;
+
+   bcount = 0;
+   UBI_BIG_8WAY( 510, 8 );
+
+   out[0] = h0;
+   out[1] = h1;
+   out[2] = h2;
+   out[3] = h3;
+   out[4] = h4;
+   out[5] = h5;
+   out[6] = h6;
+   out[7] = h7;
+}
+
+
 void
 skein256_8way_update(void *cc, const void *data, size_t len)
 {
@@ -709,6 +863,7 @@ void skein512_4way_init( skein512_4way_context *sc )
        sc->ptr = 0;
 }

+// Do not use for 128 bt data length
 static void
 skein_big_core_4way( skein512_4way_context *sc, const void *data,
                     size_t len )
@@ -794,6 +949,156 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
   memcpy_256( dst, buf, out_len >> 3 );
 }

+void
+skein512_4way_full( skein512_4way_context *sc, void *out, const void *data,
+                     size_t len )
+{
+   __m256i h0, h1, h2, h3, h4, h5, h6, h7;
+   __m256i *vdata = (__m256i*)data;
+   __m256i *buf = sc->buf;
+   size_t ptr = 0;
+   unsigned first;
+   const int buf_size = 64;   // 64 * __m256i
+   uint64_t bcount = 0;
+
+   h0 = m256_const1_64( 0x4903ADFF749C51CE );
+   h1 = m256_const1_64( 0x0D95DE399746DF03 );
+   h2 = m256_const1_64( 0x8FD1934127C79BCE );
+   h3 = m256_const1_64( 0x9A255629FF352CB1 );
+   h4 = m256_const1_64( 0x5DB62599DF6CA7B0 );
+   h5 = m256_const1_64( 0xEABE394CA9D5C3F4 );
+   h6 = m256_const1_64( 0x991112C71A75B523 );
+   h7 = m256_const1_64( 0xAE18A40B660FCC33 );
+
+// Update     
+
+   if ( len <= buf_size - ptr )
+   {
+       memcpy_256( buf + (ptr>>3), vdata, len>>3 );
+       ptr += len;
+   }
+   else
+   {
+      first = ( bcount == 0 ) << 7;
+      do {
+         size_t clen;
+
+         if ( ptr == buf_size )
+         {
+            bcount ++;
+            UBI_BIG_4WAY( 96 + first, 0 );
+            first = 0;
+            ptr = 0;
+         }
+         clen = buf_size - ptr;
+         if ( clen > len )
+            clen = len;
+         memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
+         ptr += clen;
+         vdata += (clen>>3);
+         len -= clen;
+      } while ( len > 0 );
+   }
+
+// Close
+
+   unsigned et;
+
+   memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+   et = 352 + ((bcount == 0) << 7);
+   UBI_BIG_4WAY( et, ptr );
+
+   memset_zero_256( buf, buf_size >> 3 );
+   bcount = 0;
+   UBI_BIG_4WAY( 510, 8 );
+
+   casti_m256i( out, 0 ) = h0;
+   casti_m256i( out, 1 ) = h1;
+   casti_m256i( out, 2 ) = h2;
+   casti_m256i( out, 3 ) = h3;
+   casti_m256i( out, 4 ) = h4;
+   casti_m256i( out, 5 ) = h5;
+   casti_m256i( out, 6 ) = h6;
+   casti_m256i( out, 7 ) = h7;
+}
+
+void
+skein512_4way_prehash64( skein512_4way_context *sc, const void *data )
+{
+   __m256i *vdata = (__m256i*)data;
+   __m256i *buf = sc->buf;
+   buf[0] = vdata[0];
+   buf[1] = vdata[1];
+   buf[2] = vdata[2];
+   buf[3] = vdata[3];
+   buf[4] = vdata[4];
+   buf[5] = vdata[5];
+   buf[6] = vdata[6];
+   buf[7] = vdata[7];
+   register __m256i h0 = m256_const1_64( 0x4903ADFF749C51CE );
+   register __m256i h1 = m256_const1_64( 0x0D95DE399746DF03 );
+   register __m256i h2 = m256_const1_64( 0x8FD1934127C79BCE );
+   register __m256i h3 = m256_const1_64( 0x9A255629FF352CB1 );
+   register __m256i h4 = m256_const1_64( 0x5DB62599DF6CA7B0 );
+   register __m256i h5 = m256_const1_64( 0xEABE394CA9D5C3F4 );
+   register __m256i h6 = m256_const1_64( 0x991112C71A75B523 );
+   register __m256i h7 = m256_const1_64( 0xAE18A40B660FCC33 );
+   uint64_t bcount = 1;
+
+   UBI_BIG_4WAY( 224, 0 );
+   sc->h0 = h0;
+   sc->h1 = h1;
+   sc->h2 = h2;
+   sc->h3 = h3;
+   sc->h4 = h4;
+   sc->h5 = h5;
+   sc->h6 = h6;
+   sc->h7 = h7;
+}
+
+void
+skein512_4way_final16( skein512_4way_context *sc,  void *out, const void *data )
+{
+   __m256i *vdata = (__m256i*)data;
+   __m256i *buf = sc->buf;
+   register __m256i h0 = sc->h0;
+   register __m256i    h1 = sc->h1;
+   register __m256i    h2 = sc->h2;
+   register __m256i    h3 = sc->h3;
+   register __m256i    h4 = sc->h4;
+   register __m256i    h5 = sc->h5;
+   register __m256i    h6 = sc->h6;
+   register __m256i    h7 = sc->h7;
+
+   const __m256i zero = m256_zero;
+   buf[0] = vdata[0];
+   buf[1] = vdata[1];
+   buf[2] = zero;
+   buf[3] = zero;
+   buf[4] = zero;
+   buf[5] = zero;
+   buf[6] = zero;
+   buf[7] = zero;
+
+   uint64_t bcount = 1;
+   UBI_BIG_4WAY( 352, 16 );
+
+   buf[0] = zero;
+   buf[1] = zero;
+
+   bcount = 0;
+   UBI_BIG_4WAY( 510, 8 );
+
+   casti_m256i( out, 0 ) = h0;
+   casti_m256i( out, 1 ) = h1;
+   casti_m256i( out, 2 ) = h2;
+   casti_m256i( out, 3 ) = h3;
+   casti_m256i( out, 4 ) = h4;
+   casti_m256i( out, 5 ) = h5;
+   casti_m256i( out, 6 ) = h6;
+   casti_m256i( out, 7 ) = h7;
+}
+
 void
 skein256_4way_update(void *cc, const void *data, size_t len)
 {
@@ -806,6 +1111,9 @@ skein256_4way_close(void *cc, void *dst)
        skein_big_close_4way(cc, 0, 0, dst, 32);
 }

+
+
+// Do not use with 128 bit data
 void
 skein512_4way_update(void *cc, const void *data, size_t len)
 {
--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -63,10 +63,16 @@ typedef struct
 typedef skein_8way_big_context skein512_8way_context;
 typedef skein_8way_big_context skein256_8way_context;

+void skein512_8way_full( skein512_8way_context *sc, void *out,
+                         const void *data, size_t len );
 void skein512_8way_init( skein512_8way_context *sc );
 void skein512_8way_update( void *cc, const void *data, size_t len );
 void skein512_8way_close( void *cc, void *dst );

+void skein512_8way_prehash64( skein512_8way_context *sc, const void *data );
+void skein512_8way_final16( skein512_8way_context *sc, void *out,
+     const void *data );
+
 void skein256_8way_init( skein256_8way_context *sc );
 void skein256_8way_update( void *cc, const void *data, size_t len );
 void skein256_8way_close( void *cc, void *dst );
@@ -85,6 +91,8 @@ typedef skein_4way_big_context skein512_4way_context;
 typedef skein_4way_big_context skein256_4way_context;

 void skein512_4way_init( skein512_4way_context *sc );
+void skein512_4way_full( skein512_4way_context *sc, void *out,
+                         const void *data, size_t len );
 void skein512_4way_update( void *cc, const void *data, size_t len );
 void skein512_4way_close( void *cc, void *dst );

@@ -92,6 +100,10 @@ void skein256_4way_init( skein256_4way_context *sc );
 void skein256_4way_update( void *cc, const void *data, size_t len );
 void skein256_4way_close( void *cc, void *dst );

+void skein512_4way_prehash64( skein512_4way_context *sc, const void *data );
+void skein512_4way_final16( skein512_4way_context *sc, void *out,
+     const void *data );
+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -5,114 +5,126 @@

 #if defined(SKEIN_8WAY)

+ static __thread skein512_8way_context skein512_8way_ctx
+                                             __attribute__ ((aligned (64)));
+
 void skein2hash_8way( void *output, const void *input )
 {
-   skein512_8way_context ctx;
   uint64_t hash[16*8] __attribute__ ((aligned (128)));
+   skein512_8way_context ctx;
+   memcpy( &ctx, &skein512_8way_ctx, sizeof( ctx ) );

-   skein512_8way_init( &ctx );
-   skein512_8way_update( &ctx, input, 80 );
-   skein512_8way_close( &ctx, hash );
-
-   skein512_8way_init( &ctx );
-   skein512_8way_update( &ctx, hash, 64 );
-   skein512_8way_close( &ctx, output );
+   skein512_8way_final16( &ctx, hash, input + (64*8) );
+   skein512_8way_full( &ctx, output, hash, 64 );
 }

 int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
-    uint32_t hash[16*8] __attribute__ ((aligned (128)));
+    uint64_t hash[8*8] __attribute__ ((aligned (128)));
    uint32_t vdata[20*8] __attribute__ ((aligned (64)));
    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-    uint32_t *hash7 = &(hash[49]);
+    uint64_t *hashq3 = &(hash[3*8]);
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-    const uint32_t Htarg = ptarget[7];
+    const uint64_t targq3 = ((uint64_t*)ptarget)[3];
    const uint32_t first_nonce = pdata[19];
+    const uint32_t last_nonce = max_nonce - 8;
    uint32_t n = first_nonce;
-    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-    int thr_id = mythr->id; 
+    __m512i  *noncev = (__m512i*)vdata + 9; 
+    const int thr_id = mythr->id; 
+    const bool bench = opt_benchmark;
+    skein512_8way_context ctx;

    mm512_bswap32_intrlv80_8x64( vdata, pdata );
+    *noncev = mm512_intrlv_blend_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
+    skein512_8way_prehash64( &ctx, vdata );
    do
    {
-       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
-
-       skein2hash_8way( hash, vdata );
+       skein512_8way_final16( &ctx, hash, vdata + (16*8) );
+       skein512_8way_full( &ctx, hash, hash, 64 );

       for ( int lane = 0; lane < 8; lane++ )
-       if ( hash7[ lane<<1 ] <= Htarg )
+       if ( unlikely( hashq3[ lane ] <= targq3 && !bench ) )
       {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          if ( valid_hash( lane_hash, ptarget ) && !bench )
          {
-             pdata[19] = n + lane;
+             pdata[19] = bswap_32( n + lane );
             submit_lane_solution( work, lane_hash, mythr, lane );
          }
       }
+       *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
       n += 8;
-    } while ( (n < max_nonce-8) && !work_restart[thr_id].restart );
+    } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );

-    *hashes_done = n - first_nonce + 1;
+    pdata[19] = n;
+    *hashes_done = n - first_nonce;
    return 0;
 }

 #elif defined(SKEIN_4WAY)

+static __thread skein512_4way_context skein512_4way_ctx
+                                           __attribute__ ((aligned (64)));
+
 void skein2hash_4way( void *output, const void *input )
 {
   skein512_4way_context ctx;
+   memcpy( &ctx, &skein512_4way_ctx, sizeof( ctx ) ); 
   uint64_t hash[16*4] __attribute__ ((aligned (64)));

-   skein512_4way_init( &ctx );
-   skein512_4way_update( &ctx, input, 80 );
-   skein512_4way_close( &ctx, hash );
-
-   skein512_4way_init( &ctx );
-   skein512_4way_update( &ctx, hash, 64 );
-   skein512_4way_close( &ctx, output );
+   skein512_4way_final16( &ctx, hash, input + (64*4) );
+   skein512_4way_full( &ctx, output, hash, 64 );
 }

 int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
-    uint32_t hash[16*4] __attribute__ ((aligned (64)));
+    uint64_t hash[8*4] __attribute__ ((aligned (64)));
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-    uint32_t *hash7 = &(hash[25]);
+    uint64_t *hash_q3 = &(hash[3*4]);
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-    const uint32_t Htarg = ptarget[7];
+    const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
    const uint32_t first_nonce = pdata[19];
+    const uint32_t last_nonce = max_nonce - 4;
    uint32_t n = first_nonce;
-    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-    int thr_id = mythr->id;  // thr_id arg is deprecated
+    __m256i  *noncev = (__m256i*)vdata + 9; 
+    const int thr_id = mythr->id;  
+    const bool bench = opt_benchmark;
+    skein512_4way_context ctx;

    mm256_bswap32_intrlv80_4x64( vdata, pdata );
+    skein512_4way_prehash64( &ctx, vdata );
+    *noncev = mm256_intrlv_blend_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
    do 
    {
-       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-
-       skein2hash_4way( hash, vdata );
+       skein512_4way_final16( &ctx, hash, vdata + (16*4) );
+       skein512_4way_full( &ctx, hash, hash, 64 );

       for ( int lane = 0; lane < 4; lane++ )
-       if ( hash7[ lane<<1 ] <= Htarg )
+       if ( hash_q3[ lane ] <= targ_q3 )
       {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          if ( valid_hash( lane_hash, ptarget ) && !bench )
          {
-             pdata[19] = n + lane;
+             pdata[19] = bswap_32( n + lane );
             submit_lane_solution( work, lane_hash, mythr, lane );
          }
       }
+       *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
       n += 4;
-    } while ( (n < max_nonce) && !work_restart[thr_id].restart );
+    } while ( (n < last_nonce) && !work_restart[thr_id].restart );

-    *hashes_done = n - first_nonce + 1;
+    pdata[19] = n;
+    *hashes_done = n - first_nonce;
    return 0;
 }

--- a/algo/whirlpool/sph_whirlpool.h
+++ b/algo/whirlpool/sph_whirlpool.h
@@ -120,6 +120,13 @@ void sph_whirlpool(void *cc, const void *data, size_t len);
 */
 void sph_whirlpool_close(void *cc, void *dst);

+#define sph_whirlpool512_full( cc, dst, data, len ) \
+do{ \
+   sph_whirlpool_init( cc ); \
+   sph_whirlpool( cc, data, len ); \
+   sph_whirlpool_close( cc, dst ); \
+}while(0)
+
 /**
 * WHIRLPOOL-0 uses the same structure than plain WHIRLPOOL.
 */
--- a/algo/x13/skunk-4way.c
+++ b/algo/x13/skunk-4way.c
@@ -35,8 +35,7 @@ void skunk_8way_hash( void *output, const void *input )
     skunk_8way_ctx_holder ctx __attribute__ ((aligned (64)));
     memcpy( &ctx, &skunk_8way_ctx, sizeof(skunk_8way_ctx) );

-     skein512_8way_update( &ctx.skein, input, 80 );
-     skein512_8way_close( &ctx.skein, vhash );
+     skein512_8way_final16( &ctx.skein, vhash, input );
     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7, vhash, 512 );
  
@@ -104,35 +103,35 @@ int scanhash_skunk_8way( struct work *work, uint32_t max_nonce,
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
-   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-   const uint32_t Htarg = ptarget[7];
-   int thr_id = mythr->id;  
+   __m512i  *noncev = (__m512i*)vdata + 9; 
+   const int thr_id = mythr->id;  
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;

-   if ( opt_benchmark )
-      ((uint32_t*)ptarget)[7] = 0x0cff;
+   if ( bench )  ptarget[7] = 0x0fff;

   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   skein512_8way_prehash64( &skunk_8way_ctx.skein, vdata );
+   *noncev = mm512_intrlv_blend_32( 
+             _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                               n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
   do
   {
-      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
-
      skunk_8way_hash( hash, vdata );
-      pdata[19] = n;

      for ( int i = 0; i < 8; i++ )
-      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
-      if ( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      if ( unlikely( valid_hash( hash+(i<<3), ptarget ) && !bench ) )
      {
-         pdata[19] = n+i;
+         pdata[19] = bswap_32( n+i );
         submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
      n +=8;
-   } while ( likely( ( n < max_nonce-8 ) && !(*restart) ) );
-
+   } while ( likely( ( n < last_nonce ) && !( *restart ) ) );
+   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
@@ -159,17 +158,16 @@ static __thread skunk_4way_ctx_holder skunk_4way_ctx;

 void skunk_4way_hash( void *output, const void *input )
 {
+     uint64_t vhash[8*4] __attribute__ ((aligned (128)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t vhash[8*4] __attribute__ ((aligned (64)));

     skunk_4way_ctx_holder ctx __attribute__ ((aligned (64)));
     memcpy( &ctx, &skunk_4way_ctx, sizeof(skunk_4way_ctx) );

-     skein512_4way_update( &ctx.skein, input, 80 );
-     skein512_4way_close( &ctx.skein, vhash );
+     skein512_4way_final16( &ctx.skein, vhash, input + (64*4) );
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*)hash0, 64 );
@@ -213,40 +211,40 @@ void skunk_4way_hash( void *output, const void *input )
 int scanhash_skunk_4way( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[4*8] __attribute__ ((aligned (64)));
-   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t hash[4*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
-   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-   const uint32_t Htarg = ptarget[7];
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   __m256i  *noncev = (__m256i*)vdata + 9; 
+   const int thr_id = mythr->id; 
+   volatile uint8_t *restart = &( work_restart[ thr_id ].restart );
+   const bool bench = opt_benchmark;

-   if ( opt_benchmark )
-      ((uint32_t*)ptarget)[7] = 0x0cff;
+   if ( bench )  ptarget[7] = 0x0fff;

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   skein512_4way_prehash64( &skunk_4way_ctx.skein, vdata );
+   *noncev = mm256_intrlv_blend_32(
+             _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
   {
-      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-
      skunk_4way_hash( hash, vdata );
-      pdata[19] = n;

      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg )
-      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+      if ( unlikely( valid_hash( hash+(i<<3), ptarget ) && !bench ) )
      {
-         pdata[19] = n+i;
+         pdata[19] = bswap_32( n + i );
         submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
      n +=4;
-   } while ( ( n < max_nonce ) && !(*restart) );
-
-   *hashes_done = n - first_nonce + 1;
+   } while ( likely( ( n < last_nonce ) && !( *restart ) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
   return 0;
 }

--- a/algo/x16/hex.c
+++ b/algo/x16/hex.c
@@ -30,9 +30,6 @@
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #endif

-static __thread uint32_t s_ntime = UINT32_MAX;
-static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
-
 static void hex_getAlgoString(const uint32_t* prevblock, char *output)
 {
   char *sptr = output;
@@ -50,6 +47,7 @@ static void hex_getAlgoString(const uint32_t* prevblock, char *output)
   *sptr = '\0';
 }

+/*
 union _hex_context_overlay
 {
 #if defined(__AES__)
@@ -66,7 +64,7 @@ union _hex_context_overlay
        sph_keccak512_context   keccak;
        hashState_luffa         luffa;
        cubehashParam           cube;
-        sph_shavite512_context  shavite;
+        shavite512_context      shavite;
        hashState_sd            simd;
        sph_hamsi512_context    hamsi;
        sph_fugue512_context    fugue;
@@ -75,18 +73,19 @@ union _hex_context_overlay
        SHA512_CTX              sha512;
 };
 typedef union _hex_context_overlay hex_context_overlay;
+*/

-static __thread hex_context_overlay hex_ctx;
+static __thread x16r_context_overlay hex_ctx;

 void hex_hash( void* output, const void* input )
 {
   uint32_t _ALIGN(128) hash[16];
-   hex_context_overlay ctx;
+   x16r_context_overlay ctx;
   memcpy( &ctx, &hex_ctx, sizeof(ctx) );
   void *in = (void*) input;
   int size = 80;

-   char elem = hashOrder[0];
+   char elem = x16r_hash_order[0];
   uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

   for ( int i = 0; i < 16; i++ )
@@ -160,9 +159,7 @@ void hex_hash( void* output, const void* input )
            }
         break;
         case SHAVITE:
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in, size );
-            sph_shavite512_close( &ctx.shavite, hash );
+            shavite512_full( &ctx.shavite, hash, in, size );
         break;
         case SIMD:
             init_sd( &ctx.simd, 512 );
@@ -190,9 +187,7 @@ void hex_hash( void* output, const void* input )
            sph_hamsi512_close( &ctx.hamsi, hash );
         break;
         case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in, size );
-             sph_fugue512_close( &ctx.fugue, hash );
+             sph_fugue512_full( &ctx.fugue, hash, in, size );
         break;
         case SHABAL:
            if ( i == 0 ) 
@@ -206,13 +201,12 @@ void hex_hash( void* output, const void* input )
         break;
         case WHIRLPOOL:
            if ( i == 0 ) 
-                sph_whirlpool( &ctx.whirlpool, in+64, 16 );
-            else
            {
-                sph_whirlpool_init( &ctx.whirlpool );
-                sph_whirlpool( &ctx.whirlpool, in, size );
+                sph_whirlpool( &ctx.whirlpool, in+64, 16 );
+                sph_whirlpool_close( &ctx.whirlpool, hash );
            }
-            sph_whirlpool_close( &ctx.whirlpool, hash );
+            else
+                sph_whirlpool512_full( &ctx.whirlpool, hash, in,  size );
         break;
         case SHA_512:
             SHA512_Init( &ctx.sha512 );
@@ -235,7 +229,7 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 4;
+   const uint32_t last_nonce = max_nonce;
   const int thr_id = mythr->id;
   uint32_t nonce = first_nonce;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
@@ -244,17 +238,18 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,

   mm128_bswap32_80( edata, pdata );
   
+   static __thread uint32_t s_ntime = UINT32_MAX;
   uint32_t ntime = swab32(pdata[17]);
   if ( s_ntime != ntime )
   {
-      hex_getAlgoString( (const uint32_t*) (&edata[1]), hashOrder );
+      hex_getAlgoString( (const uint32_t*) (&edata[1]), x16r_hash_order );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
-              applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime );
+              applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

   // Do midstate prehash on hash functions with block size <= 64 bytes.
-   const char elem = hashOrder[0];
+   const char elem = x16r_hash_order[0];
   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
   switch ( algo )
   {
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -287,30 +287,14 @@ void x16r_8way_hash_generic( void* output, const void* input )
            shavite512_4way_full( &ctx.shavite, vhash, vhash, size );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in0, size );
-            sph_shavite512_close( &ctx.shavite, hash0 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in1, size );
-            sph_shavite512_close( &ctx.shavite, hash1 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in2, size );
-            sph_shavite512_close( &ctx.shavite, hash2 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in3, size );
-            sph_shavite512_close( &ctx.shavite, hash3 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in4, size );
-            sph_shavite512_close( &ctx.shavite, hash4 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in5, size );
-            sph_shavite512_close( &ctx.shavite, hash5 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in6, size );
-            sph_shavite512_close( &ctx.shavite, hash6 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in7, size );
-            sph_shavite512_close( &ctx.shavite, hash7 );
+            shavite512_full( &ctx.shavite, hash0, in0, size );
+            shavite512_full( &ctx.shavite, hash1, in1, size );
+            shavite512_full( &ctx.shavite, hash2, in2, size );
+            shavite512_full( &ctx.shavite, hash3, in3, size );
+            shavite512_full( &ctx.shavite, hash4, in4, size );
+            shavite512_full( &ctx.shavite, hash5, in5, size );
+            shavite512_full( &ctx.shavite, hash6, in6, size );
+            shavite512_full( &ctx.shavite, hash7, in7, size );
 #endif
         break;
         case SIMD:
@@ -363,30 +347,14 @@ void x16r_8way_hash_generic( void* output, const void* input )
                          hash7, vhash );
         break;
         case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in0, size );
-             sph_fugue512_close( &ctx.fugue, hash0 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in1, size );
-             sph_fugue512_close( &ctx.fugue, hash1 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in2, size );
-             sph_fugue512_close( &ctx.fugue, hash2 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in3, size );
-             sph_fugue512_close( &ctx.fugue, hash3 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in4, size );
-             sph_fugue512_close( &ctx.fugue, hash4 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in5, size );
-             sph_fugue512_close( &ctx.fugue, hash5 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in6, size );
-             sph_fugue512_close( &ctx.fugue, hash6 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in7, size );
-             sph_fugue512_close( &ctx.fugue, hash7 );
+             sph_fugue512_full( &ctx.fugue, hash0, in0, size );
+             sph_fugue512_full( &ctx.fugue, hash1, in1, size );
+             sph_fugue512_full( &ctx.fugue, hash2, in2, size );
+             sph_fugue512_full( &ctx.fugue, hash3, in3, size );
+             sph_fugue512_full( &ctx.fugue, hash4, in4, size );
+             sph_fugue512_full( &ctx.fugue, hash5, in5, size );
+             sph_fugue512_full( &ctx.fugue, hash6, in6, size );
+             sph_fugue512_full( &ctx.fugue, hash7, in7, size );
         break;
         case SHABAL:
             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -431,30 +399,14 @@ void x16r_8way_hash_generic( void* output, const void* input )
            }
            else
            {
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in0, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash0 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in1, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash1 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in2, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash2 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in3, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash3 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in4, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash4 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in5, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash5 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in6, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash6 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in7, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash7 );
+               sph_whirlpool512_full( &ctx.whirlpool, hash0, in0, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash1, in1, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash2, in2, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash3, in3, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash4, in4, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash5, in5, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash6, in6, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash7, in7, size );
            }
         break;
         case SHA_512:
@@ -576,8 +528,7 @@ void x16r_4way_prehash( void *vdata, void *pdata )
      break;
      case SKEIN:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         skein512_4way_init( &x16r_ctx.skein );
-         skein512_4way_update( &x16r_ctx.skein, vdata, 64 );
+         skein512_4way_prehash64( &x16r_ctx.skein, vdata );
      break;
      case LUFFA:
         mm128_bswap32_80( edata, pdata );
@@ -692,14 +643,12 @@ void x16r_4way_hash_generic( void* output, const void* input )
         break;
         case SKEIN:
            if ( i == 0 )
-               skein512_4way_update( &ctx.skein, input + (64<<2), 16 );
+               skein512_4way_final16( &ctx.skein, vhash, input + (64*4) );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               skein512_4way_init( &ctx.skein );
-               skein512_4way_update( &ctx.skein, vhash, size );
+               skein512_4way_full( &ctx.skein, vhash, vhash, size );
            }
-            skein512_4way_close( &ctx.skein, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case LUFFA:
@@ -755,18 +704,10 @@ void x16r_4way_hash_generic( void* output, const void* input )
            }
         break;
         case SHAVITE:
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in0, size );
-            sph_shavite512_close( &ctx.shavite, hash0 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in1, size );
-            sph_shavite512_close( &ctx.shavite, hash1 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in2, size );
-            sph_shavite512_close( &ctx.shavite, hash2 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in3, size );
-            sph_shavite512_close( &ctx.shavite, hash3 );
+            shavite512_full( &ctx.shavite, hash0, in0, size );
+            shavite512_full( &ctx.shavite, hash1, in1, size );
+            shavite512_full( &ctx.shavite, hash2, in2, size );
+            shavite512_full( &ctx.shavite, hash3, in3, size );
         break;
         case SIMD:
            intrlv_2x128( vhash, in0, in1, size<<3 );
@@ -799,18 +740,10 @@ void x16r_4way_hash_generic( void* output, const void* input )
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in0, size );
-             sph_fugue512_close( &ctx.fugue, hash0 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in1, size );
-             sph_fugue512_close( &ctx.fugue, hash1 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in2, size );
-             sph_fugue512_close( &ctx.fugue, hash2 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in3, size );
-             sph_fugue512_close( &ctx.fugue, hash3 );
+             sph_fugue512_full( &ctx.fugue, hash0, in0, size );
+             sph_fugue512_full( &ctx.fugue, hash1, in1, size );
+             sph_fugue512_full( &ctx.fugue, hash2, in2, size );
+             sph_fugue512_full( &ctx.fugue, hash3, in3, size );
         break;
         case SHABAL:
             intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
@@ -841,18 +774,10 @@ void x16r_4way_hash_generic( void* output, const void* input )
            }
            else
            {
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in0, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash0 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in1, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash1 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in2, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash2 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in3, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash3 );
+               sph_whirlpool512_full( &ctx.whirlpool, hash0, in0, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash1, in1, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash2, in2, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash3, in3, size );
            }
         break;
         case SHA_512:
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -121,7 +121,7 @@ union _x16r_8way_context_overlay
    echo_4way_context       echo;
 #else
    hashState_groestl       groestl;
-    sph_shavite512_context  shavite;
+    shavite512_context      shavite;
    hashState_echo          echo;
 #endif
 } __attribute__ ((aligned (64)));
@@ -152,7 +152,7 @@ union _x16r_4way_context_overlay
    luffa_2way_context      luffa;
    hashState_luffa         luffa1;
    cubehashParam           cube;
-    sph_shavite512_context  shavite;
+    shavite512_context      shavite;
    simd_2way_context       simd;
    hamsi512_4way_context   hamsi;
    sph_fugue512_context    fugue;
@@ -191,7 +191,7 @@ union _x16r_context_overlay
        sph_keccak512_context   keccak;
        hashState_luffa         luffa;
        cubehashParam           cube;
-        sph_shavite512_context  shavite;
+        shavite512_context      shavite;
        hashState_sd            simd;
        sph_hamsi512_context    hamsi;
        sph_fugue512_context    fugue;
--- a/algo/x16/x16r.c
+++ b/algo/x16/x16r.c
@@ -124,9 +124,7 @@ void x16r_hash_generic( void* output, const void* input )
                                         (byte*)in, size );
         break;
         case SHAVITE:
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in, size );
-            sph_shavite512_close( &ctx.shavite, hash );
+            shavite512_full( &ctx.shavite, hash, in, size );
         break;
         case SIMD:
            simd_full( &ctx.simd, (BitSequence *)hash,
@@ -153,9 +151,7 @@ void x16r_hash_generic( void* output, const void* input )
            sph_hamsi512_close( &ctx.hamsi, hash );
         break;
         case FUGUE:
-            sph_fugue512_init( &ctx.fugue );
-            sph_fugue512( &ctx.fugue, in, size );
-            sph_fugue512_close( &ctx.fugue, hash );
+            sph_fugue512_full( &ctx.fugue, hash, in, size );
         break;
         case SHABAL:
            if ( i == 0 )
@@ -169,13 +165,12 @@ void x16r_hash_generic( void* output, const void* input )
         break;
         case WHIRLPOOL:
            if ( i == 0 )
-               sph_whirlpool( &ctx.whirlpool, in+64, 16 );
-            else
            {
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in, size );
+               sph_whirlpool( &ctx.whirlpool, in+64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash );
            }
-            sph_whirlpool_close( &ctx.whirlpool, hash );
+            else
+               sph_whirlpool512_full( &ctx.whirlpool, hash, in, size );
         break;
         case SHA_512:
            SHA512_Init( &ctx.sha512 );
@@ -238,7 +233,7 @@ int scanhash_x16r( struct work *work, uint32_t max_nonce,
      nonce++;
   } while ( nonce < max_nonce && !(*restart) );
   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce + 1;
+   *hashes_done = pdata[19] - first_nonce;
   return 0;
 }

--- a/algo/x16/x16rt.c
+++ b/algo/x16/x16rt.c
@@ -46,7 +46,7 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
      nonce++;
   } while ( nonce < max_nonce && !(*restart) );
   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce + 1;
+   *hashes_done = pdata[19] - first_nonce;
   return 0;
 }

--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -35,9 +35,6 @@

 #if defined (X16RV2_8WAY)

-static __thread uint32_t s_ntime = UINT32_MAX;
-static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
-
 union _x16rv2_8way_context_overlay
 {
    blake512_8way_context   blake;
@@ -60,7 +57,7 @@ union _x16rv2_8way_context_overlay
    echo_4way_context       echo;
 #else
    hashState_groestl       groestl;
-    sph_shavite512_context  shavite;
+    shavite512_context      shavite;
    hashState_echo          echo;
 #endif
 } __attribute__ ((aligned (64)));
@@ -96,7 +93,7 @@ void x16rv2_8way_hash( void* output, const void* input )

   for ( int i = 0; i < 16; i++ )
   {
-      const char elem = hashOrder[i];
+      const char elem = x16r_hash_order[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

      switch ( algo )
@@ -374,30 +371,14 @@ void x16rv2_8way_hash( void* output, const void* input )
            shavite512_4way_full( &ctx.shavite, vhash, vhash, size );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in0, size );
-            sph_shavite512_close( &ctx.shavite, hash0 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in1, size );
-            sph_shavite512_close( &ctx.shavite, hash1 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in2, size );
-            sph_shavite512_close( &ctx.shavite, hash2 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in3, size );
-            sph_shavite512_close( &ctx.shavite, hash3 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in4, size );
-            sph_shavite512_close( &ctx.shavite, hash4 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in5, size );
-            sph_shavite512_close( &ctx.shavite, hash5 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in6, size );
-            sph_shavite512_close( &ctx.shavite, hash6 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in7, size );
-            sph_shavite512_close( &ctx.shavite, hash7 );
+            shavite512_full( &ctx.shavite, hash0, in0, size );
+            shavite512_full( &ctx.shavite, hash1, in1, size );
+            shavite512_full( &ctx.shavite, hash2, in2, size );
+            shavite512_full( &ctx.shavite, hash3, in3, size );
+            shavite512_full( &ctx.shavite, hash4, in4, size );
+            shavite512_full( &ctx.shavite, hash5, in5, size );
+            shavite512_full( &ctx.shavite, hash6, in6, size );
+            shavite512_full( &ctx.shavite, hash7, in7, size );
 #endif
         break;
         case SIMD:
@@ -451,30 +432,14 @@ void x16rv2_8way_hash( void* output, const void* input )
                          hash7, vhash );
         break;
         case FUGUE:
-            sph_fugue512_init( &ctx.fugue );
-            sph_fugue512( &ctx.fugue, in0, size );
-            sph_fugue512_close( &ctx.fugue, hash0 );
-            sph_fugue512_init( &ctx.fugue );
-            sph_fugue512( &ctx.fugue, in1, size );
-            sph_fugue512_close( &ctx.fugue, hash1 );
-            sph_fugue512_init( &ctx.fugue );
-            sph_fugue512( &ctx.fugue, in2, size );
-            sph_fugue512_close( &ctx.fugue, hash2 );
-            sph_fugue512_init( &ctx.fugue );
-            sph_fugue512( &ctx.fugue, in3, size );
-            sph_fugue512_close( &ctx.fugue, hash3 );
-            sph_fugue512_init( &ctx.fugue );
-            sph_fugue512( &ctx.fugue, in4, size );
-            sph_fugue512_close( &ctx.fugue, hash4 );
-            sph_fugue512_init( &ctx.fugue );
-            sph_fugue512( &ctx.fugue, in5, size );
-            sph_fugue512_close( &ctx.fugue, hash5 );
-            sph_fugue512_init( &ctx.fugue );
-            sph_fugue512( &ctx.fugue, in6, size );
-            sph_fugue512_close( &ctx.fugue, hash6 );
-            sph_fugue512_init( &ctx.fugue );
-            sph_fugue512( &ctx.fugue, in7, size );
-            sph_fugue512_close( &ctx.fugue, hash7 );
+            sph_fugue512_full( &ctx.fugue, hash0, in0, size );
+            sph_fugue512_full( &ctx.fugue, hash1, in1, size );
+            sph_fugue512_full( &ctx.fugue, hash2, in2, size );
+            sph_fugue512_full( &ctx.fugue, hash3, in3, size );
+            sph_fugue512_full( &ctx.fugue, hash4, in4, size );
+            sph_fugue512_full( &ctx.fugue, hash5, in5, size );
+            sph_fugue512_full( &ctx.fugue, hash6, in6, size );
+            sph_fugue512_full( &ctx.fugue, hash7, in7, size );
         break;
         case SHABAL:
            intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -519,30 +484,14 @@ void x16rv2_8way_hash( void* output, const void* input )
            }
            else
            {
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in0, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash0 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in1, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash1 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in2, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash2 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in3, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash3 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in4, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash4 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in5, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash5 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in6, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash6 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in7, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash7 );
+              sph_whirlpool512_full( &ctx.whirlpool, hash0, in0, size );
+              sph_whirlpool512_full( &ctx.whirlpool, hash1, in1, size );
+              sph_whirlpool512_full( &ctx.whirlpool, hash2, in2, size );
+              sph_whirlpool512_full( &ctx.whirlpool, hash3, in3, size );
+              sph_whirlpool512_full( &ctx.whirlpool, hash4, in4, size );
+              sph_whirlpool512_full( &ctx.whirlpool, hash5, in5, size );
+              sph_whirlpool512_full( &ctx.whirlpool, hash6, in6, size );
+              sph_whirlpool512_full( &ctx.whirlpool, hash7, in7, size );
            }
         break;
         case SHA_512:
@@ -651,17 +600,19 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,

   bedata1[0] = bswap_32( pdata[1] );
   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
   const uint32_t ntime = bswap_32( pdata[17] );
   if ( s_ntime != ntime )
   {
-      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
-              applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime );
+         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

   // Do midstate prehash on hash functions with block size <= 64 bytes.
-   const char elem = hashOrder[0];
+   const char elem = x16r_hash_order[0];
   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
   switch ( algo )
   {
@@ -737,9 +688,6 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,

 #elif defined (X16RV2_4WAY)

-static __thread uint32_t s_ntime = UINT32_MAX;
-static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
-
 union _x16rv2_4way_context_overlay
 {
    blake512_4way_context   blake;
@@ -751,7 +699,7 @@ union _x16rv2_4way_context_overlay
    keccak512_4way_context  keccak;
    luffa_2way_context      luffa;
    cubehashParam           cube;
-    sph_shavite512_context  shavite;
+    shavite512_context      shavite;
    simd_2way_context       simd;
    hamsi512_4way_context   hamsi;
    sph_fugue512_context    fugue;
@@ -789,7 +737,7 @@ void x16rv2_4way_hash( void* output, const void* input )

   for ( int i = 0; i < 16; i++ )
   {
-      const char elem = hashOrder[i];
+      const char elem = x16r_hash_order[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

      switch ( algo )
@@ -835,47 +783,47 @@ void x16rv2_4way_hash( void* output, const void* input )
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case KECCAK:
-             if ( i == 0 )
-             {
-                sph_tiger( &ctx.tiger, in0 + 64, 16 );
-                sph_tiger_close( &ctx.tiger, hash0 );
-                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
-                sph_tiger( &ctx.tiger, in1 + 64, 16 );
-                sph_tiger_close( &ctx.tiger, hash1 );
-                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
-                sph_tiger( &ctx.tiger, in2 + 64, 16 );
-                sph_tiger_close( &ctx.tiger, hash2 );
-                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
-                sph_tiger( &ctx.tiger, in3 + 64, 16 );
-                sph_tiger_close( &ctx.tiger, hash3 );
-             }
-             else
-             {
-                sph_tiger_init( &ctx.tiger );
-			       sph_tiger( &ctx.tiger, in0, size );
-                sph_tiger_close( &ctx.tiger, hash0 );
-                sph_tiger_init( &ctx.tiger );
-                sph_tiger( &ctx.tiger, in1, size );
-                sph_tiger_close( &ctx.tiger, hash1 );
-                sph_tiger_init( &ctx.tiger );
-                sph_tiger( &ctx.tiger, in2, size );
-                sph_tiger_close( &ctx.tiger, hash2 );
-                sph_tiger_init( &ctx.tiger );
-                sph_tiger( &ctx.tiger, in3, size );
-                sph_tiger_close( &ctx.tiger, hash3 );
-             }
-             for ( int i = (24/4); i < (64/4); i++ )
+            if ( i == 0 )
+            {
+               sph_tiger( &ctx.tiger, in0 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_tiger( &ctx.tiger, in1 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash1 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_tiger( &ctx.tiger, in2 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash2 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_tiger( &ctx.tiger, in3 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash3 );
+            }
+            else
+            {
+               sph_tiger_init( &ctx.tiger );
+		         sph_tiger( &ctx.tiger, in0, size );
+               sph_tiger_close( &ctx.tiger, hash0 );
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in1, size );
+               sph_tiger_close( &ctx.tiger, hash1 );
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in2, size );
+               sph_tiger_close( &ctx.tiger, hash2 );
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in3, size );
+               sph_tiger_close( &ctx.tiger, hash3 );
+            }
+            for ( int i = (24/4); i < (64/4); i++ )
                hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0;

-             intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-             keccak512_4way_init( &ctx.keccak );
-             keccak512_4way_update( &ctx.keccak, vhash, 64 );
-             keccak512_4way_close( &ctx.keccak, vhash );
-             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+            keccak512_4way_init( &ctx.keccak );
+            keccak512_4way_update( &ctx.keccak, vhash, 64 );
+            keccak512_4way_close( &ctx.keccak, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case SKEIN:
            if ( i == 0 )
-               skein512_4way_update( &ctx.skein, input + (64<<2), 16 );
+               skein512_4way_final16( &ctx.skein, vhash, input + (64*4) );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
@@ -886,46 +834,46 @@ void x16rv2_4way_hash( void* output, const void* input )
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case LUFFA:
-             if ( i == 0 )
-             {
-                sph_tiger( &ctx.tiger, in0 + 64, 16 );
-                sph_tiger_close( &ctx.tiger, hash0 );
-                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
-                sph_tiger( &ctx.tiger, in1 + 64, 16 );
-                sph_tiger_close( &ctx.tiger, hash1 );
-                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
-                sph_tiger( &ctx.tiger, in2 + 64, 16 );
-                sph_tiger_close( &ctx.tiger, hash2 );
-                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
-                sph_tiger( &ctx.tiger, in3 + 64, 16 );
-                sph_tiger_close( &ctx.tiger, hash3 );
-             }
-             else
-             {
-                sph_tiger_init( &ctx.tiger );
-                sph_tiger( &ctx.tiger, in0, size );
-                sph_tiger_close( &ctx.tiger, hash0 );
-                sph_tiger_init( &ctx.tiger );
-                sph_tiger( &ctx.tiger, in1, size );
-                sph_tiger_close( &ctx.tiger, hash1 );
-                sph_tiger_init( &ctx.tiger );
-                sph_tiger( &ctx.tiger, in2, size );
-                sph_tiger_close( &ctx.tiger, hash2 );
-                sph_tiger_init( &ctx.tiger );
-                sph_tiger( &ctx.tiger, in3, size );
-                sph_tiger_close( &ctx.tiger, hash3 );
-             }
-             for ( int i = (24/4); i < (64/4); i++ )
+            if ( i == 0 )
+            {
+               sph_tiger( &ctx.tiger, in0 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_tiger( &ctx.tiger, in1 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash1 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_tiger( &ctx.tiger, in2 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash2 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_tiger( &ctx.tiger, in3 + 64, 16 );
+               sph_tiger_close( &ctx.tiger, hash3 );
+            }
+            else
+            {
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in0, size );
+               sph_tiger_close( &ctx.tiger, hash0 );
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in1, size );
+               sph_tiger_close( &ctx.tiger, hash1 );
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in2, size );
+               sph_tiger_close( &ctx.tiger, hash2 );
+               sph_tiger_init( &ctx.tiger );
+               sph_tiger( &ctx.tiger, in3, size );
+               sph_tiger_close( &ctx.tiger, hash3 );
+            }
+            for ( int i = (24/4); i < (64/4); i++ )
                hash0[i] = hash1[i] =  hash2[i] = hash3[i] = 0;

-             intrlv_2x128( vhash, hash0, hash1, 512 );
-             luffa_2way_init( &ctx.luffa, 512 );
-             luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-             dintrlv_2x128( hash0, hash1, vhash, 512 );
-             intrlv_2x128( vhash, hash2, hash3, 512 );
-             luffa_2way_init( &ctx.luffa, 512 );
-             luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-             dintrlv_2x128( hash2, hash3, vhash, 512 );
+            intrlv_2x128( vhash, hash0, hash1, 512 );
+            luffa_2way_init( &ctx.luffa, 512 );
+            luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+            dintrlv_2x128( hash0, hash1, vhash, 512 );
+            intrlv_2x128( vhash, hash2, hash3, 512 );
+            luffa_2way_init( &ctx.luffa, 512 );
+            luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+            dintrlv_2x128( hash2, hash3, vhash, 512 );
         break;
         case CUBEHASH:
            if ( i == 0 )
@@ -959,18 +907,10 @@ void x16rv2_4way_hash( void* output, const void* input )
            }
         break;
         case SHAVITE:
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in0, size );
-            sph_shavite512_close( &ctx.shavite, hash0 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in1, size );
-            sph_shavite512_close( &ctx.shavite, hash1 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in2, size );
-            sph_shavite512_close( &ctx.shavite, hash2 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in3, size );
-            sph_shavite512_close( &ctx.shavite, hash3 );
+            shavite512_full( &ctx.shavite, hash0, in0, size );
+            shavite512_full( &ctx.shavite, hash1, in1, size );
+            shavite512_full( &ctx.shavite, hash2, in2, size );
+            shavite512_full( &ctx.shavite, hash3, in3, size );
         break;
         case SIMD:
            intrlv_2x128( vhash, in0, in1, size<<3 );
@@ -1003,18 +943,10 @@ void x16rv2_4way_hash( void* output, const void* input )
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in0, size );
-             sph_fugue512_close( &ctx.fugue, hash0 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in1, size );
-             sph_fugue512_close( &ctx.fugue, hash1 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in2, size );
-             sph_fugue512_close( &ctx.fugue, hash2 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in3, size );
-             sph_fugue512_close( &ctx.fugue, hash3 );
+            sph_fugue512_full( &ctx.fugue, hash0, in0, size );
+            sph_fugue512_full( &ctx.fugue, hash1, in1, size );
+            sph_fugue512_full( &ctx.fugue, hash2, in2, size );
+            sph_fugue512_full( &ctx.fugue, hash3, in3, size );
         break;
         case SHABAL:
             intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
@@ -1045,18 +977,10 @@ void x16rv2_4way_hash( void* output, const void* input )
            }
            else
            {
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in0, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash0 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in1, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash1 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in2, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash2 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in3, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash3 );
+               sph_whirlpool512_full( &ctx.whirlpool, hash0, in0, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash1, in1, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash2, in2, size );
+               sph_whirlpool512_full( &ctx.whirlpool, hash3, in3, size );
            }
         break;
         case SHA_512:
@@ -1121,7 +1045,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
   const int thr_id = mythr->id; 
-    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+    __m256i  *noncev = (__m256i*)vdata + 9; 
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
   const bool bench = opt_benchmark;

@@ -1130,17 +1054,19 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,

   bedata1[0] = bswap_32( pdata[1] );
   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
   const uint32_t ntime = bswap_32(pdata[17]);
   if ( s_ntime != ntime )
   {
-      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
-              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

   // Do midstate prehash on hash functions with block size <= 64 bytes.
-   const char elem = hashOrder[0];
+   const char elem = x16r_hash_order[0];
   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
   switch ( algo )
   {
@@ -1159,8 +1085,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
      break;
      case SKEIN:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         skein512_4way_init( &x16rv2_ctx.skein );
-         skein512_4way_update( &x16rv2_ctx.skein, vdata, 64 );
+         skein512_4way_prehash64( &x16r_ctx.skein, vdata );
      break;
      case CUBEHASH:
         mm128_bswap32_80( edata, pdata );
--- a/algo/x16/x16rv2.c
+++ b/algo/x16/x16rv2.c
@@ -51,7 +51,7 @@ union _x16rv2_context_overlay
        sph_keccak512_context   keccak;
        hashState_luffa         luffa;
        cubehashParam           cube;
-        sph_shavite512_context  shavite;
+        shavite512_context      shavite;
        hashState_sd            simd;
        sph_hamsi512_context    hamsi;
        sph_fugue512_context    fugue;
@@ -136,9 +136,7 @@ void x16rv2_hash( void* output, const void* input )
                                  (const byte*)in, size );
         break;
         case SHAVITE:
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in, size );
-            sph_shavite512_close( &ctx.shavite, hash );
+            shavite512_full( &ctx.shavite, hash, in, size );
         break;
         case SIMD:
             init_sd( &ctx.simd, 512 );
@@ -162,9 +160,7 @@ void x16rv2_hash( void* output, const void* input )
             sph_hamsi512_close( &ctx.hamsi, hash );
         break;
         case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in, size );
-             sph_fugue512_close( &ctx.fugue, hash );
+             sph_fugue512_full( &ctx.fugue, hash, in, size );
         break;
         case SHABAL:
             sph_shabal512_init( &ctx.shabal );
@@ -172,9 +168,7 @@ void x16rv2_hash( void* output, const void* input )
             sph_shabal512_close( &ctx.shabal, hash );
         break;
         case WHIRLPOOL:
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash );
+             sph_whirlpool512_full( &ctx.whirlpool, hash, in, size );
         break;
         case SHA_512:
             sph_tiger_init( &ctx.tiger );
@@ -237,7 +231,7 @@ int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
      nonce++;
   } while ( nonce < max_nonce && !(*restart) );
   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce + 1;
+   *hashes_done = pdata[19] - first_nonce;
   return 0;
 }

--- a/algo/x16/x21s.c
+++ b/algo/x16/x21s.c
@@ -97,7 +97,7 @@ int scanhash_x21s( struct work *work, uint32_t max_nonce,
      nonce++;
   } while ( nonce < max_nonce && !(*restart) );
   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce + 1;
+   *hashes_done = pdata[19] - first_nonce;
   return 0;
 }

--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
--- a/algo/x17/sonoa.c
+++ b/algo/x17/sonoa.c
@@ -563,59 +563,31 @@ void sonoa_hash( void *state, const void *input )
 }

 int scanhash_sonoa( struct work *work, uint32_t max_nonce,
-	            uint64_t *hashes_done, struct thr_info *mythr )
+             uint64_t *hashes_done, struct thr_info *mythr)
 {
-   uint32_t _ALIGN(128) hash32[8];
-   uint32_t _ALIGN(128) endiandata[20];
+   uint32_t edata[20] __attribute__((aligned(64)));
+   uint32_t hash64[8] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
-   const uint32_t Htarg = ptarget[7];
-   uint32_t n = pdata[19] - 1;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;

-   uint64_t htmax[] =
+   mm128_bswap32_80( edata, pdata );
+
+   do
   {
-	0,
-	0xF,
-	0xFF,
-	0xFFF,
-	0xFFFF,
-	0x10000000
-   };
-   uint32_t masks[] =
-   {
-	0xFFFFFFFF,
-	0xFFFFFFF0,
-	0xFFFFFF00,
-	0xFFFFF000,
-	0xFFFF0000,
-	0
-   };
-
-
-   // we need bigendian data...
-   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-
-   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
-   {
-      uint32_t mask = masks[m];
-      do
+      edata[19] = n;
+      sonoa_hash( hash64, edata );
+      if ( unlikely( valid_hash( hash64, ptarget ) && !bench ) )
      {
-         pdata[19] = ++n;
-         be32enc(&endiandata[19], n);
-         sonoa_hash(hash32, endiandata);
-         if ( !( hash32[7] & mask ) )
-         if ( fulltest( hash32, ptarget ) && !opt_benchmark )
-            submit_solution( work, hash32, mythr );
-	   } while (n < max_nonce && !work_restart[thr_id].restart);
-	   break;
-	}
-   *hashes_done = n - first_nonce + 1;
+         pdata[19] = bswap_32( n );
+         submit_solution( work, hash64, mythr );
+      }
+      n++;
+   } while ( n < max_nonce && !work_restart[thr_id].restart );
+   *hashes_done = n - first_nonce;
   pdata[19] = n;
   return 0;
 }
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -74,9 +74,7 @@ void x17_8way_hash( void *state, const void *input )

     blake512_8way_full( &ctx.blake, vhash, input, 80 );

-     bmw512_8way_init( &ctx.bmw );
-     bmw512_8way_update( &ctx.bmw, vhash, 64 );
-     bmw512_8way_close( &ctx.bmw, vhash );
+     bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );

 #if defined(__VAES__)

@@ -106,9 +104,7 @@ void x17_8way_hash( void *state, const void *input )

 #endif

-     skein512_8way_init( &ctx.skein );
-     skein512_8way_update( &ctx.skein, vhash, 64 );
-     skein512_8way_close( &ctx.skein, vhash );
+     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );

     jh512_8way_init( &ctx.jh );
     jh512_8way_update( &ctx.jh, vhash, 64 );
@@ -136,30 +132,14 @@ void x17_8way_hash( void *state, const void *input )
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );

-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash0, 64 );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash1, 64 );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash2, 64 );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash3, 64 );
-     sph_shavite512_close( &ctx.shavite, hash3 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash4, 64 );
-     sph_shavite512_close( &ctx.shavite, hash4 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash5, 64 );
-     sph_shavite512_close( &ctx.shavite, hash5 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash6, 64 );
-     sph_shavite512_close( &ctx.shavite, hash6 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash7, 64 );
-     sph_shavite512_close( &ctx.shavite, hash7 );
+     shavite512_full( &ctx.shavite, hash0, hash0, 64 );
+     shavite512_full( &ctx.shavite, hash1, hash1, 64 );
+     shavite512_full( &ctx.shavite, hash2, hash2, 64 );
+     shavite512_full( &ctx.shavite, hash3, hash3, 64 );
+     shavite512_full( &ctx.shavite, hash4, hash4, 64 );
+     shavite512_full( &ctx.shavite, hash5, hash5, 64 );
+     shavite512_full( &ctx.shavite, hash6, hash6, 64 );
+     shavite512_full( &ctx.shavite, hash7, hash7, 64 );

     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
@@ -210,30 +190,14 @@ void x17_8way_hash( void *state, const void *input )
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );

-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, 64 );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, 64 );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, 64 );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, 64 );
-     sph_fugue512_close( &ctx.fugue, hash3 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash4, 64 );
-     sph_fugue512_close( &ctx.fugue, hash4 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash5, 64 );
-     sph_fugue512_close( &ctx.fugue, hash5 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash6, 64 );
-     sph_fugue512_close( &ctx.fugue, hash6 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash7, 64 );
-     sph_fugue512_close( &ctx.fugue, hash7 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );
+     sph_fugue512_full( &ctx.fugue, hash4, hash4, 64 );
+     sph_fugue512_full( &ctx.fugue, hash5, hash5, 64 );
+     sph_fugue512_full( &ctx.fugue, hash6, hash6, 64 );
+     sph_fugue512_full( &ctx.fugue, hash7, hash7, 64 );

     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );
@@ -245,30 +209,14 @@ void x17_8way_hash( void *state, const void *input )
     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );

-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash4, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash4 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash5, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash5 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash6, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash6 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, 64 );

     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );
@@ -287,18 +235,18 @@ void x17_8way_hash( void *state, const void *input )
 int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t hash32[8*8] __attribute__ ((aligned (128)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hash32 = &(hash[7*8]);
+   uint32_t *hash32_d7 = &(hash32[7*8]);
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
-   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   __m512i  *noncev = (__m512i*)vdata + 9; 
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
-   const uint32_t targ32 = ptarget[7];
+   const uint32_t targ32_d7 = ptarget[7];
   const bool bench = opt_benchmark;

   mm512_bswap32_intrlv80_8x64( vdata, pdata );
@@ -307,12 +255,12 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
                                n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
   do
   {
-      x17_8way_hash( hash, vdata );
+      x17_8way_hash( hash32, vdata );

      for ( int lane = 0; lane < 8; lane++ )
-      if ( unlikely( ( hash32[ lane ] <= targ32 ) && !bench ) )
+      if ( unlikely( ( hash32_d7[ lane ] <= targ32_d7 ) && !bench ) )
      {
-         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         extr_lane_8x32( lane_hash, hash32, lane, 256 );
         if ( likely( valid_hash( lane_hash, ptarget ) ) )
         {
            pdata[19] = bswap_32( n + lane );
@@ -378,9 +326,7 @@ void x17_4way_hash( void *state, const void *input )

     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

-     skein512_4way_init( &ctx.skein );
-     skein512_4way_update( &ctx.skein, vhash, 64 );
-     skein512_4way_close( &ctx.skein, vhash );
+     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );

     jh512_4way_init( &ctx.jh );
     jh512_4way_update( &ctx.jh, vhash, 64 );
@@ -424,18 +370,10 @@ void x17_4way_hash( void *state, const void *input )

     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, 64 );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, 64 );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, 64 );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, 64 );
-     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );

     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );

@@ -445,18 +383,10 @@ void x17_4way_hash( void *state, const void *input )

     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
       
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, 64 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, 64 );

     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

@@ -474,10 +404,10 @@ void x17_4way_hash( void *state, const void *input )
 int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t hash32[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hash32 = &(hash[ 7*4 ]);
+   uint32_t *hash32_d7 = &(hash32[ 7*4 ]);
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -485,7 +415,7 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
   __m256i  *noncev = (__m256i*)vdata + 9;
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
-   const uint32_t targ32 = ptarget[7];
+   const uint32_t targ32_d7 = ptarget[7];
   const bool bench = opt_benchmark;

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
@@ -493,12 +423,12 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
   {
-      x17_4way_hash( hash, vdata );
+      x17_4way_hash( hash32, vdata );

      for ( int lane = 0; lane < 4; lane++ )
-      if ( unlikely( hash32[ lane ] <= targ32 && !bench ) )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 && !bench ) )
      {  
-         extr_lane_4x32( lane_hash, hash, lane, 256 );
+         extr_lane_4x32( lane_hash, hash32, lane, 256 );
         if ( valid_hash( lane_hash, ptarget ) )
         {
            pdata[19] = bswap_32( n + lane );
--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -169,8 +169,8 @@ int scanhash_x17( struct work *work, uint32_t max_nonce,
         submit_solution( work, hash64, mythr );
      }
      n++;
-   } while ( n < max_nonce && !work_restart[thr_id].restart);
-   *hashes_done = n - first_nonce + 1;
+   } while ( n < max_nonce && !work_restart[thr_id].restart );
+   *hashes_done = n - first_nonce;
   pdata[19] = n;
   return 0;
 }
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -76,9 +76,7 @@ void xevan_8way_hash( void *output, const void *input )
     blake512_8way_full( &ctx.blake, vhash, input, 80 );
     memset( &vhash[8<<3], 0, 64<<3 );

-     bmw512_8way_init( &ctx.bmw );
-     bmw512_8way_update( &ctx.bmw, vhash, dataLen );
-     bmw512_8way_close( &ctx.bmw, vhash );
+     bmw512_8way_full( &ctx.bmw, vhash, vhash, dataLen );

 #if defined(__VAES__)

@@ -108,9 +106,7 @@ void xevan_8way_hash( void *output, const void *input )

 #endif

-     skein512_8way_init( &ctx.skein );
-     skein512_8way_update( &ctx.skein, vhash, dataLen );
-     skein512_8way_close( &ctx.skein, vhash );
+     skein512_8way_full( &ctx.skein, vhash, vhash, dataLen );

     jh512_8way_init( &ctx.jh );
     jh512_8way_update( &ctx.jh, vhash, dataLen );
@@ -138,30 +134,14 @@ void xevan_8way_hash( void *output, const void *input )
     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );

-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash0, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash1, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash2, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash3, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash3 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash4, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash4 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash5, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash5 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash6, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash6 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash7, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash7 );
+     shavite512_full( &ctx.shavite, hash0, hash0, dataLen );
+     shavite512_full( &ctx.shavite, hash1, hash1, dataLen );
+     shavite512_full( &ctx.shavite, hash2, hash2, dataLen );
+     shavite512_full( &ctx.shavite, hash3, hash3, dataLen );
+     shavite512_full( &ctx.shavite, hash4, hash4, dataLen );
+     shavite512_full( &ctx.shavite, hash5, hash5, dataLen );
+     shavite512_full( &ctx.shavite, hash6, hash6, dataLen );
+     shavite512_full( &ctx.shavite, hash7, hash7, dataLen );

     intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 );
     intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 );
@@ -212,30 +192,14 @@ void xevan_8way_hash( void *output, const void *input )
     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, dataLen<<3 );

-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash3 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash4, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash4 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash5, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash5 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash6, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash6 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash7, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash7 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash4, hash4, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash5, hash5, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash6, hash6, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash7, hash7, dataLen );

     intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, dataLen<<3 );
@@ -247,30 +211,14 @@ void xevan_8way_hash( void *output, const void *input )
     dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, dataLen<<3 );

-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash4, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash4 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash5, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash5 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash6, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash6 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash7, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, dataLen );

     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, dataLen<<3 );
@@ -291,9 +239,7 @@ void xevan_8way_hash( void *output, const void *input )

     blake512_8way_full( &ctx.blake, vhash, vhash, dataLen );

-     bmw512_8way_init( &ctx.bmw );
-     bmw512_8way_update( &ctx.bmw, vhash, dataLen );
-     bmw512_8way_close( &ctx.bmw, vhash );
+     bmw512_8way_full( &ctx.bmw, vhash, vhash, dataLen );

 #if defined(__VAES__)

@@ -323,9 +269,7 @@ void xevan_8way_hash( void *output, const void *input )

 #endif

-     skein512_8way_init( &ctx.skein );
-     skein512_8way_update( &ctx.skein, vhash, dataLen );
-     skein512_8way_close( &ctx.skein, vhash );
+     skein512_8way_full( &ctx.skein, vhash, vhash, dataLen );

     jh512_8way_init( &ctx.jh );
     jh512_8way_update( &ctx.jh, vhash, dataLen );
@@ -353,30 +297,14 @@ void xevan_8way_hash( void *output, const void *input )
     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );

-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash0, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash1, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash2, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash3, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash3 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash4, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash4 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash5, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash5 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash6, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash6 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash7, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash7 );
+     shavite512_full( &ctx.shavite, hash0, hash0, dataLen );
+     shavite512_full( &ctx.shavite, hash1, hash1, dataLen );
+     shavite512_full( &ctx.shavite, hash2, hash2, dataLen );
+     shavite512_full( &ctx.shavite, hash3, hash3, dataLen );
+     shavite512_full( &ctx.shavite, hash4, hash4, dataLen );
+     shavite512_full( &ctx.shavite, hash5, hash5, dataLen );
+     shavite512_full( &ctx.shavite, hash6, hash6, dataLen );
+     shavite512_full( &ctx.shavite, hash7, hash7, dataLen );

     intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 );
     intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 );
@@ -427,30 +355,14 @@ void xevan_8way_hash( void *output, const void *input )
     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, dataLen<<3 );

-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash3 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash4, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash4 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash5, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash5 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash6, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash6 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash7, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash7 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash4, hash4, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash5, hash5, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash6, hash6, dataLen );
+     sph_fugue512_full( &ctx.fugue, hash7, hash7, dataLen );

     intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, dataLen<<3 );
@@ -462,30 +374,14 @@ void xevan_8way_hash( void *output, const void *input )
     dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, dataLen<<3 );

-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash4, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash4 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash5, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash5 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash6, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash6 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash7, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash4, hash4, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash5, hash5, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash6, hash6, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash7, hash7, dataLen );

     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, dataLen<<3 );
@@ -504,40 +400,43 @@ void xevan_8way_hash( void *output, const void *input )
 int scanhash_xevan_8way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[8*16] __attribute__ ((aligned (128)));
-   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t *hashd7 = &(hash[7*8]);
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
-   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   __m512i  *noncev = (__m512i*)vdata + 9;
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
-   const uint32_t Htarg = ptarget[7];
+   const uint32_t targ32 = ptarget[7];
+   const bool bench = opt_benchmark;

   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   *noncev = mm512_intrlv_blend_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
   do
   {
-      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
      xevan_8way_hash( hash, vdata );

      for ( int lane = 0; lane < 8; lane++ )
-      if unlikely( ( hash7[ lane ] <= Htarg ) )
+      if ( unlikely( ( hashd7[ lane ] <= targ32 ) && !bench ) )
      {
         extr_lane_8x32( lane_hash, hash, lane, 256 );
-         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         if ( likely( valid_hash( lane_hash, ptarget ) ) )
         {
-            pdata[19] = n + lane;
+            pdata[19] = bswap_32( n + lane );
            submit_lane_solution( work, lane_hash, mythr, lane );
         }
      }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
      n += 8;
   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
-
+   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
@@ -578,8 +477,6 @@ void xevan_4way_hash( void *output, const void *input )
     const int dataLen = 128;
     xevan_4way_context_overlay ctx __attribute__ ((aligned (64)));

-     // parallel 4 way
-
     blake512_4way_full( &ctx.blake, vhash, input, 80 );
     memset( &vhash[8<<2], 0, 64<<2 );

@@ -598,9 +495,7 @@ void xevan_4way_hash( void *output, const void *input )
     // Parallel 4way
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

-     skein512_4way_init( &ctx.skein );
-     skein512_4way_update( &ctx.skein, vhash, dataLen );
-     skein512_4way_close( &ctx.skein, vhash );
+     skein512_4way_full( &ctx.skein, vhash, vhash, dataLen );

     jh512_4way_init( &ctx.jh );
     jh512_4way_update( &ctx.jh, vhash, dataLen );
@@ -618,15 +513,11 @@ void xevan_4way_hash( void *output, const void *input )
     cube_2way_full( &ctx.cube, vhashA, 512, vhashA, dataLen );
     cube_2way_full( &ctx.cube, vhashB, 512, vhashB, dataLen );

-     shavite512_2way_init( &ctx.shavite );
-     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
-     shavite512_2way_init( &ctx.shavite );
-     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, dataLen );
+     shavite512_2way_full( &ctx.shavite, vhashA, vhashA, dataLen );
+     shavite512_2way_full( &ctx.shavite, vhashB, vhashB, dataLen );

-     simd_2way_init( &ctx.simd, 512 );
-     simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
-     simd_2way_init( &ctx.simd, 512 );
-     simd_2way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
+     simd512_2way_full( &ctx.simd, vhashA, vhashA, dataLen );
+     simd512_2way_full( &ctx.simd, vhashB, vhashB, dataLen );

     dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 );
     dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 );
@@ -649,18 +540,10 @@ void xevan_4way_hash( void *output, const void *input )

     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );

     // Parallel 4way 32 bit
     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
@@ -672,18 +555,10 @@ void xevan_4way_hash( void *output, const void *input )
     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

     // Serial
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, dataLen );

     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

@@ -718,9 +593,7 @@ void xevan_4way_hash( void *output, const void *input )

     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

-     skein512_4way_init( &ctx.skein );
-     skein512_4way_update( &ctx.skein, vhash, dataLen );
-     skein512_4way_close( &ctx.skein, vhash );
+     skein512_4way_full( &ctx.skein, vhash, vhash, dataLen );

     jh512_4way_init( &ctx.jh );
     jh512_4way_update( &ctx.jh, vhash, dataLen );
@@ -738,15 +611,11 @@ void xevan_4way_hash( void *output, const void *input )
     cube_2way_full( &ctx.cube, vhashA, 512, vhashA, dataLen );
     cube_2way_full( &ctx.cube, vhashB, 512, vhashB, dataLen );

-     shavite512_2way_init( &ctx.shavite );
-     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
-     shavite512_2way_init( &ctx.shavite );
-     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, dataLen );
+     shavite512_2way_full( &ctx.shavite, vhashA, vhashA, dataLen );
+     shavite512_2way_full( &ctx.shavite, vhashB, vhashB, dataLen );

-     simd_2way_init( &ctx.simd, 512 );
-     simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
-     simd_2way_init( &ctx.simd, 512 );
-     simd_2way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
+     simd512_2way_full( &ctx.simd, vhashA, vhashA, dataLen );
+     simd512_2way_full( &ctx.simd, vhashB, vhashB, dataLen );

     dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 );
     dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 );
@@ -768,18 +637,10 @@ void xevan_4way_hash( void *output, const void *input )

     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_full( &ctx.fugue, hash0, hash0, 64 );
+     sph_fugue512_full( &ctx.fugue, hash1, hash1, 64 );
+     sph_fugue512_full( &ctx.fugue, hash2, hash2, 64 );
+     sph_fugue512_full( &ctx.fugue, hash3, hash3, 64 );

     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

@@ -789,18 +650,10 @@ void xevan_4way_hash( void *output, const void *input )

     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash2, hash2, dataLen );
+     sph_whirlpool512_full( &ctx.whirlpool, hash3, hash3, dataLen );

     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

@@ -818,41 +671,43 @@ void xevan_4way_hash( void *output, const void *input )
 int scanhash_xevan_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[4*16] __attribute__ ((aligned (64)));
-   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t *hash7 = &(hash[7<<2]);
+   uint32_t hash[16*4] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hashd7 = &(hash[7<<2]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   int thr_id = mythr->id;
-   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-
-   const uint32_t Htarg = ptarget[7];
+   __m256i  *noncev = (__m256i*)vdata + 9; 
+   const uint32_t targ32 = ptarget[7];
   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
+   const bool bench = opt_benchmark;

-   if ( opt_benchmark )
-      ptarget[7] = 0x0cff;
+   if ( bench )  ptarget[7] = 0x0cff;

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   *noncev = mm256_intrlv_blend_32(
+                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do {
-      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-               _mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ), *noncev );
-
      xevan_4way_hash( hash, vdata );
      for ( int lane = 0; lane < 4; lane++ )
-      if ( hash7[ lane ] <= Htarg )
+      if ( unlikely( hashd7[ lane ] <= targ32 ) && ! bench )
      {
         extr_lane_4x32( lane_hash, hash, lane, 256 );
-	      if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+	      if ( valid_hash( lane_hash, ptarget ) )
         {
-             pdata[19] = n + lane;
+             pdata[19] = bswap_32( n + lane );
             submit_lane_solution( work, lane_hash, mythr, lane );
         }
      }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
      n += 4;
-   } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
-   *hashes_done = n - first_nonce + 1;
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
   return 0;
 }

--- a/algo/x17/xevan.c
+++ b/algo/x17/xevan.c
@@ -56,8 +56,6 @@ typedef struct {
 } xevan_ctx_holder;

 xevan_ctx_holder xevan_ctx __attribute__ ((aligned (64)));
-static __thread sph_blake512_context xevan_blake_mid
-                                        __attribute__ ((aligned (64)));

 void init_xevan_ctx()
 {
@@ -85,34 +83,23 @@ void init_xevan_ctx()
 #endif
 };

-void xevan_blake512_midstate( const void* input )
-{
-    memcpy( &xevan_blake_mid, &xevan_ctx.blake, sizeof xevan_blake_mid );
-    sph_blake512( &xevan_blake_mid, input, 64 );
-}
-
 void xevan_hash(void *output, const void *input)
 {
-        uint32_t _ALIGN(64) hash[32]; // 128 bytes required
+   uint32_t _ALIGN(64) hash[32]; // 128 bytes required
 	const int dataLen = 128;
-        xevan_ctx_holder ctx __attribute__ ((aligned (64)));
-        memcpy( &ctx, &xevan_ctx, sizeof(xevan_ctx) );
-
-        const int midlen = 64;            // bytes
-        const int tail   = 80 - midlen;   // 16
-
-        memcpy( &ctx.blake, &xevan_blake_mid, sizeof xevan_blake_mid );
-        sph_blake512( &ctx.blake, input + midlen, tail );
-	sph_blake512_close(&ctx.blake, hash);
+   xevan_ctx_holder ctx __attribute__ ((aligned (64)));
+   memcpy( &ctx, &xevan_ctx, sizeof(xevan_ctx) );

+   sph_blake512( &ctx.blake, input, 80 );
+   sph_blake512_close( &ctx.blake, hash );
 	memset(&hash[16], 0, 64);

 	sph_bmw512(&ctx.bmw, hash, dataLen);
 	sph_bmw512_close(&ctx.bmw, hash);

 #if defined(__AES__)
-        update_and_final_groestl( &ctx.groestl, (char*)hash,
-                                  (const char*)hash, dataLen*8 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                     (const char*)hash, dataLen*8 );
 #else
 	sph_groestl512(&ctx.groestl, hash, dataLen);
 	sph_groestl512_close(&ctx.groestl, hash);
@@ -127,20 +114,20 @@ void xevan_hash(void *output, const void *input)
 	sph_keccak512(&ctx.keccak, hash, dataLen);
 	sph_keccak512_close(&ctx.keccak, hash);

-        update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
-                                (const BitSequence*)hash, dataLen );
+   update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                 (const BitSequence*)hash, dataLen );

-        cubehashUpdateDigest( &ctx.cubehash, (byte*)hash,
-                              (const byte*) hash, dataLen );
+   cubehashUpdateDigest( &ctx.cubehash, (byte*)hash,
+                                 (const byte*) hash, dataLen );

 	sph_shavite512(&ctx.shavite, hash, dataLen);
 	sph_shavite512_close(&ctx.shavite, hash);

-        update_final_sd( &ctx.simd, (BitSequence *)hash,
+   update_final_sd( &ctx.simd, (BitSequence *)hash,
                         (const BitSequence *)hash, dataLen*8 );

 #if defined(__AES__)
-        update_final_echo( &ctx.echo, (BitSequence *) hash,
+   update_final_echo( &ctx.echo, (BitSequence *) hash,
                           (const BitSequence *) hash, dataLen*8 );
 #else
 	sph_echo512(&ctx.echo, hash, dataLen);
@@ -159,15 +146,15 @@ void xevan_hash(void *output, const void *input)
 	sph_whirlpool(&ctx.whirlpool, hash, dataLen);
 	sph_whirlpool_close(&ctx.whirlpool, hash);

-        SHA512_Update( &ctx.sha512, hash, dataLen );
-        SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
+   SHA512_Update( &ctx.sha512, hash, dataLen );
+   SHA512_Final( (unsigned char*) hash, &ctx.sha512 );

 	sph_haval256_5(&ctx.haval,(const void*) hash, dataLen);
 	sph_haval256_5_close(&ctx.haval, hash);

 	memset(&hash[8], 0, dataLen - 32);

-        memcpy( &ctx, &xevan_ctx, sizeof(xevan_ctx) );
+   memcpy( &ctx, &xevan_ctx, sizeof(xevan_ctx) );

 	sph_blake512(&ctx.blake, hash, dataLen);
 	sph_blake512_close(&ctx.blake, hash);
@@ -176,11 +163,11 @@ void xevan_hash(void *output, const void *input)
 	sph_bmw512_close(&ctx.bmw, hash);

 #if defined(__AES__)
-        update_and_final_groestl( &ctx.groestl, (char*)hash,
-                                  (const BitSequence*)hash, dataLen*8 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash,
+                              (const BitSequence*)hash, dataLen*8 );
 #else
 	sph_groestl512(&ctx.groestl, hash, dataLen);
-        sph_groestl512_close(&ctx.groestl, hash);
+   sph_groestl512_close(&ctx.groestl, hash);
 #endif

 	sph_skein512(&ctx.skein, hash, dataLen);
@@ -191,24 +178,25 @@ void xevan_hash(void *output, const void *input)

 	sph_keccak512(&ctx.keccak, hash, dataLen);
 	sph_keccak512_close(&ctx.keccak, hash);
-        update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
-                                (const BitSequence*)hash, dataLen );

-        cubehashUpdateDigest( &ctx.cubehash, (byte*)hash,
-                              (const byte*) hash, dataLen );
+   update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                 (const BitSequence*)hash, dataLen );
+
+   cubehashUpdateDigest( &ctx.cubehash, (byte*)hash,
+                                 (const byte*) hash, dataLen );

 	sph_shavite512(&ctx.shavite, hash, dataLen);
 	sph_shavite512_close(&ctx.shavite, hash);

-        update_final_sd( &ctx.simd, (BitSequence *)hash,
+   update_final_sd( &ctx.simd, (BitSequence *)hash,
                         (const BitSequence *)hash, dataLen*8 );

 #if defined(__AES__)
-        update_final_echo( &ctx.echo, (BitSequence *) hash,
+   update_final_echo( &ctx.echo, (BitSequence *) hash,
                           (const BitSequence *) hash, dataLen*8 );
 #else
-        sph_echo512(&ctx.echo, hash, dataLen);
-        sph_echo512_close(&ctx.echo, hash);
+   sph_echo512(&ctx.echo, hash, dataLen);
+   sph_echo512_close(&ctx.echo, hash);
 #endif

 	sph_hamsi512(&ctx.hamsi, hash, dataLen);
@@ -223,8 +211,8 @@ void xevan_hash(void *output, const void *input)
 	sph_whirlpool(&ctx.whirlpool, hash, dataLen);
 	sph_whirlpool_close(&ctx.whirlpool, hash);

-        SHA512_Update( &ctx.sha512, hash, dataLen );
-        SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
+   SHA512_Update( &ctx.sha512, hash, dataLen );
+   SHA512_Final( (unsigned char*) hash, &ctx.sha512 );

 	sph_haval256_5(&ctx.haval,(const void*) hash, dataLen);
 	sph_haval256_5_close(&ctx.haval, hash);
@@ -233,41 +221,33 @@ void xevan_hash(void *output, const void *input)
 }

 int scanhash_xevan( struct work *work, uint32_t max_nonce,
-	            uint64_t *hashes_done, struct thr_info *mythr )
+             uint64_t *hashes_done, struct thr_info *mythr)
 {
-	uint32_t _ALIGN(64) hash[8];
-	uint32_t _ALIGN(64) endiandata[20];
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-	const uint32_t Htarg = ptarget[7];
-	const uint32_t first_nonce = pdata[19];
-	uint32_t nonce = first_nonce;
-	volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   uint32_t edata[20] __attribute__((aligned(64)));
+   uint32_t hash64[8] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;

-	if (opt_benchmark)
-		ptarget[7] = 0x0cff;
+   mm128_bswap32_80( edata, pdata );

-	for (int k=0; k < 19; k++)
-		be32enc(&endiandata[k], pdata[k]);
-
-   xevan_blake512_midstate( endiandata );
-	do {
-		be32enc(&endiandata[19], nonce);
-		xevan_hash(hash, endiandata);
-
-		if (hash[7] <= Htarg )
-      if ( fulltest( hash, ptarget ) && !opt_benchmark )
-	   {
-         pdata[19] = nonce;
-         submit_solution( work, hash, mythr );
-		}
-		nonce++;
-	} while ( nonce < max_nonce && !(*restart) );
-
-	pdata[19] = nonce;
-	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
+   do
+   {
+      edata[19] = n;
+      xevan_hash( hash64, edata );
+      if ( unlikely( valid_hash( hash64, ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n );
+         submit_solution( work, hash64, mythr );
+      }
+      n++;
+   } while ( n < max_nonce && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
 }

 #endif
--- a/algo/x22/x22i-4way.c
+++ b/algo/x22/x22i-4way.c
@@ -87,64 +87,40 @@ void x22i_8way_hash( void *output, const void *input )
   unsigned char hashA7[64]    __attribute__((aligned(32))) = {0};
   x22i_8way_ctx_overlay ctx;

-   blake512_8way_init( &ctx.blake );
-   blake512_8way_update( &ctx.blake, input, 80 );
-   blake512_8way_close( &ctx.blake, vhash );
+   blake512_8way_full( &ctx.blake, vhash, input, 80 );

-   bmw512_8way_init( &ctx.bmw );
-   bmw512_8way_update( &ctx.bmw, vhash, 64 );
-   bmw512_8way_close( &ctx.bmw, vhash );
+   bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );

 #if defined(__VAES__)

-     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

-     groestl512_4way_init( &ctx.groestl, 64 );
-     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
-     groestl512_4way_init( &ctx.groestl, 64 );
-     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+   groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
+   groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );

-     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+   rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );

 #else

-   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
-                     hash4, hash5, hash6, hash7, vhash );
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );

-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                  (const char*)hash0, 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                  (const char*)hash1, 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                  (const char*)hash2, 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                  (const char*)hash3, 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash4,
-                                  (const char*)hash4, 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash5,
-                                  (const char*)hash5, 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash6,
-                                  (const char*)hash6, 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash7,
-                                  (const char*)hash7, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );

-   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
-                           hash4, hash5, hash6, hash7 );
-   
 #endif

-   skein512_8way_init( &ctx.skein );
-   skein512_8way_update( &ctx.skein, vhash, 64 );
-   skein512_8way_close( &ctx.skein, vhash );
-
+   skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
+     
   jh512_8way_init( &ctx.jh );
   jh512_8way_update( &ctx.jh, vhash, 64 );
   jh512_8way_close( &ctx.jh, vhash );
@@ -155,22 +131,16 @@ void x22i_8way_hash( void *output, const void *input )

   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

-   luffa_4way_init( &ctx.luffa, 512 );
-   luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
-   luffa_4way_init( &ctx.luffa, 512 );
-   luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+   luffa512_4way_full( &ctx.luffa, vhashA, vhashA, 64 );
+   luffa512_4way_full( &ctx.luffa, vhashB, vhashB, 64 );

-   cube_4way_init( &ctx.cube, 512, 16, 32 );
-   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
-   cube_4way_init( &ctx.cube, 512, 16, 32 );
-   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+   cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
+   cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );

 #if defined(__VAES__)

-   shavite512_4way_init( &ctx.shavite );
-   shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
-   shavite512_4way_init( &ctx.shavite );
-   shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+   shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
+   shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 );

 #else

@@ -207,17 +177,13 @@ void x22i_8way_hash( void *output, const void *input )

 #endif

-   simd_4way_init( &ctx.simd, 512 );
-   simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
-   simd_4way_init( &ctx.simd, 512 );
-   simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+   simd512_4way_full( &ctx.simd, vhashA, vhashA, 64 );
+   simd512_4way_full( &ctx.simd, vhashB, vhashB, 64 );

 #if defined(__VAES__)

-   echo_4way_init( &ctx.echo, 512 );
-   echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
-   echo_4way_init( &ctx.echo, 512 );
-   echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+   echo_4way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
+   echo_4way_full( &ctx.echo, vhashB, 512, vhashB, 64 );

   rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );

@@ -226,30 +192,22 @@ void x22i_8way_hash( void *output, const void *input )
   dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
   dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );

-   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)hash0,
-                            (const BitSequence*)hash0, 512 );
-   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)hash1,
-                            (const BitSequence*)hash1, 512 );
-   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)hash2,
-                            (const BitSequence*)hash2, 512 );
-   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)hash3,
-                            (const BitSequence*)hash3, 512 );
-   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)hash4,
-                            (const BitSequence*)hash4, 512 );
-   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)hash5,
-                            (const BitSequence*)hash5, 512 );
-   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)hash6,
-                            (const BitSequence*)hash6, 512 );
-   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)hash7,
-                            (const BitSequence*)hash7, 512 );
+   echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                   (const BitSequence *)hash0, 64 );
+   echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                   (const BitSequence *)hash1, 64 );
+   echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                   (const BitSequence *)hash2, 64 );
+   echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                   (const BitSequence *)hash3, 64 );
+   echo_full( &ctx.echo, (BitSequence *)hash4, 512,
+                   (const BitSequence *)hash4, 64 );
+   echo_full( &ctx.echo, (BitSequence *)hash5, 512,
+                   (const BitSequence *)hash5, 64 );
+   echo_full( &ctx.echo, (BitSequence *)hash6, 512,
+                   (const BitSequence *)hash6, 64 );
+   echo_full( &ctx.echo, (BitSequence *)hash7, 512,
+                   (const BitSequence *)hash7, 64 );

   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
                           hash4, hash5, hash6, hash7 );
@@ -443,6 +401,55 @@ void x22i_8way_hash( void *output, const void *input )
   sha256_8way_close( &ctx.sha256, output );
 }

+int scanhash_x22i_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hashd7 = &(hash[7*8]);
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   __m512i  *noncev = (__m512i*)vdata + 9;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const uint32_t targ32 = ptarget[7];
+   const bool bench = opt_benchmark;
+
+   if ( bench )  ptarget[7] = 0x08ff;
+
+   InitializeSWIFFTX();
+   
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   *noncev = mm512_intrlv_blend_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
+   do
+   {
+      x22i_8way_hash( hash, vdata );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( unlikely( ( hashd7[ lane ] <= targ32 ) && !bench ) )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) ) )
+         {
+            pdata[19] = bswap_32( n + lane );
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+/*
 int scanhash_x22i_8way( struct work* work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
@@ -488,6 +495,7 @@ int scanhash_x22i_8way( struct work* work, uint32_t max_nonce,
   *hashes_done = n - first_nonce;
   return 0;
 }
+*/

 #elif defined(X22I_4WAY)

@@ -531,33 +539,21 @@ void x22i_4way_hash( void *output, const void *input )
   unsigned char hashA3[64]    __attribute__((aligned(32))) = {0};
   x22i_ctx_overlay ctx;

-   blake512_4way_init( &ctx.blake );
-   blake512_4way_update( &ctx.blake, input, 80 );
-   blake512_4way_close( &ctx.blake, vhash );
+   blake512_4way_full( &ctx.blake, vhash, input, 80 );

   bmw512_4way_init( &ctx.bmw );
   bmw512_4way_update( &ctx.bmw, vhash, 64 );
   bmw512_4way_close( &ctx.bmw, vhash );
   dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
-   
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                  (const char*)hash0, 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                  (const char*)hash1, 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                  (const char*)hash2, 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                  (const char*)hash3, 512 );
+
+   groestl512_full( &ctx.groestl, (char*)hash0, (const char*)hash0, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash1, (const char*)hash1, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash2, (const char*)hash2, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash3, (const char*)hash3, 512 );

   intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

-   skein512_4way_init( &ctx.skein );
-   skein512_4way_update( &ctx.skein, vhash, 64 );
-   skein512_4way_close( &ctx.skein, vhash );
+   skein512_4way_full( &ctx.skein, vhash, vhash, 64 );

   jh512_4way_init( &ctx.jh );
   jh512_4way_update( &ctx.jh, vhash, 64 );
@@ -569,41 +565,29 @@ void x22i_4way_hash( void *output, const void *input )

   rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

-   luffa_2way_init( &ctx.luffa, 512 );
-   luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
-   luffa_2way_init( &ctx.luffa, 512 );
-   luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+   luffa512_2way_full( &ctx.luffa, vhashA, vhashA, 64 );
+   luffa512_2way_full( &ctx.luffa, vhashB, vhashB, 64 );

-   cube_2way_init( &ctx.cube, 512, 16, 32 );
-   cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
-   cube_2way_init( &ctx.cube, 512, 16, 32 );
-   cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+   cube_2way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
+   cube_2way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
+   
+   shavite512_2way_full( &ctx.shavite, vhashA, vhashA, 64 );
+   shavite512_2way_full( &ctx.shavite, vhashB, vhashB, 64 );

-   shavite512_2way_init( &ctx.shavite );
-   shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
-   shavite512_2way_init( &ctx.shavite );
-   shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
-
-   simd_2way_init( &ctx.simd, 512 );
-   simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
-   simd_2way_init( &ctx.simd, 512 );
-   simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+   simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
+   simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );

   dintrlv_2x128_512( hash0, hash1, vhashA );
   dintrlv_2x128_512( hash2, hash3, vhashB );
   
-   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)hash0,
-                            (const BitSequence*)hash0, 512 );
-   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)hash1,
-                            (const BitSequence*)hash1, 512 );
-   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)hash2,
-                            (const BitSequence*)hash2, 512 );
-   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)hash3,
-                            (const BitSequence*)hash3, 512 );
+   echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                   (const BitSequence *)hash0, 64 );
+   echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                   (const BitSequence *)hash1, 64 );
+   echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                   (const BitSequence *)hash2, 64 );
+   echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                   (const BitSequence *)hash3, 64 );

   intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

@@ -722,44 +706,47 @@ void x22i_4way_hash( void *output, const void *input )
 int scanhash_x22i_4way( struct work* work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[4*16] __attribute__ ((aligned (64)));
-   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t *hash7 = &(hash[7<<2]);
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hashd7 = &(hash[ 7*4 ]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
-   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   const uint32_t last_nonce = max_nonce - 4;
+   __m256i  *noncev = (__m256i*)vdata + 9;
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
-   const uint32_t Htarg = ptarget[7];
+   const uint32_t targ32 = ptarget[7];
+   const bool bench = opt_benchmark;
+
+   if ( bench ) ptarget[7] = 0x08ff;

-   if (opt_benchmark)
-      ((uint32_t*)ptarget)[7] = 0x08ff;
-   
   InitializeSWIFFTX();
-
+   
   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   *noncev = mm256_intrlv_blend_32(
+                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
   {
-      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-              _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
      x22i_4way_hash( hash, vdata );

      for ( int lane = 0; lane < 4; lane++ )
-      if unlikely( ( hash7[ lane ] <= Htarg ) )
+      if ( unlikely( hashd7[ lane ] <= targ32 && !bench ) )
      {
         extr_lane_4x32( lane_hash, hash, lane, 256 );
-         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         if ( valid_hash( lane_hash, ptarget ) )
         {
-            pdata[19] = n + lane;
+            pdata[19] = bswap_32( n + lane );
            submit_lane_solution( work, lane_hash, mythr, lane );
         }
      }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
      n += 4;
-   } while ( likely( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart ) );
-
-   *hashes_done = n - first_nonce + 1;
+   } while ( likely( ( n <= last_nonce ) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
   return 0;
 }

--- a/algo/x22/x22i.c
+++ b/algo/x22/x22i.c
@@ -167,40 +167,38 @@ void x22i_hash( void *output, const void *input )
 	memcpy(output, hash, 32);
 }

-int scanhash_x22i( struct work* work, uint32_t max_nonce,
-                   uint64_t *hashes_done, struct thr_info *mythr )
+int scanhash_x22i( struct work *work, uint32_t max_nonce,
+             uint64_t *hashes_done, struct thr_info *mythr)
 {
-   uint32_t endiandata[20] __attribute__((aligned(64)));
-   uint32_t hash[8] __attribute__((aligned(64)));
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
-	const uint32_t first_nonce = pdata[19];
-   const uint32_t Htarg = ptarget[7];
-   uint32_t n = first_nonce;
+   uint32_t edata[20] __attribute__((aligned(64)));
+   uint32_t hash64[8] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = n;
   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;

-	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x08ff;
-
-	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], pdata[k]);
+   if ( bench ) ptarget[7] = 0x08ff;
+   
+   mm128_bswap32_80( edata, pdata );

   InitializeSWIFFTX();
-
+   
   do
   {
-       pdata[19] = ++n;
-       be32enc( &endiandata[19], n );
-
-       x22i_hash( hash, endiandata );
-
-       if ( hash[7] < Htarg )
-       if ( fulltest( hash, ptarget ) && !opt_benchmark )
-           submit_solution( work, hash, mythr );
-    } while ( n < max_nonce && !work_restart[thr_id].restart );
-
-	 *hashes_done = pdata[19] - first_nonce;
-	 return 0;
+      edata[19] = n;
+      x22i_hash( hash64, edata );
+      if ( unlikely( valid_hash( hash64, ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n );
+         submit_solution( work, hash64, mythr );
+      }
+      n++;
+   } while ( n < max_nonce && !work_restart[thr_id].restart );
+   *hashes_done = n - first_nonce;
+   pdata[19] = n;
+   return 0;
 }

 #endif
--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -530,6 +530,55 @@ void x25x_8way_hash( void *output, const void *input )
   blake2s_8way_full_blocks( &ctx.blake2s, output, vhashX, 64*24 );
 }

+int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hashd7 = &(hash[7*8]);
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   __m512i  *noncev = (__m512i*)vdata + 9;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const uint32_t targ32 = ptarget[7];
+   const bool bench = opt_benchmark;
+
+   if ( bench )  ptarget[7] = 0x08ff;
+
+   InitializeSWIFFTX();
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   *noncev = mm512_intrlv_blend_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
+   do
+   {
+      x25x_8way_hash( hash, vdata );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( unlikely( ( hashd7[ lane ] <= targ32 ) && !bench ) )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) ) )
+         {
+            pdata[19] = bswap_32( n + lane );
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+/*
 int scanhash_x25x_8way( struct work* work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
@@ -574,6 +623,7 @@ int scanhash_x25x_8way( struct work* work, uint32_t max_nonce,
   *hashes_done = n - first_nonce;
   return 0;
 }
+*/

 #elif defined(X25X_4WAY)

@@ -614,9 +664,7 @@ void x25x_4way_hash( void *output, const void *input )
   unsigned char vhashX[24][64*4] __attribute__ ((aligned (64)));
   x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64)));

-   blake512_4way_init( &ctx.blake );
-   blake512_4way_update( &ctx.blake, input, 80 );
-   blake512_4way_close( &ctx.blake, vhash );
+   blake512_4way_full( &ctx.blake, vhash, input, 80 );
   dintrlv_4x64_512( hash0[0], hash1[0], hash2[0], hash3[0], vhash );

   bmw512_4way_init( &ctx.bmw );
@@ -624,24 +672,13 @@ void x25x_4way_hash( void *output, const void *input )
   bmw512_4way_close( &ctx.bmw, vhash );
   dintrlv_4x64_512( hash0[1], hash1[1], hash2[1], hash3[1], vhash );

-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash0[2],
-                                  (const char*)hash0[1], 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash1[2],
-                                  (const char*)hash1[1], 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash2[2],
-                                  (const char*)hash2[1], 512 );
-   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)hash3[2],
-                                  (const char*)hash3[1], 512 );
+   groestl512_full( &ctx.groestl, (char*)hash0[2], (const char*)hash0[1], 512 );
+   groestl512_full( &ctx.groestl, (char*)hash1[2], (const char*)hash1[1], 512 );
+   groestl512_full( &ctx.groestl, (char*)hash2[2], (const char*)hash2[1], 512 );
+   groestl512_full( &ctx.groestl, (char*)hash3[2], (const char*)hash3[1], 512 );

   intrlv_4x64_512( vhash, hash0[2], hash1[2], hash2[2], hash3[2] );
-
-   skein512_4way_init( &ctx.skein );
-   skein512_4way_update( &ctx.skein, vhash, 64 );
-   skein512_4way_close( &ctx.skein, vhash );
+   skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
   dintrlv_4x64_512( hash0[3], hash1[3], hash2[3], hash3[3], vhash );

   jh512_4way_init( &ctx.jh );
@@ -654,32 +691,20 @@ void x25x_4way_hash( void *output, const void *input )
   keccak512_4way_close( &ctx.keccak, vhash );
   dintrlv_4x64_512( hash0[5], hash1[5], hash2[5], hash3[5], vhash );

-   init_luffa( &ctx.luffa, 512 );
-   update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0[6],
-                                (const BitSequence*)hash0[5], 64 );
-   init_luffa( &ctx.luffa, 512 );
-   update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1[6],
-                                (const BitSequence*)hash1[5], 64 );
-   init_luffa( &ctx.luffa, 512 );
-   update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2[6],
-                                (const BitSequence*)hash2[5], 64 );
-   init_luffa( &ctx.luffa, 512 );
-   update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3[6],
-                                (const BitSequence*)hash3[5], 64 );
-
-   cubehashInit( &ctx.cube, 512, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*) hash0[7],
-                              (const byte*)hash0[6], 64 );
-   cubehashInit( &ctx.cube, 512, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*) hash1[7],
-                              (const byte*)hash1[6], 64 );
-   cubehashInit( &ctx.cube, 512, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*) hash2[7],
-                              (const byte*)hash2[6], 64 );
-   cubehashInit( &ctx.cube, 512, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*) hash3[7],
-                              (const byte*)hash3[6], 64 );
+   luffa_full( &ctx.luffa, (BitSequence*)hash0[6], 512,
+                     (const BitSequence*)hash0[5], 64 );
+   luffa_full( &ctx.luffa, (BitSequence*)hash1[6], 512,
+                     (const BitSequence*)hash1[5], 64 );
+   luffa_full( &ctx.luffa, (BitSequence*)hash2[6], 512,
+                     (const BitSequence*)hash2[5], 64 );
+   luffa_full( &ctx.luffa, (BitSequence*)hash3[6], 512,
+                     (const BitSequence*)hash3[5], 64 );

+   cubehash_full( &ctx.cube, (byte*)hash0[7], 512, (const byte*)hash0[6], 64 );
+   cubehash_full( &ctx.cube, (byte*)hash1[7], 512, (const byte*)hash1[6], 64 );
+   cubehash_full( &ctx.cube, (byte*)hash2[7], 512, (const byte*)hash2[6], 64 );
+   cubehash_full( &ctx.cube, (byte*)hash3[7], 512, (const byte*)hash3[6], 64 );
+   
   sph_shavite512_init(&ctx.shavite);
   sph_shavite512(&ctx.shavite, (const void*) hash0[7], 64);
   sph_shavite512_close(&ctx.shavite, hash0[8]);
@@ -693,31 +718,23 @@ void x25x_4way_hash( void *output, const void *input )
   sph_shavite512(&ctx.shavite, (const void*) hash3[7], 64);
   sph_shavite512_close(&ctx.shavite, hash3[8]);

-   init_sd( &ctx.simd, 512 );
-   update_final_sd( &ctx.simd, (BitSequence*)hash0[9],
-                         (const BitSequence*)hash0[8], 512 );
-   init_sd( &ctx.simd, 512 );
-   update_final_sd( &ctx.simd, (BitSequence*)hash1[9],
-                         (const BitSequence*)hash1[8], 512 );
-   init_sd( &ctx.simd, 512 );
-   update_final_sd( &ctx.simd, (BitSequence*)hash2[9],
-                         (const BitSequence*)hash2[8], 512 );
-   init_sd( &ctx.simd, 512 );
-   update_final_sd( &ctx.simd, (BitSequence*)hash3[9],
-                         (const BitSequence*)hash3[8], 512 );
+   simd_full( &ctx.simd, (BitSequence*)hash0[9],
+                   (const BitSequence*)hash0[8], 512 );
+   simd_full( &ctx.simd, (BitSequence*)hash1[9],
+                   (const BitSequence*)hash1[8], 512 );
+   simd_full( &ctx.simd, (BitSequence*)hash2[9],
+                   (const BitSequence*)hash2[8], 512 );
+   simd_full( &ctx.simd, (BitSequence*)hash3[9],
+                   (const BitSequence*)hash3[8], 512 );

-   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)hash0[10],
-                            (const BitSequence*)hash0[9], 512 );
-   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)hash1[10],
-                            (const BitSequence*)hash1[9], 512 );
-   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)hash2[10],
-                            (const BitSequence*)hash2[9], 512 );
-   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)hash3[10],
-                            (const BitSequence*)hash3[9], 512 );
+   echo_full( &ctx.echo, (BitSequence *)hash0[10], 512,
+                   (const BitSequence *)hash0[ 9], 64 );
+   echo_full( &ctx.echo, (BitSequence *)hash1[10], 512,
+                   (const BitSequence *)hash1[ 9], 64 );
+   echo_full( &ctx.echo, (BitSequence *)hash2[10], 512,
+                   (const BitSequence *)hash2[ 9], 64 );
+   echo_full( &ctx.echo, (BitSequence *)hash3[10], 512,
+                   (const BitSequence *)hash3[ 9], 64 );

   intrlv_4x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10] );

@@ -870,43 +887,46 @@ void x25x_4way_hash( void *output, const void *input )
 int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[16*4] __attribute__ ((aligned (128)));
-   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t *hash7 = &(hash[7<<2]);
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hashd7 = &(hash[ 7*4 ]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
-   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-   uint32_t n = first_nonce;
   const uint32_t last_nonce = max_nonce - 4;
+   __m256i  *noncev = (__m256i*)vdata + 9;
+   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
-   const uint32_t Htarg = ptarget[7];
+   const uint32_t targ32 = ptarget[7];
+   const bool bench = opt_benchmark;

-   if (opt_benchmark)
-      ((uint32_t*)ptarget)[7] = 0x08ff;
+   if ( bench ) ptarget[7] = 0x08ff;

   InitializeSWIFFTX();

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   *noncev = mm256_intrlv_blend_32(
+                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
   {
-      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-              _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
      x25x_4way_hash( hash, vdata );

-      for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( unlikely( hashd7[ lane ] <= targ32 && !bench ) )
      {
         extr_lane_4x32( lane_hash, hash, lane, 256 );
-         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         if ( valid_hash( lane_hash, ptarget ) )
         {
-              pdata[19] = n + lane;
-              submit_lane_solution( work, lane_hash, mythr, lane );
+            pdata[19] = bswap_32( n + lane );
+            submit_lane_solution( work, lane_hash, mythr, lane );
         }
      }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
      n += 4;
-   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
-
+   } while ( likely( ( n <= last_nonce ) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
--- a/algo/x22/x25x.c
+++ b/algo/x22/x25x.c
@@ -201,42 +201,38 @@ void x25x_hash( void *output, const void *input )
 	memcpy(output, &hash[24], 32);
 }

-int scanhash_x25x( struct work* work, uint32_t max_nonce,
-                   uint64_t *hashes_done, struct thr_info *mythr )
+int scanhash_x25x( struct work *work, uint32_t max_nonce,
+             uint64_t *hashes_done, struct thr_info *mythr)
 {
   uint32_t edata[20] __attribute__((aligned(64)));
-   uint32_t hash[8] __attribute__((aligned(64)));
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
-	const uint32_t first_nonce = pdata[19];
-   const uint32_t Htarg = ptarget[7];
-   uint32_t n = first_nonce;
+   uint32_t hash64[8] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = n;
   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;

-	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x08ff;
+   if ( bench ) ptarget[7] = 0x08ff;

   mm128_bswap32_80( edata, pdata );
-   
-	for (int k=0; k < 20; k++)
-		be32enc(&edata[k], pdata[k]);

   InitializeSWIFFTX();

   do
   {
-       pdata[19] = ++n;
-       be32enc( &edata[19], n );
-
-       x25x_hash( hash, edata );
-
-       if ( hash[7] < Htarg )
-       if ( fulltest( hash, ptarget ) && !opt_benchmark )
-           submit_solution( work, hash, mythr );
-    } while ( n < max_nonce && !work_restart[thr_id].restart );
-
-	 *hashes_done = pdata[19] - first_nonce;
-	 return 0;
+      edata[19] = n;
+      x25x_hash( hash64, edata );
+      if ( unlikely( valid_hash( hash64, ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n );
+         submit_solution( work, hash64, mythr );
+      }
+      n++;
+   } while ( n < max_nonce && !work_restart[thr_id].restart );
+   *hashes_done = n - first_nonce;
+   pdata[19] = n;
+   return 0;
 }

 #endif
--- a/algo/yespower/yescrypt-r8g.c
+++ b/algo/yespower/yescrypt-r8g.c
@@ -73,6 +73,7 @@ bool register_yescryptr8g_algo( algo_gate_t* gate )
  gate->optimizations = SSE2_OPT | SHA_OPT;
  gate->scanhash      = (void*)&scanhash_yespower_r8g;
  gate->hash          = (void*)&yespower_tls;
+  pk_buffer_size      = 26;
  opt_sapling         = true;
  opt_target_factor   = 65536.0;
  return true;
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.12.1.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.12.4.3.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.12.1'
-PACKAGE_STRING='cpuminer-opt 3.12.1'
+PACKAGE_VERSION='3.12.4.3'
+PACKAGE_STRING='cpuminer-opt 3.12.4.3'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.12.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.12.4.3 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.12.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.12.4.3:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.12.1
+cpuminer-opt configure 3.12.4.3
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.12.1, which was
+It was created by cpuminer-opt $as_me 3.12.4.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.12.1'
+ VERSION='3.12.4.3'


 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.12.1, which was
+This file was extended by cpuminer-opt $as_me 3.12.4.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.12.1
+cpuminer-opt config.status 3.12.4.3
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.12.1])
+AC_INIT([cpuminer-opt], [3.12.4.3])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -129,7 +129,13 @@ char *rpc_url = NULL;;
 char *rpc_userpass = NULL;
 char *rpc_user, *rpc_pass;
 char *short_url = NULL;
-static unsigned char pk_script[25] = { 0 };
+char *coinbase_address;
+
+// pk_buffer_size is used as a version selector by b58 code, therefore
+// it must be set correctly to work.
+const int pk_buffer_size_max = 26;
+int pk_buffer_size = 25;
+static unsigned char pk_script[ 26 ] = { 0 };
 static size_t pk_script_size = 0;
 static char coinbase_sig[101] = { 0 };
 char *opt_cert;
@@ -425,68 +431,71 @@ static bool work_decode( const json_t *val, struct work *work )
 static const char *info_req =
 "{\"method\": \"getmininginfo\", \"params\": [], \"id\":8}\r\n";

-static bool get_mininginfo(CURL *curl, struct work *work)
+static bool get_mininginfo( CURL *curl, struct work *work )
 {
-	if (have_stratum || !allow_mininginfo)
+	if ( have_stratum || !allow_mininginfo )
 		return false;

 	int curl_err = 0;
-	json_t *val = json_rpc_call(curl, rpc_url, rpc_userpass, info_req, &curl_err, 0);
+	json_t *val = json_rpc_call( curl, rpc_url, rpc_userpass, info_req,
+                                &curl_err, 0 );

-	if (!val && curl_err == -1) {
+	if ( !val && curl_err == -1 )
+   {
 		allow_mininginfo = false;
-		if (opt_debug) {
-			applog(LOG_DEBUG, "getmininginfo not supported");
-		}
+		if ( opt_debug )
+			applog( LOG_DEBUG, "getmininginfo not supported" );
 		return false;
 	}
-	else
-        {
-	   json_t *res = json_object_get(val, "result");
-	   // "blocks": 491493 (= current work height - 1)
-	   // "difficulty": 0.99607860999999998
-	   // "networkhashps": 56475980
-	   if (res)
-           {
-		json_t *key = json_object_get(res, "difficulty");
-		if (key) {
-			if (json_is_object(key))
-				key = json_object_get(key, "proof-of-work");
-			if (json_is_real(key))
-				net_diff = json_real_value(key);
-		}
-		key = json_object_get(res, "networkhashps");
-		if (key && json_is_integer(key)) {
-			net_hashrate = (double) json_integer_value(key);
-		}
-		key = json_object_get(res, "blocks");
-		if (key && json_is_integer(key)) {
-			net_blocks = json_integer_value(key);
-		}
-		if (!work->height)
-                {
-		   // complete missing data from getwork
-		   work->height = (uint32_t) net_blocks + 1;
-		   if (work->height > g_work.height)
-                   {
-			restart_threads();
-			if (!opt_quiet) {
-			   char netinfo[64] = { 0 };
-			   char srate[32] = { 0 };
-			   sprintf(netinfo, "diff %.2f", net_diff);
-			   if (net_hashrate) {
-				format_hashrate(net_hashrate, srate);
-				strcat(netinfo, ", net ");
-				strcat(netinfo, srate);
-			   }
-			   applog(LOG_BLUE, "%s block %d, %s",
-				algo_names[opt_algo], work->height, netinfo);
-			}
-		   }
-		}
+
+   json_t *res = json_object_get( val, "result" );
+   // "blocks": 491493 (= current work height - 1)
+   // "difficulty": 0.99607860999999998
+   // "networkhashps": 56475980
+   if ( res )
+   {
+  		json_t *key = json_object_get( res, "difficulty" );
+   	if ( key )
+      {
+	   	if ( json_is_object( key ) )
+		   	key = json_object_get( key, "proof-of-work" );
+		   if ( json_is_real( key ) )
+			   net_diff = json_real_value( key );
 	   }
+
+      key = json_object_get( res, "networkhashps" );
+	   if ( key && json_is_integer( key ) )
+		   net_hashrate = (double) json_integer_value( key );
+
+      key = json_object_get( res, "blocks" );
+	   if ( key && json_is_integer( key ) )
+		  	net_blocks = json_integer_value( key );
+
+      if ( !work->height )
+      {
+	      // complete missing data from getwork
+	      work->height = (uint32_t) net_blocks + 1;
+	      if ( work->height > g_work.height )
+         {
+            restart_threads();
+		      if ( !opt_quiet )
+            {
+		         char netinfo[64] = { 0 };
+		         char srate[32] = { 0 };
+		         sprintf( netinfo, "diff %.2f", net_diff );
+		         if ( net_hashrate )
+               {
+	               format_hashrate( net_hashrate, srate );
+                  strcat( netinfo, ", net " );
+			         strcat( netinfo, srate );
+		         }
+		         applog( LOG_BLUE, "%s block %d, %s",
+			                algo_names[opt_algo], work->height, netinfo );
+		      }
+		   } 
+	   }  // res
 	}
-	json_decref(val);
+	json_decref( val );
 	return true;
 }

@@ -974,9 +983,6 @@ void report_summary_log( bool force )
                  : diff_to_hash * last_targetdiff
                      * (double)(submitted_share_count - accepted_share_count )
                    / (double)uptime.tv_sec;
-
-      double shrate = share_time == 0. ? 0. : diff_to_hash * last_targetdiff
-                                           * (double)(accepts) / share_time;
      double lost_shrate = share_time == 0. ? 0.
               : diff_to_hash * last_targetdiff  * (double)(submits - accepts )
                / share_time;
@@ -1056,25 +1062,39 @@ static int share_result( int result, struct work *null_work,
   if ( likely( result ) )
   {
      accepted_share_count++;
+      sprintf( sres, "S%d", stale_share_count );
+      sprintf( rres, "R%d", rejected_share_count );
      if unlikely( ( my_stats.net_diff > 0. )
                && ( my_stats.share_diff >= net_diff ) )
      {
         solved = true;
         solved_block_count++;
+         sprintf( bres, "BLOCK SOLVED %d", solved_block_count );
+         sprintf( ares, "A%d", accepted_share_count );
+      }
+      else
+      {
+         sprintf( bres, "B%d", solved_block_count );
+         sprintf( ares, "Accepted %d", accepted_share_count );
      }
   }
   else
   {
+     sprintf( ares, "A%d", accepted_share_count );
+     sprintf( bres, "B%d", solved_block_count );
     if ( reason && strstr( reason, "Invalid job id" ) )
     {
        stale = true;
        stale_share_count++;
+        sprintf( sres, "Stale %d", stale_share_count );
+        sprintf( rres, "R%d", rejected_share_count );
     }
     else
     {
        rejected_share_count++;
+        sprintf( sres, "S%d", stale_share_count );
+        sprintf( rres, "Rejected %d" , rejected_share_count );
        lowdiff_debug = true;
-  
     }
   }

@@ -1100,6 +1120,7 @@ static int share_result( int result, struct work *null_work,

   pthread_mutex_unlock( &stats_lock );

+/*
   if ( likely( result ) )
   {
     if ( unlikely( solved ) )
@@ -1121,7 +1142,7 @@ static int share_result( int result, struct work *null_work,
     sprintf( bres, "B%d", solved_block_count );
     if ( stale )
     {
-        sprintf( sres, "Stale job %d", stale_share_count );
+        sprintf( sres, "Stale %d", stale_share_count );
        sprintf( rres, "R%d", rejected_share_count );
     }
     else
@@ -1130,6 +1151,7 @@ static int share_result( int result, struct work *null_work,
        sprintf( rres, "Rejected %d" , rejected_share_count );
     }
   } 
+*/

   if ( use_colors )
   {
@@ -1149,10 +1171,16 @@ static int share_result( int result, struct work *null_work,
           my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol,
           bres, share_time, latency );

-   if ( have_stratum && !opt_quiet )
-      applog2( LOG_NOTICE, "Diff %.5g (%.3g%), %sBlock %d, %sJob %s" CL_WHT,
+   if ( !opt_quiet )
+   {
+      if ( have_stratum )
+         applog2( LOG_NOTICE, "Diff %.5g (%.3g%), %sBlock %d, %sJob %s" CL_WHT,
               my_stats.share_diff, share_ratio, bcol, stratum.block_height,
               scol, my_stats.job_id );
+      else
+         applog2( LOG_NOTICE, "Diff %.5g (%.3g%), %sBlock %d" CL_WHT,
+               my_stats.share_diff, share_ratio, bcol, stratum.block_height );
+   }

   if ( unlikely( reason && !result ) )
   {
@@ -1185,6 +1213,9 @@ static int share_result( int result, struct work *null_work,
   return 1;
 }

+static const char *json_submit_req =
+   "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}";
+
 void std_le_build_stratum_request( char *req, struct work *work )
 {
   unsigned char *xnonce2str;
@@ -1195,9 +1226,8 @@ void std_le_build_stratum_request( char *req, struct work *work )
   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
   xnonce2str = abin2hex( work->xnonce2, work->xnonce2_len );
-   snprintf( req, JSON_BUF_LEN,
-        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
+   snprintf( req, JSON_BUF_LEN, json_submit_req, rpc_user, work->job_id,
+             xnonce2str, ntimestr, noncestr );
   free( xnonce2str );
 }

@@ -1212,12 +1242,14 @@ void std_be_build_stratum_request( char *req, struct work *work )
   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
   xnonce2str = abin2hex( work->xnonce2, work->xnonce2_len );
-   snprintf( req, JSON_BUF_LEN,
-        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
+   snprintf( req, JSON_BUF_LEN, json_submit_req, rpc_user, work->job_id,
+             xnonce2str, ntimestr, noncestr );
   free( xnonce2str );
 }

+static const char *json_getwork_req = 
+  "{\"method\": \"getwork\", \"params\": [\"%s\"], \"id\":4}\r\n";
+
 bool std_le_submit_getwork_result( CURL *curl, struct work *work )
 {
   char req[JSON_BUF_LEN];
@@ -1234,8 +1266,7 @@ bool std_le_submit_getwork_result( CURL *curl, struct work *work )
      return false;
   }
   // build JSON-RPC request 
-   snprintf( req, JSON_BUF_LEN,
-     "{\"method\": \"getwork\", \"params\": [\"%s\"], \"id\":4}\r\n", gw_str );
+   snprintf( req, JSON_BUF_LEN, json_getwork_req, gw_str );
   free( gw_str );
   // issue JSON-RPC request 
   val = json_rpc_call( curl, rpc_url, rpc_userpass, req, NULL, 0 );
@@ -1268,8 +1299,7 @@ bool std_be_submit_getwork_result( CURL *curl, struct work *work )
      return false;
   }
   // build JSON-RPC request 
-   snprintf( req, JSON_BUF_LEN,
-     "{\"method\": \"getwork\", \"params\": [\"%s\"], \"id\":4}\r\n", gw_str );
+   snprintf( req, JSON_BUF_LEN, json_getwork_req, gw_str );
   free( gw_str );
   // issue JSON-RPC request 
   val = json_rpc_call( curl, rpc_url, rpc_userpass, req, NULL, 0 );
@@ -1324,25 +1354,27 @@ char* std_malloc_txs_request( struct work *work )

 static bool submit_upstream_work( CURL *curl, struct work *work )
 {
+
   /* pass if the previous hash is not the current previous hash */
+/* Submit anyway, discardring here messes up the stats
   if ( !submit_old && memcmp( &work->data[1], &g_work.data[1], 32 ) )
   {
-      if (opt_debug)
-         applog(LOG_DEBUG, "DEBUG: stale work detected, discarding");
+      applog( LOG_WARNING, "Stale work detected, discarding" );
      return true;
   }

   if ( !have_stratum && allow_mininginfo )
   {
-      struct work wheight;
-      get_mininginfo( curl, &wheight );
-      if ( work->height && work->height <= net_blocks )
+      struct work mining_info;
+      get_mininginfo( curl, &mining_info );
+      if ( work->height < mining_info.height )
      {
-         if (opt_debug)
- 	        applog(LOG_WARNING, "block %u was already solved", work->height);
+ 	      applog( LOG_WARNING, "Block %u was already solved, current block %d",
+                               work->height, mining_info.height );
 	      return true;
      }
   }
+*/

   if ( have_stratum )
   {
@@ -1474,6 +1506,37 @@ start:
   json_decref( val );
   // store work height in solo
   get_mininginfo(curl, work);
+
+   applog( LOG_BLUE, "%s %s block %d, diff %.5g", algo_names[ opt_algo ],
+                      short_url, work->height, net_diff );
+
+   if ( !opt_quiet && net_diff && net_hashrate )
+   {
+      double miner_hr = 0.;
+      pthread_mutex_lock( &stats_lock );
+
+      for ( int i = 0; i < opt_n_threads; i++ )
+         miner_hr += thr_hashrates[i];
+      global_hashrate = miner_hr;
+
+      pthread_mutex_unlock( &stats_lock );
+
+      if ( miner_hr )
+      {
+         char net_hr_units[4] = {0};
+         char miner_hr_units[4] = {0};
+         char net_ttf[32];
+         char miner_ttf[32];
+
+         sprintf_et( net_ttf, net_diff * diff_to_hash / net_hashrate );
+         sprintf_et( miner_ttf, net_diff * diff_to_hash / miner_hr );
+         scale_hash_for_display ( &miner_hr, miner_hr_units );
+         scale_hash_for_display ( &net_hashrate, net_hr_units );
+         applog2(LOG_INFO, "Miner TTF @ %.2f %sh/s %s, net TTF @ %.2f %sh/s %s",
+                             miner_hr, miner_hr_units, miner_ttf,
+                             net_hashrate, net_hr_units, net_ttf );
+      }
+   }
   return rc;
 }

@@ -1520,6 +1583,8 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl)
 	sleep(opt_fail_pause);
   }

+   report_summary_log( false );
+
   /* send work to requesting thread */
   if (!tq_push(wc->thr->q, ret_work))
 	free(ret_work);
@@ -1695,8 +1760,8 @@ void work_set_target_ratio( struct work* work, const void *hash )
   share_stats[ s_put_ptr ].net_diff = net_diff;
   share_stats[ s_put_ptr ].stratum_diff = stratum_diff;
   share_stats[ s_put_ptr ].target_diff = work->targetdiff;
-   ( (uint64_t*)share_stats[ s_put_ptr ].job_id )[3] = 0;
-   strncpy( share_stats[ s_put_ptr ].job_id, work->job_id, 30 );
+   if ( have_stratum )
+      strncpy( share_stats[ s_put_ptr ].job_id, work->job_id, 30 );
   s_put_ptr = stats_ptr_incr( s_put_ptr );

   pthread_mutex_unlock( &stats_lock );
@@ -1709,24 +1774,31 @@ bool submit_solution( struct work *work, const void *hash,
  {
     submitted_share_count++;
     work_set_target_ratio( work, hash );
-     if ( !opt_quiet )
-        applog( LOG_NOTICE, "%d submitted by thread %d, job %s",
-            submitted_share_count, thr->id, work->job_id );

-if ( lowdiff_debug )
-{
-   uint32_t* h = (uint32_t*)hash;
-   uint32_t* t = (uint32_t*)work->target;
-   applog(LOG_INFO,"Hash[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
-                              h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0]);
-   applog(LOG_INFO,"Targ[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
-                              t[7],t[6],t[5],t[4],t[3],t[2],t[1],t[0]);
-}
-    return true;
+     if ( !opt_quiet )
+     {
+        if ( have_stratum )
+           applog( LOG_NOTICE, "%d submitted by thread %d, job %s",
+               submitted_share_count, thr->id, work->job_id );
+        else
+           applog( LOG_NOTICE, "%d submitted by thread %d",
+               submitted_share_count, thr->id );
+     }
+
+     if ( lowdiff_debug )
+     {
+        uint32_t* h = (uint32_t*)hash;
+        uint32_t* t = (uint32_t*)work->target;
+        applog(LOG_INFO,"Hash[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
+                                    h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0]);
+        applog(LOG_INFO,"Targ[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
+                                    t[7],t[6],t[5],t[4],t[3],t[2],t[1],t[0]);
+     }
+     return true;
  }
  else
-     applog( LOG_WARNING, "%d failed to submit share.",
-             submitted_share_count );
+     applog( LOG_WARNING, "%d failed to submit share thread %d.",
+             submitted_share_count, thr->id );
  return false;
 }

@@ -1737,26 +1809,31 @@ bool submit_lane_solution( struct work *work, const void *hash,
  {
     submitted_share_count++;
     work_set_target_ratio( work, hash );
+
     if ( !opt_quiet )
-        applog( LOG_NOTICE, "%d submitted by thread %d, lane %d, job %s",
-            submitted_share_count, thr->id, lane, work->job_id );
+     {
+        if ( have_stratum )
+           applog( LOG_NOTICE, "%d submitted by thread %d, lane %d, job %s",
+               submitted_share_count, thr->id, lane, work->job_id );
+        else
+           applog( LOG_NOTICE, "%d submitted by thread %d, lane %d",
+               submitted_share_count, thr->id, lane );
+     }

-if ( lowdiff_debug )
-{
-   uint32_t* h = (uint32_t*)hash;
-   uint32_t* t = (uint32_t*)work->target;
-   applog(LOG_INFO,"Hash[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
-                              h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0]);
-   applog(LOG_INFO,"Targ[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
-                              t[7],t[6],t[5],t[4],t[3],t[2],t[1],t[0]);
-}
-
-
-     return true;
+     if ( lowdiff_debug )
+     {
+        uint32_t* h = (uint32_t*)hash;
+        uint32_t* t = (uint32_t*)work->target;
+        applog(LOG_INFO,"Hash[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
+                                    h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0]);
+         applog(LOG_INFO,"Targ[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
+                                     t[7],t[6],t[5],t[4],t[3],t[2],t[1],t[0]);
+    }
+    return true;
  }
  else
-     applog( LOG_WARNING, "%d failed to submit share.",
-          submitted_share_count );
+     applog( LOG_WARNING, "%d failed to submit share, thread %d, lane %d.",
+          submitted_share_count, thr->id, lane );
  return false;
 }

@@ -1850,12 +1927,15 @@ void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
                     uint32_t *end_nonce_ptr )
 {
   uint32_t *nonceptr = work->data + algo_gate.nonce_index;
+   bool force_new_work = false; 

-   bool force_new_work = work->job_id ? strtoul(   work->job_id, NULL, 16 ) !=
-                                        strtoul( g_work->job_id, NULL, 16 )
-                                      : true;
+   if ( have_stratum ) 
+      force_new_work = work->job_id ?    strtoul(   work->job_id, NULL, 16 )
+                                      != strtoul( g_work->job_id, NULL, 16 )
+                                     : false;

-   if ( force_new_work || *nonceptr >= *end_nonce_ptr )
+   if ( force_new_work || ( *nonceptr >= *end_nonce_ptr )
+     || memcmp( work->data, g_work->data, algo_gate.work_cmp_size ) )
   {
     work_free( work );
     work_copy( work, g_work );
@@ -2033,12 +2113,28 @@ static void *miner_thread( void *userdata )
          sleep(5);
 	       continue;
       }
-       // adjust max_nonce to meet target scan time
+
+// LP_SCANTIME overrides opt_scantime option, is this right?
+
+       // adjust max_nonce to meet target scan time. Startum and longpoll
+       // can go longer because they can rely on restart_threads to signal
+       // an early abort. get_work on the other hand can't rely on
+       // restart_threads so need a much shorter scantime
+       if ( have_stratum )
+          max64 = 60 * thr_hashrates[thr_id];
+       else if ( have_longpoll )
+          max64 = LP_SCANTIME * thr_hashrates[thr_id];
+       else  // getwork inline
+          max64 = opt_scantime * thr_hashrates[thr_id];   
+
+/*       
       if ( have_stratum )
          max64 = LP_SCANTIME;
       else
          max64 = g_work_time + ( have_longpoll ? LP_SCANTIME : opt_scantime )
 	                      - time(NULL);
+*/
+
       // time limit
       if ( unlikely( opt_time_limit && firstwork_time ) )
       {
@@ -2065,17 +2161,20 @@ static void *miner_thread( void *userdata )
          }
          if ( remain < max64 ) max64 = remain;
       }
-       // Select nonce range for approx 1 min duration based
-       // on hashrate, initial value arbitrarilly set to 1000 just to get
+
+       // Select nonce range based on max64, the estimated number of hashes
+       // to meet the desired scan time.
+       // Initial value arbitrarilly set to 1000 just to get
       // a sample hashrate for the next time.
       uint32_t work_nonce = *nonceptr;
-       max64 = 60 * thr_hashrates[thr_id];
+//       max64 = 60 * thr_hashrates[thr_id];
       if ( max64 <= 0)
          max64 = 1000;
       if ( work_nonce + max64 > end_nonce )
          max_nonce = end_nonce;
       else
          max_nonce = work_nonce + (uint32_t)max64;
+
       // init time
       if ( firstwork_time == 0 )
          firstwork_time = time(NULL);
@@ -2206,6 +2305,8 @@ void restart_threads(void)
 {
 	for ( int i = 0; i < opt_n_threads; i++)
 		work_restart[i].restart = 1;
+   if ( opt_debug )
+      applog( LOG_INFO, "Threads restarted for new work."); 
 }

 json_t *std_longpoll_rpc_call( CURL *curl, int *err, char* lp_url )
@@ -2289,13 +2390,18 @@ start:
      soval = json_object_get(res, "submitold");
      submit_old = soval ? json_is_true(soval) : false;
 	   pthread_mutex_lock(&g_work_lock);
-	   start_job_id = g_work.job_id ? strdup(g_work.job_id) : NULL;
+
+// This code has been here for a long time even though job_id isn't used.
+// This needs to be changed eventually to test the block height properly
+// using g_work.block_height .     
+      start_job_id = g_work.job_id ? strdup(g_work.job_id) : NULL;
 	   if (have_gbt)
 	      rc = gbt_work_decode(res, &g_work);
 	   else
 	      rc = work_decode(res, &g_work);
 	   if (rc)
      {
+// purge job id from solo mining
        bool newblock = g_work.job_id && strcmp(start_job_id, g_work.job_id);
 	     newblock |= (start_diff != net_diff); // the best is the height but... longpoll...
        if (newblock)
@@ -2461,6 +2567,8 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )

   pthread_mutex_unlock( &sctx->work_lock );

+   restart_threads();
+
   if ( opt_debug )
   {
      unsigned char *xnonce2str = abin2hex( g_work->xnonce2,
@@ -2985,11 +3093,7 @@ void parse_arg(int key, char *arg )
      opt_hash_meter = true;
      break;
   case 1016:			/* --coinbase-addr */
-		pk_script_size = address_to_script(pk_script, sizeof(pk_script), arg);
-		if (!pk_script_size) {
-			fprintf(stderr, "invalid address -- '%s'\n", arg);
-			show_usage_and_exit(1);
-		}
+      if ( arg ) coinbase_address = strdup( arg );
 		break;
 	case 1015:			/* --coinbase-sig */
 		if (strlen(arg) + 1 > sizeof(coinbase_sig)) {
@@ -3472,6 +3576,17 @@ int main(int argc, char *argv[])
   // All options must be set before starting the gate
   if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1);

+   if ( coinbase_address )
+   {
+      pk_script_size = address_to_script( pk_script, pk_buffer_size,
+                                          coinbase_address );
+      if ( !pk_script_size )
+      {
+         applog(LOG_ERR,"Invalid coinbase address: '%s'", coinbase_address );
+         exit(0);
+      }
+   }
+
   // Initialize stats times and counters
   memset( share_stats, 0, 2 *  sizeof (struct share_stats_t) );
   gettimeofday( &last_submit_time, NULL );
@@ -3622,7 +3737,10 @@ int main(int argc, char *argv[])
 	/* ESET-NOD32 Detects these 2 thread_create... */
 	if (want_longpoll && !have_stratum)
   {
-		/* init longpoll thread info */
+      if ( opt_debug )
+         applog(LOG_INFO,"Creating long poll thread");
+
+      /* init longpoll thread info */
 		longpoll_thr_id = opt_n_threads + 1;
 		thr = &thr_info[longpoll_thr_id];
 		thr->id = longpoll_thr_id;
@@ -3638,7 +3756,10 @@ int main(int argc, char *argv[])
 	}
 	if (want_stratum)
   {
-		/* init stratum thread info */
+      if ( opt_debug )
+         applog(LOG_INFO,"Creating stratum thread");
+
+      /* init stratum thread info */
 		stratum_thr_id = opt_n_threads + 2;
 		thr = &thr_info[stratum_thr_id];
 		thr->id = stratum_thr_id;
@@ -3658,7 +3779,10 @@ int main(int argc, char *argv[])

 	if ( opt_api_enabled )
   {
-		/* api thread */
+      if ( opt_debug )
+         applog(LOG_INFO,"Creating API thread");
+
+      /* api thread */
 		api_thr_id = opt_n_threads + 3;
 		thr = &thr_info[api_thr_id];
 		thr->id = api_thr_id;
@@ -3668,7 +3792,7 @@ int main(int argc, char *argv[])
 		err = thread_create( thr, api_thread );
 		if ( err )
      {
-			applog( LOG_ERR, "api thread create failed" );
+			applog( LOG_ERR, "API thread create failed" );
 			return 1;
 		}
      if ( !opt_quiet )
--- a/miner.h
+++ b/miner.h
@@ -754,6 +754,8 @@ extern uint32_t solved_block_count;
 extern pthread_mutex_t applog_lock;
 extern pthread_mutex_t stats_lock;
 extern bool opt_sapling;
+extern const int pk_buffer_size_max;
+extern int pk_buffer_size;

 static char const usage[] = "\
 Usage: " PACKAGE_NAME " [OPTIONS]\n\
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -120,15 +120,26 @@ do { \
 } while(0)


-// Horizontal vector testing
-// needs a proper test, seems to be working in the code but polarity appears
-// reversed.
-#define mm256_allbits0( a )    _mm256_testz_si256(   a, a )
-#define mm256_allbits1( a )    _mm256_testc_si256(   a, m256_neg1 )
-//broken
-//#define mm256_allbitsne( a )   _mm256_testnzc_si256( a, m256_neg1 )
-#define mm256_anybits0( a )   !mm256_allbits1( a )
-#define mm256_anybits1( a )   !mm256_allbits0( a )
+// Bytewise test of all 256 bits
+#define mm256_all0_8( a ) \
+     ( _mm256_movemask_epi8( a ) == 0 )
+
+#define mm256_all1_8( a ) \
+    ( _mm256_movemask_epi8( a ) == -1 )
+
+
+#define mm256_anybits0( a ) \
+   (  _mm256_movemask_epi8( a ) & 0xffffffff  )
+
+#define mm256_anybits1( a ) \
+   ( ( _mm256_movemask_epi8( a ) & 0xffffffff ) != 0xffffffff )
+
+
+// Bitwise test of all 256 bits
+#define mm256_allbits0( a )   _mm256_testc_si256( a, m256_neg1 )
+#define mm256_allbits1( a )   _mm256_testc_si256( m256_zero, a )
+//#define mm256_anybits0( a )   !mm256_allbits1( a )
+//#define mm256_anybits1( a )   !mm256_allbits0( a )


 // Parallel AES, for when x is expected to be in a 256 bit register.
--- a/util.c
+++ b/util.c
@@ -159,8 +159,6 @@ void applog2( int prio, const char *fmt, ... )
 }


-
-
 void applog(int prio, const char *fmt, ...)
 {
 	va_list ap;
@@ -921,25 +919,28 @@ bool jobj_binary(const json_t *obj, const char *key, void *buf, size_t buflen)
 	return true;
 }

-size_t address_to_script(unsigned char *out, size_t outsz, const char *addr)
+size_t address_to_script( unsigned char *out, size_t outsz, const char *addr )
 {
-	unsigned char addrbin[26];
+	unsigned char addrbin[ pk_buffer_size_max ];
 	int addrver;
 	size_t rv;

-	if (!b58dec(addrbin, sizeof(addrbin), addr))
+	if ( !b58dec( addrbin, outsz, addr ) )
 		return 0;
-	addrver = b58check(addrbin, sizeof(addrbin), addr);
-	if (addrver < 0)
+
+   addrver = b58check( addrbin, outsz, addr );
+   if ( addrver < 0 )
 		return 0;
-	switch (addrver) {
+
+   switch ( addrver )
+   {
 		case 5:    /* Bitcoin script hash */
 		case 196:  /* Testnet script hash */
-			if (outsz < (rv = 23))
+			if ( outsz < ( rv = 23 ) )
 				return rv;
 			out[ 0] = 0xa9;  /* OP_HASH160 */
 			out[ 1] = 0x14;  /* push 20 bytes */
-			memcpy(&out[2], &addrbin[1], 20);
+			memcpy( &out[2], &addrbin[1], 20 );
 			out[22] = 0x87;  /* OP_EQUAL */
 			return rv;
 		default:
@@ -948,7 +949,7 @@ size_t address_to_script(unsigned char *out, size_t outsz, const char *addr)
 			out[ 0] = 0x76;  /* OP_DUP */
 			out[ 1] = 0xa9;  /* OP_HASH160 */
 			out[ 2] = 0x14;  /* push 20 bytes */
-			memcpy(&out[3], &addrbin[1], 20);
+			memcpy( &out[3], &addrbin[1], 20 );
 			out[23] = 0x88;  /* OP_EQUALVERIFY */
 			out[24] = 0xac;  /* OP_CHECKSIG */
 			return rv;
Author	SHA1	Message	Date
Jay D Dee	0e1e88f53e	v3.12.4.3	2020-02-24 21:35:19 -05:00
Jay D Dee	45c77a5c81	v3.12.4.2	2020-02-23 15:31:06 -05:00
Jay D Dee	dbce7e0721	v3.12.4.1	2020-02-22 18:06:39 -05:00
Jay D Dee	6d66051de6	v3.12.4	2020-02-21 16:34:53 -05:00
Jay D Dee	b93be8816a	v3.12.3.1	2020-02-18 12:05:47 -05:00
Jay D Dee	19b0ac6d5c	v3.12.3	2020-02-13 04:25:33 -05:00
Jay D Dee	3da2b958cf	v3.12.2	2020-02-09 13:30:40 -05:00