v3.11.6

v3.11.5
2025-09-17 23:44:27 +00:00 · 2020-01-23 00:11:08 -05:00 · 2020-01-18 15:14:27 -05:00
59 changed files with 2405 additions and 1289 deletions
--- a/3
+++ b/3
@@ -33,3 +33,6 @@ Jay D Dee
 xcouiz@gmail.com

 Cryply
+
+Colin Percival
+Alexander Peslyak
--- a/60
+++ b/60
@@ -33,9 +33,69 @@ supported.
 64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
 are not supported. FreeBSD YMMV.

+Reporting bugs
+--------------
+
+Bugs can be reported by sending am email to JayDDee246@gmail.com or opening
+an issue in git: https://github.com/JayDDee/cpuminer-opt/issues
+
+Please include the following information:
+
+1. CPU model, operating system, cpuminer-opt version (must be latest),
+   binary file for Windows, changes to default build procedure for Linux.
+
+2. Exact comand line (except user and pw) and intial output showing
+   the above requested info.
+
+3. Additional program output showing any error messages or other
+   pertinent data.
+
+4. A clear description of the problem including history, scope,
+   persistence or intermittance, and reproduceability. 
+
+In simpler terms:
+
+What is it doing?
+What should it be doing instead?
+Did it work in a previous release?
+Does it happen for all algos? All pools? All options? Solo?
+Does it happen all the time?
+If not what makes it happen or not happen? 
+
 Change Log
 ----------

+v3.11.6
+
+Fixed CPU temperature regression from v3.11.5.
+
+More improvements to share log. More compact, highlight incremented counter,
+block height when solved, job id when stale.
+
+v3.11.5
+
+Fixed AVX512 detection that could cause compilation errors on CPUs
+without AVX512.
+
+Fixed "BLOCK SOLVED" log incorrectly displaying "Accepted" when a block
+is solved.
+Added share counter to share submitited & accepted logs
+Added job id to share submitted log.
+Share submitted log is no longer highlighted blue, there was too much blue.
+
+Another CPU temperature fix for Linux.
+
+Added bug reporting tips to RELEASE NOTES.
+
+v3.11.4
+
+Fixed scrypt segfault since v3.9.9.1.
+
+Stale shares counted and reported seperately from other rejected shares.
+
+Display of counters for solved blocks, rejects, stale shares suppressed in
+periodic summary when zero.
+
 v3.11.3

 Fixed x12 AVX2 again.
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -13,7 +13,7 @@ void blakehash_4way(void *state, const void *input)
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
     blake256r14_4way_context ctx;
     memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
-     blake256r14_4way( &ctx, input + (64<<2), 16 );
+     blake256r14_4way_update( &ctx, input + (64<<2), 16 );
     blake256r14_4way_close( &ctx, vhash );
     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }
@@ -36,7 +36,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,

   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake256r14_4way_init( &blake_4w_ctx );
-   blake256r14_4way( &blake_4w_ctx, vdata, 64 );
+   blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );

   do {
      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -37,8 +37,6 @@
 #ifndef __BLAKE_HASH_4WAY__
 #define __BLAKE_HASH_4WAY__ 1

-//#ifdef __SSE4_2__
-
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -51,46 +49,41 @@ extern "C"{

 #define SPH_SIZE_blake512   512

-// With SSE4.2 only Blake-256 4 way is available.
-// With AVX2 Blake-256 8way & Blake-512 4 way are also available.
-
-// Blake-256 4 way
+//////////////////////////
+//
+//   Blake-256 4 way SSE2

 typedef struct {
   unsigned char buf[64<<2];
   uint32_t H[8<<2];
-//   __m128i buf[16] __attribute__ ((aligned (64)));
-//   __m128i H[8];
-//   __m128i S[4];    
   size_t ptr;
   uint32_t T0, T1;
   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
 } blake_4way_small_context __attribute__ ((aligned (64)));

-// Default 14 rounds
+// Default, 14 rounds, blake, decred
 typedef blake_4way_small_context blake256_4way_context;
 void blake256_4way_init(void *ctx);
 void blake256_4way_update(void *ctx, const void *data, size_t len);
-#define blake256_4way blake256_4way_update
 void blake256_4way_close(void *ctx, void *dst);

 // 14 rounds, blake, decred
 typedef blake_4way_small_context blake256r14_4way_context;
 void blake256r14_4way_init(void *cc);
 void blake256r14_4way_update(void *cc, const void *data, size_t len);
-#define blake256r14_4way blake256r14_4way_update
 void blake256r14_4way_close(void *cc, void *dst);

 // 8 rounds, blakecoin, vanilla
 typedef blake_4way_small_context blake256r8_4way_context;
 void blake256r8_4way_init(void *cc);
 void blake256r8_4way_update(void *cc, const void *data, size_t len);
-#define blake256r8_4way blake256r8_4way_update
 void blake256r8_4way_close(void *cc, void *dst);

 #ifdef __AVX2__

-// Blake-256 8 way
+//////////////////////////
+//
+//   Blake-256 8 way AVX2

 typedef struct {
   __m256i buf[16] __attribute__ ((aligned (64)));
@@ -104,7 +97,6 @@ typedef struct {
 typedef blake_8way_small_context blake256_8way_context;
 void blake256_8way_init(void *cc);
 void blake256_8way_update(void *cc, const void *data, size_t len);
-//#define blake256_8way blake256_8way_update
 void blake256_8way_close(void *cc, void *dst);

 // 14 rounds, blake, decred
@@ -117,10 +109,9 @@ void blake256r14_8way_close(void *cc, void *dst);
 typedef blake_8way_small_context blake256r8_8way_context;
 void blake256r8_8way_init(void *cc);
 void blake256r8_8way_update(void *cc, const void *data, size_t len);
-#define blake256r8_8way blake256r8_8way_update
 void blake256r8_8way_close(void *cc, void *dst);

-// Blake-512 4 way
+// Blake-512 4 way AVX2

 typedef struct {
   __m256i buf[16];
@@ -134,14 +125,15 @@ typedef blake_4way_big_context blake512_4way_context;

 void blake512_4way_init( blake_4way_big_context *sc );
 void blake512_4way_update( void *cc, const void *data, size_t len );
-#define blake512_4way blake512_4way_update
 void blake512_4way_close( void *cc, void *dst );
-void blake512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                      void *dst );
+void blake512_4way_full( blake_4way_big_context *sc, void * dst,
+                         const void *data, size_t len );

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-//Blake-256 16 way
+////////////////////////////
+//
+//   Blake-256 16 way AVX512

 typedef struct {
   __m512i buf[16];
@@ -169,8 +161,9 @@ void blake256r8_16way_init(void *cc);
 void blake256r8_16way_update(void *cc, const void *data, size_t len);
 void blake256r8_16way_close(void *cc, void *dst);

-
-// Blake-512 8 way
+////////////////////////////
+//
+//// Blake-512 8 way AVX512

 typedef struct {
   __m512i buf[16];
@@ -185,12 +178,10 @@ typedef blake_8way_big_context blake512_8way_context;
 void blake512_8way_init( blake_8way_big_context *sc );
 void blake512_8way_update( void *cc, const void *data, size_t len );
 void blake512_8way_close( void *cc, void *dst );
-void blake512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                      void *dst );
+void blake512_8way_full( blake_8way_big_context *sc, void * dst,
+                        const void *data, size_t len );

 #endif  // AVX512
-
-
 #endif  // AVX2

 #ifdef __cplusplus
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -267,22 +267,22 @@ static const sph_u64 CB[16] = {
 #define CBx_(n)     CBx__(n)
 #define CBx__(n)    CB ## n

-#define CB0   SPH_C64(0x243F6A8885A308D3)
-#define CB1   SPH_C64(0x13198A2E03707344)
-#define CB2   SPH_C64(0xA4093822299F31D0)
-#define CB3   SPH_C64(0x082EFA98EC4E6C89)
-#define CB4   SPH_C64(0x452821E638D01377)
-#define CB5   SPH_C64(0xBE5466CF34E90C6C)
-#define CB6   SPH_C64(0xC0AC29B7C97C50DD)
-#define CB7   SPH_C64(0x3F84D5B5B5470917)
-#define CB8   SPH_C64(0x9216D5D98979FB1B)
-#define CB9   SPH_C64(0xD1310BA698DFB5AC)
-#define CBA   SPH_C64(0x2FFD72DBD01ADFB7)
-#define CBB   SPH_C64(0xB8E1AFED6A267E96)
-#define CBC   SPH_C64(0xBA7C9045F12C7F99)
-#define CBD   SPH_C64(0x24A19947B3916CF7)
-#define CBE   SPH_C64(0x0801F2E2858EFC16)
-#define CBF   SPH_C64(0x636920D871574E69)
+#define CB0   0x243F6A8885A308D3
+#define CB1   0x13198A2E03707344
+#define CB2   0xA4093822299F31D0
+#define CB3   0x082EFA98EC4E6C89
+#define CB4   0x452821E638D01377
+#define CB5   0xBE5466CF34E90C6C
+#define CB6   0xC0AC29B7C97C50DD
+#define CB7   0x3F84D5B5B5470917
+#define CB8   0x9216D5D98979FB1B
+#define CB9   0xD1310BA698DFB5AC
+#define CBA   0x2FFD72DBD01ADFB7
+#define CBB   0xB8E1AFED6A267E96
+#define CBC   0xBA7C9045F12C7F99
+#define CBD   0x24A19947B3916CF7
+#define CBE   0x0801F2E2858EFC16
+#define CBF   0x636920D871574E69

 #define READ_STATE64(state)   do { \
      H0 = (state)->H[0]; \
@@ -349,9 +349,9 @@ static const sph_u64 CB[16] = {
 #define DECL_STATE64_8WAY \
   __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
        __m512i S0, S1, S2, S3; \
-   sph_u64 T0, T1;
+   uint64_t T0, T1;

-#define COMPRESS64_8WAY   do \
+#define COMPRESS64_8WAY( buf )   do \
 { \
  __m512i M0, M1, M2, M3, M4, M5, M6, M7; \
  __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
@@ -424,6 +424,84 @@ static const sph_u64 CB[16] = {
  H7 = mm512_xor4( VF, V7, S3, H7 ); \
 } while (0)

+void blake512_8way_compress( blake_8way_big_context *sc )
+{ 
+  __m512i M0, M1, M2, M3, M4, M5, M6, M7;
+  __m512i M8, M9, MA, MB, MC, MD, ME, MF;
+  __m512i V0, V1, V2, V3, V4, V5, V6, V7;
+  __m512i V8, V9, VA, VB, VC, VD, VE, VF;
+  __m512i shuf_bswap64;
+
+  V0 = sc->H[0];
+  V1 = sc->H[1];
+  V2 = sc->H[2];
+  V3 = sc->H[3];
+  V4 = sc->H[4];
+  V5 = sc->H[5];
+  V6 = sc->H[6];
+  V7 = sc->H[7];
+  V8 = _mm512_xor_si512( sc->S[0], m512_const1_64( CB0 ) );
+  V9 = _mm512_xor_si512( sc->S[1], m512_const1_64( CB1 ) );
+  VA = _mm512_xor_si512( sc->S[2], m512_const1_64( CB2 ) );
+  VB = _mm512_xor_si512( sc->S[3], m512_const1_64( CB3 ) );
+  VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
+                            m512_const1_64( CB4 ) );
+  VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
+                            m512_const1_64( CB5 ) );
+  VE = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
+                            m512_const1_64( CB6 ) );
+  VF = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
+                            m512_const1_64( CB7 ) );
+
+  shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637,
+                                0x28292a2b2c2d2e2f, 0x2021222324252627,
+                                0x18191a1b1c1d1e1f, 0x1011121314151617,
+                                0x08090a0b0c0d0e0f, 0x0001020304050607 );
+
+  M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
+  M1 = _mm512_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
+  M2 = _mm512_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
+  M3 = _mm512_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
+  M4 = _mm512_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
+  M5 = _mm512_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
+  M6 = _mm512_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
+  M7 = _mm512_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
+  M8 = _mm512_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
+  M9 = _mm512_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
+  MA = _mm512_shuffle_epi8( sc->buf[10], shuf_bswap64 );
+  MB = _mm512_shuffle_epi8( sc->buf[11], shuf_bswap64 );
+  MC = _mm512_shuffle_epi8( sc->buf[12], shuf_bswap64 );
+  MD = _mm512_shuffle_epi8( sc->buf[13], shuf_bswap64 );
+  ME = _mm512_shuffle_epi8( sc->buf[14], shuf_bswap64 );
+  MF = _mm512_shuffle_epi8( sc->buf[15], shuf_bswap64 );
+
+  ROUND_B_8WAY(0);
+  ROUND_B_8WAY(1);
+  ROUND_B_8WAY(2);
+  ROUND_B_8WAY(3);
+  ROUND_B_8WAY(4);
+  ROUND_B_8WAY(5);
+  ROUND_B_8WAY(6);
+  ROUND_B_8WAY(7);
+  ROUND_B_8WAY(8);
+  ROUND_B_8WAY(9);
+  ROUND_B_8WAY(0);
+  ROUND_B_8WAY(1);
+  ROUND_B_8WAY(2);
+  ROUND_B_8WAY(3);
+  ROUND_B_8WAY(4);
+  ROUND_B_8WAY(5);
+
+  sc->H[0] = mm512_xor4( V8, V0, sc->S[0], sc->H[0] );
+  sc->H[1] = mm512_xor4( V9, V1, sc->S[1], sc->H[1] );
+  sc->H[2] = mm512_xor4( VA, V2, sc->S[2], sc->H[2] );
+  sc->H[3] = mm512_xor4( VB, V3, sc->S[3], sc->H[3] );
+  sc->H[4] = mm512_xor4( VC, V4, sc->S[0], sc->H[4] );
+  sc->H[5] = mm512_xor4( VD, V5, sc->S[1], sc->H[5] );
+  sc->H[6] = mm512_xor4( VE, V6, sc->S[2], sc->H[6] );
+  sc->H[7] = mm512_xor4( VF, V7, sc->S[3], sc->H[7] );
+}
+
 void blake512_8way_init( blake_8way_big_context *sc )
 {
   __m512i zero = m512_zero;
@@ -455,39 +533,43 @@ blake64_8way( blake_8way_big_context *sc, const void *data, size_t len )

   const int buf_size = 128;  //  sizeof/8

+// 64, 80 bytes: 1st pass copy data. 2nd pass copy padding and compress.   
+// 128 bytes: 1st pass copy data, compress. 2nd pass copy padding, compress.
+   
   buf = sc->buf;
   ptr = sc->ptr;
   if ( len < (buf_size - ptr) )
   {
-   memcpy_512( buf + (ptr>>3), vdata, len>>3 );
-   ptr += len;
-   sc->ptr = ptr;
-   return;
+      memcpy_512( buf + (ptr>>3), vdata, len>>3 );
+      ptr += len;
+      sc->ptr = ptr;
+      return;
   }

   READ_STATE64(sc);
   while ( len > 0 )
   {
-   size_t clen;
+      size_t clen;

-   clen = buf_size - ptr;
-   if ( clen > len )
+      clen = buf_size - ptr;
+      if ( clen > len )
      clen = len;
-   memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
-   ptr += clen;
-   vdata = vdata + (clen>>3);
-   len -= clen;
-   if ( ptr == buf_size )
-        {
-      if ( ( T0 = SPH_T64(T0 + 1024) ) < 1024 )
-         T1 = SPH_T64(T1 + 1);
-      COMPRESS64_8WAY;
-      ptr = 0;
-   }
+      memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
+      ptr += clen;
+      vdata = vdata + (clen>>3);
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         if ( ( T0 = T0 + 1024 ) < 1024 )
+            T1 = T1 + 1;
+         COMPRESS64_8WAY( buf );
+         ptr = 0;
+      }
   }
   WRITE_STATE64(sc);
   sc->ptr = ptr;
-}
+
+   }

 static void
 blake64_8way_close( blake_8way_big_context *sc, void *dst )
@@ -495,26 +577,22 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
   __m512i buf[16];
   size_t ptr;
   unsigned bit_len;
-//   uint64_t z, zz;
-   sph_u64 th, tl;
+   uint64_t th, tl;

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-//   z = 0x80 >> n;
-//   zz = ((ub & -z) | z) & 0xFF;
-//   buf[ptr>>3] = _mm512_set1_epi64( zz );
   buf[ptr>>3] = m512_const1_64( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;
   if (ptr == 0 )
   {
-   sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
-   sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
+   sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
+   sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
   }
   else if ( sc->T0 == 0 )
   {
-   sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
-   sc->T1 = SPH_T64(sc->T1 - 1);
+   sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
+   sc->T1 = sc->T1 - 1;
   }
   else
   {
@@ -535,8 +613,8 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
       memset_zero_512( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );

       blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
-       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
-       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
+       sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
+       sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
       memset_zero_512( buf, 112>>3 );
       buf[104>>3] = m512_const1_64( 0x0100000000000000ULL );
       buf[112>>3] = m512_const1_64( bswap_64( th ) );
@@ -547,6 +625,79 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
   mm512_block_bswap_64( (__m512i*)dst, sc->H );
 }

+// init, update & close
+void blake512_8way_full( blake_8way_big_context *sc, void * dst, 
+                        const void *data, size_t len )
+{
+   
+// init
+
+   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
+   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
+   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
+   casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
+   casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
+   casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
+   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
+   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
+
+   casti_m512i( sc->S, 0 ) = m512_zero;
+   casti_m512i( sc->S, 1 ) = m512_zero;
+   casti_m512i( sc->S, 2 ) = m512_zero;
+   casti_m512i( sc->S, 3 ) = m512_zero;
+
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
+
+// update
+
+   memcpy_512( sc->buf, (__m512i*)data, len>>3 );
+   sc->ptr = len;
+   if ( len == 128 )
+   {
+      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
+            sc->T1 = sc->T1 + 1;
+      blake512_8way_compress( sc );
+      sc->ptr = 0;
+   }
+
+// close
+
+   size_t ptr64 = sc->ptr >> 3;
+   unsigned bit_len;
+   uint64_t th, tl;
+
+   bit_len = sc->ptr << 3;
+   sc->buf[ptr64] = m512_const1_64( 0x80 );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+
+   if ( ptr64 == 0 )
+   {
+   sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
+   sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
+   }
+   else if ( sc->T0 == 0 )
+   {
+   sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
+   sc->T1 = sc->T1 - 1;
+   }
+   else
+      sc->T0 -= 1024 - bit_len;
+
+   memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
+   sc->buf[13] = m512_const1_64( 0x0100000000000000ULL );
+   sc->buf[14] = m512_const1_64( bswap_64( th ) );
+   sc->buf[15] = m512_const1_64( bswap_64( tl ) );
+
+   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
+       sc->T1 = sc->T1 + 1;
+
+   blake512_8way_compress( sc );
+   
+   mm512_block_bswap_64( (__m512i*)dst, sc->H );
+}
+   
 void
 blake512_8way_update(void *cc, const void *data, size_t len)
 {
@@ -555,12 +706,6 @@ blake512_8way_update(void *cc, const void *data, size_t len)

 void
 blake512_8way_close(void *cc, void *dst)
-{
-   blake512_8way_addbits_and_close(cc, 0, 0, dst);
-}
-
-void
-blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
   blake64_8way_close(cc, dst);
 }
@@ -596,7 +741,7 @@ blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #define DECL_STATE64_4WAY \
 	__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
        __m256i S0, S1, S2, S3; \
-	sph_u64 T0, T1;
+	uint64_t T0, T1;

 #define COMPRESS64_4WAY   do \
 { \
@@ -670,6 +815,81 @@ blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 } while (0)


+void blake512_4way_compress( blake_4way_big_context *sc )
+{
+  __m256i M0, M1, M2, M3, M4, M5, M6, M7;
+  __m256i M8, M9, MA, MB, MC, MD, ME, MF;
+  __m256i V0, V1, V2, V3, V4, V5, V6, V7;
+  __m256i V8, V9, VA, VB, VC, VD, VE, VF;
+  __m256i shuf_bswap64;
+
+  V0 = sc->H[0];
+  V1 = sc->H[1];
+  V2 = sc->H[2];
+  V3 = sc->H[3];
+  V4 = sc->H[4];
+  V5 = sc->H[5];
+  V6 = sc->H[6];
+  V7 = sc->H[7];
+  V8 = _mm256_xor_si256( sc->S[0], m256_const1_64( CB0 ) );
+  V9 = _mm256_xor_si256( sc->S[1], m256_const1_64( CB1 ) );
+  VA = _mm256_xor_si256( sc->S[2], m256_const1_64( CB2 ) );
+  VB = _mm256_xor_si256( sc->S[3], m256_const1_64( CB3 ) );
+  VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
+                             m256_const1_64( CB4 ) );
+  VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
+                             m256_const1_64( CB5 ) );
+  VE = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
+                             m256_const1_64( CB6 ) );
+  VF = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
+                             m256_const1_64( CB7 ) );
+  shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617,
+                                0x08090a0b0c0d0e0f, 0x0001020304050607 );
+
+  M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
+  M1 = _mm256_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
+  M2 = _mm256_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
+  M3 = _mm256_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
+  M4 = _mm256_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
+  M5 = _mm256_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
+  M6 = _mm256_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
+  M7 = _mm256_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
+  M8 = _mm256_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
+  M9 = _mm256_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
+  MA = _mm256_shuffle_epi8( sc->buf[10], shuf_bswap64 );
+  MB = _mm256_shuffle_epi8( sc->buf[11], shuf_bswap64 );
+  MC = _mm256_shuffle_epi8( sc->buf[12], shuf_bswap64 );
+  MD = _mm256_shuffle_epi8( sc->buf[13], shuf_bswap64 );
+  ME = _mm256_shuffle_epi8( sc->buf[14], shuf_bswap64 );
+  MF = _mm256_shuffle_epi8( sc->buf[15], shuf_bswap64 );
+
+  ROUND_B_4WAY(0);
+  ROUND_B_4WAY(1);
+  ROUND_B_4WAY(2);
+  ROUND_B_4WAY(3);
+  ROUND_B_4WAY(4);
+  ROUND_B_4WAY(5);
+  ROUND_B_4WAY(6);
+  ROUND_B_4WAY(7);
+  ROUND_B_4WAY(8);
+  ROUND_B_4WAY(9);
+  ROUND_B_4WAY(0);
+  ROUND_B_4WAY(1);
+  ROUND_B_4WAY(2);
+  ROUND_B_4WAY(3);
+  ROUND_B_4WAY(4);
+  ROUND_B_4WAY(5);
+
+  sc->H[0] = mm256_xor4( V8, V0, sc->S[0], sc->H[0] );
+  sc->H[1] = mm256_xor4( V9, V1, sc->S[1], sc->H[1] );
+  sc->H[2] = mm256_xor4( VA, V2, sc->S[2], sc->H[2] );
+  sc->H[3] = mm256_xor4( VB, V3, sc->S[3], sc->H[3] );
+  sc->H[4] = mm256_xor4( VC, V4, sc->S[0], sc->H[4] );
+  sc->H[5] = mm256_xor4( VD, V5, sc->S[1], sc->H[5] );
+  sc->H[6] = mm256_xor4( VE, V6, sc->S[2], sc->H[6] );
+  sc->H[7] = mm256_xor4( VF, V7, sc->S[3], sc->H[7] );
+}
+
 void blake512_4way_init( blake_4way_big_context *sc )
 {
   __m256i zero = m256_zero;
@@ -681,10 +901,12 @@ void blake512_4way_init( blake_4way_big_context *sc )
   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
+
   casti_m256i( sc->S, 0 ) = zero;
   casti_m256i( sc->S, 1 ) = zero;
   casti_m256i( sc->S, 2 ) = zero;
   casti_m256i( sc->S, 3 ) = zero;
+
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
 }
@@ -703,31 +925,31 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
   ptr = sc->ptr;
   if ( len < (buf_size - ptr) )
   {
-	memcpy_256( buf + (ptr>>3), vdata, len>>3 );
-	ptr += len;
-	sc->ptr = ptr;
-	return;
+   	memcpy_256( buf + (ptr>>3), vdata, len>>3 );
+	   ptr += len;
+	   sc->ptr = ptr;
+	   return;
   }

   READ_STATE64(sc);
   while ( len > 0 )
   {
-	size_t clen;
+   	size_t clen;

-	clen = buf_size - ptr;
-	if ( clen > len )
-		clen = len;
-	memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
-	ptr += clen;
-	vdata = vdata + (clen>>3);
-	len -= clen;
-	if (ptr == buf_size )
-        {
-		if ((T0 = SPH_T64(T0 + 1024)) < 1024)
-			T1 = SPH_T64(T1 + 1);
-		COMPRESS64_4WAY;
-		ptr = 0;
-	}
+	   clen = buf_size - ptr;
+	   if ( clen > len )
+		   clen = len;
+   	memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
+	   ptr += clen;
+	   vdata = vdata + (clen>>3);
+	   len -= clen;
+	   if ( ptr == buf_size )
+      {
+		   if ( (T0 = T0 + 1024 ) < 1024 )
+			   T1 = SPH_T64(T1 + 1);
+	   	COMPRESS64_4WAY;
+		   ptr = 0;
+	   }
   }
   WRITE_STATE64(sc);
   sc->ptr = ptr;
@@ -739,7 +961,7 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
   __m256i buf[16];
   size_t ptr;
   unsigned bit_len;
-   sph_u64 th, tl;
+   uint64_t th, tl;

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
@@ -748,13 +970,13 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
   th = sc->T1;
   if (ptr == 0 )
   {
-	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
-	sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
+	sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
+	sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
   }
   else if ( sc->T0 == 0 )
   {
-	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
-	sc->T1 = SPH_T64(sc->T1 - 1);
+	sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
+	sc->T1 = sc->T1 - 1;
   } 
   else
   {
@@ -788,13 +1010,77 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
   mm256_block_bswap_64( (__m256i*)dst, sc->H );
 }

-/*
-void
-blake512_4way_init(void *cc)
+// init, update & close
+void blake512_4way_full( blake_4way_big_context *sc, void * dst,
+                         const void *data, size_t len )
 {
-	blake64_4way_init(cc, IV512, salt_zero_big);
+
+// init
+
+   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
+   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
+   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
+   casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
+   casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
+   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
+   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
+   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
+
+   casti_m256i( sc->S, 0 ) = m256_zero;
+   casti_m256i( sc->S, 1 ) = m256_zero;
+   casti_m256i( sc->S, 2 ) = m256_zero;
+   casti_m256i( sc->S, 3 ) = m256_zero;
+
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
+
+// update
+
+   memcpy_256( sc->buf, (__m256i*)data, len>>3 );
+   sc->ptr += len;
+   if ( len == 128 )
+   {
+      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
+         sc->T1 =  sc->T1 + 1;
+      blake512_4way_compress( sc );
+      sc->ptr = 0;
+   }
+
+// close
+
+   size_t ptr64 = sc->ptr >> 3;
+   unsigned bit_len;
+   uint64_t th, tl;
+
+   bit_len = sc->ptr << 3;
+   sc->buf[ptr64] = m256_const1_64( 0x80 );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+   if ( sc->ptr == 0 )
+   {
+      sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
+      sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
+   }
+   else if ( sc->T0 == 0 )
+   {
+      sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
+      sc->T1 = sc->T1 - 1;
+   }
+   else
+        sc->T0 -= 1024 - bit_len;
+
+   memset_zero_256( sc->buf + ptr64 + 1, 13 - ptr64 );
+   sc->buf[13] = m256_const1_64( 0x0100000000000000ULL );
+   sc->buf[14] = m256_const1_64( bswap_64( th ) );
+   sc->buf[15] = m256_const1_64( bswap_64( tl ) );
+
+   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
+       sc->T1 = sc->T1 + 1;
+
+   blake512_4way_compress( sc );
+
+   mm256_block_bswap_64( (__m256i*)dst, sc->H );
 }
-*/

 void
 blake512_4way_update(void *cc, const void *data, size_t len)
@@ -806,17 +1092,8 @@ void
 blake512_4way_close(void *cc, void *dst)
 {
   blake64_4way_close( cc, dst );
-
-//   blake512_4way_addbits_and_close(cc, dst);
 }

-/*
-void
-blake512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
-{
-	blake64_4way_close(cc, ub, n, dst, 8);
-}
-*/
 #ifdef __cplusplus
 }
 #endif
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -14,7 +14,7 @@ void blakecoin_4way_hash(void *state, const void *input)
     blake256r8_4way_context ctx;

     memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
-     blake256r8_4way( &ctx, input + (64<<2), 16 );
+     blake256r8_4way_update( &ctx, input + (64<<2), 16 );
     blake256r8_4way_close( &ctx, vhash );

     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
@@ -37,7 +37,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,

   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake256r8_4way_init( &blakecoin_4w_ctx );
-   blake256r8_4way( &blakecoin_4w_ctx, vdata, 64 );
+   blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );

   do {
      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
@@ -71,7 +71,7 @@ void blakecoin_8way_hash( void *state, const void *input )
     blake256r8_8way_context ctx;

     memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
-     blake256r8_8way( &ctx, input + (64<<3), 16 );
+     blake256r8_8way_update( &ctx, input + (64<<3), 16 );
     blake256r8_8way_close( &ctx, vhash );

     dintrlv_8x32( state,     state+ 32, state+ 64, state+ 96, state+128,
@@ -95,7 +95,7 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   blake256r8_8way_init( &blakecoin_8w_ctx );
-   blake256r8_8way( &blakecoin_8w_ctx, vdata, 64 );
+   blake256r8_8way_update( &blakecoin_8w_ctx, vdata, 64 );

   do {
      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -21,7 +21,7 @@ void decred_hash_4way( void *state, const void *input )
     blake256_4way_context ctx __attribute__ ((aligned (64)));

     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
-     blake256_4way( &ctx, tail, tail_len );
+     blake256_4way_update( &ctx, tail, tail_len );
     blake256_4way_close( &ctx, vhash );
     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }
@@ -46,7 +46,7 @@ int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
   mm128_intrlv_4x32x( vdata, edata, edata, edata, edata, 180*8 );

   blake256_4way_init( &blake_mid );
-   blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
+   blake256_4way_update( &blake_mid, vdata, DECRED_MIDSTATE_LEN );

   uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
   do {
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -22,23 +22,23 @@ extern void pentablakehash_4way( void *output, const void *input )


     blake512_4way_init( &ctx );
-     blake512_4way( &ctx, input, 80 );
+     blake512_4way_update( &ctx, input, 80 );
     blake512_4way_close( &ctx, vhash );

     blake512_4way_init( &ctx );
-     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_update( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );

     blake512_4way_init( &ctx );
-     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_update( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );

     blake512_4way_init( &ctx );
-     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_update( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );

     blake512_4way_init( &ctx );
-     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_update( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );

     memcpy( output,    hash0, 32 );
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -168,6 +168,66 @@ int cube_4way_close( cube_4way_context *sp, void *output )
    return 0;
 }

+int cube_4way_full( cube_4way_context *sp, void *output,  int hashbitlen, 
+                    const void *data, size_t size )
+{
+    __m512i *h = (__m512i*)sp->h;
+    __m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
+                                                : (__m128i*)IV256 );
+    sp->hashlen   = hashbitlen/128;
+    sp->blocksize = 32/16;
+    sp->rounds    = 16;
+    sp->pos       = 0;
+
+    h[ 0] = m512_const1_128( iv[0] );
+    h[ 1] = m512_const1_128( iv[1] );
+    h[ 2] = m512_const1_128( iv[2] );
+    h[ 3] = m512_const1_128( iv[3] );
+    h[ 4] = m512_const1_128( iv[4] );
+    h[ 5] = m512_const1_128( iv[5] );
+    h[ 6] = m512_const1_128( iv[6] );
+    h[ 7] = m512_const1_128( iv[7] );
+    h[ 0] = m512_const1_128( iv[0] );
+    h[ 1] = m512_const1_128( iv[1] );
+    h[ 2] = m512_const1_128( iv[2] );
+    h[ 3] = m512_const1_128( iv[3] );
+    h[ 4] = m512_const1_128( iv[4] );
+    h[ 5] = m512_const1_128( iv[5] );
+    h[ 6] = m512_const1_128( iv[6] );
+    h[ 7] = m512_const1_128( iv[7] );
+
+    const int len = size >> 4;
+    const __m512i *in = (__m512i*)data;
+    __m512i *hash = (__m512i*)output;
+    int i;
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform_4way( sp );
+           sp->pos = 0;
+        }
+    }
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
+                                    m512_const2_64( 0, 0x0000000000000080 ) );
+    transform_4way( sp );
+
+    sp->h[7] = _mm512_xor_si512( sp->h[7],
+                                    m512_const2_64( 0x0000000100000000, 0 ) );
+
+    for ( i = 0; i < 10; ++i )
+       transform_4way( sp );
+
+    memcpy( hash, sp->h, sp->hashlen<<6);
+    return 0;
+}
+
+
 int cube_4way_update_close( cube_4way_context *sp, void *output,
                               const void *data, size_t size )
 {
@@ -376,4 +436,62 @@ int cube_2way_update_close( cube_2way_context *sp, void *output,
    return 0;
 }

+int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,
+                               const void *data, size_t size )
+{
+    __m256i *h = (__m256i*)sp->h;
+    __m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
+                                                : (__m128i*)IV256 );
+    sp->hashlen   = hashbitlen/128;
+    sp->blocksize = 32/16;
+    sp->rounds    = 16;
+    sp->pos       = 0;
+
+    h[ 0] = m256_const1_128( iv[0] );
+    h[ 1] = m256_const1_128( iv[1] );
+    h[ 2] = m256_const1_128( iv[2] );
+    h[ 3] = m256_const1_128( iv[3] );
+    h[ 4] = m256_const1_128( iv[4] );
+    h[ 5] = m256_const1_128( iv[5] );
+    h[ 6] = m256_const1_128( iv[6] );
+    h[ 7] = m256_const1_128( iv[7] );
+    h[ 0] = m256_const1_128( iv[0] );
+    h[ 1] = m256_const1_128( iv[1] );
+    h[ 2] = m256_const1_128( iv[2] );
+    h[ 3] = m256_const1_128( iv[3] );
+    h[ 4] = m256_const1_128( iv[4] );
+    h[ 5] = m256_const1_128( iv[5] );
+    h[ 6] = m256_const1_128( iv[6] );
+    h[ 7] = m256_const1_128( iv[7] );
+
+    const int len = size >> 4;
+    const __m256i *in = (__m256i*)data;
+    __m256i *hash = (__m256i*)output;
+    int i;
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform_2way( sp );
+           sp->pos = 0;
+        }
+    }
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
+                                    m256_const2_64( 0, 0x0000000000000080 ) );
+    transform_2way( sp );
+
+    sp->h[7] = _mm256_xor_si256( sp->h[7],
+                                    m256_const2_64( 0x0000000100000000, 0 ) );
+
+    for ( i = 0; i < 10; ++i )    transform_2way( sp );
+
+    memcpy( hash, sp->h, sp->hashlen<<5 );
+    return 0;
+}
+
 #endif
--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -21,15 +21,12 @@ typedef struct _cube_4way_context cube_4way_context;

 int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
                       int blockbytes );
-// reinitialize context with same parameters, much faster.
-int cube_4way_reinit( cube_4way_context *sp );
-
 int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
-
 int cube_4way_close( cube_4way_context *sp, void *output );
-
 int cube_4way_update_close( cube_4way_context *sp, void *output,
                            const void *data, size_t size );
+int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
+                    const void *data, size_t size );

 #endif

@@ -48,15 +45,12 @@ typedef struct _cube_2way_context cube_2way_context;

 int cube_2way_init( cube_2way_context* sp, int hashbitlen, int rounds,
                       int blockbytes );
-// reinitialize context with same parameters, much faster.
-int cube_2way_reinit( cube_2way_context *sp );
-
 int cube_2way_update( cube_2way_context *sp, const void *data, size_t size );
-
 int cube_2way_close( cube_2way_context *sp, void *output );
-
 int cube_2way_update_close( cube_2way_context *sp, void *output,
                            const void *data, size_t size );
+int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,
+                    const void *data, size_t size );


 #endif
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -20,6 +20,7 @@
 #include "hash_api.h"
 //#include "vperm.h"
 #include <immintrin.h>
+#include "simd-utils.h"

 MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
 MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
@@ -517,6 +518,165 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
   return SUCCESS;
 }

+HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
+            int nHashSize, const BitSequence *data, DataLength datalen )
+{
+   int i, j;
+
+   state->k = m128_zero;
+   state->processed_bits = 0;
+   state->uBufferBytes = 0;
+
+   switch( nHashSize )
+   {
+      case 256:
+         state->uHashSize = 256;
+         state->uBlockLength = 192;
+         state->uRounds = 8;
+         state->hashsize = m128_const_64( 0, 0x100 );
+         state->const1536 = m128_const_64( 0, 0x600 );
+         break;
+
+      case 512:
+         state->uHashSize = 512;
+         state->uBlockLength = 128;
+         state->uRounds = 10;
+         state->hashsize = m128_const_64( 0, 0x200 );
+         state->const1536 = m128_const_64( 0, 0x400 );
+         break;
+
+      default:
+         return BAD_HASHBITLEN;
+   }
+
+   for(i = 0; i < 4; i++)
+      for(j = 0; j < nHashSize / 256; j++)
+         state->state[i][j] = state->hashsize;
+
+   for(i = 0; i < 4; i++)
+      for(j = nHashSize / 256; j < 4; j++)
+         state->state[i][j] = m128_zero;
+
+
+   unsigned int uBlockCount, uRemainingBytes;
+
+   if( (state->uBufferBytes + datalen) >= state->uBlockLength )
+   {
+        if( state->uBufferBytes != 0 )
+        {
+           // Fill the buffer
+           memcpy( state->buffer + state->uBufferBytes,
+                   (void*)data, state->uBlockLength - state->uBufferBytes );
+
+           // Process buffer
+           Compress( state, state->buffer, 1 );
+           state->processed_bits += state->uBlockLength * 8;
+
+           data += state->uBlockLength - state->uBufferBytes;
+           datalen -= state->uBlockLength - state->uBufferBytes;
+        }
+
+        // buffer now does not contain any unprocessed bytes
+
+        uBlockCount = datalen / state->uBlockLength;
+        uRemainingBytes = datalen % state->uBlockLength;
+
+        if( uBlockCount > 0 )
+        {
+           Compress( state, data, uBlockCount );
+           state->processed_bits += uBlockCount * state->uBlockLength * 8;
+           data += uBlockCount * state->uBlockLength;
+        }
+
+        if( uRemainingBytes > 0 )
+        memcpy(state->buffer, (void*)data, uRemainingBytes);
+
+        state->uBufferBytes = uRemainingBytes;
+   }
+   else
+   {
+        memcpy( state->buffer + state->uBufferBytes, (void*)data, datalen );
+        state->uBufferBytes += datalen;
+   }
+
+   __m128i remainingbits;
+
+   // Add remaining bytes in the buffer
+   state->processed_bits += state->uBufferBytes * 8;
+
+   remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 );
+
+   // Pad with 0x80
+   state->buffer[state->uBufferBytes++] = 0x80;
+   // Enough buffer space for padding in this block?
+   if( (state->uBlockLength - state->uBufferBytes) >= 18 )
+   {
+        // Pad with zeros
+        memset( state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18) );
+
+        // Hash size
+        *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = state->uHashSize;
+
+        // Processed bits
+        *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
+                   state->processed_bits;
+        *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
+
+        // Last block contains message bits?
+        if( state->uBufferBytes == 1 )
+        {
+           state->k = _mm_xor_si128( state->k, state->k );
+           state->k = _mm_sub_epi64( state->k, state->const1536 );
+        }
+        else
+        {
+           state->k = _mm_add_epi64( state->k, remainingbits );
+           state->k = _mm_sub_epi64( state->k, state->const1536 );
+        }
+
+        // Compress
+        Compress( state, state->buffer, 1 );
+   }
+   else
+   {
+        // Fill with zero and compress
+        memset( state->buffer + state->uBufferBytes, 0,
+                state->uBlockLength - state->uBufferBytes );
+        state->k = _mm_add_epi64( state->k, remainingbits );
+        state->k = _mm_sub_epi64( state->k, state->const1536 );
+        Compress( state, state->buffer, 1 );
+
+        // Last block
+        memset( state->buffer, 0, state->uBlockLength - 18 );
+
+        // Hash size
+        *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) =
+                 state->uHashSize;
+
+        // Processed bits
+        *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
+                   state->processed_bits;
+        *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
+        // Compress the last block
+        state->k = _mm_xor_si128( state->k, state->k );
+        state->k = _mm_sub_epi64( state->k, state->const1536 );
+        Compress( state, state->buffer, 1) ;
+   }
+
+   // Store the hash value
+   _mm_store_si128( (__m128i*)hashval + 0, state->state[0][0] );
+   _mm_store_si128( (__m128i*)hashval + 1, state->state[1][0] );
+
+   if( state->uHashSize == 512 )
+   {
+        _mm_store_si128( (__m128i*)hashval + 2, state->state[2][0] );
+        _mm_store_si128( (__m128i*)hashval + 3, state->state[3][0] );
+
+   }
+   return SUCCESS;
+}
+
+

 HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
 {
--- a/algo/echo/aes_ni/hash_api.h
+++ b/algo/echo/aes_ni/hash_api.h
@@ -55,6 +55,8 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit

 HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
                              const BitSequence *data, DataLength databitlen );
+HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
+            int nHashSize, const BitSequence *data, DataLength databitlen );

 #endif // HASH_API_H

--- a/algo/echo/echo-hash-4way.c
+++ b/algo/echo/echo-hash-4way.c
@@ -313,4 +313,92 @@ int echo_4way_update_close( echo_4way_context *state, void *hashval,
   return 0;
 }

+int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize, 
+                    const void *data, int datalen )
+{
+   int i, j;
+   int databitlen = datalen * 8;
+   ctx->k = m512_zero;
+   ctx->processed_bits = 0;
+   ctx->uBufferBytes = 0;
+
+   switch( nHashSize )
+   {
+      case 256:
+         ctx->uHashSize = 256;
+         ctx->uBlockLength = 192;
+         ctx->uRounds = 8;
+         ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x100 );
+         ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x600 );
+         break;
+
+      case 512:
+         ctx->uHashSize = 512;
+         ctx->uBlockLength = 128;
+         ctx->uRounds = 10;
+         ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x200 );
+         ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x400);
+         break;
+
+      default:
+         return 1;
+   }
+
+   for( i = 0; i < 4; i++ )
+      for( j = 0; j < nHashSize / 256; j++ )
+         ctx->state[ i ][ j ] = ctx->hashsize;
+
+   for( i = 0; i < 4; i++ )
+      for( j = nHashSize / 256; j < 4; j++ )
+         ctx->state[ i ][ j ] = m512_zero;
+
+   
+// bytelen is either 32 (maybe), 64 or 80 or 128!
+// all are less than full block.
+
+   int vlen = datalen / 32;  
+   const int vblen = ctx->uBlockLength / 16; //  16 bytes per lane
+   __m512i remainingbits;
+
+   if ( databitlen == 1024 )
+   {
+      echo_4way_compress( ctx, data, 1 );
+      ctx->processed_bits = 1024;
+      remainingbits = m512_const2_64( 0, -1024 );
+      vlen = 0;
+   }
+   else
+   {
+      vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
+      memcpy_512( ctx->buffer, data, vlen );
+      ctx->processed_bits += (unsigned int)( databitlen );
+      remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen );
+
+   }
+
+   ctx->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 );
+   memset_zero_512( ctx->buffer + vlen + 1, vblen - vlen - 2 );
+   ctx->buffer[ vblen-2 ] =
+                _mm512_set4_epi32( (uint32_t)ctx->uHashSize << 16, 0, 0, 0 );
+   ctx->buffer[ vblen-1 ] =
+                   _mm512_set4_epi64( 0, ctx->processed_bits,
+                                      0, ctx->processed_bits );
+
+   ctx->k = _mm512_add_epi64( ctx->k, remainingbits );
+   ctx->k = _mm512_sub_epi64( ctx->k, ctx->const1536 );
+
+   echo_4way_compress( ctx, ctx->buffer, 1 );
+
+   _mm512_store_si512( (__m512i*)hashval + 0, ctx->state[ 0 ][ 0] );
+   _mm512_store_si512( (__m512i*)hashval + 1, ctx->state[ 1 ][ 0] );
+
+   if ( ctx->uHashSize == 512 )
+   {
+      _mm512_store_si512( (__m512i*)hashval + 2, ctx->state[ 2 ][ 0 ] );
+      _mm512_store_si512( (__m512i*)hashval + 3, ctx->state[ 3 ][ 0 ] );
+   }
+   return 0;
+}
+
+
 #endif
--- a/algo/echo/echo-hash-4way.h
+++ b/algo/echo/echo-hash-4way.h
@@ -32,5 +32,8 @@ int echo_close( echo_4way_context *state, void *hashval );
 int echo_4way_update_close( echo_4way_context *state, void *hashval,
                              const void *data, int databitlen );

+int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
+                    const void *data, int datalen );
+
 #endif 
 #endif
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -185,6 +185,82 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
   return SUCCESS_GR;
 }

+int groestl512_full( hashState_groestl* ctx, void* output,
+                                const void* input, uint64_t databitlen )
+{
+
+  int i;
+
+  ctx->hashlen = 64;
+  SET_CONSTANTS();
+
+  for ( i = 0; i < SIZE512; i++ )
+  {
+     ctx->chaining[i] = _mm_setzero_si128();
+     ctx->buffer[i]   = _mm_setzero_si128();
+  }
+  ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
+  ctx->buf_ptr = 0;
+  ctx->rem_ptr = 0;
+
+
+   const int len = (int)databitlen / 128;
+   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
+   const int hash_offset = SIZE512 - hashlen_m128i;
+   int rem = ctx->rem_ptr;
+   uint64_t blocks = len / SIZE512;
+   __m128i* in = (__m128i*)input;
+
+   // --- update ---
+
+   // digest any full blocks, process directly from input 
+   for ( i = 0; i < blocks; i++ )
+      TF1024( ctx->chaining, &in[ i * SIZE512 ] );
+   ctx->buf_ptr = blocks * SIZE512;
+
+   // copy any remaining data to buffer, it may already contain data
+   // from a previous update for a midstate precalc
+   for ( i = 0; i < len % SIZE512; i++ )
+       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+   i += rem;    // use i as rem_ptr in final
+
+   //--- final ---
+
+   blocks++;      // adjust for final block
+
+   if ( i == len -1 )
+   {
+       // only 128 bits left in buffer, all padding at once
+       ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
+                                           0,0,0,0, 0,0,0,0x80 );
+   }
+   else
+   {
+       // add first padding
+       ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+                                      0,0,0,0, 0,0,0,0x80 );
+       // add zero padding
+       for ( i += 1; i < SIZE512 - 1; i++ )
+           ctx->buffer[i] = _mm_setzero_si128();
+
+       // add length padding, second last byte is zero unless blocks > 255
+       ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0,
+                                           0,         0 ,0,0, 0,0,0,0 );
+   }
+
+   // digest final padding block and do output transform
+   TF1024( ctx->chaining, ctx->buffer );
+
+   OF1024( ctx->chaining );
+
+   // store hash result in output 
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
+
+   return 0;
+}
+   
+
 HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
                                const void* input, DataLength_gr databitlen )
 {
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -87,5 +87,6 @@ HashReturn_gr final_groestl( hashState_groestl*, void* );

 HashReturn_gr update_and_final_groestl( hashState_groestl*,  void*,
                                        const void*, DataLength_gr );
+int groestl512_full( hashState_groestl*,  void*, const void*, uint64_t );

 #endif /* __hash_h */
--- a/algo/groestl/groestl256-hash-4way.c
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -15,7 +15,7 @@
 #include "miner.h"
 #include "simd-utils.h"

-#if defined(__VAES__)
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)


 int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
--- a/algo/groestl/groestl256-hash-4way.h
+++ b/algo/groestl/groestl256-hash-4way.h
@@ -18,6 +18,8 @@
 #endif
 #include <stdlib.h>

+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+   
 #define LENGTH (256)

 //#include "brg_endian.h"
@@ -69,4 +71,5 @@ int groestl256_4way_init( groestl256_4way_context*, uint64_t );
 int groestl256_4way_update_close( groestl256_4way_context*,  void*,
                                        const void*, uint64_t );

+#endif
 #endif 
--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -15,29 +15,22 @@
 #include "miner.h"
 #include "simd-utils.h"

-#if defined(__VAES__)
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
 {
  int i;

-  ctx->hashlen = hashlen;
  SET_CONSTANTS();

  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return 1;

-  for ( i = 0; i < SIZE512; i++ )
-  {
-     ctx->chaining[i] = m512_zero;
-     ctx->buffer[i]   = m512_zero;
-  }
+  memset_zero_512( ctx->chaining, SIZE512 );
+  memset_zero_512( ctx->buffer, SIZE512 );

  // The only non-zero in the IV is len. It can be hard coded.
  ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
-//  uint64_t len = U64BIG((uint64_t)LENGTH);
-//  ctx->chaining[ COLS/2 -1 ] = _mm512_set4_epi64( len, 0, len, 0 );
-//  INIT_4way(ctx->chaining);

  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;
@@ -49,7 +42,7 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
                                const void* input, uint64_t databitlen )
 {
   const int len = (int)databitlen / 128;
-   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
+   const int hashlen_m128i = 64 / 16;   // bytes to __m128i
   const int hash_offset = SIZE512 - hashlen_m128i;
   int rem = ctx->rem_ptr;
   int blocks = len / SIZE512;
@@ -58,16 +51,13 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,

   // --- update ---

-   // digest any full blocks, process directly from input 
   for ( i = 0; i < blocks; i++ )
      TF1024_4way( ctx->chaining, &in[ i * SIZE512 ] );
   ctx->buf_ptr = blocks * SIZE512;

-   // copy any remaining data to buffer, it may already contain data
-   // from a previous update for a midstate precalc
   for ( i = 0; i < len % SIZE512; i++ )
       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
-   i += rem;    // use i as rem_ptr in final
+   i += rem; 

   //--- final ---

@@ -81,23 +71,71 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
   }   
   else
   {
-       // add first padding
       ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
-       // add zero padding
       for ( i += 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = m512_zero;
-
-       // add length padding, second last byte is zero unless blocks > 255
       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
                   blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
   }

-// digest final padding block and do output transform
   TF1024_4way( ctx->chaining, ctx->buffer );
-
   OF1024_4way( ctx->chaining );

-   // store hash result in output 
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];
+
+   return 0;
+}
+
+int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
+                          const void* input, uint64_t datalen )
+{
+   const int len = (int)datalen >> 4;
+   const int hashlen_m128i = 64 >> 4;   // bytes to __m128i
+   const int hash_offset = SIZE512 - hashlen_m128i;
+   uint64_t blocks = len / SIZE512;
+   __m512i* in = (__m512i*)input;
+   int i;
+
+   // --- init ---
+
+   SET_CONSTANTS();
+   memset_zero_512( ctx->chaining, SIZE512 );
+   memset_zero_512( ctx->buffer, SIZE512 );
+   ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
+   ctx->buf_ptr = 0;
+   ctx->rem_ptr = 0;
+
+   // --- update ---
+
+   for ( i = 0; i < blocks; i++ )
+      TF1024_4way( ctx->chaining, &in[ i * SIZE512 ] );
+   ctx->buf_ptr = blocks * SIZE512;
+
+   for ( i = 0; i < len % SIZE512; i++ )
+       ctx->buffer[ ctx->rem_ptr + i ] = in[ ctx->buf_ptr + i ];
+   i += ctx->rem_ptr;
+
+   // --- close ---
+
+   blocks++;   
+
+   if ( i == SIZE512 - 1 )
+   {
+       // only 1 vector left in buffer, all padding at once
+       ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 );
+   }
+   else
+   {
+       ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
+       for ( i += 1; i < SIZE512 - 1; i++ )
+           ctx->buffer[i] = m512_zero;
+       ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
+   }
+
+   TF1024_4way( ctx->chaining, ctx->buffer );
+   OF1024_4way( ctx->chaining );
+
   for ( i = 0; i < hashlen_m128i; i++ )
      casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];

--- a/algo/groestl/groestl512-hash-4way.h
+++ b/algo/groestl/groestl512-hash-4way.h
@@ -1,11 +1,3 @@
-/* hash.h     Aug 2011
- *
- * Groestl implementation for different versions.
- * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
- *
- * This code is placed in the public domain
- */
-
 #if !defined(GROESTL512_HASH_4WAY_H__)
 #define GROESTL512_HASH_4WAY_H__ 1

@@ -18,11 +10,9 @@
 #endif
 #include <stdlib.h>

-#define LENGTH (512)
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-//#include "brg_endian.h"
-//#define NEED_UINT_64T
-//#include "algo/sha/brg_types.h"
+#define LENGTH (512)

 /* some sizes (number of bytes) */
 #define ROWS (8)
@@ -44,34 +34,11 @@
 #define ROUNDS (ROUNDS1024)
 //#endif

-/*
-#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
-
-#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
-#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
-#define U64BIG(a) (a)
-#endif // IS_BIG_ENDIAN 
-
-#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
-#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
-#define U64BIG(a) \
-  ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
-   (ROTL64(a,24) & li_64(0000FF000000FF00)) | \
-   (ROTL64(a,40) & li_64(00FF000000FF0000)) | \
-   (ROTL64(a,56) & li_64(FF000000FF000000)))
-#endif // IS_LITTLE_ENDIAN 
-
-typedef unsigned char BitSequence_gr;
-typedef unsigned long long DataLength_gr;
-typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr;
-*/
-
 #define SIZE512 (SIZE_1024/16)

 typedef struct {
  __attribute__ ((aligned (128))) __m512i chaining[SIZE512];
  __attribute__ ((aligned (64))) __m512i buffer[SIZE512];
-  int hashlen;       // byte
  int blk_count;     // SIZE_m128i
  int buf_ptr;       // __m128i offset
  int rem_ptr;
@@ -85,10 +52,11 @@ int groestl512_4way_init( groestl512_4way_context*, uint64_t );

 int groestl512_4way_update( groestl512_4way_context*, const void*,
                              uint64_t );
-
 int groestl512_4way_close( groestl512_4way_context*, void* );
-
 int groestl512_4way_update_close( groestl512_4way_context*,  void*,
                                        const void*, uint64_t );
+int groestl512_4way_full( groestl512_4way_context*,  void*,
+                          const void*, uint64_t );

-#endif /* __hash_h */
+#endif   // VAES
+#endif   // GROESTL512_HASH_4WAY_H__
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -161,7 +161,7 @@ bool register_hodl_algo( algo_gate_t* gate )
 //     return false;
 //  }
  pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
-  gate->optimizations         = AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations         = SSE42_OPT | AES_OPT | AVX2_OPT;
  gate->scanhash              = (void*)&hodl_scanhash;
  gate->get_new_work          = (void*)&hodl_get_new_work;
  gate->longpoll_rpc_call     = (void*)&hodl_longpoll_rpc_call;
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -41,57 +41,10 @@
 extern "C"{
 #endif

-
-#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_JH
-#define SPH_SMALL_FOOTPRINT_JH   1
-#endif
-
-#if !defined SPH_JH_64 && SPH_64_TRUE
-#define SPH_JH_64   1
-#endif
-
-#if !SPH_64
-#undef SPH_JH_64
-#endif
-
 #ifdef _MSC_VER
 #pragma warning (disable: 4146)
 #endif

-/*
- * The internal bitslice representation may use either big-endian or
- * little-endian (true bitslice operations do not care about the bit
- * ordering, and the bit-swapping linear operations in JH happen to
- * be invariant through endianness-swapping). The constants must be
- * defined according to the chosen endianness; we use some
- * byte-swapping macros for that.
- */
-
-#if SPH_LITTLE_ENDIAN
-
-#if SPH_64
-#define C64e(x)     ((SPH_C64(x) >> 56) \
-                    | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
-                    | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \
-                    | ((SPH_C64(x) >>  8) & SPH_C64(0x00000000FF000000)) \
-                    | ((SPH_C64(x) <<  8) & SPH_C64(0x000000FF00000000)) \
-                    | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \
-                    | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \
-                    | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000)))
-#define dec64e_aligned   sph_dec64le_aligned
-#define enc64e           sph_enc64le
-#endif
-
-#else
-
-#if SPH_64
-#define C64e(x)     SPH_C64(x)
-#define dec64e_aligned   sph_dec64be_aligned
-#define enc64e           sph_enc64be
-#endif
-
-#endif
-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 #define Sb_8W(x0, x1, x2, x3, c) \
@@ -152,8 +105,97 @@ do { \
    x3 = _mm256_xor_si256( x3, x4 ); \
 } while (0)

-#if SPH_JH_64
+static const uint64_t C[] =
+{
+   0x67f815dfa2ded572, 0x571523b70a15847b,
+   0xf6875a4d90d6ab81, 0x402bd1c3c54f9f4e,
+   0x9cfa455ce03a98ea, 0x9a99b26699d2c503,
+   0x8a53bbf2b4960266, 0x31a2db881a1456b5,
+   0xdb0e199a5c5aa303, 0x1044c1870ab23f40,
+   0x1d959e848019051c, 0xdccde75eadeb336f,
+   0x416bbf029213ba10, 0xd027bbf7156578dc,
+   0x5078aa3739812c0a, 0xd3910041d2bf1a3f,
+   0x907eccf60d5a2d42, 0xce97c0929c9f62dd,
+   0xac442bc70ba75c18, 0x23fcc663d665dfd1,
+   0x1ab8e09e036c6e97, 0xa8ec6c447e450521,
+   0xfa618e5dbb03f1ee, 0x97818394b29796fd,
+   0x2f3003db37858e4a, 0x956a9ffb2d8d672a,
+   0x6c69b8f88173fe8a, 0x14427fc04672c78a,
+   0xc45ec7bd8f15f4c5, 0x80bb118fa76f4475,
+   0xbc88e4aeb775de52, 0xf4a3a6981e00b882,
+   0x1563a3a9338ff48e, 0x89f9b7d524565faa,
+   0xfde05a7c20edf1b6, 0x362c42065ae9ca36,
+   0x3d98fe4e433529ce, 0xa74b9a7374f93a53,
+   0x86814e6f591ff5d0, 0x9f5ad8af81ad9d0e,
+   0x6a6234ee670605a7, 0x2717b96ebe280b8b,
+   0x3f1080c626077447, 0x7b487ec66f7ea0e0,
+   0xc0a4f84aa50a550d, 0x9ef18e979fe7e391,
+   0xd48d605081727686, 0x62b0e5f3415a9e7e,
+   0x7a205440ec1f9ffc, 0x84c9f4ce001ae4e3,
+   0xd895fa9df594d74f, 0xa554c324117e2e55,
+   0x286efebd2872df5b, 0xb2c4a50fe27ff578,
+   0x2ed349eeef7c8905, 0x7f5928eb85937e44,
+   0x4a3124b337695f70, 0x65e4d61df128865e,
+   0xe720b95104771bc7, 0x8a87d423e843fe74,
+   0xf2947692a3e8297d, 0xc1d9309b097acbdd,
+   0xe01bdc5bfb301b1d, 0xbf829cf24f4924da,
+   0xffbf70b431bae7a4, 0x48bcf8de0544320d,
+   0x39d3bb5332fcae3b, 0xa08b29e0c1c39f45,
+   0x0f09aef7fd05c9e5, 0x34f1904212347094,
+   0x95ed44e301b771a2, 0x4a982f4f368e3be9,
+   0x15f66ca0631d4088, 0xffaf52874b44c147,
+   0x30c60ae2f14abb7e, 0xe68c6eccc5b67046,
+   0x00ca4fbd56a4d5a4, 0xae183ec84b849dda,
+   0xadd1643045ce5773, 0x67255c1468cea6e8,
+   0x16e10ecbf28cdaa3, 0x9a99949a5806e933,
+   0x7b846fc220b2601f, 0x1885d1a07facced1,
+   0xd319dd8da15b5932, 0x46b4a5aac01c9a50,
+   0xba6b04e467633d9f, 0x7eee560bab19caf6,
+   0x742128a9ea79b11f, 0xee51363b35f7bde9,
+   0x76d350755aac571d, 0x01707da3fec2463a,
+   0x42d8a498afc135f7, 0x79676b9e20eced78,
+   0xa8db3aea15638341, 0x832c83324d3bc3fa,
+   0xf347271c1f3b40a7, 0x9a762db734f04059,
+   0xfd4f21d26c4e3ee7, 0xef5957dc398dfdb8,
+   0xdaeb492b490c9b8d, 0x0d70f36849d7a25b,
+   0x84558d7ad0ae3b7d, 0x658ef8e4f0e9a5f5,
+   0x533b1036f4a2b8a0, 0x5aec3e759e07a80c,
+   0x4f88e85692946891, 0x4cbcbaf8555cb05b,
+   0x7b9487f3993bbbe3, 0x5d1c6b72d6f4da75,
+   0x6db334dc28acae64, 0x71db28b850a5346c,
+   0x2a518d10f2e261f8, 0xfc75dd593364dbe3,
+   0xa23fce43f1bcac1c, 0xb043e8023cd1bb67,
+   0x75a12988ca5b0a33, 0x5c5316b44d19347f,
+   0x1e4d790ec3943b92, 0x3fafeeb6d7757479,
+   0x21391abef7d4a8ea, 0x5127234c097ef45c,
+   0xd23c32ba5324a326, 0xadd5a66d4a17a344,
+   0x08c9f2afa63e1db5, 0x563c6b91983d5983,
+   0x4d608672a17cf84c, 0xf6c76e08cc3ee246,
+   0x5e76bcb1b333982f, 0x2ae6c4efa566d62b,
+   0x36d4c1bee8b6f406, 0x6321efbc1582ee74,
+   0x69c953f40d4ec1fd, 0x26585806c45a7da7,
+   0x16fae0061614c17e, 0x3f9d63283daf907e,
+   0x0cd29b00e3f2c9d2, 0x300cd4b730ceaa5f,
+   0x9832e0f216512a74, 0x9af8cee3d830eb0d,
+   0x9279f1b57b9ec54b, 0xd36886046ee651ff,
+   0x316796e6574d239b, 0x05750a17f3a6e6cc,
+   0xce6c3213d98176b1, 0x62a205f88452173c,
+   0x47154778b3cb2bf4, 0x486a9323825446ff,
+   0x65655e4e0758df38, 0x8e5086fc897cfcf2,
+   0x86ca0bd0442e7031, 0x4e477830a20940f0,
+   0x8338f7d139eea065, 0xbd3a2ce437e95ef7,
+   0x6ff8130126b29721, 0xe7de9fefd1ed44a3,
+   0xd992257615dfa08b, 0xbe42dc12f6f7853c,
+   0x7eb027ab7ceca7d8, 0xdea83eaada7d8d53,
+   0xd86902bd93ce25aa, 0xf908731afd43f65a,
+   0xa5194a17daef5fc0, 0x6a21fd4c33664d97,
+   0x701541db3198b435, 0x9b54cdedbb0f1eea,
+   0x72409751a163d09a, 0xe26f4791bf9d75f6
+};

+// Big endian version
+
+/*
 static const sph_u64 C[] = {
 	C64e(0x72d5dea2df15f867), C64e(0x7b84150ab7231557),
 	C64e(0x81abd6904d5a87f6), C64e(0x4e9f4fc5c3d12b40),
@@ -240,6 +282,7 @@ static const sph_u64 C[] = {
 	C64e(0x35b49831db411570), C64e(0xea1e0fbbedcd549b),
 	C64e(0x9ad063a151974072), C64e(0xf6759dbf91476fe2)
 };
+*/

 #define Ceven_hi(r)   (C[((r) << 2) + 0])
 #define Ceven_lo(r)   (C[((r) << 2) + 1])
@@ -427,7 +470,7 @@ do { \
   h7h = _mm256_xor_si256( h7h, m3h ); \
   h7l = _mm256_xor_si256( h7l, m3l ); \

-
+/*
 static const sph_u64 IV256[] = {
 	C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1),
 	C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03),
@@ -450,11 +493,8 @@ static const sph_u64 IV512[] = {
 	C64e(0xcf57f6ec9db1f856), C64e(0xa706887c5716b156),
 	C64e(0xe3c2fcdfe68517fb), C64e(0x545a4678cc8cdd4b)
 };
+*/

-#else
-
-
-#endif

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

@@ -484,57 +524,6 @@ static const sph_u64 IV512[] = {
 		W ## ro(h7); \
 	} while (0)

-#if SPH_SMALL_FOOTPRINT_JH
-
-#if SPH_JH_64
-
-/*
- * The "small footprint" 64-bit version just uses a partially unrolled
- * loop.
- */
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-#define E8_8W   do { \
-      unsigned r; \
-      for (r = 0; r < 42; r += 7) { \
-         SL_8W(0); \
-         SL_8W(1); \
-         SL_8W(2); \
-         SL_8W(3); \
-         SL_8W(4); \
-         SL_8W(5); \
-         SL_8W(6); \
-      } \
-   } while (0)
-
-#endif
-
-#define E8   do { \
-		unsigned r; \
-		for (r = 0; r < 42; r += 7) { \
-			SL(0); \
-			SL(1); \
-			SL(2); \
-			SL(3); \
-			SL(4); \
-			SL(5); \
-			SL(6); \
-		} \
-	} while (0)
-
-#else
-
-
-#endif
-
-#else
-
-#if SPH_JH_64
-
-/*
- * On a "true 64-bit" architecture, we can unroll at will.
- */

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

@@ -585,6 +574,7 @@ static const sph_u64 IV512[] = {

 #endif  // AVX512

+
 #define E8   do { \
      SLu( 0, 0); \
      SLu( 1, 1); \
@@ -630,13 +620,6 @@ static const sph_u64 IV512[] = {
      SLu(41, 6); \
   } while (0)

-#else
-
-
-#endif
-
-#endif
-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 void jh256_8way_init( jh_8way_context *sc )
@@ -732,12 +715,12 @@ jh_8way_core( jh_8way_context *sc, const void *data, size_t len )

 static void
 jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst,
-               size_t out_size_w32, const void *iv )
+               size_t out_size_w32 )
 {
   __m512i buf[16*4];
   __m512i *dst512 = (__m512i*)dst;
   size_t numz, u;
-   sph_u64 l0, l1, l0e, l1e;
+   uint64_t l0, l1;

   buf[0] = m512_const1_64( 0x80ULL );

@@ -748,12 +731,10 @@ jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst,

   memset_zero_512( buf+1, (numz>>3) - 1 );

-   l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
-   l1 = SPH_T64(sc->block_count >> 55);
-   sph_enc64be( &l0e, l0 );
-   sph_enc64be( &l1e, l1 );
-   *(buf + (numz>>3)    ) = _mm512_set1_epi64( l1e );
-   *(buf + (numz>>3) + 1) = _mm512_set1_epi64( l0e );
+   l0 = ( sc->block_count << 9 ) + ( sc->ptr << 3 );
+   l1 = ( sc->block_count >> 55 );
+   *(buf + (numz>>3)    ) = _mm512_set1_epi64( bswap_64( l1 ) );
+   *(buf + (numz>>3) + 1) = _mm512_set1_epi64( bswap_64( l0 ) );

   jh_8way_core( sc, buf, numz + 16 );

@@ -772,7 +753,7 @@ jh256_8way_update(void *cc, const void *data, size_t len)
 void
 jh256_8way_close(void *cc, void *dst)
 {
-   jh_8way_close(cc, 0, 0, dst, 8, IV256);
+   jh_8way_close(cc, 0, 0, dst, 8);
 }

 void
@@ -784,7 +765,7 @@ jh512_8way_update(void *cc, const void *data, size_t len)
 void
 jh512_8way_close(void *cc, void *dst)
 {
-   jh_8way_close(cc, 0, 0, dst, 16, IV512);
+   jh_8way_close(cc, 0, 0, dst, 16);
 }

 #endif
@@ -882,12 +863,12 @@ jh_4way_core( jh_4way_context *sc, const void *data, size_t len )

 static void
 jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
-               size_t out_size_w32, const void *iv )
+               size_t out_size_w32 )
 {
   __m256i buf[16*4];
   __m256i *dst256 = (__m256i*)dst;
   size_t numz, u;
-   sph_u64 l0, l1, l0e, l1e;
+   uint64_t l0, l1;

   buf[0] = m256_const1_64( 0x80ULL );

@@ -898,12 +879,10 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,

   memset_zero_256( buf+1, (numz>>3) - 1 );   

-   l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
-   l1 = SPH_T64(sc->block_count >> 55);
-   sph_enc64be( &l0e, l0 );
-   sph_enc64be( &l1e, l1 );
-   *(buf + (numz>>3)    ) = _mm256_set1_epi64x( l1e );
-   *(buf + (numz>>3) + 1) = _mm256_set1_epi64x( l0e ); 
+   l0 = ( sc->block_count << 9 ) + ( sc->ptr << 3 );
+   l1 = ( sc->block_count >> 55 );
+   *(buf + (numz>>3)    ) = _mm256_set1_epi64x( bswap_64( l1 ) );
+   *(buf + (numz>>3) + 1) = _mm256_set1_epi64x( bswap_64( l0 ) );

   jh_4way_core( sc, buf, numz + 16 );

@@ -922,7 +901,7 @@ jh256_4way_update(void *cc, const void *data, size_t len)
 void
 jh256_4way_close(void *cc, void *dst)
 {
-	jh_4way_close(cc, 0, 0, dst, 8, IV256);
+	jh_4way_close(cc, 0, 0, dst, 8 );
 }

 void
@@ -934,7 +913,7 @@ jh512_4way_update(void *cc, const void *data, size_t len)
 void
 jh512_4way_close(void *cc, void *dst)
 {
-	jh_4way_close(cc, 0, 0, dst, 16, IV512);
+	jh_4way_close(cc, 0, 0, dst, 16 );
 }


--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -43,7 +43,6 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
 #include "simd-utils.h"

 #define SPH_SIZE_jh256   256
--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -65,7 +65,7 @@ void jha_hash_4way( void *out, const void *input )
          vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );

       blake512_4way_init( &ctx_blake );
-       blake512_4way( &ctx_blake, vhash, 64 );
+       blake512_4way_update( &ctx_blake, vhash, 64 );
       blake512_4way_close( &ctx_blake, vhashA );

       jh512_4way_init( &ctx_jh );
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -163,7 +163,7 @@ static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
    unsigned eb;
    union {
       __m512i tmp[lim + 1];
-       sph_u64 dummy;   /* for alignment */
+       uint64_t dummy;   /* for alignment */
    } u;
    size_t j;
    size_t m512_len = byte_len >> 3;
@@ -344,7 +344,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
    unsigned eb;
    union {
       __m256i tmp[lim + 1];
-       sph_u64 dummy;   /* for alignment */
+       uint64_t dummy;   /* for alignment */
    } u;
    size_t j;
    size_t m256_len = byte_len >> 3;
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -43,16 +43,8 @@ extern "C"{
 #ifdef  __AVX2__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
 #include "simd-utils.h"

-#define SPH_SIZE_keccak256   256
-
-/**
- * Output size (in bits) for Keccak-512.
- */
-#define SPH_SIZE_keccak512   512
-
 /**
 * This structure is a context for Keccak computations: it contains the
 * intermediate values and some data from the last entered block. Once a
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -459,6 +459,11 @@ int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
    return 0;
 }

+int luffa512_4way_init( luffa_4way_context *state )
+{
+   return luffa_4way_init( state, 512 );
+}
+   
 // Do not call luffa_update_close after having called luffa_update.
 // Once luffa_update has been called only call luffa_update or luffa_close.
 int luffa_4way_update( luffa_4way_context *state, const void *data,
@@ -496,6 +501,14 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
    return 0;
 }

+/*
+int luffa512_4way_update( luffa_4way_context *state, const void *data,
+                       size_t len )
+{
+   return luffa_4way_update( state, data, len );
+}
+*/
+
 int luffa_4way_close( luffa_4way_context *state, void *hashval )
 {
    __m512i *buffer = (__m512i*)state->buffer;
@@ -518,6 +531,77 @@ int luffa_4way_close( luffa_4way_context *state, void *hashval )
    return 0;
 }

+/*
+int luffa512_4way_close( luffa_4way_context *state, void *hashval )
+{
+   return luffa_4way_close( state, hashval );
+}
+*/
+
+int luffa512_4way_full( luffa_4way_context *state, void *output,
+                        const void *data, size_t inlen )
+{
+    state->hashbitlen = 512;
+    __m128i *iv = (__m128i*)IV;
+
+    state->chainv[0] = m512_const1_128( iv[0] );
+    state->chainv[1] = m512_const1_128( iv[1] );
+    state->chainv[2] = m512_const1_128( iv[2] );
+    state->chainv[3] = m512_const1_128( iv[3] );
+    state->chainv[4] = m512_const1_128( iv[4] );
+    state->chainv[5] = m512_const1_128( iv[5] );
+    state->chainv[6] = m512_const1_128( iv[6] );
+    state->chainv[7] = m512_const1_128( iv[7] );
+    state->chainv[8] = m512_const1_128( iv[8] );
+    state->chainv[9] = m512_const1_128( iv[9] );
+
+    ((__m512i*)state->buffer)[0] = m512_zero;
+    ((__m512i*)state->buffer)[1] = m512_zero;
+
+    const __m512i *vdata  = (__m512i*)data;
+    __m512i msg[2];
+    int i;
+    const int blocks = (int)( inlen >> 5 );
+    const __m512i shuff_bswap32 = m512_const_64(
+                                   0x3c3d3e3f38393a3b, 0x3435363730313233,
+                                   0x2c2d2e2f28292a2b, 0x2425262720212223,
+                                   0x1c1d1e1f18191a1b, 0x1415161710111213,
+                                   0x0c0d0e0f08090a0b, 0x0405060700010203 );
+
+    state->rembytes = inlen & 0x1F;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+    {
+       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       rnd512_4way( state, msg );
+    }
+
+    // 16 byte partial block exists for 80 byte len
+    if ( state->rembytes  )
+    {
+       // padding of partial block
+       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = m512_const2_64( 0, 0x0000000080000000 );
+       rnd512_4way( state, msg );
+    }
+    else
+    {
+       // empty pad block
+       msg[0] = m512_const2_64( 0, 0x0000000080000000 );
+       msg[1] = m512_zero;
+       rnd512_4way( state, msg );
+    }
+
+    finalization512_4way( state, (uint32*)output );
+
+    if ( state->hashbitlen > 512 )
+        finalization512_4way( state, (uint32*)( output+64 ) );
+
+    return 0;
+}
+
 int luffa_4way_update_close( luffa_4way_context *state,
                 void *output, const void *data, size_t inlen )
 {
@@ -1031,6 +1115,69 @@ int luffa_2way_close( luffa_2way_context *state, void *hashval )
    return 0;
 }

+int luffa512_2way_full( luffa_2way_context *state, void *output,
+                        const void *data, size_t inlen )
+{
+    state->hashbitlen = 512;
+    __m128i *iv = (__m128i*)IV;
+
+    state->chainv[0] = m256_const1_128( iv[0] );
+    state->chainv[1] = m256_const1_128( iv[1] );
+    state->chainv[2] = m256_const1_128( iv[2] );
+    state->chainv[3] = m256_const1_128( iv[3] );
+    state->chainv[4] = m256_const1_128( iv[4] );
+    state->chainv[5] = m256_const1_128( iv[5] );
+    state->chainv[6] = m256_const1_128( iv[6] );
+    state->chainv[7] = m256_const1_128( iv[7] );
+    state->chainv[8] = m256_const1_128( iv[8] );
+    state->chainv[9] = m256_const1_128( iv[9] );
+
+    ((__m256i*)state->buffer)[0] = m256_zero;
+    ((__m256i*)state->buffer)[1] = m256_zero;
+
+    const __m256i *vdata  = (__m256i*)data;
+    __m256i msg[2];
+    int i;
+    const int blocks = (int)( inlen >> 5 );
+    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
+                                                 0x1415161710111213,
+                                                 0x0c0d0e0f08090a0b,
+                                                 0x0405060700010203 );
+
+    state->rembytes = inlen & 0x1F;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+    {
+       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       rnd512_2way( state, msg );
+    }
+
+    // 16 byte partial block exists for 80 byte len
+    if ( state->rembytes  )
+    {
+       // padding of partial block
+       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = m256_const2_64( 0, 0x0000000080000000 );
+       rnd512_2way( state, msg );
+    }
+    else
+    {
+       // empty pad block
+       msg[0] = m256_const2_64( 0, 0x0000000080000000 );
+       msg[1] = m256_zero;
+       rnd512_2way( state, msg );
+    }
+
+    finalization512_2way( state, (uint32*)output );
+
+    if ( state->hashbitlen > 512 )
+        finalization512_2way( state, (uint32*)( output+32 ) );
+
+    return 0;
+}
+
 int luffa_2way_update_close( luffa_2way_context *state,
                 void *output, const void *data, size_t inlen )
 {
--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -61,11 +61,23 @@ typedef struct {
 } luffa_4way_context __attribute((aligned(128)));

 int luffa_4way_init( luffa_4way_context *state, int hashbitlen );
-int luffa_4way_update( luffa_4way_context *state, const void *data,
-                       size_t len );
-int luffa_4way_close( luffa_4way_context *state, void *hashval );
+//int luffa_4way_update( luffa_4way_context *state, const void *data,
+//                       size_t len );
+//int luffa_4way_close( luffa_4way_context *state, void *hashval );
 int luffa_4way_update_close( luffa_4way_context *state, void *output,
                                   const void *data, size_t inlen );
+int luffa512_4way_full( luffa_4way_context *state, void *output,
+                         const void *data, size_t inlen );
+int luffa512_4way_init( luffa_4way_context *state );
+int luffa512_4way_update( luffa_4way_context *state, const void *data,
+                       size_t len );
+int luffa512_4way_close( luffa_4way_context *state, void *hashval );
+int luffa512_4way_update_close( luffa_4way_context *state, void *output,
+                                const void *data, size_t inlen );
+
+#define luffa_4way_update       luffa512_4way_update
+#define luffa_4way_close        luffa512_4way_close
+#define luffa_4way_update_close luffa512_4way_update_close

 #endif

@@ -82,6 +94,8 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
 int luffa_2way_close( luffa_2way_context *state, void *hashval );
 int luffa_2way_update_close( luffa_2way_context *state, void *output,
                                   const void *data, size_t inlen );
+int luffa512_2way_full( luffa_2way_context *state, void *output,
+                         const void *data, size_t inlen );

 #endif
 #endif
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -262,7 +262,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   const uint32_t last_nonce = max_nonce - 8;
+   const uint32_t last_nonce = max_nonce - 16;
   const uint32_t Htarg = ptarget[7];
   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
   int thr_id = mythr->id;  // thr_id arg is deprecated
@@ -320,18 +320,19 @@ bool init_allium_8way_ctx()
   return true;
 }

-void allium_8way_hash( void *state, const void *input )
+void allium_8way_hash( void *hash, const void *input )
 {
-   uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
-   uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
-   uint32_t hash0[8] __attribute__ ((aligned (32)));
-   uint32_t hash1[8] __attribute__ ((aligned (32)));
-   uint32_t hash2[8] __attribute__ ((aligned (32)));
-   uint32_t hash3[8] __attribute__ ((aligned (32)));
-   uint32_t hash4[8] __attribute__ ((aligned (64)));
-   uint32_t hash5[8] __attribute__ ((aligned (32)));
-   uint32_t hash6[8] __attribute__ ((aligned (32)));
-   uint32_t hash7[8] __attribute__ ((aligned (32)));
+   uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
+   uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
+//   uint64_t hash[4*8] __attribute__ ((aligned (64)));
+   uint64_t *hash0 = (uint64_t*)hash;
+   uint64_t *hash1 = (uint64_t*)hash+ 4;
+   uint64_t *hash2 = (uint64_t*)hash+ 8;
+   uint64_t *hash3 = (uint64_t*)hash+12;
+   uint64_t *hash4 = (uint64_t*)hash+16;
+   uint64_t *hash5 = (uint64_t*)hash+20;
+   uint64_t *hash6 = (uint64_t*)hash+24;
+   uint64_t *hash7 = (uint64_t*)hash+28;
   allium_8way_ctx_holder ctx __attribute__ ((aligned (64))); 

   memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
@@ -398,69 +399,74 @@ void allium_8way_hash( void *state, const void *input )
   dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
   dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );

-   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
+   update_and_final_groestl256( &ctx.groestl, hash0, hash0, 256 );
   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
+   update_and_final_groestl256( &ctx.groestl, hash1, hash1, 256 );
   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
+   update_and_final_groestl256( &ctx.groestl, hash2, hash2, 256 );
   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
+   update_and_final_groestl256( &ctx.groestl, hash3, hash3, 256 );
   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+128, hash4, 256 );
+   update_and_final_groestl256( &ctx.groestl, hash4, hash4, 256 );
   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+160, hash5, 256 );
+   update_and_final_groestl256( &ctx.groestl, hash5, hash5, 256 );
   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+192, hash6, 256 );
+   update_and_final_groestl256( &ctx.groestl, hash6, hash6, 256 );
   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 );
+   update_and_final_groestl256( &ctx.groestl, hash7, hash7, 256 );
 }

 int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[8*8] __attribute__ ((aligned (64)));
+   uint64_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+   uint64_t *ptarget = (uint64_t*)work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
-   const uint32_t Htarg = ptarget[7];
+   const uint64_t Htarg = ptarget[3];
   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  
+   const int thr_id = mythr->id;  
+   const bool bench = opt_benchmark;

-   if ( opt_benchmark )
+   if unlikely( bench )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
+   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
+
   blake256_8way_init( &allium_8way_ctx.blake );
   blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );

   do {
-     *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
-                                                 n+3, n+2, n+1, n ) );
-
     allium_8way_hash( hash, vdata );
-     pdata[19] = n;

-     for ( int lane = 0; lane < 8; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
+     for ( int lane = 0; lane < 8; lane++ )
     {
-        if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
+        const uint64_t *lane_hash = hash + (lane<<2);
+        if unlikely( lane_hash[3] <= Htarg )
        {
-           pdata[19] = n + lane;
-           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
+        if likely( ( lane_hash[3] < Htarg && !bench )
+            || valid_hash( lane_hash, ptarget ) )
+        {
+           pdata[19] = bswap_32( n + lane );
+           submit_lane_solution( work, lane_hash, mythr, lane );
         }
+        }
     }
     n += 8;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
-
+     *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
+   } while likely( (n <= last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -33,7 +33,7 @@ void lyra2h_4way_hash( void *state, const void *input )
     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));

     memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
-     blake256_4way( &ctx_blake, input + (64*4), 16 );
+     blake256_4way_update( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );

     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -260,8 +260,8 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
 // Overlap has 2 in Nrows chance reduced to 1 in Nrows because if both
 // overlap it's unified.
 // As a result normal is Nrows-2 / Nrows.
-// for 4 rows: 1 unified, 1 overlap, 2 normal.
-// for 8 rows: 1 unified, 1 overlap, 6 normal.
+// for 4 rows: 1 unified, 2 overlap, 1 normal.
+// for 8 rows: 1 unified, 2 overlap, 56 normal.

 static inline void reducedDuplexRow_2way_normal( uint64_t *State,
                   uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
@@ -338,21 +338,18 @@ static inline void reducedDuplexRow_2way_normal( uint64_t *State,
   _mm512_store_si512( (__m512i*)State + 3, state3 );
 }

-
-
-// rowInOut0 ! = rowInOut1 != rowOut
 static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
                   uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
                            uint64_t *rowOut, uint64_t nCols)
 {
-
   int i;
   register __m512i state0, state1, state2, state3;
   __m512i *in = (__m512i*)rowIn;
   __m512i *inout0 = (__m512i*)rowInOut0;
   __m512i *inout1 = (__m512i*)rowInOut1;
   __m512i *out = (__m512i*)rowOut;
-   inout_ovly io;
+//   inout_ovly io;
+   ovly_512 io0, io1, io2;

   state0 = _mm512_load_si512( (__m512i*)State     );
   state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -362,6 +359,21 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
   for ( i = 0; i < nCols; i++ )
   {
     //Absorbing "M[prev] [+] M[row*]"
+     io0.v512 = _mm512_mask_blend_epi64( 0xf0,
+                                  _mm512_load_si512( (__m512i*)inout0 ),
+                                  _mm512_load_si512( (__m512i*)inout1 ) );
+     io1.v512 = _mm512_mask_blend_epi64( 0xf0,
+                                  _mm512_load_si512( (__m512i*)inout0 +1 ),
+                                  _mm512_load_si512( (__m512i*)inout1 +1 ) );
+     io2.v512 = _mm512_mask_blend_epi64( 0xf0,
+                                  _mm512_load_si512( (__m512i*)inout0 +2 ),
+                                  _mm512_load_si512( (__m512i*)inout1 +2 ) );
+
+     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0.v512 ) );
+     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1.v512 ) );
+     state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io2.v512 ) );
+     
+/* 
     io.v512[0] = _mm512_mask_blend_epi64( 0xf0,
                                  _mm512_load_si512( (__m512i*)inout0 ),
                                  _mm512_load_si512( (__m512i*)inout1 ) );
@@ -375,6 +387,7 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io.v512[0] ) );
     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io.v512[1] ) );
     state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io.v512[2] ) );
+*/

     //Applies the reduced-round transformation f to the sponge's state
     LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
@@ -388,6 +401,21 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
       out[2] = _mm512_xor_si512( out[2], state2 );

       // if out is the same row as inout, update with new data.
+       if ( rowOut == rowInOut0 )
+       {
+          io0.v512 = _mm512_mask_blend_epi64( 0x0f, io0.v512, out[0] );
+          io1.v512 = _mm512_mask_blend_epi64( 0x0f, io1.v512, out[1] );
+          io2.v512 = _mm512_mask_blend_epi64( 0x0f, io2.v512, out[2] );
+
+       }
+       if ( rowOut == rowInOut1 )
+       {
+          io0.v512 = _mm512_mask_blend_epi64( 0xf0, io0.v512, out[0] );
+          io1.v512 = _mm512_mask_blend_epi64( 0xf0, io1.v512, out[1] );
+          io2.v512 = _mm512_mask_blend_epi64( 0xf0, io2.v512, out[2] );
+       }
+
+/*
       if ( rowOut == rowInOut0 )
       {
          io.v512[0] = _mm512_mask_blend_epi64( 0x0f, io.v512[0], out[0] );
@@ -401,27 +429,35 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
          io.v512[1] = _mm512_mask_blend_epi64( 0xf0, io.v512[1], out[1] );
          io.v512[2] = _mm512_mask_blend_epi64( 0xf0, io.v512[2], out[2] );
       }
+*/

       //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
       t0 = _mm512_permutex_epi64( state0, 0x93 );
       t1 = _mm512_permutex_epi64( state1, 0x93 );
       t2 = _mm512_permutex_epi64( state2, 0x93 );

-       io.v512[0] = _mm512_xor_si512( io.v512[0],
+       io0.v512 = _mm512_xor_si512( io0.v512,
                                 _mm512_mask_blend_epi64( 0x11, t0, t2 ) );
-       io.v512[1] = _mm512_xor_si512( io.v512[1],
+       io1.v512 = _mm512_xor_si512( io1.v512,
                                 _mm512_mask_blend_epi64( 0x11, t1, t0 ) );
-       io.v512[2] = _mm512_xor_si512( io.v512[2],
+       io2.v512 = _mm512_xor_si512( io2.v512,
                                 _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
     }

+      casti_m256i( inout0, 0 ) = io0.v256lo;
+      casti_m256i( inout1, 1 ) = io0.v256hi;
+      casti_m256i( inout0, 2 ) = io1.v256lo;
+      casti_m256i( inout1, 3 ) = io1.v256hi;
+      casti_m256i( inout0, 4 ) = io2.v256lo;
+      casti_m256i( inout1, 5 ) = io2.v256hi;
+/*     
     _mm512_mask_store_epi64( inout0,    0x0f, io.v512[0] );
     _mm512_mask_store_epi64( inout1,    0xf0, io.v512[0] );
     _mm512_mask_store_epi64( inout0 +1, 0x0f, io.v512[1] );
     _mm512_mask_store_epi64( inout1 +1, 0xf0, io.v512[1] );
     _mm512_mask_store_epi64( inout0 +2, 0x0f, io.v512[2] );
     _mm512_mask_store_epi64( inout1 +2, 0xf0, io.v512[2] );
-
+*/
      //Goes to next block
      in     += BLOCK_LEN_M256I;
      inout0 += BLOCK_LEN_M256I;
@@ -566,7 +602,7 @@ static inline void reducedDuplexRow_2way_unified( uint64_t *State,
       inout[1] = _mm512_xor_si512( inout[1],
                                    _mm512_mask_blend_epi64( 0x11, t1, t0 ) );
       inout[2] = _mm512_xor_si512( inout[2],
-                                     _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
+                                    _mm512_mask_blend_epi64( 0x11, t2, t1 ) );

       out[0] = _mm512_xor_si512( out[0], state0 );
       out[1] = _mm512_xor_si512( out[1], state1 );
@@ -575,9 +611,9 @@ static inline void reducedDuplexRow_2way_unified( uint64_t *State,
     }

     //Goes to next block
-     in     += BLOCK_LEN_M256I;
+     in    += BLOCK_LEN_M256I;
     inout += BLOCK_LEN_M256I;
-     out    += BLOCK_LEN_M256I;
+     out   += BLOCK_LEN_M256I;
   }

   _mm512_store_si512( (__m512i*)State,     state0 );
@@ -600,8 +636,8 @@ static inline void reducedDuplexRow_2way_unified( uint64_t *State,
 
 //  Wrapper
 inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
-                            uint64_t *rowInOut0, uint64_t *rowInOut1,
-                            uint64_t *rowOut, uint64_t nCols )
+                                   uint64_t *rowInOut0, uint64_t *rowInOut1,
+                                   uint64_t *rowOut, uint64_t nCols )
 {
  if ( rowInOut0 == rowInOut1 )
     reducedDuplexRow_2way_unified( State, rowIn, rowInOut0, rowOut, nCols );
@@ -614,18 +650,18 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
 }

 inline void reducedDuplexRow_2way_X( uint64_t *State, uint64_t *rowIn,
-                            uint64_t *rowInOut0, uint64_t *rowInOut1,
-                            uint64_t *rowOut, uint64_t nCols )
+                                     uint64_t *rowInOut0, uint64_t *rowInOut1,
+                                     uint64_t *rowOut, uint64_t nCols )
 {
-    if ( rowInOut0 == rowInOut1 )
+   if ( rowInOut0 == rowInOut1 )
      reducedDuplexRow_2way_unified( State, rowIn, rowInOut0, rowOut, nCols );
-    else if ( ( rowInOut0 == rowOut ) || ( rowInOut1 == rowOut ) )
-    {
-       asm ( "nop" );  // This prevents GCC from merging with previous function
-       reducedDuplexRow_2way_overlap_X( State, rowIn, rowInOut0, rowInOut1,
-                                      rowOut, nCols );
-    }
-    else
+   else if ( ( rowInOut0 == rowOut ) || ( rowInOut1 == rowOut ) )
+   {
+      asm volatile ( "nop" );  // Prevent GCC from optimizing
+      reducedDuplexRow_2way_overlap_X( State, rowIn, rowInOut0, rowInOut1,
+                                       rowOut, nCols );
+   }
+   else
      reducedDuplexRow_2way_normal( State, rowIn, rowInOut0, rowInOut1,
                                    rowOut, nCols );
 }
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -203,6 +203,18 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

+union _ovly_512
+{
+  __m512i v512;
+  struct
+  {
+     __m256i v256lo;
+     __m256i v256hi;
+  };
+};
+typedef union _ovly_512 ovly_512;
+
+
 union _inout_ovly
 {
   __m512i v512[3];
--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -149,7 +149,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
    char data_str[161], hash_str[65], target_str[65];
    //uint8_t *bdata = 0;
    uint8_t bdata[8192] __attribute__ ((aligned (64)));
-    int rc = 0, i, digits;
+    int i, digits;
    int bytes;
    size_t p = sizeof(unsigned long), a = 64/p, b = 32/p;

@@ -267,48 +267,41 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
            SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
        }

-// rewrite to use 64 bit test.        
-        const unsigned char *hash_ = (const unsigned char *)hash;
-        const unsigned char *target_ = (const unsigned char *)ptarget;
-        for ( i = 31; i >= 0; i-- )
+
+        if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget ) 
+             && !opt_benchmark ) )
+
+
+//        if ( unlikely( hash[7] <= ptarget[7] ) )
+//        if ( likely( fulltest( hash, ptarget ) && !opt_benchmark ) )        
        {
-	        if ( hash_[i] != target_[i] )
+           if ( opt_debug )
           {
-		        rc = hash_[i] < target_[i];
-		        break;
-	        }
-        }
-        if ( unlikely(rc) )
-        {
-            if ( opt_debug )
-            {
-                bin2hex(hash_str, (unsigned char *)hash, 32);
-                bin2hex(target_str, (unsigned char *)ptarget, 32);
-                bin2hex(data_str, (unsigned char *)data, 80);
-                applog(LOG_DEBUG, "DEBUG: [%d thread] Found share!\ndata   %s\nhash   %s\ntarget %s", thr_id, 
-                    data_str,
-                    hash_str,
-                    target_str);
+                bin2hex( hash_str, (unsigned char *)hash, 32 );
+                bin2hex( target_str, (unsigned char *)ptarget, 32 );
+                bin2hex( data_str, (unsigned char *)data, 80 );
+                applog( LOG_DEBUG, "DEBUG: [%d thread] Found share!\ndata   %s\nhash   %s\ntarget %s",
+                      thr_id, data_str, hash_str, target_str );
            }
            pdata[19] = data[19];
            submit_solution( work, hash, mythr );
        }
-    } while (n < max_nonce && !work_restart[thr_id].restart);
+    } while ( n < max_nonce && !work_restart[thr_id].restart );

     pdata[19] = n;

-     mpf_set_prec_raw(magifpi, prec0);
-     mpf_set_prec_raw(magifpi0, prec0);
-     mpf_set_prec_raw(mptmp, prec0);
-     mpf_set_prec_raw(mpt1, prec0);
-     mpf_set_prec_raw(mpt2, prec0);
-     mpf_clear(magifpi);
-     mpf_clear(magifpi0);
-     mpf_clear(mpten);
-     mpf_clear(mptmp);
-     mpf_clear(mpt1);
-     mpf_clear(mpt2);
-     mpz_clears(magipi, magisw, product, bns0, bns1, NULL);
+     mpf_set_prec_raw( magifpi, prec0 );
+     mpf_set_prec_raw( magifpi0, prec0 );
+     mpf_set_prec_raw( mptmp, prec0 );
+     mpf_set_prec_raw( mpt1, prec0 );
+     mpf_set_prec_raw( mpt2, prec0 );
+     mpf_clear( magifpi );
+     mpf_clear( magifpi0 );
+     mpf_clear( mpten );
+     mpf_clear( mptmp );
+     mpf_clear( mpt1 );
+     mpf_clear( mpt2 );
+     mpz_clears( magipi, magisw, product, bns0, bns1, NULL );

    *hashes_done = n - first_nonce + 1;
    return 0;
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -3,11 +3,9 @@

 #include <stdio.h>

-// This implementation is deprecated, superseded by VAES in Icelake
-// which provides HW based 4 way aes.
-// It was created for AVX2 to eliminate interleaving between the 
-// preceding and following function.
-// This code can be removed when current users have reverted to one way.
+// This is a fake, it actually does not do parallel AES, that requires VAES.
+// This is only intended when the preceding and folllowing functions use the
+// same 2x128 interleave.

 #if defined(__AVX2__)

@@ -410,4 +408,94 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
   casti_m256i( dst, 3 ) = casti_m256i( ctx->h, 3 );
 }

+void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
+                           const void *data, size_t len )
+{
+    __m256i *h = (__m256i*)ctx->h;
+    __m128i *iv = (__m128i*)IV512;
+
+   h[0] = m256_const1_128( iv[0] );
+   h[1] = m256_const1_128( iv[1] );
+   h[2] = m256_const1_128( iv[2] );
+   h[3] = m256_const1_128( iv[3] );
+
+   ctx->ptr    =
+   ctx->count0 =
+   ctx->count1 =
+   ctx->count2 =
+   ctx->count3 = 0;
+
+   unsigned char *buf = ctx->buf;
+   size_t         ptr = ctx->ptr;
+
+   // process full blocks and load buf with remainder.
+   while ( len > 0 )
+   {
+      size_t clen;
+
+      clen = (sizeof ctx->buf) - ptr;
+      if ( clen > len << 1 )
+         clen = len << 1;
+      memcpy( buf + ptr, data, clen );
+      data = (const unsigned char *)data + clen;
+      ptr += clen;
+      len -= (clen >> 1);
+      if ( ptr == sizeof ctx->buf )
+      {
+         if ( ( ctx->count0 = ctx->count0 + 1024 )  == 0 )
+         {
+             ctx->count1 = ctx->count1 + 1;
+             if ( ctx->count1 == 0 )
+             {
+                ctx->count2 = ctx->count2 + 1;
+                if ( ctx->count2 == 0 )
+                   ctx->count3 = ctx->count3 + 1;
+             }
+         }
+         c512_2way( ctx, buf );
+         ptr = 0;
+      }
+   }
+
+   uint32_t vp = ptr>>5;
+   // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
+   // Count is misaligned to 16 bits and straddles 2 vectors.
+   // Use u32 overlay to stage then u16 to load buf.
+   union
+   {
+      uint32_t u32[4];
+      uint16_t u16[8];
+   } count;
+
+   count.u32[0] = ctx->count0 += (ptr << 2);  // ptr/2 * 8
+   count.u32[1] = ctx->count1;
+   count.u32[2] = ctx->count2;
+   count.u32[3] = ctx->count3;
+
+   if ( vp == 0 )    // empty buf, xevan.
+   {
+      casti_m256i( buf, 0 ) = m256_const2_64( 0, 0x0000000000000080 );
+      memset_zero_256( (__m256i*)buf + 1, 5 );
+      ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
+   }
+   else     // half full buf, everyone else.
+   {
+    casti_m256i( buf, vp++ ) = m256_const2_64( 0, 0x0000000000000080 );
+      memset_zero_256( (__m256i*)buf + vp, 6 - vp );
+   }
+
+    casti_m256i( buf, 6 ) = m256_const1_128(
+                  _mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
+    casti_m256i( buf, 7 ) = m256_const1_128( _mm_set_epi16(
+                  0x0200,       count.u16[7], count.u16[6], count.u16[5],
+                  count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
+
+   c512_2way( ctx, buf);
+
+   casti_m256i( dst, 0 ) = casti_m256i( ctx->h, 0 );
+   casti_m256i( dst, 1 ) = casti_m256i( ctx->h, 1 );
+   casti_m256i( dst, 2 ) = casti_m256i( ctx->h, 2 );
+   casti_m256i( dst, 3 ) = casti_m256i( ctx->h, 3 );
+}
+   
 #endif // AVX2
--- a/algo/shavite/shavite-hash-2way.h
+++ b/algo/shavite/shavite-hash-2way.h
@@ -18,6 +18,8 @@ void shavite512_2way_update( shavite512_2way_context *ctx, const void *data,
 void shavite512_2way_close( shavite512_2way_context *ctx, void *dst );
 void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
 		                   const void *data, size_t len );
+void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
+                           const void *data, size_t len );

 #endif // AVX2

--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -396,4 +396,96 @@ void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
   casti_m512i( dst, 3 ) = casti_m512i( ctx->h, 3 );
 }

+
+void shavite512_4way_full( shavite512_4way_context *ctx, void *dst,
+                           const void *data, size_t len )
+{
+    __m512i *h = (__m512i*)ctx->h;
+    __m128i *iv = (__m128i*)IV512;
+
+   h[0] = m512_const1_128( iv[0] );
+   h[1] = m512_const1_128( iv[1] );
+   h[2] = m512_const1_128( iv[2] );
+   h[3] = m512_const1_128( iv[3] );
+
+   ctx->ptr    = 
+   ctx->count0 = 
+   ctx->count1 =
+   ctx->count2 =
+   ctx->count3 = 0;
+
+   unsigned char *buf = ctx->buf;
+   size_t         ptr = ctx->ptr;
+
+   // process full blocks and load buf with remainder.
+   while ( len > 0 )
+   {
+      size_t clen;
+
+      clen = (sizeof ctx->buf) - ptr;
+      if ( clen > len << 2 )
+         clen = len << 2;
+      memcpy( buf + ptr, data, clen );
+      data = (const unsigned char *)data + clen;
+      ptr += clen;
+      len -= (clen >> 2);
+      if ( ptr == sizeof ctx->buf )
+      {
+         if ( ( ctx->count0 = ctx->count0 + 1024 )  == 0 )
+         {
+             ctx->count1 = ctx->count1 + 1;
+             if ( ctx->count1 == 0 )
+             {
+                ctx->count2 = ctx->count2 + 1;
+                if ( ctx->count2 == 0 )
+                   ctx->count3 = ctx->count3 + 1;
+             }
+         }
+         c512_4way( ctx, buf );
+         ptr = 0;
+      }
+   }
+
+   uint32_t vp = ptr>>6;
+   // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
+   // Count is misaligned to 16 bits and straddles 2 vectors.
+   // Use u32 overlay to stage then u16 to load buf.
+   union
+   {
+      uint32_t u32[4];
+      uint16_t u16[8];
+   } count;
+
+   count.u32[0] = ctx->count0 += (ptr << 1);  // ptr/4 * 8
+   count.u32[1] = ctx->count1;
+   count.u32[2] = ctx->count2;
+   count.u32[3] = ctx->count3;
+
+   if ( vp == 0 )    // empty buf, xevan.
+   {
+      casti_m512i( buf, 0 ) = m512_const2_64( 0, 0x0000000000000080 );
+      memset_zero_512( (__m512i*)buf + 1, 5 );
+      ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
+   }
+   else     // half full buf, everyone else.
+   {
+    casti_m512i( buf, vp++ ) = m512_const2_64( 0, 0x0000000000000080 );
+      memset_zero_512( (__m512i*)buf + vp, 6 - vp );
+   }
+
+    casti_m512i( buf, 6 ) = m512_const1_128(
+                  _mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
+    casti_m512i( buf, 7 ) = m512_const1_128( _mm_set_epi16(
+                  0x0200,       count.u16[7], count.u16[6], count.u16[5],
+                  count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
+
+   c512_4way( ctx, buf);
+
+   casti_m512i( dst, 0 ) = casti_m512i( ctx->h, 0 );
+   casti_m512i( dst, 1 ) = casti_m512i( ctx->h, 1 );
+   casti_m512i( dst, 2 ) = casti_m512i( ctx->h, 2 );
+   casti_m512i( dst, 3 ) = casti_m512i( ctx->h, 3 );
+}
+
+
 #endif // VAES
--- a/algo/shavite/shavite-hash-4way.h
+++ b/algo/shavite/shavite-hash-4way.h
@@ -18,6 +18,8 @@ void shavite512_4way_update( shavite512_4way_context *ctx, const void *data,
 void shavite512_4way_close( shavite512_4way_context *ctx, void *dst );
 void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
 		                   const void *data, size_t len );
+void shavite512_4way_full( shavite512_4way_context *ctx, void *dst,
+                           const void *data, size_t len );

 #endif // VAES

--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -1173,6 +1173,91 @@ int simd_4way_update_close( simd_4way_context *state, void *hashval,
  return 0;
 }

+int simd512_4way_full( simd_4way_context *state, void *hashval,
+                    const void *data, int datalen )
+{
+  __m512i *A = (__m512i*)state->A;
+
+  state->hashbitlen = 512;
+  state->n_feistels = 8;
+  state->blocksize = 128*8;
+  state->count = 0;
+
+  for ( int i = 0; i < 8; i++ )
+       A[i] = _mm512_set4_epi32( SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
+                                 SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0] );
+
+  int current, i;
+  int bs = state->blocksize;  // bits in one lane
+  int isshort = 1;
+  uint64_t l;
+  int databitlen = datalen * 8;
+
+  current = state->count & (bs - 1);
+
+  while ( databitlen > 0 )
+  {
+    if ( current == 0 && databitlen >= bs )
+    {
+      // We can hash the data directly from the input buffer.
+      SIMD_4way_Compress( state, data, 0 );
+      databitlen -= bs;
+      data += 4*( bs/8 );
+      state->count += bs;
+    }
+    else
+    {
+      // Copy a chunk of data to the buffer
+      int len = bs - current;
+      if ( databitlen < len )
+      {
+        memcpy( state->buffer + 4*( current/8 ), data, 4*( (databitlen)/8 ) );
+        state->count += databitlen;
+        break;
+      }
+      else
+      {
+        memcpy( state->buffer + 4*(current/8), data, 4*(len/8) );
+        state->count += len;
+        databitlen -= len;
+        data += 4*( len/8 );
+        current = 0;
+        SIMD_4way_Compress( state, state->buffer, 0 );
+      }
+    }
+  }
+
+  current = state->count & (state->blocksize - 1);
+
+  // If there is still some data in the buffer, hash it
+  if ( current )
+  {
+    current = current / 8;
+    memset( state->buffer + 4*current, 0, 4*( state->blocksize/8 - current) );
+    SIMD_4way_Compress( state, state->buffer, 0 );
+  }
+
+  //* Input the message length as the last block
+  memset( state->buffer, 0, 4*( state->blocksize/8 ) );
+  l = state->count;
+  for ( i = 0; i < 8; i++ )
+  {
+    state->buffer[ i    ] = l & 0xff;
+    state->buffer[ i+16 ] = l & 0xff;
+    state->buffer[ i+32 ] = l & 0xff;
+    state->buffer[ i+48 ] = l & 0xff;
+    l >>= 8;
+  }
+  if ( state->count < 16384 )
+    isshort = 2;
+
+  SIMD_4way_Compress( state, state->buffer, isshort );
+  memcpy( hashval, state->A, 4*( state->hashbitlen / 8 ) );
+  return 0;
+}
+
+
+
 #endif // AVX512

 ////////////////////////////////////
@@ -1929,4 +2014,90 @@ int simd_2way_update_close( simd_2way_context *state, void *hashval,
  return 0;
 }

+int simd512_2way_full( simd_2way_context *state, void *hashval,
+                    const void *data, int datalen )
+{
+  __m256i *A = (__m256i*)state->A;
+
+  state->hashbitlen = 512;
+  state->n_feistels = 8;
+  state->blocksize = 128*8;
+  state->count = 0;
+
+  for ( int i = 0; i < 8; i++ )
+       A[i] = _mm256_set_epi32( SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
+                                SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0],
+                                SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
+                                SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0] );
+
+  int current, i;
+  int bs = state->blocksize;  // bits in one lane
+  int isshort = 1;
+  uint64_t l;
+  int databitlen = datalen * 8;
+
+  current = state->count & (bs - 1);
+
+  while ( databitlen > 0 )
+  {
+    if ( current == 0 && databitlen >= bs )
+    {
+      // We can hash the data directly from the input buffer.
+      SIMD_2way_Compress( state, data, 0 );
+
+      databitlen -= bs;
+      data += 2*( bs/8 );
+      state->count += bs;
+    }
+    else
+    {
+      // Copy a chunk of data to the buffer
+      int len = bs - current;
+      if ( databitlen < len )
+      {
+
+         memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) );
+        state->count += databitlen;
+        break;
+      }
+      else
+      {
+        memcpy( state->buffer + 2*(current/8), data, 2*(len/8) );
+        state->count += len;
+        databitlen -= len;
+        data += 2*( len/8 );
+        current = 0;
+        SIMD_2way_Compress( state, state->buffer, 0 );
+      }
+    }
+  }
+
+  current = state->count & (state->blocksize - 1);
+
+  // If there is still some data in the buffer, hash it
+  if ( current )
+  {
+    current = ( current+7 ) / 8;
+    memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current) );
+    SIMD_2way_Compress( state, state->buffer, 0 );
+  }
+
+  //* Input the message length as the last block
+  memset( state->buffer, 0, 2*( state->blocksize/8 ) );
+  l = state->count;
+  for ( i = 0; i < 8; i++ )
+  {
+    state->buffer[ i    ] = l & 0xff;
+    state->buffer[ i+16 ] = l & 0xff;
+    l >>= 8;
+  }
+  if ( state->count < 16384 )
+    isshort = 2;
+
+  SIMD_2way_Compress( state, state->buffer, isshort );
+  memcpy( hashval, state->A, 2*( state->hashbitlen / 8 ) );
+  return 0;
+}
+
+
 #endif
--- a/algo/simd/simd-hash-2way.h
+++ b/algo/simd/simd-hash-2way.h
@@ -26,6 +26,8 @@ int simd_4way_update( simd_4way_context *state, const void *data,
 int simd_4way_close( simd_4way_context *state, void *hashval );
 int simd_4way_update_close( simd_4way_context *state, void *hashval,
                            const void *data, int databitlen );
+int simd512_4way_full( simd_4way_context *state, void *hashval,
+                    const void *data, int datalen );

 #endif

@@ -45,5 +47,8 @@ int simd_2way_update( simd_2way_context *state, const void *data,
 int simd_2way_close( simd_2way_context *state, void *hashval );
 int simd_2way_update_close( simd_2way_context *state, void *hashval,
                            const void *data, int databitlen );
+int simd512_2way_full( simd_2way_context *state, void *hashval,
+                    const void *data, int datalen );
+
 #endif
 #endif
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -45,18 +45,18 @@ extern "C"{
 #endif

 /*
-static const sph_u64 IV256[] = {
-   SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
-   SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
-   SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
-   SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
+static const uint64_t IV256[] = {
+   0xCCD044A12FDB3E13, 0xE83590301A79A9EB,
+   0x55AEA0614F816E6F, 0x2A2767A4AE9B94DB,
+   0xEC06025E74DD7683, 0xE7A436CDC4746251,
+   0xC36FBAF9393AD185, 0x3EEDBA1833EDFC13
 };

-static const sph_u64 IV512[] = {
-   SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
-   SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
-   SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
-   SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
+static const uint64_t IV512[] = {
+   0x4903ADFF749C51CE, 0x0D95DE399746DF03,
+   0x8FD1934127C79BCE, 0x9A255629FF352CB1,
+   0x5DB62599DF6CA7B0, 0xEABE394CA9D5C3F4,
+   0x991112C71A75B523, 0xAE18A40B660FCC33
 };
 */
   
@@ -372,7 +372,7 @@ do { \

 #define UBI_BIG_8WAY(etype, extra) \
 do { \
-  sph_u64 t0, t1, t2; \
+  uint64_t t0, t1, t2; \
  __m512i h8; \
  __m512i m0 =  buf[0]; \
  __m512i m1 =  buf[1]; \
@@ -391,8 +391,8 @@ do { \
  __m512i p5 = m5; \
  __m512i p6 = m6; \
  __m512i p7 = m7; \
-  t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
-  t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
+  t0 = (uint64_t)(bcount << 6) + (uint64_t)(extra); \
+  t1 = (bcount >> 58) + ((uint64_t)(etype) << 55); \
  TFBIG_KINIT_8WAY(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \
  TFBIG_8WAY_4e(0); \
  TFBIG_8WAY_4o(1); \
@@ -425,7 +425,7 @@ do { \

 #define DECL_STATE_BIG_8WAY \
  __m512i h0, h1, h2, h3, h4, h5, h6, h7; \
-  sph_u64 bcount;
+  uint64_t bcount;


 #endif // AVX512
@@ -488,7 +488,7 @@ do { \
 // scale buf offset by 4
 #define UBI_BIG_4WAY(etype, extra) \
 do { \
-  sph_u64 t0, t1, t2; \
+  uint64_t t0, t1, t2; \
  __m256i h8; \
  __m256i m0 =  buf[0]; \
  __m256i m1 =  buf[1]; \
@@ -507,8 +507,8 @@ do { \
  __m256i p5 = m5; \
  __m256i p6 = m6; \
  __m256i p7 = m7; \
-  t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
-  t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
+  t0 = (uint64_t)(bcount << 6) + (uint64_t)(extra); \
+  t1 = (bcount >> 58) + ((uint64_t)(etype) << 55); \
  TFBIG_KINIT_4WAY(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \
  TFBIG_4WAY_4e(0); \
  TFBIG_4WAY_4o(1); \
@@ -542,7 +542,7 @@ do { \

 #define DECL_STATE_BIG_4WAY \
  __m256i h0, h1, h2, h3, h4, h5, h6, h7; \
-  sph_u64 bcount;
+  uint64_t bcount;

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -48,14 +48,8 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
 #include "simd-utils.h"

-// Output size in bits
-#define SPH_SIZE_skein256   256
-#define SPH_SIZE_skein512   512
-
-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 typedef struct
@@ -63,11 +57,11 @@ typedef struct
   __m512i buf[8];
   __m512i h0, h1, h2, h3, h4, h5, h6, h7;
   size_t ptr;
-   sph_u64 bcount;
-} sph_skein_8way_big_context __attribute__ ((aligned (128)));
+   uint64_t bcount;
+} skein_8way_big_context __attribute__ ((aligned (128)));

-typedef sph_skein_8way_big_context skein512_8way_context;
-typedef sph_skein_8way_big_context skein256_8way_context;
+typedef skein_8way_big_context skein512_8way_context;
+typedef skein_8way_big_context skein256_8way_context;

 void skein512_8way_init( skein512_8way_context *sc );
 void skein512_8way_update( void *cc, const void *data, size_t len );
@@ -84,21 +78,19 @@ typedef struct
   __m256i buf[8];
   __m256i h0, h1, h2, h3, h4, h5, h6, h7;
   size_t ptr;
-	sph_u64 bcount;
-} sph_skein_4way_big_context __attribute__ ((aligned (128)));
+	uint64_t bcount;
+} skein_4way_big_context __attribute__ ((aligned (128)));

-typedef sph_skein_4way_big_context skein512_4way_context;
-typedef sph_skein_4way_big_context skein256_4way_context;
+typedef skein_4way_big_context skein512_4way_context;
+typedef skein_4way_big_context skein256_4way_context;

 void skein512_4way_init( skein512_4way_context *sc );
 void skein512_4way_update( void *cc, const void *data, size_t len );
 void skein512_4way_close( void *cc, void *dst );
-//#define skein512_4way skein512_4way_update

 void skein256_4way_init( skein256_4way_context *sc );
 void skein256_4way_update( void *cc, const void *data, size_t len );
 void skein256_4way_close( void *cc, void *dst );
-//#define skein256_4way skein256_4way_update

 #ifdef __cplusplus
 }
--- a/algo/x13/x13bcd-4way.c
+++ b/algo/x13/x13bcd-4way.c
@@ -527,7 +527,7 @@ int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce,
     mm256_bswap32_intrlv80_4x64( vdata, pdata );

     blake512_4way_init( &x13bcd_ctx_mid );
-     blake512_4way( &x13bcd_ctx_mid, vdata, 64 );
+     blake512_4way_update( &x13bcd_ctx_mid, vdata, 64 );
     do
     {
        *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
--- a/algo/x13/x13sm3-4way.c
+++ b/algo/x13/x13sm3-4way.c
@@ -227,7 +227,7 @@ int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce,
     mm256_bswap32_intrlv80_4x64( vdata, pdata );

     blake512_4way_init( &x13sm3_ctx_mid );
-     blake512_4way( &x13sm3_ctx_mid, vdata, 64 );
+     blake512_4way_update( &x13sm3_ctx_mid, vdata, 64 );

     for ( int m=0; m < 6; m++ )
       if ( Htarg <= htmax[m] )
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -65,6 +65,7 @@ union _x16r_8way_context_overlay

 typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;

+
 void x16r_8way_hash( void* output, const void* input )
 {
   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
@@ -98,18 +99,16 @@ void x16r_8way_hash( void* output, const void* input )
      switch ( algo )
      {
         case BLAKE:
-            blake512_8way_init( &ctx.blake );
            if ( i == 0 )
-               blake512_8way_update( &ctx.blake, input, size );
+               blake512_8way_full( &ctx.blake, vhash, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
                            size<<3 );
-               blake512_8way_update( &ctx.blake, vhash, size );
+               blake512_8way_full( &ctx.blake, vhash, vhash, size );
            }
-            blake512_8way_close( &ctx.blake, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5,
+                                 hash6, hash7, vhash );
         break;
         case BMW:
            bmw512_8way_init( &ctx.bmw );
@@ -128,40 +127,22 @@ void x16r_8way_hash( void* output, const void* input )
         case GROESTL:
 #if defined(__VAES__)
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            groestl512_4way_init( &ctx.groestl, 64 );
-            groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 );
+            groestl512_4way_full( &ctx.groestl, vhash, vhash, size );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            groestl512_4way_init( &ctx.groestl, 64 );
-            groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 );
+            groestl512_4way_full( &ctx.groestl, vhash, vhash, size );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                                (const char*)in0, size<<3 );
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                                (const char*)in1, size<<3 );
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                                (const char*)in2, size<<3 );
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                                (const char*)in3, size<<3 );
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash4,
-                                                (const char*)in4, size<<3 );
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash5,
-                                                (const char*)in5, size<<3 );
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash6,
-                                                (const char*)in6, size<<3 );
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash7,
-                                                (const char*)in7, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash4, (char*)in4, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash5, (char*)in5, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash6, (char*)in6, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash7, (char*)in7, size<<3 );
 #endif
-            break;
+         break;
         case SKEIN:
            skein512_8way_init( &ctx.skein );
            if ( i == 0 )
@@ -206,33 +187,27 @@ void x16r_8way_hash( void* output, const void* input )
         break;
         case LUFFA:
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            luffa_4way_init( &ctx.luffa, 512 );
-            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
+            luffa512_4way_full( &ctx.luffa, vhash, vhash, size );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            luffa_4way_init( &ctx.luffa, 512 );
-            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
+            luffa512_4way_full( &ctx.luffa, vhash, vhash, size );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
         break;
         case CUBEHASH:
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, size );
+            cube_4way_full( &ctx.cube, vhash, 512, vhash, size );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, size );
+            cube_4way_full( &ctx.cube, vhash, 512, vhash, size );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
         break;
         case SHAVITE:
 #if defined(__VAES__)
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            shavite512_4way_init( &ctx.shavite );
-            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size );
+            shavite512_4way_full( &ctx.shavite, vhash, vhash, size );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            shavite512_4way_init( &ctx.shavite );
-            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size );
+            shavite512_4way_full( &ctx.shavite, vhash, vhash, size );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
            sph_shavite512_init( &ctx.shavite );
@@ -260,54 +235,42 @@ void x16r_8way_hash( void* output, const void* input )
            sph_shavite512( &ctx.shavite, in7, size );
            sph_shavite512_close( &ctx.shavite, hash7 );
 #endif
-            break;
+         break;
         case SIMD:
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            simd_4way_init( &ctx.simd, 512 );
-            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            simd512_4way_full( &ctx.simd, vhash, vhash, size );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            simd_4way_init( &ctx.simd, 512 );
-            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            simd512_4way_full( &ctx.simd, vhash, vhash, size );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
         break;
         case ECHO:
 #if defined(__VAES__)
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            echo_4way_init( &ctx.echo, 512 );
-            echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 );
+            echo_4way_full( &ctx.echo, vhash, 512, vhash, size );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            echo_4way_init( &ctx.echo, 512 );
-            echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 );
+            echo_4way_full( &ctx.echo, vhash, 512, vhash, size );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
-            init_echo( &ctx.echo, 512 );
-            update_final_echo ( &ctx.echo, (BitSequence *)hash0,
-                               (const BitSequence*)in0, size<<3 );
-            init_echo( &ctx.echo, 512 );
-            update_final_echo ( &ctx.echo, (BitSequence *)hash1,
-                               (const BitSequence*)in1, size<<3 );
-            init_echo( &ctx.echo, 512 );
-            update_final_echo ( &ctx.echo, (BitSequence *)hash2,
-                               (const BitSequence*)in2, size<<3 );
-            init_echo( &ctx.echo, 512 );
-            update_final_echo ( &ctx.echo, (BitSequence *)hash3,
-                               (const BitSequence*)in3, size<<3 );
-            init_echo( &ctx.echo, 512 );
-            update_final_echo ( &ctx.echo, (BitSequence *)hash4,
-                               (const BitSequence*)in4, size<<3 );
-            init_echo( &ctx.echo, 512 );
-            update_final_echo ( &ctx.echo, (BitSequence *)hash5,
-                               (const BitSequence*)in5, size<<3 );
-            init_echo( &ctx.echo, 512 );
-            update_final_echo ( &ctx.echo, (BitSequence *)hash6,
-                               (const BitSequence*)in6, size<<3 );
-            init_echo( &ctx.echo, 512 );
-            update_final_echo ( &ctx.echo, (BitSequence *)hash7,
-                               (const BitSequence*)in7, size<<3 );
+            echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                              (const BitSequence *)in0, size );
+            echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                              (const BitSequence *)in1, size );
+            echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                              (const BitSequence *)in2, size );
+            echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                              (const BitSequence *)in3, size );
+            echo_full( &ctx.echo, (BitSequence *)hash4, 512,
+                              (const BitSequence *)in4, size );
+            echo_full( &ctx.echo, (BitSequence *)hash5, 512,
+                              (const BitSequence *)in5, size );
+            echo_full( &ctx.echo, (BitSequence *)hash6, 512,
+                              (const BitSequence *)in6, size );
+            echo_full( &ctx.echo, (BitSequence *)hash7, 512,
+                              (const BitSequence *)in7, size );
 #endif
-             break;
+         break;
         case HAMSI:
             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
@@ -317,7 +280,7 @@ void x16r_8way_hash( void* output, const void* input )
             hamsi512_8way_close( &ctx.hamsi, vhash );
             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
-             break;
+         break;
         case FUGUE:
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in0, size );
@@ -380,13 +343,18 @@ void x16r_8way_hash( void* output, const void* input )
             sph_whirlpool_close( &ctx.whirlpool, hash7 );
         break;
         case SHA_512:
-             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
             sha512_8way_init( &ctx.sha512 );
-             sha512_8way_update( &ctx.sha512, vhash, size );
+             if ( i == 0 )
+                sha512_8way_update( &ctx.sha512, input, size );
+             else
+             {
+                intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                             size<<3 );
+                sha512_8way_update( &ctx.sha512, vhash, size );
+             }
             sha512_8way_close( &ctx.sha512, vhash );
             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
+                               hash7, vhash );
         break;
      }
      size = 64;
@@ -431,7 +399,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
-              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+              applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime );
   }

   do
@@ -505,15 +473,13 @@ void x16r_4way_hash( void* output, const void* input )
      switch ( algo )
      {
         case BLAKE:
-            blake512_4way_init( &ctx.blake );
            if ( i == 0 )
-               blake512_4way_update( &ctx.blake, input, size );
+               blake512_4way_full( &ctx.blake, vhash, input, size );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               blake512_4way_update( &ctx.blake, vhash, size );
+               blake512_4way_full( &ctx.blake, vhash, vhash, size );
            }
-            blake512_4way_close( &ctx.blake, vhash );
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case BMW:
@@ -529,18 +495,10 @@ void x16r_4way_hash( void* output, const void* input )
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case GROESTL:
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                                 (const char*)in0, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                                 (const char*)in1, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                                 (const char*)in2, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                                 (const char*)in3, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 );
         break;
         case SKEIN:
            skein512_4way_init( &ctx.skein );
@@ -580,12 +538,10 @@ void x16r_4way_hash( void* output, const void* input )
         break;
         case LUFFA:
            intrlv_2x128( vhash, in0, in1, size<<3 );
-            luffa_2way_init( &ctx.luffa, 512 );
-            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
+            luffa512_2way_full( &ctx.luffa, vhash, vhash, size );
            dintrlv_2x128_512( hash0, hash1, vhash );
            intrlv_2x128( vhash, in2, in3, size<<3 );
-            luffa_2way_init( &ctx.luffa, 512 );
-            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
+            luffa512_2way_full( &ctx.luffa, vhash, vhash, size );
            dintrlv_2x128_512( hash2, hash3, vhash );
         break;
         case CUBEHASH:
@@ -618,27 +574,21 @@ void x16r_4way_hash( void* output, const void* input )
         break;
         case SIMD:
            intrlv_2x128( vhash, in0, in1, size<<3 );
-            simd_2way_init( &ctx.simd, 512 );
-            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            simd512_2way_full( &ctx.simd, vhash, vhash, size );
            dintrlv_2x128_512( hash0, hash1, vhash );
            intrlv_2x128( vhash, in2, in3, size<<3 );
-            simd_2way_init( &ctx.simd, 512 );
-            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            simd512_2way_full( &ctx.simd, vhash, vhash, size );
            dintrlv_2x128_512( hash2, hash3, vhash );
         break;
         case ECHO:
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
-                                (const BitSequence*)in0, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
-                                (const BitSequence*)in1, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
-                                (const BitSequence*)in2, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
-                                (const BitSequence*)in3, size<<3 );
+            echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                              (const BitSequence *)in0, size );
+            echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                              (const BitSequence *)in1, size );
+            echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                              (const BitSequence *)in2, size );
+            echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                              (const BitSequence *)in3, size );
         break;
         case HAMSI:
             intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
@@ -727,7 +677,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
-              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+              applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime );
   }

   do
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -39,9 +39,13 @@
 #include <openssl/sha.h>
 #endif

+#if defined(X21S_8WAY) || defined(X21S_4WAY)
+
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };

+#endif
+
 #if defined (X21S_8WAY)

 static __thread uint64_t* x21s_8way_matrix;
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -72,27 +72,19 @@ void x17_8way_hash( void *state, const void *input )
     uint64_t hash7[8] __attribute__ ((aligned (64)));
     x17_8way_context_overlay ctx;

-     // 1 Blake
-     blake512_8way_init( &ctx.blake );
-     blake512_8way_update( &ctx.blake, input, 80 );
-     blake512_8way_close( &ctx.blake, vhash );
+     blake512_8way_full( &ctx.blake, vhash, input, 80 );

-     // 2 Bmw
     bmw512_8way_init( &ctx.bmw );
     bmw512_8way_update( &ctx.bmw, vhash, 64 );
     bmw512_8way_close( &ctx.bmw, vhash );

-     // 3 Groestl
-
 #if defined(__VAES__)

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

-     groestl512_4way_init( &ctx.groestl, 64 );
-     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
-     groestl512_4way_init( &ctx.groestl, 64 );
-     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
-
+     groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
+     groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
+     
     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
     
 #else
@@ -100,65 +92,44 @@ void x17_8way_hash( void *state, const void *input )
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash );

-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );

     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7 );

 #endif

-     // 4 Skein parallel 4 way 64 bit 
     skein512_8way_init( &ctx.skein );
     skein512_8way_update( &ctx.skein, vhash, 64 );
     skein512_8way_close( &ctx.skein, vhash );

-     // 5 JH
     jh512_8way_init( &ctx.jh );
     jh512_8way_update( &ctx.jh, vhash, 64 );
     jh512_8way_close( &ctx.jh, vhash );

-     // 6 Keccak
     keccak512_8way_init( &ctx.keccak );
     keccak512_8way_update( &ctx.keccak, vhash, 64 );
     keccak512_8way_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

-     // 7 Luffa  
-     luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
-     luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+     luffa512_4way_full( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa512_4way_full( &ctx.luffa, vhashB, vhashB, 64 );

-     // 8 Cubehash
-     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
-     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
-
-     // 9 Shavite
+     cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
+     cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );

 #if defined(__VAES__)

-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 );

 #else

@@ -195,20 +166,13 @@ void x17_8way_hash( void *state, const void *input )

 #endif

-     // 10 Simd
-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
-
-     // 11 Echo
+     simd512_4way_full( &ctx.simd, vhashA, vhashA, 64 );
+     simd512_4way_full( &ctx.simd, vhashB, vhashB, 64 );

 #if defined(__VAES__)

-     echo_4way_init( &ctx.echo, 512 );
-     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
-     echo_4way_init( &ctx.echo, 512 );
-     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+     echo_4way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
+     echo_4way_full( &ctx.echo, vhashB, 512, vhashB, 64 );

     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );

@@ -217,36 +181,27 @@ void x17_8way_hash( void *state, const void *input )
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );

-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                            (const BitSequence *) hash0, 512 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                            (const BitSequence *) hash1, 512 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                            (const BitSequence *) hash2, 512 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                            (const BitSequence *) hash3, 512 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash4,
-                            (const BitSequence *) hash4, 512 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash5,
-                            (const BitSequence *) hash5, 512 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash6,
-                            (const BitSequence *) hash6, 512 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash7,
-                            (const BitSequence *) hash7, 512 );
-
+     echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                     (const BitSequence *)hash0, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                     (const BitSequence *)hash1, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                     (const BitSequence *)hash2, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                     (const BitSequence *)hash3, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash4, 512,
+                     (const BitSequence *)hash4, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash5, 512,
+                     (const BitSequence *)hash5, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash6, 512,
+                     (const BitSequence *)hash6, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash7, 512,
+                     (const BitSequence *)hash7, 64 );
+     
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );
-#endif

-     // 12 Hamsi
+#endif

     hamsi512_8way_init( &ctx.hamsi );
     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
@@ -255,7 +210,6 @@ void x17_8way_hash( void *state, const void *input )
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );

-     // 13 Fugue serial
     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
@@ -281,7 +235,6 @@ void x17_8way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash7, 64 );
     sph_fugue512_close( &ctx.fugue, hash7 );

-     // 14 Shabal, parallel 8 way 32 bit
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );

@@ -292,7 +245,6 @@ void x17_8way_hash( void *state, const void *input )
     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );

-     // 15 Whirlpool serial
     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash0 );
@@ -318,7 +270,6 @@ void x17_8way_hash( void *state, const void *input )
     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash7 );

-     // 16 SHA512 parallel 64 bit 
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );

@@ -326,7 +277,6 @@ void x17_8way_hash( void *state, const void *input )
     sha512_8way_update( &ctx.sha512, vhash, 64 );
     sha512_8way_close( &ctx.sha512, vhash );

-     // 17 Haval parallel 32 bit
     rintrlv_8x64_8x32( vhashA, vhash,  512 );

     haval256_5_8way_init( &ctx.haval );
@@ -401,7 +351,7 @@ typedef union _x17_4way_context_overlay x17_4way_context_overlay;

 void x17_4way_hash( void *state, const void *input )
 {
-     uint64_t vhash[8*4] __attribute__ ((aligned (128)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
@@ -410,91 +360,59 @@ void x17_4way_hash( void *state, const void *input )
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     x17_4way_context_overlay ctx;

-     // 1 Blake parallel 4 way 64 bit
-     blake512_4way_init( &ctx.blake );
-     blake512_4way_update( &ctx.blake, input, 80 );
-     blake512_4way_close( &ctx.blake, vhash );
+     blake512_4way_full( &ctx.blake, vhash, input, 80 );

-     // 2 Bmw
     bmw512_4way_init( &ctx.bmw );
     bmw512_4way_update( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );

-     // Serialize
     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

-     // 3 Groestl
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

-     // Parallellize
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

-     // 4 Skein parallel 4 way 64 bit 
     skein512_4way_init( &ctx.skein );
     skein512_4way_update( &ctx.skein, vhash, 64 );
     skein512_4way_close( &ctx.skein, vhash );

-     // 5 JH
     jh512_4way_init( &ctx.jh );
     jh512_4way_update( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );

-     // 6 Keccak
     keccak512_4way_init( &ctx.keccak );
     keccak512_4way_update( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );

-     // 7 Luffa  parallel 2 way 128 bit
     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

-     luffa_2way_init( &ctx.luffa, 512 );
-     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
-     luffa_2way_init( &ctx.luffa, 512 );
-     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+     luffa512_2way_full( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa512_2way_full( &ctx.luffa, vhashB, vhashB, 64 );

-     // 8 Cubehash
-     cube_2way_init( &ctx.cube, 512, 16, 32 );
-     cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
-     cube_2way_init( &ctx.cube, 512, 16, 32 );
-     cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+     cube_2way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
+     cube_2way_full( &ctx.cube, vhashB, 512, vhashB, 64 );

-     // 9 Shavite
-     shavite512_2way_init( &ctx.shavite );
-     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
-     shavite512_2way_init( &ctx.shavite );
-     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+     shavite512_2way_full( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_2way_full( &ctx.shavite, vhashB, vhashB, 64 );

-     // 10 Simd
-     simd_2way_init( &ctx.simd, 512 );
-     simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
-     simd_2way_init( &ctx.simd, 512 );
-     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+     simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
+     simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );

     dintrlv_2x128_512( hash0, hash1, vhashA );
     dintrlv_2x128_512( hash2, hash3, vhashB );

-     // 11 Echo serial
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                       (const BitSequence *) hash0, 512 );
-     init_echo( &ctx.echo, 512 );     
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                       (const BitSequence *) hash1, 512 );
-     init_echo( &ctx.echo, 512 );     
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                       (const BitSequence *) hash2, 512 );
-     init_echo( &ctx.echo, 512 );     
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                       (const BitSequence *) hash3, 512 );
+     echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                     (const BitSequence *)hash0, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                     (const BitSequence *)hash1, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                     (const BitSequence *)hash2, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                     (const BitSequence *)hash3, 64 );

-     // 12 Hamsi parallel 4 way 64 bit
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     hamsi512_4way_init( &ctx.hamsi );
@@ -503,7 +421,6 @@ void x17_4way_hash( void *state, const void *input )

     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

-     // 13 Fugue serial
     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
@@ -517,7 +434,6 @@ void x17_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     // 14 Shabal, parallel 4 way 32 bit
     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );

     shabal512_4way_init( &ctx.shabal );
@@ -526,7 +442,6 @@ void x17_4way_hash( void *state, const void *input )

     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
       
-     // 15 Whirlpool serial
     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash0 );
@@ -540,14 +455,12 @@ void x17_4way_hash( void *state, const void *input )
     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash3 );

-     // 16 SHA512 parallel 64 bit 
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     sha512_4way_init( &ctx.sha512 );
     sha512_4way_update( &ctx.sha512, vhash, 64 );
     sha512_4way_close( &ctx.sha512, vhash );     

-     // 17 Haval parallel 32 bit
     rintrlv_4x64_4x32( vhashB, vhash,  512 );

     haval256_5_4way_init( &ctx.haval );
@@ -558,8 +471,8 @@ void x17_4way_hash( void *state, const void *input )
 int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[16*4] __attribute__ ((aligned (128)));
-   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t hash[16*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t *pdata = work->data;
@@ -570,27 +483,30 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const uint32_t Htarg = ptarget[7];
+   const bool bench = opt_benchmark;

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   *noncev = mm256_intrlv_blend_32(
+                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
   {
-      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-              _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
      x17_4way_hash( hash, vdata );

      for ( int lane = 0; lane < 4; lane++ )
-      if unlikely( ( hash7[ lane ] <= Htarg ) )
-      {
+      if ( unlikely( hash7[ lane ] <= Htarg && !bench ) )
+      {  
         extr_lane_4x32( lane_hash, hash, lane, 256 );
-         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         if ( ( hash7[ lane ] < Htarg ) || valid_hash( lane_hash, ptarget ) )
         {
-            pdata[19] = n + lane;
+            pdata[19] = bswap_32( n + lane );
            submit_lane_solution( work, lane_hash, mythr, lane );
-         }
+         }            
      }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
      n += 4;
-   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
-
+   } while ( likely( ( n <= last_nonce ) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -11,9 +11,8 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
-#include "algo/shavite/shavite-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/cubehash/cubehash_sse2.h"
+#include "algo/shavite/shavite-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -74,9 +73,7 @@ void xevan_8way_hash( void *output, const void *input )
     const int dataLen = 128;
     xevan_8way_context_overlay ctx __attribute__ ((aligned (64)));

-     blake512_8way_init( &ctx.blake );
-     blake512_8way_update( &ctx.blake, input, 80 );
-     blake512_8way_close( &ctx.blake, vhash );
+     blake512_8way_full( &ctx.blake, vhash, input, 80 );
     memset( &vhash[8<<3], 0, 64<<3 );

     bmw512_8way_init( &ctx.bmw );
@@ -87,10 +84,8 @@ void xevan_8way_hash( void *output, const void *input )

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );

-     groestl512_4way_init( &ctx.groestl, 64 );
-     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, dataLen<<3 );
-     groestl512_4way_init( &ctx.groestl, 64 );
-     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, dataLen<<3 );
+     groestl512_4way_full( &ctx.groestl, vhashA, vhashA, dataLen );
+     groestl512_4way_full( &ctx.groestl, vhashB, vhashB, dataLen );

     rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 );

@@ -99,30 +94,14 @@ void xevan_8way_hash( void *output, const void *input )
     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, dataLen<<3 );

-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7,
-                               dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, dataLen<<3 );

     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, dataLen<<3 );
@@ -143,22 +122,16 @@ void xevan_8way_hash( void *output, const void *input )

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );

-     luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
-     luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
+     luffa512_4way_full( &ctx.luffa, vhashA, vhashA, dataLen );
+     luffa512_4way_full( &ctx.luffa, vhashB, vhashB, dataLen );

-     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
-     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
+     cube_4way_full( &ctx.cube, vhashA, 512, vhashA, dataLen );
+     cube_4way_full( &ctx.cube, vhashB, 512, vhashB, dataLen );

 #if defined(__VAES__)

-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, dataLen );
+     shavite512_4way_full( &ctx.shavite, vhashA, vhashA, dataLen );
+     shavite512_4way_full( &ctx.shavite, vhashB, vhashB, dataLen );

 #else

@@ -195,17 +168,13 @@ void xevan_8way_hash( void *output, const void *input )

 #endif

-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
+     simd512_4way_full( &ctx.simd, vhashA, vhashA, dataLen );
+     simd512_4way_full( &ctx.simd, vhashB, vhashB, dataLen );

 #if defined(__VAES__)

-     echo_4way_init( &ctx.echo, 512 );
-     echo_4way_update_close( &ctx.echo, vhashA, vhashA, dataLen<<3 );
-     echo_4way_init( &ctx.echo, 512 );
-     echo_4way_update_close( &ctx.echo, vhashB, vhashB, dataLen<<3 );
+     echo_4way_full( &ctx.echo, vhashA, 512, vhashA, dataLen );
+     echo_4way_full( &ctx.echo, vhashB, 512, vhashB, dataLen );

     rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 );

@@ -214,31 +183,23 @@ void xevan_8way_hash( void *output, const void *input )
     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );

-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                       (const BitSequence *) hash0, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                       (const BitSequence *) hash1, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                       (const BitSequence *) hash2, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                       (const BitSequence *) hash3, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash4,
-                       (const BitSequence *) hash4, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash5,
-                       (const BitSequence *) hash5, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash6,
-                       (const BitSequence *) hash6, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash7,
-                       (const BitSequence *) hash7, dataLen<<3 );
-
+     echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                     (const BitSequence *)hash0, dataLen );
+     echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                     (const BitSequence *)hash1, dataLen );
+     echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                     (const BitSequence *)hash2, dataLen );
+     echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                     (const BitSequence *)hash3, dataLen );
+     echo_full( &ctx.echo, (BitSequence *)hash4, 512,
+                     (const BitSequence *)hash4, dataLen );
+     echo_full( &ctx.echo, (BitSequence *)hash5, 512,
+                     (const BitSequence *)hash5, dataLen );
+     echo_full( &ctx.echo, (BitSequence *)hash6, 512,
+                     (const BitSequence *)hash6, dataLen );
+     echo_full( &ctx.echo, (BitSequence *)hash7, 512,
+                     (const BitSequence *)hash7, dataLen );
+     
     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, dataLen<<3 );

@@ -328,9 +289,7 @@ void xevan_8way_hash( void *output, const void *input )

     memset( &vhash[ 4<<3 ], 0, (dataLen-32) << 3 );

-     blake512_8way_init( &ctx.blake );
-     blake512_8way_update( &ctx.blake, vhash, dataLen );
-     blake512_8way_close(&ctx.blake, vhash);
+     blake512_8way_full( &ctx.blake, vhash, vhash, dataLen );

     bmw512_8way_init( &ctx.bmw );
     bmw512_8way_update( &ctx.bmw, vhash, dataLen );
@@ -340,10 +299,8 @@ void xevan_8way_hash( void *output, const void *input )

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );

-     groestl512_4way_init( &ctx.groestl, 64 );
-     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, dataLen<<3 );
-     groestl512_4way_init( &ctx.groestl, 64 );
-     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, dataLen<<3 );
+     groestl512_4way_full( &ctx.groestl, vhashA, vhashA, dataLen );
+     groestl512_4way_full( &ctx.groestl, vhashB, vhashB, dataLen );

     rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 );

@@ -352,30 +309,14 @@ void xevan_8way_hash( void *output, const void *input )
     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, dataLen<<3 );

-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7,
-                               dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, dataLen<<3 );

     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, dataLen<<3 );
@@ -396,22 +337,16 @@ void xevan_8way_hash( void *output, const void *input )

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );

-     luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
-     luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
+     luffa512_4way_full( &ctx.luffa, vhashA, vhashA, dataLen );
+     luffa512_4way_full( &ctx.luffa, vhashB, vhashB, dataLen );

-     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
-     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
+     cube_4way_full( &ctx.cube, vhashA, 512, vhashA, dataLen );
+     cube_4way_full( &ctx.cube, vhashB, 512, vhashB, dataLen );

 #if defined(__VAES__)

-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, dataLen );
+     shavite512_4way_full( &ctx.shavite, vhashA, vhashA, dataLen );
+     shavite512_4way_full( &ctx.shavite, vhashB, vhashB, dataLen );

 #else

@@ -448,17 +383,13 @@ void xevan_8way_hash( void *output, const void *input )

 #endif

-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
+     simd512_4way_full( &ctx.simd, vhashA, vhashA, dataLen );
+     simd512_4way_full( &ctx.simd, vhashB, vhashB, dataLen );

 #if defined(__VAES__)

-     echo_4way_init( &ctx.echo, 512 );
-     echo_4way_update_close( &ctx.echo, vhashA, vhashA, dataLen<<3 );
-     echo_4way_init( &ctx.echo, 512 );
-     echo_4way_update_close( &ctx.echo, vhashB, vhashB, dataLen<<3 );
+     echo_4way_full( &ctx.echo, vhashA, 512, vhashA, dataLen );
+     echo_4way_full( &ctx.echo, vhashB, 512, vhashB, dataLen );

     rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 );

@@ -467,30 +398,22 @@ void xevan_8way_hash( void *output, const void *input )
     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );

-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                       (const BitSequence *) hash0, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                       (const BitSequence *) hash1, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                       (const BitSequence *) hash2, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                       (const BitSequence *) hash3, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash4,
-                       (const BitSequence *) hash4, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash5,
-                       (const BitSequence *) hash5, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash6,
-                       (const BitSequence *) hash6, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash7,
-                       (const BitSequence *) hash7, dataLen<<3 );
+     echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                     (const BitSequence *)hash0, dataLen );
+     echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                     (const BitSequence *)hash1, dataLen );
+     echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                     (const BitSequence *)hash2, dataLen );
+     echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                     (const BitSequence *)hash3, dataLen );
+     echo_full( &ctx.echo, (BitSequence *)hash4, 512,
+                     (const BitSequence *)hash4, dataLen );
+     echo_full( &ctx.echo, (BitSequence *)hash5, 512,
+                     (const BitSequence *)hash5, dataLen );
+     echo_full( &ctx.echo, (BitSequence *)hash6, 512,
+                     (const BitSequence *)hash6, dataLen );
+     echo_full( &ctx.echo, (BitSequence *)hash7, 512,
+                     (const BitSequence *)hash7, dataLen );

     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, dataLen<<3 );
@@ -657,9 +580,7 @@ void xevan_4way_hash( void *output, const void *input )

     // parallel 4 way

-     blake512_4way_init( &ctx.blake );
-     blake512_4way_update( &ctx.blake, input, 80 );
-     blake512_4way_close(&ctx.blake, vhash);
+     blake512_4way_full( &ctx.blake, vhash, input, 80 );
     memset( &vhash[8<<2], 0, 64<<2 );

     bmw512_4way_init( &ctx.bmw );
@@ -669,18 +590,10 @@ void xevan_4way_hash( void *output, const void *input )
     // Serial
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
-                               dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, dataLen<<3 );

     // Parallel 4way
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
@@ -699,15 +612,11 @@ void xevan_4way_hash( void *output, const void *input )

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );

-     luffa_2way_init( &ctx.luffa, 512 );
-     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
-     luffa_2way_init( &ctx.luffa, 512 );
-     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
+     luffa512_2way_full( &ctx.luffa, vhashA, vhashA, dataLen );
+     luffa512_2way_full( &ctx.luffa, vhashB, vhashB, dataLen );

-     cube_2way_init( &ctx.cube, 512, 16, 32 );
-     cube_2way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
-     cube_2way_init( &ctx.cube, 512, 16, 32 );
-     cube_2way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
+     cube_2way_full( &ctx.cube, vhashA, 512, vhashA, dataLen );
+     cube_2way_full( &ctx.cube, vhashB, 512, vhashB, dataLen );

     shavite512_2way_init( &ctx.shavite );
     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
@@ -722,18 +631,15 @@ void xevan_4way_hash( void *output, const void *input )
     dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 );
     dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 );

-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                       (const BitSequence *) hash0, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                       (const BitSequence *) hash1, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                       (const BitSequence *) hash2, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                       (const BitSequence *) hash3, dataLen<<3 );
+     echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                     (const BitSequence *)hash0, dataLen );
+     echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                     (const BitSequence *)hash1, dataLen );
+     echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                     (const BitSequence *)hash2, dataLen );
+     echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                     (const BitSequence *)hash3, dataLen );
+
     // Parallel
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

@@ -805,18 +711,10 @@ void xevan_4way_hash( void *output, const void *input )

     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
-                               dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, dataLen<<3 );
+     groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, dataLen<<3 );

     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

@@ -834,15 +732,11 @@ void xevan_4way_hash( void *output, const void *input )

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );

-     luffa_2way_init( &ctx.luffa, 512 );
-     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
-     luffa_2way_init( &ctx.luffa, 512 );
-     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
+     luffa512_2way_full( &ctx.luffa, vhashA, vhashA, dataLen );
+     luffa512_2way_full( &ctx.luffa, vhashB, vhashB, dataLen );

-     cube_2way_init( &ctx.cube, 512, 16, 32 );
-     cube_2way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
-     cube_2way_init( &ctx.cube, 512, 16, 32 );
-     cube_2way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
+     cube_2way_full( &ctx.cube, vhashA, 512, vhashA, dataLen );
+     cube_2way_full( &ctx.cube, vhashB, 512, vhashB, dataLen );

     shavite512_2way_init( &ctx.shavite );
     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
@@ -857,18 +751,14 @@ void xevan_4way_hash( void *output, const void *input )
     dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 );
     dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 );

-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                       (const BitSequence *) hash0, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                       (const BitSequence *) hash1, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                       (const BitSequence *) hash2, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                       (const BitSequence *) hash3, dataLen<<3 );
+     echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                     (const BitSequence *)hash0, dataLen );
+     echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                     (const BitSequence *)hash1, dataLen );
+     echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                     (const BitSequence *)hash2, dataLen );
+     echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                     (const BitSequence *)hash3, dataLen );

     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

@@ -934,7 +824,7 @@ int scanhash_xevan_4way( struct work *work, uint32_t max_nonce,
   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;
   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned

   const uint32_t Htarg = ptarget[7];
--- a/algo/x22/x22i-4way.c
+++ b/algo/x22/x22i-4way.c
@@ -167,10 +167,10 @@ void x22i_8way_hash( void *output, const void *input )

 #if defined(__VAES__)

-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+   shavite512_4way_init( &ctx.shavite );
+   shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+   shavite512_4way_init( &ctx.shavite );
+   shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );

 #else

@@ -214,12 +214,12 @@ void x22i_8way_hash( void *output, const void *input )

 #if defined(__VAES__)

-     echo_4way_init( &ctx.echo, 512 );
-     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
-     echo_4way_init( &ctx.echo, 512 );
-     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+   echo_4way_init( &ctx.echo, 512 );
+   echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+   echo_4way_init( &ctx.echo, 512 );
+   echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );

-     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+   rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );

 #else

--- a/algo/yespower/yespower-gate.c
+++ b/algo/yespower/yespower-gate.c
@@ -135,7 +135,7 @@ bool register_yespower_algo( algo_gate_t* gate )
  if ( yespower_params.pers )
     applog( LOG_NOTICE,"Key= \"%s\"\n", yespower_params.pers );

-  gate->optimizations = SSE2_OPT;
+  gate->optimizations = SSE2_OPT | SHA_OPT;
  gate->scanhash      = (void*)&scanhash_yespower;
  gate->hash          = (void*)&yespower_hash;
  opt_target_factor = 65536.0;
@@ -149,7 +149,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate )
  yespower_params.r       = 16;
  yespower_params.pers    = NULL;
  yespower_params.perslen = 0;
-  gate->optimizations = SSE2_OPT;
+  gate->optimizations = SSE2_OPT | SHA_OPT;
  gate->scanhash      = (void*)&scanhash_yespower;
  gate->hash          = (void*)&yespower_hash;
  opt_target_factor = 65536.0;
@@ -223,7 +223,7 @@ bool register_power2b_algo( algo_gate_t* gate )
  applog( LOG_NOTICE,"Key= \"%s\"", yespower_params.pers );
  applog( LOG_NOTICE,"Key length= %d\n", yespower_params.perslen );

-  gate->optimizations = SSE2_OPT;
+  gate->optimizations = SSE2_OPT | SHA_OPT;
  gate->scanhash      = (void*)&scanhash_yespower_b2b;
  gate->hash          = (void*)&yespower_b2b_hash;
  opt_target_factor = 65536.0;
--- a/algo/yespower/yespower-opt.c
+++ b/algo/yespower/yespower-opt.c
@@ -529,7 +529,7 @@ static volatile uint64_t Smask2var = Smask2;
 /* 64-bit without AVX.  This relies on out-of-order execution and register
 * renaming.  It may actually be fastest on CPUs with AVX(2) as well - e.g.,
 * it runs great on Haswell. */
-#warning "Note: using x86-64 inline assembly for pwxform.  That's great."
+//#warning "Note: using x86-64 inline assembly for pwxform.  That's great."
 #undef MAYBE_MEMORY_BARRIER
 #define MAYBE_MEMORY_BARRIER \
 	__asm__("" : : : "memory");
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -44,23 +44,23 @@ mv cpuminer.exe cpuminer-aes-sse42.exe
 strip -s cpuminer
 mv cpuminer cpuminer-aes-sse42

-make clean || echo clean
-rm -f config.status
-CFLAGS="-O3 -march=corei7 -Wall" ./configure --with-curl
-make -j 16
-strip -s cpuminer.exe
-mv cpuminer.exe cpuminer-sse42.exe
-strip -s cpuminer
-mv cpuminer cpuminer-sse42
+#make clean || echo clean
+#rm -f config.status
+#CFLAGS="-O3 -march=corei7 -Wall" ./configure --with-curl
+#make -j 16
+#strip -s cpuminer.exe
+#mv cpuminer.exe cpuminer-sse42.exe
+#strip -s cpuminer
+#mv cpuminer cpuminer-sse42

-make clean || echo clean
-rm -f config.status
-CFLAGS="-O3 -march=core2 -Wall" ./configure --with-curl
-make -j 16
-strip -s cpuminer.exe
-mv cpuminer.exe cpuminer-ssse3.exe
-strip -s cpuminer
-mv cpuminer cpuminer-ssse3
+#make clean || echo clean
+#rm -f config.status
+#CFLAGS="-O3 -march=core2 -Wall" ./configure --with-curl
+#make -j 16
+#strip -s cpuminer.exe
+#mv cpuminer.exe cpuminer-ssse3.exe
+#strip -s cpuminer
+#mv cpuminer cpuminer-ssse3

 make clean || echo clean
 rm -f config.status
--- a/clean-all.sh
+++ b/clean-all.sh
@@ -3,8 +3,8 @@
 # imake clean and rm all the targetted executables.
 # tips to users.

-rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen  > /dev/null
+rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-zen  > /dev/null

-rm cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-aes-avx.exe cpuminer-aes-sse42.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-sse2.exe cpuminer-zen.exe  > /dev/null
+rm cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-aes-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-zen.exe  > /dev/null

 make distclean > /dev/null
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.11.3.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.11.6.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.11.3'
-PACKAGE_STRING='cpuminer-opt 3.11.3'
+PACKAGE_VERSION='3.11.6'
+PACKAGE_STRING='cpuminer-opt 3.11.6'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.11.3 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.11.6 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.11.3:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.11.6:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.11.3
+cpuminer-opt configure 3.11.6
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.11.3, which was
+It was created by cpuminer-opt $as_me 3.11.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.11.3'
+ VERSION='3.11.6'


 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.11.3, which was
+This file was extended by cpuminer-opt $as_me 3.11.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.11.3
+cpuminer-opt config.status 3.11.6
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.11.3])
+AC_INIT([cpuminer-opt], [3.11.6])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -157,6 +157,7 @@ bool opt_hash_meter = false;
 uint32_t submitted_share_count= 0;
 uint32_t accepted_share_count = 0;
 uint32_t rejected_share_count = 0;
+uint32_t stale_share_count = 0;
 uint32_t solved_block_count = 0;
 double *thr_hashrates;
 double global_hashrate = 0;
@@ -819,20 +820,15 @@ out:
 // returns the unit prefix and the hashrate appropriately scaled.
 void scale_hash_for_display ( double* hashrate, char* prefix )
 {
-     if ( *hashrate < 1e4 )            //  0  H/s to 9999  h/s
-        *prefix =  0;
-     else if ( *hashrate < 1e7 )       // 10 kH/s to 9999 kh/s
-     {  *prefix = 'k';  *hashrate /= 1e3;   }
-     else if ( *hashrate < 1e10 )      // 10 Mh/s to 9999 Mh/s
-     {  *prefix = 'M';  *hashrate /= 1e6;   }
-     else if ( *hashrate < 1e13 )      // 10 Gh/s to 9999 Gh/s
-     {  *prefix = 'G';  *hashrate /= 1e9;   }
-     else if ( *hashrate < 1e16 )      // 10 Th/s to 9999 Th/s
-     {  *prefix = 'T';  *hashrate /= 1e12;  }
-     else if ( *hashrate < 1e19 )      // 10 Ph/s to 9999 Ph
-     {  *prefix = 'P';  *hashrate /= 1e15;  }
-     else                              // 10 Eh/s and higher
-     {  *prefix = 'E';  *hashrate /= 1e18;  }
+       if ( *hashrate < 1e4  )    *prefix =  0;
+  else if ( *hashrate < 1e7  )  { *prefix = 'k';  *hashrate /= 1e3;  }
+  else if ( *hashrate < 1e10 )  { *prefix = 'M';  *hashrate /= 1e6;  }  
+  else if ( *hashrate < 1e13 )  { *prefix = 'G';  *hashrate /= 1e9;  }
+  else if ( *hashrate < 1e16 )  { *prefix = 'T';  *hashrate /= 1e12; }
+  else if ( *hashrate < 1e19 )  { *prefix = 'P';  *hashrate /= 1e15; }
+  else if ( *hashrate < 1e22 )  { *prefix = 'E';  *hashrate /= 1e18; }
+  else if ( *hashrate < 1e25 )  { *prefix = 'Z';  *hashrate /= 1e21; } 
+  else                          { *prefix = 'Y';  *hashrate /= 1e24; }
 }

 static inline void sprintf_et( char *str, int seconds )
@@ -841,10 +837,13 @@ static inline void sprintf_et( char *str, int seconds )
   unsigned int min = seconds / 60;
   unsigned int sec = seconds % 60;
   unsigned int hrs = min / 60;
-   if ( hrs )   
+   if ( unlikely( hrs ) )   
   {
+      unsigned int years = hrs / (24*365);
      unsigned int days = hrs / 24;
-      if ( days )  //0d00h
+      if ( years )
+         sprintf( str, "%uy%ud", years, years % 365 );
+      else if ( days )  //0d00h
         sprintf( str, "%ud%02uh", days, hrs % 24 );
      else         // 0h00m  
         sprintf( str, "%uh%02um", hrs, min % 60 );
@@ -866,13 +865,15 @@ const double diff_to_hash = 4294967296.;

 static struct   timeval session_start;
 static struct   timeval five_min_start;
+static uint64_t session_first_block = 0;
 static double   latency_sum = 0.;
 static uint64_t submit_sum  = 0;
 static uint64_t accept_sum  = 0;
+static uint64_t stale_sum  = 0;
 static uint64_t reject_sum  = 0;
 static double   norm_diff_sum = 0.;
 static uint32_t last_block_height = 0;
-static bool     new_job = false;
+//static bool     new_job = false;
 static double   last_targetdiff = 0.;
 static double   ref_rate_hi = 0.;
 static double   ref_rate_lo = 1e100;
@@ -883,6 +884,7 @@ static uint32_t hi_temp = 0;

 struct share_stats_t
 {
+   int share_count;
   struct timeval submit_time;
   double net_diff;
   double share_diff;
@@ -892,7 +894,7 @@ struct share_stats_t
 };

 #define s_stats_size 8
-static struct share_stats_t share_stats[ s_stats_size ] = {0};
+static struct share_stats_t share_stats[ s_stats_size ] = {{0}};
 static int s_get_ptr = 0, s_put_ptr = 0;
 static struct timeval last_submit_time = {0};

@@ -921,6 +923,7 @@ void report_summary_log( bool force )
   uint64_t submits = submit_sum;  submit_sum = 0;
   uint64_t accepts = accept_sum;  accept_sum = 0;
   uint64_t rejects = reject_sum;  reject_sum = 0;
+   uint64_t stales  = stale_sum;   stale_sum  = 0;
 //   int      latency  = latency_sum; latency_sum = 0;
   memcpy( &start_time, &five_min_start, sizeof start_time );
   memcpy( &five_min_start, &now, sizeof now );
@@ -976,7 +979,11 @@ void report_summary_log( bool force )
                       submits, submitted_share_count );
   applog2( LOG_INFO,"Accepted         %6d       %6d",
                       accepts, accepted_share_count );
-   applog2( LOG_INFO,"Rejected         %6d       %6d",
+   if ( stale_share_count )
+      applog2( LOG_INFO,"Stale            %6d       %6d",
+                       stales, stale_share_count );
+   if ( rejected_share_count )
+      applog2( LOG_INFO,"Rejected         %6d       %6d",
                       rejects, rejected_share_count );
   if ( solved_block_count )
      applog2( LOG_INFO,"Blocks solved                 %6d",
@@ -1011,13 +1018,18 @@ static int share_result( int result, struct work *null_work,
   int latency = 0;
   struct share_stats_t my_stats = {0};
   struct timeval ack_time, latency_tv, et;
-   const char *sres = NULL;
+   char ares[48];
+   char sres[48];
+   char rres[48];
+   char bres[48];
+//   char job_id[48];
   bool solved = false; 
-
+   bool stale = false;
+   char *acol = NULL, *bcol = NULL, *scol = NULL, *rcol = NULL;
   // Mutex while we grab a snapshot of the stats.
   pthread_mutex_lock( &stats_lock );

-   if ( share_stats[ s_get_ptr ].submit_time.tv_sec )
+   if ( likely( share_stats[ s_get_ptr ].submit_time.tv_sec ) )
   {
      memcpy( &my_stats, &share_stats[ s_get_ptr], sizeof my_stats );
      memset( &share_stats[ s_get_ptr ], 0, sizeof my_stats );
@@ -1047,7 +1059,7 @@ static int share_result( int result, struct work *null_work,
                                                my_stats.net_diff * 100.;

   // check result
-   if ( result )
+   if ( likely( result ) )
   {
      accepted_share_count++;
      if ( ( my_stats.net_diff > 0. ) && ( my_stats.share_diff >= net_diff ) )
@@ -1057,13 +1069,16 @@ static int share_result( int result, struct work *null_work,
      }
   }
   else
-      rejected_share_count++;
-/*
-   result ? accepted_share_count++ : rejected_share_count++;
-   solved = result && (my_stats.net_diff > 0.0 )
-            && ( my_stats.share_diff >= net_diff );
-   solved_block_count += solved ? 1 : 0 ;
-*/
+   {
+     if ( reason && strstr( reason, "Invalid job id" ) )
+     {
+        stale = true;
+        stale_share_count++;
+     }
+     else
+        rejected_share_count++;
+   }
+
   // update global counters for summary report
   pthread_mutex_lock( &stats_lock );

@@ -1071,37 +1086,88 @@ static int share_result( int result, struct work *null_work,
       hashrate += thr_hashrates[i];
   global_hashrate = hashrate;
   
-   if ( result ) 
+   if ( likely( result ) )
   {
      accept_sum++;
      norm_diff_sum += my_stats.target_diff;
   }
   else
-      reject_sum++;
+   {
+      if ( stale )
+         stale_sum++;
+      else
+         reject_sum++;
+   }
   submit_sum++;
   latency_sum += latency;

   pthread_mutex_unlock( &stats_lock );

-   if ( use_colors )
-      sres = solved ? ( CL_MAG "BLOCK SOLVED" CL_WHT )
-                    : ( result ? ( CL_GRN "Accepted" CL_WHT )
-                             : ( CL_RED "Rejected" CL_WHT ) );
-   else   // monochrome
-      sres = solved ? "BLOCK SOLVED" : ( result ? "Accepted" : "Rejected" );
+   bcol = acol = scol = rcol = "\0";
+   if ( likely( result ) )
+   {
+     if ( unlikely( solved ) )
+     {
+       sprintf( bres, "BLOCK SOLVED %d", solved_block_count );
+       sprintf( ares, "A%d", accepted_share_count );
+     }
+     else
+     {
+       sprintf( bres, "B%d", solved_block_count );
+       sprintf( ares, "Accepted %d", accepted_share_count );
+     }
+     sprintf( sres, "S%d", stale_share_count );
+     sprintf( rres, "R%d", rejected_share_count );
+   }
+   else  
+   {
+     sprintf( ares, "A%d", accepted_share_count );
+     sprintf( bres, "B%d", solved_block_count );
+     if ( stale )
+     {
+        sprintf( sres, "Stale job %d", stale_share_count );
+        sprintf( rres, "R%d", rejected_share_count );
+     }
+     else
+     {
+        sprintf( sres, "S%d", stale_share_count ); 
+        sprintf( rres, "Rejected %d" , rejected_share_count );
+     }
+   } 

-   applog( LOG_NOTICE, "%s, %.3f secs (%dms), A/R/B: %d/%d/%d",
-                       sres, share_time, latency, accepted_share_count,
-                       rejected_share_count, solved_block_count );
+   bcol = acol = scol = rcol = CL_WHT;
+
+   if ( use_colors )
+   {
+     if ( likely( result ) )
+     {
+       if ( unlikely( solved ) )
+       {
+         bcol = CL_MAG;
+         acol = CL_GRN;
+       }
+       else
+         acol = CL_GRN; 
+     }        
+     else if ( stale )
+       scol = CL_YL2;
+     else
+       rcol = CL_RED;
+   }
+
+   applog( LOG_NOTICE, "%d %s%s %s%s %s%s %s%s" CL_WHT ", %.3f sec (%dms)",
+           my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol,
+           bres, share_time, latency );

   if ( have_stratum && !opt_quiet )
-      applog2( LOG_INFO, "Share diff %.3g (%5f%%), block %d, job %s",
-               my_stats.share_diff, share_ratio, stratum.block_height,
-               my_stats.job_id );
+      applog2( LOG_NOTICE, "Diff %.3g (%.3g%), %sBlock %d, %sJob %s" CL_WHT,
+               my_stats.share_diff, share_ratio, bcol, stratum.block_height,
+               scol, my_stats.job_id );

-   if ( reason )
+   if ( unlikely( reason && !result ) )
   {
-      applog( LOG_WARNING, "Reject reason: %s", reason );
+      if ( !( opt_quiet || stale ) )
+         applog( LOG_WARNING, "Reject reason: %s", reason );
      
      if ( opt_debug )
      {
@@ -1122,7 +1188,7 @@ static int share_result( int result, struct work *null_work,
         applog2( LOG_INFO, "Target: %s...", str3 );
      }

-      if ( opt_reset_on_stale && strstr( reason, "Invalid job id" ) )
+      if ( unlikely( opt_reset_on_stale && stale ) )
         stratum_need_reset = true;
   }

@@ -1635,7 +1701,7 @@ static void *workio_thread(void *userdata)
 	if ( jsonrpc_2 && !have_stratum )
 		ok = rpc2_workio_login( curl );

-   while (ok)
+   while ( likely(ok) )
   {
 		struct workio_cmd *wc;

@@ -1711,7 +1777,8 @@ static bool get_work(struct thr_info *thr, struct work *work)
 	return true;
 }

-static bool submit_work( struct thr_info *thr, const struct work *work_in )
+static bool submit_work( const struct thr_info *thr,
+                         const struct work *work_in )
 {
 	struct workio_cmd *wc;

@@ -1735,20 +1802,22 @@ err_out:
 	return false;
 }

-// Convert little endian 256 bit unsigned integer to
-// double precision floating point.
-static inline double u256_to_double( const uint64_t* u )
+// __float128?
+// Convert little endian 256 bit (38 decimal digits) unsigned integer to
+// double precision floating point with 15 decimal digits precision.
+// returns u * ( 2**256 ) 
+static inline double u256_to_double( const uint64_t *u )
 {
-   const double f = 4294967296.0 * 4294967296.0;  // 2**64
-   return ( ( u[3] * f + u[2] ) * f + u[1] ) * f + u[0];
+   const double exp64 = 4294967296.0 * 4294967296.0;  // 2**64
+   return ( ( u[3] * exp64 + u[2] ) * exp64 + u[1] ) * exp64 + u[0];
 }

-void work_set_target_ratio( struct work* work, uint32_t* hash )
+void work_set_target_ratio( struct work* work, const void *hash )
 {
   double dhash;

   dhash = u256_to_double( (const uint64_t*)hash );
-   if ( dhash > 0. )
+   if ( likely( dhash > 0. ) )
      work->sharediff = work->targetdiff *
             u256_to_double( (const uint64_t*)( work->target ) ) / dhash;
   else
@@ -1760,6 +1829,7 @@ void work_set_target_ratio( struct work* work, uint32_t* hash )
   // it can overflow the queue and overwrite stats for a share.
   pthread_mutex_lock( &stats_lock );

+   share_stats[ s_put_ptr ].share_count = submitted_share_count;
   gettimeofday( &share_stats[ s_put_ptr ].submit_time, NULL );
   share_stats[ s_put_ptr ].share_diff = work->sharediff;
   share_stats[ s_put_ptr ].net_diff = net_diff;
@@ -1772,37 +1842,39 @@ void work_set_target_ratio( struct work* work, uint32_t* hash )
   pthread_mutex_unlock( &stats_lock );
 }

-bool submit_solution( struct work *work, void *hash,
-                      struct thr_info *thr )
+bool submit_solution( struct work *work, const void *hash,
+                      const struct thr_info *thr )
 {
-  if ( submit_work( thr, work ) )
+  if ( likely( submit_work( thr, work ) ) )
  {
     submitted_share_count++;
     work_set_target_ratio( work, hash );
     if ( !opt_quiet )
-        applog( LOG_BLUE, "Share %d submitted by thread %d",
-            submitted_share_count, thr->id );
+        applog( LOG_NOTICE, "%d submitted by thread %d, job %s",
+            submitted_share_count, thr->id, work->job_id );
     return true;
  }
  else
-     applog( LOG_WARNING, "Failed to submit share." );
+     applog( LOG_WARNING, "%d failed to submit share.",
+             submitted_share_count );
  return false;
 }

-bool submit_lane_solution( struct work *work, void *hash,
-                           struct thr_info *thr, int lane )
+bool submit_lane_solution( struct work *work, const void *hash,
+                           const struct thr_info *thr, const int lane )
 {
-  if ( submit_work( thr, work ) )
+  if ( likely( submit_work( thr, work ) ) )
  {
     submitted_share_count++;
     work_set_target_ratio( work, hash );
     if ( !opt_quiet )
-        applog( LOG_BLUE, "Share %d submitted by thread %d, lane %d",
-            submitted_share_count, thr->id, lane );
+        applog( LOG_NOTICE, "%d submitted by thread %d, lane %d, job %s",
+            submitted_share_count, thr->id, lane, work->job_id );
     return true;
  }
  else
-     applog( LOG_WARNING, "Failed to submit share." );
+     applog( LOG_WARNING, "%d failed to submit share.",
+          submitted_share_count );
  return false;
 }

@@ -1908,35 +1980,29 @@ double std_calc_network_diff( struct work* work )
   return d;
 }

-uint32_t* std_get_nonceptr( uint32_t *work_data )
+uint32_t *std_get_nonceptr( uint32_t *work_data )
 {
   return work_data + algo_gate.nonce_index;
 }

-uint32_t* jr2_get_nonceptr( uint32_t *work_data )
+uint32_t *jr2_get_nonceptr( uint32_t *work_data )
 {
   // nonce is misaligned, use byte offset
   return (uint32_t*) ( ((uint8_t*) work_data) + algo_gate.nonce_index );
 }

-
 void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
                     uint32_t *end_nonce_ptr, bool clean_job )
 {
   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );

-// the job_id check doesn't work as intended, it's a char pointer!
-// For stratum the pointers can be dereferenced and the strings compared,
-// benchmark not, getwork & gbt unsure.
-//    || ( have_straum && strcmp( work->job_id, g_work->job_id ) ) ) )
-// or
-//    || ( !benchmark && strcmp( work->job_id, g_work->job_id ) ) ) )
-// For now leave it as is, it seems stable.
-// strtoul seems to work.
-   if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
-     && ( clean_job || ( *nonceptr >= *end_nonce_ptr )
-      || strtoul( work->job_id, NULL, 16 )
-          != strtoul( g_work->job_id, NULL, 16 ) ) )
+   bool force_new_work = work->job_id ? strtoul(   work->job_id, NULL, 16 ) !=
+                                        strtoul( g_work->job_id, NULL, 16 )
+                                      : true;
+
+   if ( force_new_work || *nonceptr >= *end_nonce_ptr
+   || ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
+        && clean_job ) )
   {
     work_free( work );
     work_copy( work, g_work );
@@ -1958,7 +2024,7 @@ void jr2_get_new_work( struct work* work, struct work* g_work, int thr_id,
   if ( memcmp( work->data, g_work->data, algo_gate.nonce_index )
     || memcmp( ((uint8_t*) work->data)   + JR2_WORK_CMP_INDEX_2,
                ((uint8_t*) g_work->data) + JR2_WORK_CMP_INDEX_2,
-                                                    JR2_WORK_CMP_SIZE_2 ) )
+                                             JR2_WORK_CMP_SIZE_2 ) )
   {
      work_free( work );
      work_copy( work, g_work );
@@ -2007,39 +2073,30 @@ static void *miner_thread( void *userdata )
    * error if it fails */
   if (!opt_benchmark && opt_priority == 0)
   {
-	setpriority(PRIO_PROCESS, 0, 19);
-	drop_policy();
+      setpriority(PRIO_PROCESS, 0, 19);
+      drop_policy();
   }
   else
   {
-	int prio = 0;
+      int prio = 0;
 #ifndef WIN32
-	prio = 18;
-	// note: different behavior on linux (-19 to 19)
-	switch (opt_priority)
-        {
-	   case 1:
-		prio = 5;
-		break;
-	   case 2:
-		prio = 0;
-		break;
-	   case 3:
-		prio = -5;
-		break;
-	   case 4:
-		prio = -10;
-		break;
-	   case 5:
-		prio = -15;
-	}
-	if (opt_debug)
-	   applog(LOG_DEBUG, "Thread %d priority %d (nice %d)", thr_id,
+      prio = 18;
+      // note: different behavior on linux (-19 to 19)
+	   switch ( opt_priority )
+      {
+	      case 1:   prio =   5;   break;
+	      case 2:   prio =   0;   break;
+	      case 3:   prio =  -5;   break;
+	      case 4:   prio = -10;   break;
+	      case 5:   prio = -15;
+      }
+	   if (opt_debug)
+	      applog(LOG_DEBUG, "Thread %d priority %d (nice %d)", thr_id,
                              opt_priority, prio );
 #endif
-	setpriority(PRIO_PROCESS, 0, prio);
-	if (opt_priority == 0)
-	   drop_policy();
+      setpriority(PRIO_PROCESS, 0, prio);
+	   if ( opt_priority == 0 )
+	      drop_policy();
   }
   // CPU thread affinity
   if ( num_cpus > 1 )
@@ -2092,7 +2149,7 @@ static void *miner_thread( void *userdata )
   }

   // wait for stratum to send first job
-   if ( have_stratum ) while ( !stratum.job.job_id ) sleep(1);
+   if ( have_stratum ) while ( unlikely( !g_work.job_id ) ) sleep(1);

   while (1)
   {
@@ -2101,7 +2158,7 @@ static void *miner_thread( void *userdata )
       int64_t max64 = 1000;
       int nonce_found = 0;

-       if ( algo_gate.do_this_thread( thr_id ) )
+       if ( likely( algo_gate.do_this_thread( thr_id ) ) )
       {
          if ( have_stratum )
          {
@@ -2136,10 +2193,10 @@ static void *miner_thread( void *userdata )
       } // do_this_thread
       algo_gate.resync_threads( &work );

-       if ( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) )
+       if ( unlikely( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) ) )
          continue;
       // conditional mining
-       if (!wanna_mine(thr_id))
+       if ( unlikely( !wanna_mine( thr_id ) ) )
       {
          sleep(5);
 	       continue;
@@ -2151,7 +2208,7 @@ static void *miner_thread( void *userdata )
          max64 = g_work_time + ( have_longpoll ? LP_SCANTIME : opt_scantime )
 	                      - time(NULL);
       // time limit
-       if ( opt_time_limit && firstwork_time )
+       if ( unlikely( opt_time_limit && firstwork_time ) )
       {
          int passed = (int)( time(NULL) - firstwork_time );
          int remain = (int)( opt_time_limit - passed );
@@ -2209,15 +2266,17 @@ static void *miner_thread( void *userdata )
          pthread_mutex_unlock( &stats_lock );
       }
       // If unsubmiited nonce(s) found, submit now. 
-       if ( nonce_found && !opt_benchmark )
+       if ( unlikely( nonce_found && !opt_benchmark ) )
       {  
+          applog( LOG_WARNING, "BUG: See RELEASE_NOTES for reporting bugs. Algo = %s.",
+                               algo_names[ opt_algo ] );
          if ( !submit_work( mythr, &work ) )
          {
             applog( LOG_WARNING, "Failed to submit share." );
             break;
          }
          if ( !opt_quiet )
-              applog( LOG_BLUE, "Share %d submitted by thread %d.",
+              applog( LOG_NOTICE, "%d: submitted by thread %d.",
                      accepted_share_count + rejected_share_count + 1,
                      mythr->id );

@@ -2232,7 +2291,7 @@ static void *miner_thread( void *userdata )
          }
       }
       // display hashrate
-       if ( opt_hash_meter )
+       if ( unlikely( opt_hash_meter ) )
       {
          char hr[16];
          char hr_units[2] = {0,0};
@@ -2250,8 +2309,8 @@ static void *miner_thread( void *userdata )

       // Display benchmark total
       // Update hashrate for API if no shares accepted yet.
-       if ( ( opt_benchmark || !accepted_share_count ) 
-            && thr_id == opt_n_threads - 1 )
+       if ( unlikely( ( opt_benchmark || !accepted_share_count ) 
+            && thr_id == opt_n_threads - 1 ) )
       {
          double hashrate  = 0.;
          for ( i = 0; i < opt_n_threads; i++ )
@@ -2570,12 +2629,11 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
   algo_gate.build_extraheader( g_work, sctx );
   net_diff = algo_gate.calc_network_diff( g_work );
   algo_gate.set_work_data_endian( g_work );
-
-   pthread_mutex_unlock( &sctx->work_lock );
-
   work_set_target( g_work, sctx->job.diff
                                  / ( opt_target_factor * opt_diff_factor ) );

+   pthread_mutex_unlock( &sctx->work_lock );
+
   if ( opt_debug )
   {
      unsigned char *xnonce2str = abin2hex( g_work->xnonce2,
@@ -2585,60 +2643,72 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
      free( xnonce2str );
   }

-   // Log new block and/or stratum difficulty change.
+   double hr = 0.;
+   pthread_mutex_lock( &stats_lock );
+
+   for ( int i = 0; i < opt_n_threads; i++ )
+      hr += thr_hashrates[i];
+   global_hashrate = hr;
+   pthread_mutex_unlock( &stats_lock );
+
+   if ( stratum_diff != sctx->job.diff )
+      applog( LOG_BLUE, "New stratum diff %g, block %d, job %s",
+                        sctx->job.diff, sctx->block_height, g_work->job_id );
+   else if ( last_block_height != sctx->block_height )
+      applog( LOG_BLUE, "New block %d, job %s",
+                         sctx->block_height, g_work->job_id );
+   else
+      applog( LOG_BLUE,"New job %s", g_work->job_id );
+
+   // Update data and calculate new estimates.
   if ( ( stratum_diff != sctx->job.diff )
-     || ( last_block_height != sctx->block_height ) )
+   || ( last_block_height != sctx->block_height ) )
   {
-       double hr = 0.;
-       new_job = false;
-       pthread_mutex_lock( &stats_lock );
+      static bool multipool = false;
+      if ( stratum.block_height < last_block_height ) multipool = true;
+      if ( unlikely( !session_first_block ) )
+         session_first_block = stratum.block_height;
+      last_block_height = stratum.block_height;
+      stratum_diff      = sctx->job.diff;
+      last_targetdiff   = g_work->targetdiff;

-       for ( int i = 0; i < opt_n_threads; i++ )
-          hr += thr_hashrates[i];
-       global_hashrate = hr;
-       pthread_mutex_unlock( &stats_lock );
+      if ( !opt_quiet )
+      {
+         applog2( LOG_INFO, "%s: %s", algo_names[opt_algo], short_url );
+         applog2( LOG_INFO, "Diff: Net %.3g, Stratum %.3g, Target %.3g",
+                            net_diff, stratum_diff, last_targetdiff );
+         if ( likely( hr > 0. ) )
+         {
+            char hr_units[4] = {0};
+            char block_ttf[32];
+            char share_ttf[32];

-       if ( !opt_quiet )
-       {
-          if ( stratum_diff != sctx->job.diff )
-             applog( LOG_BLUE, "New stratum diff %g, block %d, job %s",
-                   sctx->job.diff, sctx->block_height, g_work->job_id );
-          else if ( last_block_height != sctx->block_height )
-             applog( LOG_BLUE, "New block %d, job %s", sctx->block_height,
-                                                        g_work->job_id );
-          else
-             applog( LOG_BLUE,"New job %s.", g_work->job_id );
-       }
+            sprintf_et( block_ttf, net_diff * diff_to_hash / hr );
+            sprintf_et( share_ttf, last_targetdiff * diff_to_hash / hr );
+            scale_hash_for_display ( &hr, hr_units );
+ 
+            applog2( LOG_INFO, "TTF @ %.2f %sh/s: block %s, share %s",
+                               hr, hr_units, block_ttf, share_ttf );
+            if ( !multipool && net_diff > 0. )
+            {
+               struct timeval now, et;
+               gettimeofday( &now, NULL );
+               timeval_subtract( &et, &now, &session_start );
+               double net_hr = net_diff * diff_to_hash;
+               char net_ttf[32];
+               char net_hr_units[4] = {0};

-       // Update data and calculate new estimates.
-       stratum_diff = sctx->job.diff;
-       last_block_height = stratum.block_height;
-       last_targetdiff = g_work->targetdiff;
+               sprintf_et( net_ttf,
+                   ( last_block_height - session_first_block ) == 0 ? 0 :
+                     et.tv_sec / ( last_block_height - session_first_block ) );

-       if ( !opt_quiet )
-       {
-          applog2( LOG_INFO, "%s %s block %d", short_url,
-                             algo_names[opt_algo], stratum.block_height );
-          applog2( LOG_INFO, "Diff: net %g, stratum %g, target %g",
-                             net_diff, stratum_diff, last_targetdiff );
-       }
+               scale_hash_for_display ( &net_hr, net_hr_units );

-       if ( hr > 0. )
-       {
-          char hr_units[4] = {0};
-          char block_ttf[32];
-          char share_ttf[32];
-
-          sprintf_et( block_ttf, net_diff * diff_to_hash / hr );
-          sprintf_et( share_ttf, last_targetdiff * diff_to_hash / hr );
-          scale_hash_for_display ( &hr, hr_units );
-
-          if ( !opt_quiet )
-          {   
-             applog2( LOG_INFO, "TTF @ %.2f %sh/s: block %s, share %s",
-                                hr, hr_units, block_ttf, share_ttf );
-          }
-       }
+               applog2( LOG_INFO, "TTF @ %.2f %sh/s: %s",
+                                  net_hr, net_hr_units, net_ttf );
+            }
+         }  // hr > 0
+      } // !quiet
   }  // new diff/block   
 }

@@ -2666,7 +2736,7 @@ static void *stratum_thread(void *userdata )
   {
      int failures = 0;

-      if ( stratum_need_reset )
+      if ( unlikely( stratum_need_reset ) )
      {
          stratum_need_reset = false;
          stratum_disconnect( &stratum );
@@ -2703,8 +2773,7 @@ static void *stratum_thread(void *userdata )
                applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
            sleep(opt_fail_pause);
         }
-
-         if (jsonrpc_2)
+         if ( unlikely( jsonrpc_2 ) )
         {
             work_free(&g_work);
             work_copy(&g_work, &stratum.work);
@@ -2717,28 +2786,12 @@ static void *stratum_thread(void *userdata )
      if ( stratum.job.job_id
          && ( !g_work_time || strcmp( stratum.job.job_id, g_work.job_id ) ) )
      {
-         new_job = true;
         pthread_mutex_lock(&g_work_lock);
         algo_gate.stratum_gen_work( &stratum, &g_work );
         time(&g_work_time);
         pthread_mutex_unlock(&g_work_lock);
         restart_threads();
-
-         if ( stratum.job.clean || jsonrpc_2 )
-         {
-            if ( !opt_quiet && last_block_height && new_job
-               &&  ( last_block_height == stratum.block_height ) )
-            {  
-               new_job = false;
-               applog( LOG_BLUE,"New job %s", g_work.job_id );
-            }
-         }
-         else if (opt_debug && !opt_quiet)
-         {
-            applog( LOG_BLUE, "%s asks job %d for block %d", short_url,
-                strtoul( stratum.job.job_id, NULL, 16 ), stratum.block_height );
-         }
-      }  // stratum.job.job_id
+      } 

     if ( stratum_socket_full( &stratum, opt_timeout ) )
     {
@@ -2764,25 +2817,6 @@ static void *stratum_thread(void *userdata )
        // check if this redundant
        stratum_disconnect( &stratum );
     }   
-/*
-     if ( !stratum_socket_full( &stratum, opt_timeout ) )
-     {
-        stratum_errors++;
-        applog(LOG_ERR, "Stratum connection timeout");
-        s = NULL;
-     }
-     else
-        s = stratum_recv_line(&stratum);
-     if ( !s )
-     {
-        stratum_disconnect(&stratum);
-        applog(LOG_WARNING, "Stratum connection interrupted");
-        continue;
-     }
-     if (!stratum_handle_method(&stratum, s))
-          stratum_handle_response(s);
-     free(s);
-*/
   }  // loop
 out:
  return NULL;
@@ -3710,6 +3744,7 @@ int main(int argc, char *argv[])
          applog(LOG_WARNING,"available on Linux. Using default affinity.");
          opt_affinity = -1;
      }
+/*
      else	
      {
         affine_to_cpu_mask( -1, opt_affinity );
@@ -3728,6 +3763,7 @@ int main(int argc, char *argv[])
 #endif
         }
      }
+*/
   }

 #ifdef HAVE_SYSLOG_H
@@ -3759,7 +3795,7 @@ int main(int argc, char *argv[])

 	/* start work I/O thread */
 	if (thread_create(thr, workio_thread))
-        {
+   {
 		applog(LOG_ERR, "work thread create failed");
 		return 1;
 	}
@@ -3825,7 +3861,7 @@ int main(int argc, char *argv[])
 		thr->q = tq_new();
 		if (!thr->q)
 			return 1;
-		err = thread_create(thr, miner_thread);
+         err = thread_create(thr, miner_thread);
 		if (err) {
 			applog(LOG_ERR, "thread %d create failed", i);
 			return 1;
--- a/miner.h
+++ b/miner.h
@@ -313,12 +313,14 @@ size_t address_to_script( unsigned char *out, size_t outsz, const char *addr );
 int    timeval_subtract( struct timeval *result, struct timeval *x,
                           struct timeval *y);
 bool   fulltest( const uint32_t *hash, const uint32_t *target );
+bool   valid_hash( const void*, const void* );
+
 void   work_set_target( struct work* work, double diff );
 double target_to_diff( uint32_t* target );
 extern void diff_to_target(uint32_t *target, double diff);

 double hash_target_ratio( uint32_t* hash, uint32_t* target );
-void   work_set_target_ratio( struct work* work, uint32_t* hash );
+void   work_set_target_ratio( struct work* work, const void *hash );

 struct thr_info {
        int id;
@@ -330,10 +332,10 @@ struct thr_info {

 //struct thr_info *thr_info;

-bool   submit_solution( struct work *work, void *hash,
-                        struct thr_info *thr );
-bool   submit_lane_solution( struct work *work, void *hash,
-                             struct thr_info *thr, int lane );
+bool   submit_solution( struct work *work, const void *hash,
+                        const struct thr_info *thr );
+bool   submit_lane_solution( struct work *work, const void *hash,
+                             const struct thr_info *thr, const int lane );


 //bool submit_work( struct thr_info *thr, const struct work *work_in );
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -129,8 +129,8 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
 }

 // Equivalent of set1, broadcast 64 bit constant to all 64 bit elements.
-#define m512_const1_256( i )   _mm512_broadcast_i64x4( i )
-#define m512_const1_128( i )   _mm512_broadcast_i64x2( i )
+#define m512_const1_256( v )   _mm512_broadcast_i64x4( v )
+#define m512_const1_128( v )   _mm512_broadcast_i64x2( v )
 #define m512_const1_64( i )    _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
 #define m512_const1_32( i )    _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
 #define m512_const1_16( i )    _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
@@ -547,8 +547,6 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 //  Rotate elements from 2 512 bit vectors in place, source arguments
 //  are overwritten.
-//  These can all be done with 2 permutex2var instructions but they are
-//  slower than either xor or alignr and require AVX512VBMI.

 #define mm512_swap1024_512(v1, v2) \
   v1 = _mm512_xor_si512(v1, v2); \
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -41,15 +41,16 @@
   "/sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input"

 #define HWMON_PATH3 \
-   "/sys/class/hwmon/hwmon0/temp1_input"
+   "/sys/devices/platform/coretemp.0/hwmon/hwmon2/temp1_input"

 #define HWMON_PATH \
 "/sys/class/hwmon/hwmon2/temp1_input"

-/*
+// need this for Ryzen
 #define HWMON_ALT \
 "/sys/class/hwmon/hwmon0/temp1_input"

+/*
 #define HWMON_ALT1 \
 "/sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input"
 */
@@ -84,21 +85,9 @@ static inline float linux_cputemp(int core)
   if (!fd)
      fd = fopen(HWMON_PATH, "r");

-	if (!fd)
-//		fd = fopen(HWMON_ALT1, "r");
-
-//	if (!fd)
-		fd = fopen(HWMON_ALT2, "r");
-
-	if (!fd)
-		fd = fopen(HWMON_ALT3, "r");
-
-	if (!fd)
-		fd = fopen(HWMON_ALT4, "r");
-
-	if (!fd)
-      fd = fopen(HWMON_ALT5, "r");
-
+   if (!fd)
+      fd = fopen(HWMON_ALT, "r");
+   
 	if (!fd)
 		return tc;

--- a/util.c
+++ b/util.c
@@ -982,40 +982,59 @@ int timeval_subtract(struct timeval *result, struct timeval *x,
 	return x->tv_sec < y->tv_sec;
 }

-bool fulltest(const uint32_t *hash, const uint32_t *target)
+// Use this when deinterleaved
+// do 64 bit test 4 iterations
+inline bool valid_hash( const void *hash, const void *target )
+{
+   const uint64_t *h = (const uint64_t*)hash;
+   const uint64_t *t = (const uint64_t*)target;
+   if ( h[3] > t[3] ) return false;
+   if ( h[3] < t[3] ) return true;
+   if ( h[2] > t[2] ) return false;
+   if ( h[2] < t[2] ) return true;
+   if ( h[1] > t[1] ) return false;
+   if ( h[1] < t[1] ) return true;
+   if ( h[0] > t[0] ) return false;
+   return true;
+}
+
+bool fulltest( const uint32_t *hash, const uint32_t *target )
 {
 	int i;
 	bool rc = true;
 	
-	for (i = 7; i >= 0; i--) {
-		if (hash[i] > target[i]) {
+	for ( i = 7; i >= 0; i-- )
+   {
+		if ( hash[i] > target[i] )
+      {
 			rc = false;
 			break;
 		}
-		if (hash[i] < target[i]) {
+		if ( hash[i] < target[i] )
+      {
 			rc = true;
 			break;
 		}
 	}

-	if (opt_debug) {
+	if ( opt_debug )
+   {
 		uint32_t hash_be[8], target_be[8];
 		char hash_str[65], target_str[65];
 		
-		for (i = 0; i < 8; i++) {
-			be32enc(hash_be + i, hash[7 - i]);
-			be32enc(target_be + i, target[7 - i]);
+		for ( i = 0; i < 8; i++ )
+      {
+			be32enc( hash_be + i, hash[7 - i] );
+			be32enc( target_be + i, target[7 - i] );
 		}
-		bin2hex(hash_str, (unsigned char *)hash_be, 32);
-		bin2hex(target_str, (unsigned char *)target_be, 32);
+		bin2hex( hash_str, (unsigned char *)hash_be, 32 );
+		bin2hex( target_str, (unsigned char *)target_be, 32 );

-		applog(LOG_DEBUG, "DEBUG: %s\nHash:   %s\nTarget: %s",
-			rc ? "hash <= target"
-			   : "hash > target (false positive)",
-			hash_str,
-			target_str);
+		applog( LOG_DEBUG, "DEBUG: %s\nHash:   %s\nTarget: %s",
+                         rc ? "hash <= target"
+			                   : "hash > target (false positive)",
+	                      hash_str, target_str );
 	}
-
 	return rc;
 }
Author	SHA1	Message	Date
Jay D Dee	103e6ad36c	v3.11.6	2020-01-23 00:11:08 -05:00
Jay D Dee	1a7a573675	v3.11.5	2020-01-18 15:14:27 -05:00