v3.8.3

2025-09-17 23:44:27 +00:00 · 2018-02-23 12:39:15 -05:00
70 changed files with 3871 additions and 1848 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -45,7 +45,10 @@ cpuminer_SOURCES = \
  algo/blake/sph_blake2b.c \
  algo/blake/blake2b.c \
  algo/blake/sph-blake2s.c \
+  algo/blake/blake2s-hash-4way.c \
  algo/blake/blake2s.c \
+  algo/blake/blake2s-gate.c \
+  algo/blake/blake2s-4way.c \
  algo/blake/blakecoin-gate.c \
  algo/blake/mod_blakecoin.c \
  algo/blake/blakecoin.c \
--- a/10
+++ b/10
@@ -134,7 +134,7 @@ cd /c/path/to/cpuminer-opt
 Run build.sh to build on Windows or execute the following commands.

 ./autogen.sh
-CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
+CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
 make

 Start mining
@@ -159,6 +159,14 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------

+v3.8.3
+
+More restoration of lost lyra2 hash.
+8 way AVX2 and 4way AVX optimization for blakecoin, vanilla & blake2s.
+8 way AVX2 for lbry.
+Scaled hashrate for API output.
+A couple of GBT fixes.
+
 v3.8.2.1

 Fixed low difficulty rejects with allium.
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -1,19 +1,18 @@
 #include "blake-gate.h"
-
-#if defined (BLAKE_4WAY)
-
 #include "blake-hash-4way.h"
 #include <string.h>
 #include <stdint.h>
 #include <memory.h>

-blake256r14_4way_context blake_ctx;
+#if defined (BLAKE_4WAY)
+
+blake256r14_4way_context blake_4w_ctx;

 void blakehash_4way(void *state, const void *input)
 {
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
     blake256r14_4way_context ctx;
-     memcpy( &ctx, &blake_ctx, sizeof ctx );
+     memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
     blake256r14_4way( &ctx, input + (64<<2), 16 );
     blake256r14_4way_close( &ctx, vhash );
     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
@@ -31,7 +30,6 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t _ALIGN(32) edata[20];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;

   if (opt_benchmark)
@@ -39,15 +37,12 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,

   // we need big endian data...
   swab32_array( edata, pdata, 20 );
-
   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
-
-   blake256r14_4way_init( &blake_ctx );
-   blake256r14_4way( &blake_ctx, vdata, 64 );
+   blake256r14_4way_init( &blake_4w_ctx );
+   blake256r14_4way( &blake_4w_ctx, vdata, 64 );

   uint32_t *noncep = vdata + 76;   // 19*4
   do {
-      found[0] = found[1] = found[2] = found[3] = false;
      be32enc( noncep,    n   );
      be32enc( noncep +1, n+1 );
      be32enc( noncep +2, n+2 );
@@ -55,34 +50,11 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,

      blakehash_4way( hash, vdata );

-      if (  hash[7] <= HTarget && fulltest( hash, ptarget ) )
+      for ( int i = 0; i < 4; i++ )
+      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
      {
-          found[0] = true;
-          num_found++;
-          nonces[0] = n;
-          pdata[19] = n;
-          work_set_target_ratio( work, hash );
-      }
-      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
-      {
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-          work_set_target_ratio( work, hash+8 );
-      }
-      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
-      {
-           found[2] = true;
-           num_found++;
-           nonces[2] = n+2;
-           work_set_target_ratio( work, hash+16 );
-      }
-      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
-      {
-           found[3] = true;
-           num_found++;
-           nonces[3] = n+3;
-           work_set_target_ratio( work, hash+24 );
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;

@@ -95,3 +67,77 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,

 #endif

+#if defined(BLAKE_8WAY)
+
+blake256r14_8way_context blake_8w_ctx;
+
+void blakehash_8way( void *state, const void *input )
+{
+     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+     blake256r14_8way_context ctx;
+     memcpy( &ctx, &blake_8w_ctx, sizeof ctx );
+     blake256r14_8way( &ctx, input + (64<<3), 16 );
+     blake256r14_8way_close( &ctx, vhash );
+     mm256_deinterleave_8x32( state,     state+ 32, state+ 64, state+ 96,
+                              state+128, state+160, state+192, state+224,
+                              vhash, 256 );
+}
+
+int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t HTarget = ptarget[7];
+   uint32_t _ALIGN(32) edata[20];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+
+   if (opt_benchmark)
+      HTarget = 0x7f;
+
+   // we need big endian data...
+   swab32_array( edata, pdata, 20 );
+
+   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
+                                 edata, edata, edata, edata, 640 );
+
+   blake256r14_8way_init( &blake_8w_ctx );
+   blake256r14_8way( &blake_8w_ctx, vdata, 64 );
+
+   uint32_t *noncep = vdata + 152;   // 19*8
+   do {
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+      be32enc( noncep +4, n+4 );
+      be32enc( noncep +5, n+5 );
+      be32enc( noncep +6, n+6 );
+      be32enc( noncep +7, n+7 );
+      pdata[19] = n;
+
+      blakehash_8way( hash, vdata );
+
+      for ( int i = 0; i < 8; i++ )
+      if ( (hash+i)[7] <= HTarget && fulltest( hash+i, ptarget ) )
+      {
+          found[i] = true;
+          num_found++;
+          nonces[i] = n+i;
+          work_set_target_ratio( work, hash+1 );
+      }
+      n += 8;
+
+   } while ( (num_found == 0) && (n < max_nonce)
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
@@ -58,6 +58,8 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

+// Blake-256
+
 static const sph_u32 IV256[8] = {
 	SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
 	SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
@@ -67,6 +69,8 @@ static const sph_u32 IV256[8] = {

 #if defined (__AVX2__)

+// Blake-512
+
 static const sph_u64 IV512[8] = {
 	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
 	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
@@ -78,7 +82,7 @@ static const sph_u64 IV512[8] = {

 #if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64

-// Blake-256 4 & 8 way, Blake-512 4way
+// Blake-256 4 & 8 way, Blake-512 4 way

 static const unsigned sigma[16][16] = {
 	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
@@ -371,6 +375,8 @@ do { \

 #if SPH_COMPACT_BLAKE_32

+// Blake-256 4 way
+
 #define ROUND_S_4WAY(r)   do { \
 	GS_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
 		CS[sigma[r][0x0]], CS[sigma[r][0x1]], V0, V4, V8, VC); \
@@ -407,7 +413,7 @@ do { \

 #if defined (__AVX2__)

-// BLAKE256 8 WAY
+// Blake-256 8 way

 #define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
 do { \
@@ -487,6 +493,8 @@ do { \

 #endif

+// Blake-256 4 way
+
 #define DECL_STATE32_4WAY \
 	__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
 	__m128i S0, S1, S2, S3; \
@@ -527,6 +535,7 @@ do { \
 	} while (0)

 #if SPH_COMPACT_BLAKE_32
+// not used

 #define COMPRESS32_4WAY( rounds )   do { \
 	__m128i M[16]; \
@@ -778,7 +787,6 @@ do { \
                                                              S3 ), H7 ); \
 } while (0)

-
 // Blake-512 4 way

 #define DECL_STATE64_4WAY \
@@ -967,6 +975,8 @@ do { \

 #endif

+// Blake-256 4 way
+
 static const sph_u32 salt_zero_4way_small[4] = { 0, 0, 0, 0 };

 static void
@@ -988,52 +998,51 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
 {
   __m128i *vdata = (__m128i*)data;
   __m128i *buf;
-	size_t ptr;
-        const int buf_size = 64;   // number of elements, sizeof/4
-	DECL_STATE32_4WAY
+   size_t ptr;
+   const int buf_size = 64;   // number of elements, sizeof/4
+   DECL_STATE32_4WAY
+   buf = sc->buf;
+   ptr = sc->ptr;
+   if ( len < buf_size - ptr )
+   {
+      memcpy_128( buf + (ptr>>2), vdata, len>>2 );
+      ptr += len;
+      sc->ptr = ptr;
+      return;
+   }

-	buf = sc->buf;
-	ptr = sc->ptr;
-	if ( len < buf_size - ptr )
-        {
-		memcpy_128( buf + (ptr>>2), vdata, len>>2 );
-		ptr += len;
-		sc->ptr = ptr;
-		return;
-	}
+   READ_STATE32_4WAY(sc);
+   while ( len > 0 )
+   {
+      size_t clen;

-	READ_STATE32_4WAY(sc);
-	while ( len > 0 )
-        {
-           size_t clen;
-
-	   clen = buf_size - ptr;
-	   if (clen > len)
-		clen = len;
-	   memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
-	   ptr += clen;
-           vdata += (clen>>2);
-	   len -= clen;
-	   if ( ptr == buf_size )
-           {
-		if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
-			T1 = SPH_T32(T1 + 1);
-                COMPRESS32_4WAY( sc->rounds );
-		ptr = 0;
-	   }
-	}
-	WRITE_STATE32_4WAY(sc);
-	sc->ptr = ptr;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
+      ptr += clen;
+      vdata += (clen>>2);
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
+            T1 = SPH_T32(T1 + 1);
+         COMPRESS32_4WAY( sc->rounds );
+         ptr = 0;
+      }
+   }
+   WRITE_STATE32_4WAY(sc);
+   sc->ptr = ptr;
 }

 static void
 blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
               void *dst, size_t out_size_w32 )
 {
-   union {
+//   union {
 	__m128i buf[16];
-	sph_u32 dummy;
-   } u;
+//	sph_u32 dummy;
+//   } u;
   size_t ptr, k;
   unsigned bit_len;
   sph_u32 th, tl;
@@ -1041,7 +1050,7 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-   u.buf[ptr>>2] = _mm_set1_epi32( 0x80 );
+   buf[ptr>>2] = _mm_set1_epi32( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;

@@ -1060,26 +1069,26 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,

   if ( ptr <= 52 )
   {
-       memset_zero_128( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
+       memset_zero_128( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
       if (out_size_w32 == 8)
-           u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
+           buf[52>>2] = _mm_or_si128( buf[52>>2],
                                        _mm_set1_epi32( 0x01000000UL ) );
-       *(u.buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
-       *(u.buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
-       blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
+       *(buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
+       *(buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
+       blake32_4way( sc, buf + (ptr>>2), 64 - ptr );
   }
   else
   {
-	memset_zero_128( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
-	blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
+	memset_zero_128( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
+	blake32_4way( sc, buf + (ptr>>2), 64 - ptr );
 	sc->T0 = SPH_C32(0xFFFFFE00UL);
 	sc->T1 = SPH_C32(0xFFFFFFFFUL);
-	memset_zero_128( u.buf, 56>>2 );
+	memset_zero_128( buf, 56>>2 );
       if (out_size_w32 == 8)
-           u.buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
-        *(u.buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
-        *(u.buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
-	blake32_4way( sc, u.buf, 64 );
+           buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
+        *(buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
+        *(buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
+	blake32_4way( sc, buf, 64 );
   }
   out = (__m128i*)dst;
   for ( k = 0; k < out_size_w32; k++ )
@@ -1114,7 +1123,6 @@ blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
   size_t ptr;
   const int buf_size = 64;   // number of elements, sizeof/4
   DECL_STATE32_8WAY
-
   buf = sc->buf;
   ptr = sc->ptr;
   if ( len < buf_size - ptr )
@@ -1153,10 +1161,10 @@ static void
 blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
                    void *dst, size_t out_size_w32 )
 {
-   union {
+//   union {
        __m256i buf[16];
-        sph_u32 dummy;
-   } u;
+//        sph_u32 dummy;
+//   } u;
   size_t ptr, k;
   unsigned bit_len;
   sph_u32 th, tl;
@@ -1164,7 +1172,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-   u.buf[ptr>>2] = _mm256_set1_epi32( 0x80 );
+   buf[ptr>>2] = _mm256_set1_epi32( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;

@@ -1183,26 +1191,26 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,

   if ( ptr <= 52 )
   {
-       memset_zero_256( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
-       if (out_size_w32 == 8)
-           u.buf[52>>2] = _mm256_or_si256( u.buf[52>>2],
+       memset_zero_256( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
+       if ( out_size_w32 == 8 )
+           buf[52>>2] = _mm256_or_si256( buf[52>>2],
                                           _mm256_set1_epi32( 0x01000000UL ) );
-       *(u.buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
-       *(u.buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
-       blake32_8way( sc, u.buf + (ptr>>2), 64 - ptr );
+       *(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
+       *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
+       blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
   }
   else
   {
-        memset_zero_256( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
-        blake32_8way( sc, u.buf + (ptr>>2), 64 - ptr );
+        memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
+        blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
        sc->T0 = SPH_C32(0xFFFFFE00UL);
        sc->T1 = SPH_C32(0xFFFFFFFFUL);
-        memset_zero_256( u.buf, 56>>2 );
-       if (out_size_w32 == 8)
-           u.buf[52>>2] = _mm256_set1_epi32( 0x01000000UL );
-        *(u.buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
-        *(u.buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
-        blake32_8way( sc, u.buf, 64 );
+        memset_zero_256( buf, 56>>2 );
+       if ( out_size_w32 == 8 )
+           buf[52>>2] = _mm256_set1_epi32( 0x01000000UL );
+        *(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
+        *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
+        blake32_8way( sc, buf, 64 );
   }
   out = (__m256i*)dst;
   for ( k = 0; k < out_size_w32; k++ )
@@ -1274,10 +1282,10 @@ static void
 blake64_4way_close( blake_4way_big_context *sc,
 	unsigned ub, unsigned n, void *dst, size_t out_size_w64)
 {
-   union {
+//   union {
      __m256i buf[16];
-      sph_u64 dummy;
-   } u;
+//      sph_u64 dummy;
+//   } u;
   size_t ptr, k;
   unsigned bit_len;
   uint64_t z, zz;
@@ -1288,7 +1296,7 @@ blake64_4way_close( blake_4way_big_context *sc,
   bit_len = ((unsigned)ptr << 3);
   z = 0x80 >> n;
   zz = ((ub & -z) | z) & 0xFF;
-   u.buf[ptr>>3] = _mm256_set_epi64x( zz, zz, zz, zz );
+   buf[ptr>>3] = _mm256_set_epi64x( zz, zz, zz, zz );
   tl = sc->T0 + bit_len;
   th = sc->T1;
   if (ptr == 0 )
@@ -1307,33 +1315,33 @@ blake64_4way_close( blake_4way_big_context *sc,
   }
   if ( ptr <= 104 )
   {
-       memset_zero_256( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
+       memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
       if ( out_size_w64 == 8 )
-          u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
+          buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)],
                                 _mm256_set1_epi64x( 0x0100000000000000ULL ) );
-       *(u.buf+(112>>3)) = mm256_bswap_64(
+       *(buf+(112>>3)) = mm256_bswap_64(
                                    _mm256_set_epi64x( th, th, th, th ) );
-       *(u.buf+(120>>3)) = mm256_bswap_64(
+       *(buf+(120>>3)) = mm256_bswap_64(
                                    _mm256_set_epi64x( tl, tl, tl, tl ) );

-       blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
+       blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
   }
   else
  {
-       memset_zero_256( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
+       memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );

-       blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
+       blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
-       memset_zero_256( u.buf, 112>>3 ); 
+       memset_zero_256( buf, 112>>3 ); 
       if ( out_size_w64 == 8 )
-           u.buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
-       *(u.buf+(112>>3)) = mm256_bswap_64(
+           buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
+       *(buf+(112>>3)) = mm256_bswap_64(
                                    _mm256_set_epi64x( th, th, th, th ) );
-       *(u.buf+(120>>3)) = mm256_bswap_64(
+       *(buf+(120>>3)) = mm256_bswap_64(
                                    _mm256_set_epi64x( tl, tl, tl, tl ) );

-       blake64_4way( sc, u.buf, 128 );
+       blake64_4way( sc, buf, 128 );
   }
   out = (__m256i*)dst;
   for ( k = 0; k < out_size_w64; k++ )
@@ -1342,7 +1350,7 @@ blake64_4way_close( blake_4way_big_context *sc,

 #endif

-// Blake-256 4 way & 8 way
+// Blake-256 4 way

 // default 14 rounds, backward copatibility
 void
@@ -1364,6 +1372,9 @@ blake256_4way_close(void *cc, void *dst)
 }

 #if defined(__AVX2__)
+
+// Blake-256 8way
+
 void
 blake256_8way_init(void *cc)
 {
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -35,7 +35,7 @@
 */

 #ifndef __BLAKE_HASH_4WAY__
-#define __BLAKE_HASH_4WAY__
+#define __BLAKE_HASH_4WAY__ 1

 #ifdef __AVX__

@@ -117,11 +117,11 @@ void blake256r8_8way_close(void *cc, void *dst);
 // Blake-512 4 way

 typedef struct {
-        __m256i buf[16] __attribute__ ((aligned (64)));
-        __m256i H[8];
-        __m256i S[4];   
-        size_t ptr;
-	sph_u64 T0, T1;
+   __m256i buf[16] __attribute__ ((aligned (64)));
+   __m256i H[8];
+   __m256i S[4];   
+   size_t ptr;
+   sph_u64 T0, T1;
 } blake_4way_big_context;

 typedef blake_4way_big_context blake512_4way_context;
--- a/algo/blake/blake2s-4way.c
+++ b/algo/blake/blake2s-4way.c
@@ -0,0 +1,134 @@
+#include "blake2s-gate.h"
+#include "blake2s-hash-4way.h"
+#include <string.h>
+#include <stdint.h>
+
+#if defined(BLAKE2S_8WAY)
+
+static __thread blake2s_8way_state blake2s_8w_ctx;
+
+void blake2s_8way_hash( void *output, const void *input )
+{
+   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+   blake2s_8way_state ctx;
+   memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
+
+   blake2s_8way_update( &ctx, input + (64<<3), 16 );
+   blake2s_8way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
+
+   mm256_deinterleave_8x32( output,     output+ 32, output+ 64, output+ 96,
+                            output+128, output+160, output+192, output+224,
+                            vhash, 256 );
+}
+
+int scanhash_blake2s_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done )
+{
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t _ALIGN(64) edata[20];
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 152;   // 19*8
+
+   swab32_array( edata, pdata, 20 );
+   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
+                                 edata, edata, edata, edata, 640 );
+   blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
+   blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
+
+   do {
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+      be32enc( noncep +4, n+4 );
+      be32enc( noncep +5, n+5 );
+      be32enc( noncep +6, n+6 );
+      be32enc( noncep +7, n+7 );
+      pdata[19] = n;
+
+      blake2s_8way_hash( hash, vdata );
+
+
+      for ( int i = 0; i < 8; i++ )
+      if (  (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      {
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 8;
+
+   } while ( (num_found == 0) && (n < max_nonce)
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#elif defined(BLAKE2S_4WAY)
+
+static __thread blake2s_4way_state blake2s_4w_ctx;
+
+void blake2s_4way_hash( void *output, const void *input )
+{
+   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+   blake2s_4way_state ctx;
+   memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
+
+   blake2s_4way_update( &ctx, input + (64<<2), 16 );
+   blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
+
+   mm_deinterleave_4x32( output, output+32, output+64, output+96, vhash, 256 );
+}
+
+int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done )
+{
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t _ALIGN(64) edata[20];
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 76;   // 19*4
+
+   swab32_array( edata, pdata, 20 );
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
+   blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
+
+   do {
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+      pdata[19] = n;
+
+      blake2s_4way_hash( hash, vdata );
+
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      {
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 4;
+
+   } while ( (num_found == 0) && (n < max_nonce)
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/blake/blake2s-gate.c
+++ b/algo/blake/blake2s-gate.c
@@ -0,0 +1,27 @@
+#include "blake2s-gate.h"
+
+
+// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
+int64_t blake2s_get_max64 ()
+{
+   return 0x7ffffLL;
+}
+
+bool register_blake2s_algo( algo_gate_t* gate )
+{
+#if defined(BLAKE2S_8WAY)
+  gate->scanhash  = (void*)&scanhash_blake2s_8way;
+  gate->hash      = (void*)&blake2s_8way_hash;
+#elif defined(BLAKE2S_4WAY)
+  gate->scanhash  = (void*)&scanhash_blake2s_4way;
+  gate->hash      = (void*)&blake2s_4way_hash;
+#else
+  gate->scanhash  = (void*)&scanhash_blake2s;
+  gate->hash      = (void*)&blake2s_hash;
+#endif
+  gate->get_max64 = (void*)&blake2s_get_max64;
+  gate->optimizations = AVX_OPT | AVX2_OPT;
+  return true;
+};
+
+
--- a/algo/blake/blake2s-gate.h
+++ b/algo/blake/blake2s-gate.h
@@ -0,0 +1,35 @@
+#ifndef __BLAKE2S_GATE_H__
+#define __BLAKE2S_GATE_H__ 1
+
+#include <stdint.h>
+#include "algo-gate-api.h"
+
+#if defined(__AVX__)
+  #define BLAKE2S_4WAY
+#endif
+#if defined(__AVX2__)
+  #define BLAKE2S_8WAY
+#endif
+
+bool register_blake2s_algo( algo_gate_t* gate );
+
+#if defined(BLAKE2S_8WAY)
+
+void blake2s_8way_hash( void *state, const void *input );
+int scanhash_blake2s_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+#elif defined (BLAKE2S_4WAY)
+
+void blake2s_4way_hash( void *state, const void *input );
+int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#else
+
+void blake2s_hash( void *state, const void *input );
+int scanhash_blake2s( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done );
+
+#endif
+
+#endif
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -0,0 +1,362 @@
+/**
+ * BLAKE2 reference source code package - reference C implementations
+ *
+ * Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide. This software is distributed without any warranty.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+#include "blake2s-hash-4way.h"
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#if defined(__AVX__)
+
+static const uint32_t blake2s_IV[8] =
+{
+	0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+	0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
+};
+
+static const uint8_t blake2s_sigma[10][16] =
+{
+	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
+	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
+	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
+	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
+	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
+	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
+	{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
+	{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
+	{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
+	{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
+};
+
+// define a constant for initial param.
+
+int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
+{
+   blake2s_nway_param P[1];
+
+   P->digest_length = outlen;
+   P->key_length    = 0;
+   P->fanout        = 1;
+   P->depth         = 1;
+   P->leaf_length   = 0;    
+   *((uint64_t*)(P->node_offset)) = 0;
+   P->node_depth    = 0;
+   P->inner_length  = 0;
+   memset( P->salt,     0, sizeof( P->salt ) );
+   memset( P->personal, 0, sizeof( P->personal ) );
+
+   memset( S, 0, sizeof( blake2s_4way_state ) );
+   for( int i = 0; i < 8; ++i )
+      S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
+
+   uint32_t *p = ( uint32_t * )( P );
+
+   /* IV XOR ParamBlock */
+   for ( size_t i = 0; i < 8; ++i )
+      S->h[i] = _mm_xor_si128( S->h[i], _mm_set1_epi32( p[i] ) );
+   return 0;
+}
+
+int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
+{
+   __m128i m[16];
+   __m128i v[16];
+
+   memcpy_128( m, block, 16 );
+   memcpy_128( v, S->h, 8 );
+
+   v[ 8] = _mm_set1_epi32( blake2s_IV[0] );
+   v[ 9] = _mm_set1_epi32( blake2s_IV[1] );
+   v[10] = _mm_set1_epi32( blake2s_IV[2] );
+   v[11] = _mm_set1_epi32( blake2s_IV[3] );
+   v[12] = _mm_xor_si128( _mm_set1_epi32( S->t[0] ),
+                          _mm_set1_epi32( blake2s_IV[4] ) );
+   v[13] = _mm_xor_si128( _mm_set1_epi32( S->t[1] ),
+                          _mm_set1_epi32( blake2s_IV[5] ) );
+   v[14] = _mm_xor_si128( _mm_set1_epi32( S->f[0] ),
+                          _mm_set1_epi32( blake2s_IV[6] ) );
+   v[15] = _mm_xor_si128( _mm_set1_epi32( S->f[1] ),
+                          _mm_set1_epi32( blake2s_IV[7] ) );
+
+#define G4W(r,i,a,b,c,d) \
+do { \
+   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+0] ] ); \
+   d = mm_rotr_32( _mm_xor_si128( d, a ), 16 ); \
+   c = _mm_add_epi32( c, d ); \
+   b = mm_rotr_32( _mm_xor_si128( b, c ), 12 ); \
+   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+1] ] ); \
+   d = mm_rotr_32( _mm_xor_si128( d, a ),  8 ); \
+   c = _mm_add_epi32( c, d ); \
+   b = mm_rotr_32( _mm_xor_si128( b, c ),  7 ); \
+} while(0)
+
+#define ROUND4W(r)  \
+do { \
+   G4W( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \
+   G4W( r, 1, v[ 1], v[ 5], v[ 9], v[13] ); \
+   G4W( r, 2, v[ 2], v[ 6], v[10], v[14] ); \
+   G4W( r, 3, v[ 3], v[ 7], v[11], v[15] ); \
+   G4W( r, 4, v[ 0], v[ 5], v[10], v[15] ); \
+   G4W( r, 5, v[ 1], v[ 6], v[11], v[12] ); \
+   G4W( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \
+   G4W( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); \
+} while(0)
+
+   ROUND4W( 0 );
+   ROUND4W( 1 );
+   ROUND4W( 2 );
+   ROUND4W( 3 );
+   ROUND4W( 4 );
+   ROUND4W( 5 );
+   ROUND4W( 6 );
+   ROUND4W( 7 );
+   ROUND4W( 8 );
+   ROUND4W( 9 );
+
+   for( size_t i = 0; i < 8; ++i )
+      S->h[i] = _mm_xor_si128( _mm_xor_si128( S->h[i], v[i] ), v[i + 8] );
+
+#undef G4W
+#undef ROUND4W
+   return 0;
+}
+
+int blake2s_4way_update( blake2s_4way_state *S, const void *in,
+                         uint64_t inlen )
+{
+  __m128i *input = (__m128i*)in;
+  __m128i *buf = (__m128i*)S->buf;
+  const int bsize = BLAKE2S_BLOCKBYTES;
+
+   while( inlen > 0 )
+   {
+      size_t left = S->buflen;
+      if( inlen >= bsize - left )
+      {
+         memcpy_128( buf + (left>>2), input, (bsize - left) >> 2 );
+         S->buflen += bsize - left;
+         S->t[0] += BLAKE2S_BLOCKBYTES;
+         S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+         blake2s_4way_compress( S, buf ); 
+         S->buflen = 0;
+         input += ( bsize >> 2 );
+         inlen -= bsize;
+      }
+      else
+      {
+          memcpy_128( buf + ( left>>2 ), input, inlen>>2 );
+          S->buflen += (size_t) inlen; 
+          input += ( inlen>>2 );
+          inlen -= inlen;
+      }
+   }
+   return 0;
+}
+
+int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
+{
+   __m128i *buf = (__m128i*)S->buf;
+
+   S->t[0] += S->buflen;
+   S->t[1] += ( S->t[0] < S->buflen );
+   if ( S->last_node ) 
+      S->f[1] = ~0U;
+   S->f[0] = ~0U;
+
+   memset_zero_128( buf + ( S->buflen>>2 ),
+                    ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );      
+   blake2s_4way_compress( S, buf );
+
+   for ( int i = 0; i < 8; ++i )
+      casti_m128i( out, i ) = S->h[ i ];
+   return 0;
+}
+
+#if defined(__AVX2__)
+
+int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
+{
+   __m256i m[16];
+   __m256i v[16];
+
+   memcpy_256( m, block, 16 );
+   memcpy_256( v, S->h, 8 );
+
+   v[ 8] = _mm256_set1_epi32( blake2s_IV[0] );
+   v[ 9] = _mm256_set1_epi32( blake2s_IV[1] );
+   v[10] = _mm256_set1_epi32( blake2s_IV[2] );
+   v[11] = _mm256_set1_epi32( blake2s_IV[3] );
+   v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
+                             _mm256_set1_epi32( blake2s_IV[4] ) );
+   v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
+                             _mm256_set1_epi32( blake2s_IV[5] ) );
+   v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
+                             _mm256_set1_epi32( blake2s_IV[6] ) );
+   v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
+                             _mm256_set1_epi32( blake2s_IV[7] ) );
+
+#define G8W(r,i,a,b,c,d) \
+do { \
+   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
+                          m[ blake2s_sigma[r][2*i+0] ] ); \
+   d = mm256_rotr_32( _mm256_xor_si256( d, a ), 16 ); \
+   c = _mm256_add_epi32( c, d ); \
+   b = mm256_rotr_32( _mm256_xor_si256( b, c ), 12 ); \
+   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
+                         m[ blake2s_sigma[r][2*i+1] ] ); \
+   d = mm256_rotr_32( _mm256_xor_si256( d, a ),  8 ); \
+   c = _mm256_add_epi32( c, d ); \
+   b = mm256_rotr_32( _mm256_xor_si256( b, c ),  7 ); \
+} while(0)
+
+#define ROUND8W(r)  \
+do { \
+   G8W( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \
+   G8W( r, 1, v[ 1], v[ 5], v[ 9], v[13] ); \
+   G8W( r, 2, v[ 2], v[ 6], v[10], v[14] ); \
+   G8W( r, 3, v[ 3], v[ 7], v[11], v[15] ); \
+   G8W( r, 4, v[ 0], v[ 5], v[10], v[15] ); \
+   G8W( r, 5, v[ 1], v[ 6], v[11], v[12] ); \
+   G8W( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \
+   G8W( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); \
+} while(0)
+
+   ROUND8W( 0 );
+   ROUND8W( 1 );
+   ROUND8W( 2 );
+   ROUND8W( 3 );
+   ROUND8W( 4 );
+   ROUND8W( 5 );
+   ROUND8W( 6 );
+   ROUND8W( 7 );
+   ROUND8W( 8 );
+   ROUND8W( 9 );
+
+   for( size_t i = 0; i < 8; ++i )
+      S->h[i] = _mm256_xor_si256( _mm256_xor_si256( S->h[i], v[i] ), v[i + 8] );
+
+#undef G8W
+#undef ROUND8W
+   return 0;
+}
+
+int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
+{
+   blake2s_nway_param P[1];
+
+   P->digest_length = outlen;
+   P->key_length    = 0;
+   P->fanout        = 1;
+   P->depth         = 1;
+   P->leaf_length   = 0;
+   *((uint64_t*)(P->node_offset)) = 0;
+   P->node_depth    = 0;
+   P->inner_length  = 0;
+   memset( P->salt,     0, sizeof( P->salt ) );
+   memset( P->personal, 0, sizeof( P->personal ) );
+
+   memset( S, 0, sizeof( blake2s_8way_state ) );
+   for( int i = 0; i < 8; ++i )
+      S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );
+
+   uint32_t *p = ( uint32_t * )( P );
+
+   /* IV XOR ParamBlock */
+   for ( size_t i = 0; i < 8; ++i )
+      S->h[i] = _mm256_xor_si256( S->h[i], _mm256_set1_epi32( p[i] ) );
+   return 0;
+}
+
+int blake2s_8way_update( blake2s_8way_state *S, const void *in,
+                         uint64_t inlen )
+{
+  __m256i *input = (__m256i*)in;
+  __m256i *buf = (__m256i*)S->buf;
+  const int bsize = BLAKE2S_BLOCKBYTES;
+
+   while( inlen > 0 )
+   {
+      size_t left = S->buflen;
+      if( inlen >= bsize - left )
+      {
+         memcpy_256( buf + (left>>2), input, (bsize - left) >> 2 );
+         S->buflen += bsize - left;
+         S->t[0] += BLAKE2S_BLOCKBYTES;
+         S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+         blake2s_8way_compress( S, buf );
+         S->buflen = 0;
+         input += ( bsize >> 2 );
+         inlen -= bsize;
+      }
+      else
+      {
+          memcpy_256( buf + ( left>>2 ), input, inlen>>2 );
+          S->buflen += (size_t) inlen;
+          input += ( inlen>>2 );
+          inlen -= inlen;
+      }
+   }
+   return 0;
+}
+
+int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
+{
+   __m256i *buf = (__m256i*)S->buf;
+
+   S->t[0] += S->buflen;
+   S->t[1] += ( S->t[0] < S->buflen );
+   if ( S->last_node )
+      S->f[1] = ~0U;
+   S->f[0] = ~0U;
+
+   memset_zero_256( buf + ( S->buflen>>2 ),
+                    ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
+   blake2s_8way_compress( S, buf );
+
+   for ( int i = 0; i < 8; ++i )
+      casti_m256i( out, i ) = S->h[ i ];
+   return 0;
+}
+
+
+#endif // __AVX2__
+
+#if 0
+int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
+{
+	blake2s_state S[1];
+
+	/* Verify parameters */
+	if ( NULL == in ) return -1;
+
+	if ( NULL == out ) return -1;
+
+	if ( NULL == key ) keylen = 0; /* Fail here instead if keylen != 0 and key == NULL? */
+
+	if( keylen > 0 )
+	{
+		if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1;
+	}
+	else
+	{
+		if( blake2s_init( S, outlen ) < 0 ) return -1;
+	}
+
+	blake2s_update( S, ( uint8_t * )in, inlen );
+	blake2s_final( S, out, outlen );
+	return 0;
+}
+#endif
+
+#endif // __AVX__
--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -0,0 +1,112 @@
+/**
+ * BLAKE2 reference source code package - reference C implementations
+ *
+ * Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide. This software is distributed without any warranty.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+//#pragma once
+#ifndef __BLAKE2S_HASH_4WAY_H__
+#define __BLAKE2S_HASH_4WAY_H__ 1
+
+#if defined(__AVX__)
+
+#include "avxdefs.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#include <inttypes.h>
+#define inline __inline
+#define ALIGN(x) __declspec(align(x))
+#else
+#define ALIGN(x) __attribute__((aligned(x)))
+#endif
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+enum blake2s_constant
+{
+   BLAKE2S_BLOCKBYTES = 64,
+   BLAKE2S_OUTBYTES   = 32,
+   BLAKE2S_KEYBYTES   = 32,
+   BLAKE2S_SALTBYTES  = 8,
+   BLAKE2S_PERSONALBYTES = 8
+};
+
+#pragma pack(push, 1)
+typedef struct __blake2s_nway_param
+{
+   uint8_t  digest_length; // 1
+   uint8_t  key_length;    // 2
+   uint8_t  fanout;        // 3
+   uint8_t  depth;         // 4
+   uint32_t leaf_length;   // 8
+   uint8_t  node_offset[6];// 14
+   uint8_t  node_depth;    // 15
+   uint8_t  inner_length;  // 16
+   // uint8_t  reserved[0];
+   uint8_t  salt[BLAKE2S_SALTBYTES]; // 24
+   uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
+} blake2s_nway_param;
+#pragma pack(pop)
+
+ALIGN( 64 ) typedef struct __blake2s_4way_state
+{
+   __m128i h[8];
+   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 4 ];
+   uint32_t t[2];
+   uint32_t f[2];
+   size_t   buflen;
+   uint8_t  last_node;
+} blake2s_4way_state ;
+
+int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen );
+int blake2s_4way_update( blake2s_4way_state *S, const void *in,
+                         uint64_t inlen );
+int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
+
+#if defined(__AVX2__)
+
+ALIGN( 64 ) typedef struct __blake2s_8way_state
+{
+   __m256i h[8];
+   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 8 ];
+   uint32_t t[2];
+   uint32_t f[2];
+   size_t   buflen;
+   uint8_t  last_node;
+} blake2s_8way_state ;
+
+int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen );
+int blake2s_8way_update( blake2s_8way_state *S, const void *in,
+                         uint64_t inlen );
+int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
+
+#endif
+
+#if 0
+	// Simple API
+//	int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+
+	// Direct Hash Mining Helpers
+	#define blake2s_salt32(out, in, inlen, key32) blake2s(out, in, key32, 32, inlen, 32) /* neoscrypt */
+	#define blake2s_simple(out, in, inlen) blake2s(out, in, NULL, 32, inlen, 0)
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif  // __AVX__
+
+#endif
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -1,26 +1,29 @@
-#include "algo-gate-api.h"
+#include "blake2s-gate.h"

 #include <string.h>
 #include <stdint.h>

 #include "sph-blake2s.h"

-static __thread blake2s_state s_midstate;
-static __thread blake2s_state s_ctx;
+static __thread blake2s_state blake2s_ctx;
+//static __thread blake2s_state s_ctx;
 #define MIDLEN 76

-void blake2s_hash(void *output, const void *input)
+void blake2s_hash( void *output, const void *input )
 {
-	unsigned char _ALIGN(64) hash[BLAKE2S_OUTBYTES];
-	blake2s_state blake2_ctx __attribute__ ((aligned (64)));
-
-	blake2s_init(&blake2_ctx, BLAKE2S_OUTBYTES);
-	blake2s_update(&blake2_ctx, input, 80);
-	blake2s_final(&blake2_ctx, hash, BLAKE2S_OUTBYTES);
+   unsigned char _ALIGN(64) hash[BLAKE2S_OUTBYTES];
+   blake2s_state ctx __attribute__ ((aligned (64)));
+  
+   memcpy( &ctx, &blake2s_ctx, sizeof ctx );
+   blake2s_update( &ctx, input+64, 16 );
+ 
+//	blake2s_init(&ctx, BLAKE2S_OUTBYTES);
+//	blake2s_update(&ctx, input, 80);
+	blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );

 	memcpy(output, hash, 32);
 }
-
+/*
 static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
 {
 	s_ctx.buflen = MIDLEN;
@@ -28,7 +31,7 @@ static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
 	blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
 	blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
 }
-
+*/
 int scanhash_blake2s(int thr_id, struct work *work,
 	uint32_t max_nonce, uint64_t *hashes_done)
 {
@@ -46,13 +49,12 @@ int scanhash_blake2s(int thr_id, struct work *work,
        swab32_array( endiandata, pdata, 20 );

 	// midstate
-	blake2s_init(&s_midstate, BLAKE2S_OUTBYTES);
-	blake2s_update(&s_midstate, (uint8_t*) endiandata, MIDLEN);
-	memcpy(&s_ctx, &s_midstate, sizeof(blake2s_state));
+	blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
+	blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );

 	do {
 		be32enc(&endiandata[19], n);
-		blake2s_hash_end(hash64, endiandata);
+		blake2s_hash( hash64, endiandata );
 		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
 			*hashes_done = n - first_nonce + 1;
 			pdata[19] = n;
@@ -67,7 +69,7 @@ int scanhash_blake2s(int thr_id, struct work *work,

 	return 0;
 }
-
+/*
 // changed to get_max64_0x3fffffLL in cpuminer-multi-decred
 int64_t blake2s_get_max64 ()
 {
@@ -81,4 +83,4 @@ bool register_blake2s_algo( algo_gate_t* gate )
  gate->get_max64 = (void*)&blake2s_get_max64;
  return true;
 };
-
+*/
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -1,21 +1,22 @@
 #include "blakecoin-gate.h"
-
-#if defined (BLAKECOIN_4WAY)
-
 #include "blake-hash-4way.h"
 #include <string.h>
 #include <stdint.h>
 #include <memory.h>

-blake256r8_4way_context blakecoin_ctx;
+#if defined (BLAKECOIN_4WAY)
+
+blake256r8_4way_context blakecoin_4w_ctx;

 void blakecoin_4way_hash(void *state, const void *input)
 {
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
     blake256r8_4way_context ctx;
-     memcpy( &ctx, &blakecoin_ctx, sizeof ctx );
+
+     memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
     blake256r8_4way( &ctx, input + (64<<2), 16 );
     blake256r8_4way_close( &ctx, vhash );
+
     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

@@ -31,58 +32,29 @@ int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t _ALIGN(32) edata[20];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-
-   if (opt_benchmark)
+   if ( opt_benchmark )
      HTarget = 0x7f;

-   // we need big endian data...
   swab32_array( edata, pdata, 20 );
-
   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
-
-   blake256r8_4way_init( &blakecoin_ctx );
-   blake256r8_4way( &blakecoin_ctx, vdata, 64 );
+   blake256r8_4way_init( &blakecoin_4w_ctx );
+   blake256r8_4way( &blakecoin_4w_ctx, vdata, 64 );

   uint32_t *noncep = vdata + 76;   // 19*4
   do {
-      found[0] = found[1] = found[2] = found[3] = false;
      be32enc( noncep,    n   );
      be32enc( noncep +1, n+1 );
      be32enc( noncep +2, n+2 );
      be32enc( noncep +3, n+3 );
-
-      blakecoin_4way_hash( hash, vdata );
      pdata[19] = n;
+      blakecoin_4way_hash( hash, vdata );

-      if (  hash[7] <= HTarget && fulltest( hash, ptarget ) )
+      for ( int i = 0; i < 4; i++ )
+      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
      {
-          found[0] = true;
-          num_found++;
-          nonces[0] = n;
-          work_set_target_ratio( work, hash );
-      }
-      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) ) 
-      {
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-          work_set_target_ratio( work, hash+8 );
-      }
-      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
-      {
-           found[2] = true;
-           num_found++;
-           nonces[2] = n+2;
-           work_set_target_ratio( work, hash+16 );
-      }
-      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
-      {
-           found[3] = true;
-           num_found++;
-           nonces[3] = n+3;
-           work_set_target_ratio( work, hash+24 );
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;

@@ -90,15 +62,76 @@ int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
             && !work_restart[thr_id].restart );

   *hashes_done = n - first_nonce + 1;
-
-   // workaround to prevent flood of hash reports when nonce range exhasuted
-   // and thread is spinning waiting for new work
-   if ( ( n >= max_nonce ) && ( *hashes_done < 10 ) )
-   {
-      *hashes_done = 0;
-//      sleep(1);
-   }
-
+   return num_found;
+}
+
+#endif
+
+#if defined(BLAKECOIN_8WAY)
+
+blake256r8_8way_context blakecoin_8w_ctx;
+
+void blakecoin_8way_hash( void *state, const void *input )
+{
+     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+     blake256r8_8way_context ctx;
+
+     memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
+     blake256r8_8way( &ctx, input + (64<<3), 16 );
+     blake256r8_8way_close( &ctx, vhash );
+
+     mm256_deinterleave_8x32( state,     state+ 32, state+ 64, state+ 96,
+                              state+128, state+160, state+192, state+224,
+                              vhash, 256 );
+}
+
+int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t HTarget = ptarget[7];
+   uint32_t _ALIGN(32) edata[20];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   uint32_t *noncep = vdata + 152;   // 19*8
+   int num_found = 0;
+   if ( opt_benchmark )
+      HTarget = 0x7f;
+
+   // we need big endian data...
+   swab32_array( edata, pdata, 20 );
+   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
+                                 edata, edata, edata, edata, 640 );
+   blake256r8_8way_init( &blakecoin_8w_ctx );
+   blake256r8_8way( &blakecoin_8w_ctx, vdata, 64 );
+
+   do {
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+      be32enc( noncep +4, n+4 );
+      be32enc( noncep +5, n+5 );
+      be32enc( noncep +6, n+6 );
+      be32enc( noncep +7, n+7 );
+      pdata[19] = n;
+      blakecoin_8way_hash( hash, vdata );
+
+      for ( int i = 0; i < 8; i++ )
+      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
+      {
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 8;
+   } while ( (num_found == 0) && (n < max_nonce)
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
   return num_found;
 }

--- a/algo/blake/blakecoin-gate.c
+++ b/algo/blake/blakecoin-gate.c
@@ -8,55 +8,21 @@ int64_t blakecoin_get_max64 ()
 //  return 0x3fffffLL;
 }

-// Blakecoin 4 way hashes so fast it runs out of nonces.
-// This is an attempt to solve this but the result may be
-// to rehash old nonces until new work is received.
-void bc4w_get_new_work( struct work* work, struct work* g_work, int thr_id,
-                     uint32_t *end_nonce_ptr, bool clean_job )
-{
-   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
- 
-//   if ( have_stratum && ( *nonceptr >= *end_nonce_ptr ) )
-//      algo_gate.stratum_gen_work( &stratum, g_work );
-
-   if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size ) 
-   || ( *nonceptr >= *end_nonce_ptr )
-   || ( (  work->job_id != g_work->job_id ) && clean_job ) )
-/*
-   if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
-      && ( clean_job || ( *nonceptr >= *end_nonce_ptr )
-         || ( work->job_id != g_work->job_id ) ) )
-*/   
-   {
-     work_free( work );
-     work_copy( work, g_work );
-     *nonceptr = 0xffffffffU / opt_n_threads * thr_id;
-     if ( opt_randomize )
-       *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
-     *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20; 
-// try incrementing the xnonce to chsnge the data
-//     for ( int i = 0; i < work->xnonce2_size && !( ++work->xnonce2[i] ); i++ );
-   }
-   else
-       ++(*nonceptr);
-}
-
-
 // vanilla uses default gen merkle root, otherwise identical to blakecoin
 bool register_vanilla_algo( algo_gate_t* gate )
 {
-#if defined(BLAKECOIN_4WAY)
-//  four_way_not_tested();
+#if defined(BLAKECOIN_8WAY)
+  gate->scanhash  = (void*)&scanhash_blakecoin_8way;
+  gate->hash      = (void*)&blakecoin_8way_hash;
+
+#elif defined(BLAKECOIN_4WAY)
  gate->scanhash  = (void*)&scanhash_blakecoin_4way;
  gate->hash      = (void*)&blakecoin_4way_hash;
-//  gate->get_new_work = (void*)&bc4w_get_new_work;
-//  blakecoin_4way_init( &blake_4way_init_ctx );
 #else
  gate->scanhash = (void*)&scanhash_blakecoin;
  gate->hash     = (void*)&blakecoinhash;
-//  blakecoin_init( &blake_init_ctx );
 #endif
-  gate->optimizations = AVX2_OPT;
+  gate->optimizations = AVX_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&blakecoin_get_max64;
  return true;
 }
--- a/algo/blake/blakecoin-gate.h
+++ b/algo/blake/blakecoin-gate.h
@@ -1,12 +1,21 @@
 #ifndef __BLAKECOIN_GATE_H__
-#define __BLAKECOIN_GATE_H__
+#define __BLAKECOIN_GATE_H__ 1

 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__)
+#if defined(__AVX__)
  #define BLAKECOIN_4WAY
 #endif
+#if defined(__AVX2__)
+  #define BLAKECOIN_8WAY
+#endif
+
+#if defined (BLAKECOIN_8WAY)
+void blakecoin_8way_hash(void *state, const void *input);
+int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#endif

 #if defined (BLAKECOIN_4WAY)
 void blakecoin_4way_hash(void *state, const void *input);
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -38,7 +38,6 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;

   // copy to buffer guaranteed to be aligned.
@@ -52,7 +51,6 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,

   uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
   do {
-      found[0] = found[1] = found[2] = found[3] = false;
      * noncep    = n;
      *(noncep+1) = n+1;
      *(noncep+2) = n+2;
@@ -60,35 +58,11 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,

      decred_hash_4way( hash, vdata );

-      if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
+      for ( int i = 0; i < 4; i++ )
+      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
      {
-          work_set_target_ratio( work, hash );
-          found[0] = true;
-          num_found++;
-          nonces[0] = n;
-          pdata[DECRED_NONCE_INDEX] = n;
-      }
-      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
-      {
-          work_set_target_ratio( work, hash+8 );
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-      }
-      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
-      {
-          work_set_target_ratio( work, hash+16 );
-          found[2] = true;
-          num_found++;
-          nonces[2] = n+2;
-      }
-
-      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
-      {
-          work_set_target_ratio( work, hash+24 );
-          found[3] = true;
-          num_found++;
-          nonces[3] = n+3;
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;
  } while ( (num_found == 0) && (n < max_nonce) 
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -111,12 +111,8 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
    const uint32_t first_nonce = pdata[19];
    const uint32_t Htarg = ptarget[7];
    uint32_t *nonces = work->nonces;
-    bool *found = work->nfound;
    int num_found = 0;
-    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-    uint32_t *noncep1 = vdata + 75;
-    uint32_t *noncep2 = vdata + 77;
-    uint32_t *noncep3 = vdata + 79;
+    uint32_t *noncep = vdata + 73;   // 9*8 + 1

 //    uint32_t _ALIGN(32) hash64[8];
 //    uint32_t _ALIGN(32) endiandata[32];
@@ -150,47 +146,19 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
        {
           uint32_t mask = masks[m];
           do {
-              found[0] = found[1] = found[2] = found[3] = false;
-              be32enc( noncep0, n   );
-              be32enc( noncep1, n+1 );
-              be32enc( noncep2, n+2 );
-              be32enc( noncep3, n+3 );
+              be32enc( noncep,   n   );
+              be32enc( noncep+2, n+1 );
+              be32enc( noncep+4, n+2 );
+              be32enc( noncep+6, n+3 );

              pentablakehash_4way( hash, vdata );

-              // return immediately on nonce found, only one submit
-              if ( ( !(hash[7] & mask) ) && fulltest( hash, ptarget ) )
+              for ( int i = 0; i < 4; i++ )
+              if ( !( (hash+(i<<3))[7] & mask )
+                  && fulltest( hash+(i<<3), ptarget ) )
              {
-                  found[0] = true;
-                  num_found++;
-                  nonces[0] = n;
-                  pdata[19] = n;
-                  *hashes_done = n - first_nonce + 1;
-                  return 1;
-              }
-              if ( (! ((hash+8)[7] & mask) ) && fulltest( hash+8, ptarget ) )
-              {
-                  found[1] = true;
-                  num_found++;
-                  nonces[1] = n;
-                  *hashes_done = n - first_nonce + 1;
-                  return 1;
-              }
-              if ( ( !((hash+16)[7] & mask) ) && fulltest( hash+16, ptarget ) )
-              {
-                  found[2] = true;
-                  num_found++;
-                  nonces[2] = n;
-                  *hashes_done = n - first_nonce + 1;
-                  return 1;
-              }
-              if ( ( !((hash+24)[7] & mask) ) && fulltest( hash+24, ptarget ) )
-              {
-                  found[3] = true;
-                  num_found++;
-                  nonces[3] = n;
-                  *hashes_done = n - first_nonce + 1;
-                  return 1;
+                 nonces[ num_found++ ] = n+i;
+                 work_set_target_ratio( work, hash+(i<<3) );
              }
              n += 4;

--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -64,12 +64,8 @@ int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 76; // 19*4
-   uint32_t *noncep1 = vdata + 77;
-   uint32_t *noncep2 = vdata + 78;
-   uint32_t *noncep3 = vdata + 79;
+   uint32_t *noncep = vdata + 76; // 19*4

 /*
        uint32_t *pdata = work->data;
@@ -86,42 +82,19 @@ int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );

   do {
-      found[0] = found[1] = found[2] = found[3] = false;
-      be32enc( noncep0, n   );
-      be32enc( noncep1, n+1 );
-      be32enc( noncep2, n+2 );
-      be32enc( noncep3, n+3 );
+      be32enc( noncep,   n   );
+      be32enc( noncep+1, n+1 );
+      be32enc( noncep+2, n+2 );
+      be32enc( noncep+3, n+3 );

      myriad_4way_hash( hash, vdata );
      pdata[19] = n;

-      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
      {
-          found[0] = true;
-          num_found++;
-          nonces[0] = pdata[19] = n;
-          work_set_target_ratio( work, hash );
-      }
-      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
-      {
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-          work_set_target_ratio( work, hash+8 );
-      }
-      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
-      {
-          found[2] = true;
-          num_found++;
-          nonces[2] = n+2;
-          work_set_target_ratio( work, hash+16 );
-      }
-      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
-      {
-          found[3] = true;
-          num_found++;
-          nonces[3] = n+3;
-          work_set_target_ratio( work, hash+24 );
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;
   } while ( (num_found == 0) && (n < max_nonce-4)
--- a/algo/hodl/hodl-wolf.c
+++ b/algo/hodl/hodl-wolf.c
@@ -150,6 +150,9 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
        int searchNumber = COMPARE_SIZE / opt_n_threads;
        int startLoc = threadNumber * searchNumber;

+        if ( opt_debug )
+           applog( LOG_DEBUG,"Hash target= %08lx", ptarget[7] );
+
        for(int32_t k = startLoc; k < startLoc + searchNumber && !work_restart[threadNumber].restart; k++)
        {
           // copy data to first l2 cache
--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -95,12 +95,8 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t Htarg = ptarget[7];
   uint32_t n = pdata[19];
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-   uint32_t *noncep1 = vdata + 75;
-   uint32_t *noncep2 = vdata + 77;
-   uint32_t *noncep3 = vdata + 79;
+   uint32_t *noncep = vdata + 73;   // 9*8 + 1

   uint64_t htmax[] = {
 		0,
@@ -131,46 +127,20 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
      {
         uint32_t mask = masks[m];
         do {
-              found[0] = found[1] = found[2] = found[3] = false;
-              be32enc( noncep0, n   );
-              be32enc( noncep1, n+1 );
-              be32enc( noncep2, n+2 );
-              be32enc( noncep3, n+3 );
+              be32enc( noncep,   n   );
+              be32enc( noncep+2, n+1 );
+              be32enc( noncep+4, n+2 );
+              be32enc( noncep+6, n+3 );

              jha_hash_4way( hash, vdata );
              pdata[19] = n;

-              if ( ( !(hash[7] & mask) )
-                   && fulltest( hash, ptarget ) )
+              for ( int i = 0; i < 4; i++ )
+              if ( ( !( (hash+(i<<3))[7] & mask ) == 0 )
+                  && fulltest( hash+(i<<3), ptarget ) )
              {
-                 found[0] = true;
-                 num_found++;
-                 nonces[0] = n;
-                 work_set_target_ratio( work, hash );
-              }
-              if ( ( !((hash+8)[7] & mask) )
-                   && fulltest( hash+8, ptarget ) )
-              {
-                 found[1] = true;
-                 num_found++;
-                 nonces[1] = n+1;
-                 work_set_target_ratio( work, hash+8 );
-              }
-              if ( ( !((hash+16)[7] & mask) )
-                 && fulltest( hash+16, ptarget ) )
-              {
-                 found[2] = true;
-                 num_found++;
-                 nonces[2] = n+2;
-                 work_set_target_ratio( work, hash+16 );
-              }
-              if ( ( !((hash+24)[7] & mask) )
-                   && fulltest( hash+24, ptarget ) )
-              {
-                 found[3] = true;
-                 num_found++;
-                 nonces[3] = n+3;
-                 work_set_target_ratio( work, hash+24 );
+                 nonces[ num_found++ ] = n+i;
+                 work_set_target_ratio( work, hash+(i<<3) );
              }
              n += 4;
         } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -32,12 +32,8 @@ int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
 //   const uint32_t Htarg = ptarget[7];
   uint32_t endiandata[20];
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-   uint32_t *noncep1 = vdata + 75;
-   uint32_t *noncep2 = vdata + 77;
-   uint32_t *noncep3 = vdata + 79;
+   uint32_t *noncep = vdata + 73;   // 9*8 + 1

   for ( int i=0; i < 19; i++ ) 
      be32enc( &endiandata[i], pdata[i] );
@@ -46,42 +42,19 @@ int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

   do {
-      found[0] = found[1] = found[2] = found[3] = false;
-      be32enc( noncep0, n   );
-      be32enc( noncep1, n+1 );
-      be32enc( noncep2, n+2 );
-      be32enc( noncep3, n+3 );
+      be32enc( noncep,   n   );
+      be32enc( noncep+2, n+1 );
+      be32enc( noncep+4, n+2 );
+      be32enc( noncep+6, n+3 );
 	
      keccakhash_4way( hash, vdata );

-      if ( ( ( hash[7] & 0xFFFFFF00 ) == 0 )
-         && fulltest( hash, ptarget) )
+      for ( int i = 0; i < 4; i++ )
+      if ( ( ( (hash+(i<<3))[7] & 0xFFFFFF00 ) == 0 )
+           && fulltest( hash+(i<<3), ptarget ) )
      {
-          found[0] = true;
-          num_found++;
-          nonces[0] = n;
-          pdata[19] = n;
-      }
-      if ( ( ( (hash+8)[7] & 0xFFFFFF00 ) == 0 )
-         && fulltest( hash+8, ptarget) ) 
-      {
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-      }
-      if ( ( ( (hash+16) [7] & 0xFFFFFF00 ) == 0 )
-         && fulltest( hash+16, ptarget) )
-      {
-          found[2] = true;
-          num_found++;
-          nonces[2] = n+2;
-      }
-      if ( ( ( (hash+24)[7] & 0xFFFFFF00 ) == 0 )
-         && fulltest( hash+24, ptarget) )
-      {
-          found[3] = true;
-          num_found++;
-          nonces[3] = n+3;
+         nonces[ num_found++ ] = n+i;
+         work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;

--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -101,12 +101,8 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 76; // 19*4
-   uint32_t *noncep1 = vdata + 77;
-   uint32_t *noncep2 = vdata + 78;
-   uint32_t *noncep3 = vdata + 79;
+   uint32_t *noncep = vdata + 76; // 19*4

   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;
@@ -117,44 +113,21 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
   blake256_4way( &allium_4way_ctx.blake, vdata, 64 );

   do {
-      found[0] = found[1] = found[2] = found[3] = false;
-      be32enc( noncep0, n   );
-      be32enc( noncep1, n+1 );
-      be32enc( noncep2, n+2 );
-      be32enc( noncep3, n+3 );
+     be32enc( noncep,   n   );
+     be32enc( noncep+1, n+1 );
+     be32enc( noncep+2, n+2 );
+     be32enc( noncep+3, n+3 );

-      allium_4way_hash( hash, vdata );
-      pdata[19] = n;
+     allium_4way_hash( hash, vdata );
+     pdata[19] = n;

-      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
-      {
-          found[0] = true;
-          num_found++;
-          nonces[0] = pdata[19] = n;
-          work_set_target_ratio( work, hash );
-      }
-      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
-      {
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-          work_set_target_ratio( work, hash+8 );
-      }
-      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
-      {
-          found[2] = true;
-          num_found++;
-          nonces[2] = n+2;
-          work_set_target_ratio( work, hash+16 );
-      }
-      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
-      {
-          found[3] = true;
-          num_found++;
-          nonces[3] = n+3;
-          work_set_target_ratio( work, hash+24 );
-      }
-      n += 4;
+     for ( int i = 0; i < 4; i++ )
+     if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+     {
+         nonces[ num_found++ ] = n+i;
+         work_set_target_ratio( work, hash+(i<<3) );
+     }
+     n += 4;
   } while ( (num_found == 0) && (n < max_nonce-4)
                   && !work_restart[thr_id].restart);

--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -61,12 +61,8 @@ int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 76; // 19*4
-   uint32_t *noncep1 = vdata + 77;
-   uint32_t *noncep2 = vdata + 78;
-   uint32_t *noncep3 = vdata + 79;
+   uint32_t *noncep= vdata + 76; // 19*4

   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;
@@ -79,42 +75,19 @@ int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
   lyra2h_4way_midstate( vdata );

   do {
-      found[0] = found[1] = found[2] = found[3] = false;
-      be32enc( noncep0, n   );
-      be32enc( noncep1, n+1 );
-      be32enc( noncep2, n+2 );
-      be32enc( noncep3, n+3 );
+      be32enc( noncep,   n   );
+      be32enc( noncep+1, n+1 );
+      be32enc( noncep+2, n+2 );
+      be32enc( noncep+3, n+3 );

      be32enc( &edata[19], n );
      lyra2h_4way_hash( hash, vdata );

-      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
      {
-          found[0] = true;
-          num_found++;
-          nonces[0] = pdata[19] = n;
-          work_set_target_ratio( work, hash );
-      }
-      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
-      {
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-          work_set_target_ratio( work, hash+8 );
-      }
-      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
-      {
-          found[2] = true;
-          num_found++;
-          nonces[2] = n+2;
-          work_set_target_ratio( work, hash+16 );
-      }
-      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
-      {
-          found[3] = true;
-          num_found++;
-          nonces[3] = n+3;
-          work_set_target_ratio( work, hash+24 );
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;
   } while ( (num_found == 0) && (n < max_nonce-4)
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -93,12 +93,8 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 76; // 19*4
-   uint32_t *noncep1 = vdata + 77;
-   uint32_t *noncep2 = vdata + 78;
-   uint32_t *noncep3 = vdata + 79;
+   uint32_t *noncep = vdata + 76; // 19*4

   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;
@@ -111,42 +107,19 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
   blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );

   do {
-      found[0] = found[1] = found[2] = found[3] = false;
-      be32enc( noncep0, n   );
-      be32enc( noncep1, n+1 );
-      be32enc( noncep2, n+2 );
-      be32enc( noncep3, n+3 );
+      be32enc( noncep,   n   );
+      be32enc( noncep+1, n+1 );
+      be32enc( noncep+2, n+2 );
+      be32enc( noncep+3, n+3 );

      lyra2rev2_4way_hash( hash, vdata );
      pdata[19] = n;

-      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
      {
-          found[0] = true;
-          num_found++;
-          nonces[0] = pdata[19] = n;
-          work_set_target_ratio( work, hash );
-      }
-      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
-      {
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-          work_set_target_ratio( work, hash+8 );
-      }
-      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
-      {
-          found[2] = true;
-          num_found++;
-          nonces[2] = n+2;
-          work_set_target_ratio( work, hash+16 );
-      }
-      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
-      {
-          found[3] = true;
-          num_found++;
-          nonces[3] = n+3;
-          work_set_target_ratio( work, hash+24 );
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;
   } while ( (num_found == 0) && (n < max_nonce-4)
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -61,12 +61,8 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 76; // 19*4
-   uint32_t *noncep1 = vdata + 77;
-   uint32_t *noncep2 = vdata + 78;
-   uint32_t *noncep3 = vdata + 79;
+   uint32_t *noncep = vdata + 76; // 19*4

   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;
@@ -79,42 +75,19 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
   lyra2z_4way_midstate( vdata );

   do {
-      found[0] = found[1] = found[2] = found[3] = false;
-      be32enc( noncep0, n   );
-      be32enc( noncep1, n+1 );
-      be32enc( noncep2, n+2 );
-      be32enc( noncep3, n+3 );
+      be32enc( noncep,   n   );
+      be32enc( noncep+1, n+1 );
+      be32enc( noncep+2, n+2 );
+      be32enc( noncep+3, n+3 );

      lyra2z_4way_hash( hash, vdata );
      pdata[19] = n;

-      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
      {
-          found[0] = true;
-          num_found++;
-          nonces[0] = pdata[19] = n;
-          work_set_target_ratio( work, hash );
-      }
-      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
-      {
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-          work_set_target_ratio( work, hash+8 );
-      }
-      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
-      {
-          found[2] = true;
-          num_found++;
-          nonces[2] = n+2;
-          work_set_target_ratio( work, hash+16 );
-      }
-      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
-      {
-          found[3] = true;
-          num_found++;
-          nonces[3] = n+3;
-          work_set_target_ratio( work, hash+24 );
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;
   } while ( (num_found == 0) && (n < max_nonce-4)
@@ -126,3 +99,114 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,

 #endif

+#if defined(LYRA2Z_8WAY)
+
+__thread uint64_t* lyra2z_8way_matrix;
+
+bool lyra2z_8way_thread_init()
+{
+ return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
+}
+
+static __thread blake256_8way_context l2z_8way_blake_mid;
+
+void lyra2z_8way_midstate( const void* input )
+{
+       blake256_8way_init( &l2z_8way_blake_mid );
+       blake256_8way( &l2z_8way_blake_mid, input, 64 );
+}
+
+void lyra2z_8way_hash( void *state, const void *input )
+{
+     uint32_t hash0[8] __attribute__ ((aligned (64)));
+     uint32_t hash1[8] __attribute__ ((aligned (64)));
+     uint32_t hash2[8] __attribute__ ((aligned (64)));
+     uint32_t hash3[8] __attribute__ ((aligned (64)));
+     uint32_t hash4[8] __attribute__ ((aligned (64)));
+     uint32_t hash5[8] __attribute__ ((aligned (64)));
+     uint32_t hash6[8] __attribute__ ((aligned (64)));
+     uint32_t hash7[8] __attribute__ ((aligned (64)));
+     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+     blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
+
+     memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
+     blake256_8way( &ctx_blake, input + (64*8), 16 );
+     blake256_8way_close( &ctx_blake, vhash );
+
+     mm256_deinterleave_8x32( hash0, hash1, hash2, hash3,
+                              hash4, hash5, hash6, hash7, vhash, 256 );
+
+     LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash4, 32, hash4, 32, hash4, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash5, 32, hash5, 32, hash5, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
+
+     memcpy( state,     hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash1, 32 );
+     memcpy( state+160, hash2, 32 );
+     memcpy( state+192, hash3, 32 );
+     memcpy( state+224, hash1, 32 );
+}
+
+int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 152; // 19*8
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0000ff;
+
+   for ( int i=0; i < 19; i++ )
+      be32enc( &edata[i], pdata[i] );
+
+   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
+                                 edata, edata, edata, edata, 640 );
+
+   lyra2z_8way_midstate( vdata );
+
+   do {
+      be32enc( noncep,   n   );
+      be32enc( noncep+1, n+1 );
+      be32enc( noncep+2, n+2 );
+      be32enc( noncep+3, n+3 );
+      be32enc( noncep+4, n+4 );
+      be32enc( noncep+5, n+5 );
+      be32enc( noncep+6, n+6 );
+      be32enc( noncep+7, n+7 );
+
+      lyra2z_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 8; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      {
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 8;
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+
+#endif
--- a/algo/lyra2/lyra2z-gate.c
+++ b/algo/lyra2/lyra2z-gate.c
@@ -8,7 +8,11 @@ void lyra2z_set_target( struct work* work, double job_diff )

 bool register_lyra2z_algo( algo_gate_t* gate )
 {
-#ifdef LYRA2Z_4WAY
+#if defined(LYRA2Z_8WAY)
+  gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2z_8way;
+  gate->hash       = (void*)&lyra2z_8way_hash;
+#elif defined(LYRA2Z_4WAY)
  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z_4way;
  gate->hash       = (void*)&lyra2z_4way_hash;
--- a/algo/lyra2/lyra2z-gate.h
+++ b/algo/lyra2/lyra2z-gate.h
@@ -1,17 +1,29 @@
 #ifndef LYRA2Z_GATE_H__
-#define LYRA2Z_GATE_H__
+#define LYRA2Z_GATE_H__ 1

 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__)
+#if defined(__AVX__)
  #define LYRA2Z_4WAY
 #endif
+#if defined(__AVX2__)
+//  #define LYRA2Z_8WAY
+#endif


 #define LYRA2Z_MATRIX_SIZE  BLOCK_LEN_INT64 * 8 * 8 * 8

-#if defined(LYRA2Z_4WAY)
+#if defined(LYRA2Z_8WAY)
+
+void lyra2z_8way_hash( void *state, const void *input );
+
+int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+bool lyra2z_8way_thread_init();
+
+#elif defined(LYRA2Z_4WAY)

 void lyra2z_4way_hash( void *state, const void *input );

@@ -20,7 +32,7 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,

 bool lyra2z_4way_thread_init();

-#endif
+#else

 void lyra2z_hash( void *state, const void *input );

@@ -31,3 +43,4 @@ bool lyra2z_thread_init();

 #endif

+#endif
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -42,7 +42,7 @@ inline void initState( uint64_t State[/*16*/] )
 {
 #if defined (__AVX2__)

-  __m256i *state = (__m256i*)State;
+  __m256i* state = (__m256i*)State;
 
  state[0] = _mm256_setzero_si256();
  state[1] = _mm256_setzero_si256();
@@ -53,7 +53,7 @@ inline void initState( uint64_t State[/*16*/] )

 #elif defined (__AVX__)

-  __m128i *state = (__m128i*)State;
+  __m128i* state = (__m128i*)State;

  state[0] = _mm_setzero_si128();
  state[1] = _mm_setzero_si128();
@@ -123,8 +123,8 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )

    const int len_m256i = len / 32;
    const int fullBlocks = len_m256i / BLOCK_LEN_M256I;
-    __m256i *state = (__m256i*)State;
-    __m256i *out   = (__m256i*)Out;
+    __m256i* state = (__m256i*)State;
+    __m256i* out   = (__m256i*)Out;
    int i;

    //Squeezes full blocks
@@ -141,8 +141,8 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )

    const int len_m128i = len / 16;
    const int fullBlocks = len_m128i / BLOCK_LEN_M128I;
-    __m128i *state = (__m128i*)State;
-    __m128i *out   = (__m128i*)Out;
+    __m128i* state = (__m128i*)State;
+    __m128i* out   = (__m128i*)Out;
    int i;

    //Squeezes full blocks
@@ -186,27 +186,19 @@ inline void absorbBlock( uint64_t *State, const uint64_t *In )
 {
 #if defined (__AVX2__)

-    register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
-    register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
-    register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
-    register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
-    const __m256i *in = (const __m256i*)In;
+    __m256i* state = (__m256i*)State;
+    __m256i* in = (__m256i*)In;

-    state0 = _mm256_xor_si256( state0, in[0] );
-    state1 = _mm256_xor_si256( state1, in[1] );
-    state2 = _mm256_xor_si256( state2, in[2] );
+    state[0] = _mm256_xor_si256( state[0], in[0] );
+    state[1] = _mm256_xor_si256( state[1], in[1] );
+    state[2] = _mm256_xor_si256( state[2], in[2] );

-    LYRA_12_ROUNDS_AVX2( state0, state1, state2, state3 );
-
-    _mm256_store_si256( casto_m256i( State, 0 ), state0 );
-    _mm256_store_si256( casto_m256i( State, 1 ), state1 );
-    _mm256_store_si256( casto_m256i( State, 2 ), state2 );
-    _mm256_store_si256( casto_m256i( State, 3 ), state3 );
+    LYRA_12_ROUNDS_AVX2( state[0], state[1], state[2], state[3] );

 #elif defined (__AVX__)

-    __m128i *state = (__m128i*)State;
-    const __m128i *in = (const __m128i*)In;
+    __m128i* state = (__m128i*)State;
+    __m128i* in    = (__m128i*)In;

    state[0] = _mm_xor_si128( state[0], in[0] );
    state[1] = _mm_xor_si128( state[1], in[1] );
@@ -253,26 +245,18 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In )
    //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
 #if defined (__AVX2__)

-    register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
-    register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
-    register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
-    register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
-    const __m256i *in = (const __m256i*)In;
+    __m256i* state = (__m256i*)State;
+    __m256i* in    = (__m256i*)In;

-    state0 = _mm256_xor_si256( state0, in[0] );
-    state1 = _mm256_xor_si256( state1, in[1] );
+    state[0] = _mm256_xor_si256( state[0], in[0] );
+    state[1] = _mm256_xor_si256( state[1], in[1] );

-    LYRA_12_ROUNDS_AVX2( state0, state1, state2, state3 );
-
-    _mm256_store_si256( casto_m256i( State, 0 ), state0 );
-    _mm256_store_si256( casto_m256i( State, 1 ), state1 );
-    _mm256_store_si256( casto_m256i( State, 2 ), state2 );
-    _mm256_store_si256( casto_m256i( State, 3 ), state3 );
+    LYRA_12_ROUNDS_AVX2( state[0], state[1], state[2], state[3] );

 #elif defined (__AVX__)

-    __m128i *state = (__m128i*)State;
-    const __m128i *in = (const __m128i*)In;
+    __m128i* state = (__m128i*)State;
+    __m128i* in    = (__m128i*)In;

    state[0] = _mm_xor_si128( state[0], in[0] );
    state[1] = _mm_xor_si128( state[1], in[1] );
@@ -308,7 +292,7 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In )
 * @param state     The current state of the sponge
 * @param rowOut    Row to receive the data squeezed
 */
-inline void reducedSqueezeRow0( uint64_t *State, uint64_t *rowOut,
+inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
                                uint64_t nCols )
 {
    int i;
@@ -317,19 +301,24 @@ inline void reducedSqueezeRow0( uint64_t *State, uint64_t *rowOut,

 #if defined (__AVX2__)

-    register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
-    register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
-    register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
-    register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
-    __m256i *out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
+    __m256i* state = (__m256i*)State;
+    __m256i  state0 = _mm256_load_si256(  state    );
+    __m256i  state1 = _mm256_load_si256( &state[1] );
+    __m256i  state2 = _mm256_load_si256( &state[2] );
+    __m256i  state3 = _mm256_load_si256( &state[3] );

-    __builtin_prefetch( out,    1, 0 );
-    __builtin_prefetch( out -2, 1, 0 );
-    __builtin_prefetch( out -4, 1, 0 );
+    __m256i* out   = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
+
+    for ( i = 0; i < 9; i += 3)
+    {
+        _mm_prefetch( out - i,     _MM_HINT_T0 );
+        _mm_prefetch( out - i - 2, _MM_HINT_T0 );
+    }

    for ( i = 0; i < nCols; i++ )
    {
-       __builtin_prefetch( out -i-6, 1, 0 );
+       _mm_prefetch( out -  9, _MM_HINT_T0 );
+       _mm_prefetch( out - 11, _MM_HINT_T0 );
                   
       out[0] = state0;
       out[1] = state1;
@@ -341,14 +330,15 @@ inline void reducedSqueezeRow0( uint64_t *State, uint64_t *rowOut,
       LYRA_ROUND_AVX2( state0, state1, state2, state3 );
    }

-    _mm256_store_si256( casto_m256i( State, 0 ), state0 );
-    _mm256_store_si256( casto_m256i( State, 1 ), state1 );
-    _mm256_store_si256( casto_m256i( State, 2 ), state2 );
-    _mm256_store_si256( casto_m256i( State, 3 ), state3 );
+   _mm256_store_si256( state,     state0 );
+   _mm256_store_si256( &state[1], state1 );
+   _mm256_store_si256( &state[2], state2 );
+   _mm256_store_si256( &state[3], state3 );
+

 #elif defined (__AVX__)

-    __m128i *state = (__m128i*)State;
+    __m128i* state = (__m128i*)State;
    __m128i  state0 = _mm_load_si128(  state    );
    __m128i  state1 = _mm_load_si128( &state[1] );
    __m128i  state2 = _mm_load_si128( &state[2] );
@@ -358,7 +348,7 @@ inline void reducedSqueezeRow0( uint64_t *State, uint64_t *rowOut,
    __m128i  state6 = _mm_load_si128( &state[6] );
    __m128i  state7 = _mm_load_si128( &state[7] );

-    __m128i *out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
+    __m128i* out   = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );

    for ( i = 0; i < 6; i += 3)
    {
@@ -397,7 +387,7 @@ inline void reducedSqueezeRow0( uint64_t *State, uint64_t *rowOut,

 #else

-    uint64_t *ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
+    uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]

    for ( i = 0; i < nCols; i++ )
    {
@@ -432,31 +422,37 @@ inline void reducedSqueezeRow0( uint64_t *State, uint64_t *rowOut,
 * @param rowIn		Row to feed the sponge
 * @param rowOut	Row to receive the sponge's output
 */
-inline void reducedDuplexRow1( uint64_t *State, const uint64_t *rowIn,
+inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
                               uint64_t *rowOut, uint64_t nCols )
 {
    int i;

 #if defined (__AVX2__)

-    register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
-    register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
-    register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
-    register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
-    const __m256i *in = (const __m256i*)rowIn;
-    __m256i *out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
+    __m256i* state = (__m256i*)State;
+    __m256i  state0 = _mm256_load_si256(  state    );
+    __m256i  state1 = _mm256_load_si256( &state[1] );
+    __m256i  state2 = _mm256_load_si256( &state[2] );
+    __m256i  state3 = _mm256_load_si256( &state[3] );

-    __builtin_prefetch( in,     0, 0 );
-    __builtin_prefetch( in  +2, 0, 0 );
-    __builtin_prefetch( in  +4, 0, 0 );
-    __builtin_prefetch( out,    1, 0 );
-    __builtin_prefetch( out -2, 1, 0 );
-    __builtin_prefetch( out -4, 1, 0 );
+    __m256i* in    = (__m256i*)rowIn;
+    __m256i* out   = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
+
+    for ( i = 0; i < 9; i += 3)
+    {
+        _mm_prefetch( in  + i,     _MM_HINT_T0 );
+        _mm_prefetch( in  + i + 2, _MM_HINT_T0 );
+        _mm_prefetch( out - i,     _MM_HINT_T0 );
+        _mm_prefetch( out - i - 2, _MM_HINT_T0 );
+    }

    for ( i = 0; i < nCols; i++ )
    {
-         __builtin_prefetch( in  +i+6, 0, 0 );
-         __builtin_prefetch( out -i-6, 1, 0 );
+
+        _mm_prefetch( in  +  9, _MM_HINT_T0 );
+        _mm_prefetch( in  + 11, _MM_HINT_T0 );
+        _mm_prefetch( out -  9, _MM_HINT_T0 );
+        _mm_prefetch( out - 11, _MM_HINT_T0 );
 
         state0 = _mm256_xor_si256( state0, in[0] );
         state1 = _mm256_xor_si256( state1, in[1] );
@@ -474,14 +470,14 @@ inline void reducedDuplexRow1( uint64_t *State, const uint64_t *rowIn,
         out -= BLOCK_LEN_M256I;
    }

-    _mm256_store_si256( casto_m256i( State, 0 ), state0 );
-    _mm256_store_si256( casto_m256i( State, 1 ), state1 );
-    _mm256_store_si256( casto_m256i( State, 2 ), state2 );
-    _mm256_store_si256( casto_m256i( State, 3 ), state3 );
+   _mm256_store_si256( state,     state0 );
+   _mm256_store_si256( &state[1], state1 );
+   _mm256_store_si256( &state[2], state2 );
+   _mm256_store_si256( &state[3], state3 );

 #elif defined (__AVX__)

-    __m128i *state = (__m128i*)State;
+    __m128i* state = (__m128i*)State;
    __m128i  state0 = _mm_load_si128(  state    );
    __m128i  state1 = _mm_load_si128( &state[1] );
    __m128i  state2 = _mm_load_si128( &state[2] );
@@ -491,8 +487,8 @@ inline void reducedDuplexRow1( uint64_t *State, const uint64_t *rowIn,
    __m128i  state6 = _mm_load_si128( &state[6] );
    __m128i  state7 = _mm_load_si128( &state[7] );

-    const __m128i *in = (const __m128i*)rowIn;
-    __m128i *out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
+    __m128i*  in   = (__m128i*)rowIn;
+    __m128i* out   = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );

    for ( i = 0; i < 6; i += 3)
    {
@@ -544,8 +540,8 @@ inline void reducedDuplexRow1( uint64_t *State, const uint64_t *rowIn,

 #else

-    const uint64_t *ptrWordIn = (const uint64_t*)rowIn;        //In Lyra2: pointer to prev
-    uint64_t *ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+    uint64_t* ptrWordIn = rowIn;        //In Lyra2: pointer to prev
+    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row

    for ( i = 0; i < nCols; i++ )
    {
@@ -604,7 +600,7 @@ inline void reducedDuplexRow1( uint64_t *State, const uint64_t *rowIn,
 * @param rowOut         Row receiving the output
 *
 */
-inline void reducedDuplexRowSetup( uint64_t *State, const uint64_t *rowIn,
+inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
                                   uint64_t *rowInOut, uint64_t *rowOut,
                                   uint64_t nCols )
 {
@@ -612,30 +608,35 @@ inline void reducedDuplexRowSetup( uint64_t *State, const uint64_t *rowIn,

 #if defined (__AVX2__)

-    register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
-    register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
-    register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
-    register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
-    const __m256i *in = (const __m256i*)rowIn;
-    __m256i *inout = (__m256i*)rowInOut;
-    __m256i *out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
-    __m256i t0, t1, t2;
+    __m256i* state = (__m256i*)State;
+    __m256i  state0 = _mm256_load_si256(  state    );
+    __m256i  state1 = _mm256_load_si256( &state[1] );
+    __m256i  state2 = _mm256_load_si256( &state[2] );
+    __m256i  state3 = _mm256_load_si256( &state[3] );

-    __builtin_prefetch( in,       0, 0 );
-    __builtin_prefetch( in    +2, 0, 0 );
-    __builtin_prefetch( in    +4, 0, 0 );
-    __builtin_prefetch( inout,    1, 0 );
-    __builtin_prefetch( inout +2, 1, 0 );
-    __builtin_prefetch( inout +4, 1, 0 );
-    __builtin_prefetch( out,      1, 0 );
-    __builtin_prefetch( out   -2, 1, 0 );
-    __builtin_prefetch( out   -4, 1, 0 );
+    __m256i* in    = (__m256i*)rowIn;
+    __m256i* inout = (__m256i*)rowInOut;
+    __m256i* out   = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
+    __m256i  t0, t1, t2;
+
+    for ( i = 0; i < 9; i += 3)
+    {
+        _mm_prefetch( in    + i,     _MM_HINT_T0 );
+        _mm_prefetch( in    + i + 2, _MM_HINT_T0 );
+        _mm_prefetch( inout + i,     _MM_HINT_T0 );
+        _mm_prefetch( inout + i + 2, _MM_HINT_T0 );
+        _mm_prefetch( out   - i,     _MM_HINT_T0 );
+        _mm_prefetch( out   - i - 2, _MM_HINT_T0 );
+    }

    for ( i = 0; i < nCols; i++ )
    {
-       __builtin_prefetch( in    +i+6, 0, 0 );
-       __builtin_prefetch( inout +i+6, 1, 0 );
-       __builtin_prefetch( out   -i-6, 1, 0 );
+       _mm_prefetch( in    +  9, _MM_HINT_T0 );
+       _mm_prefetch( in    + 11, _MM_HINT_T0 );
+       _mm_prefetch( inout +  9, _MM_HINT_T0 );
+       _mm_prefetch( inout + 11, _MM_HINT_T0 );
+       _mm_prefetch( out   -  9, _MM_HINT_T0 );
+       _mm_prefetch( out   - 11, _MM_HINT_T0 );

       state0 = _mm256_xor_si256( state0,
                                  _mm256_add_epi64( in[0], inout[0] ) );
@@ -669,16 +670,16 @@ inline void reducedDuplexRowSetup( uint64_t *State, const uint64_t *rowIn,
       out   -= BLOCK_LEN_M256I;
    }

-    _mm256_store_si256( casto_m256i( State, 0 ), state0 );
-    _mm256_store_si256( casto_m256i( State, 1 ), state1 );
-    _mm256_store_si256( casto_m256i( State, 2 ), state2 );
-    _mm256_store_si256( casto_m256i( State, 3 ), state3 );
+   _mm256_store_si256( state,     state0 );
+   _mm256_store_si256( &state[1], state1 );
+   _mm256_store_si256( &state[2], state2 );
+   _mm256_store_si256( &state[3], state3 );

 #elif defined (__AVX__)

-    const __m128i *in = (const __m128i*)rowIn;
-    __m128i *inout = (__m128i*)rowInOut;
-    __m128i *out   = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
+    __m128i* in    = (__m128i*)rowIn;
+    __m128i* inout = (__m128i*)rowInOut;
+    __m128i* out   = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );

    for ( i = 0; i < 6; i += 3)
    {
@@ -690,12 +691,12 @@ inline void reducedDuplexRowSetup( uint64_t *State, const uint64_t *rowIn,
        _mm_prefetch( out   - i - 2, _MM_HINT_T0 );
    }

-    __m128i *state = (__m128i*)State;
+    __m128i* state = (__m128i*)State;

    // For the last round in this function not optimized for AVX
-    const uint64_t *ptrWordIn = rowIn;        //In Lyra2: pointer to prev
-    uint64_t *ptrWordInOut = rowInOut;  //In Lyra2: pointer to row*
-    uint64_t *ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+    uint64_t* ptrWordIn = rowIn;        //In Lyra2: pointer to prev
+    uint64_t* ptrWordInOut = rowInOut;  //In Lyra2: pointer to row*
+    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row

    for ( i = 0; i < nCols; i++ )
    {
@@ -756,9 +757,9 @@ inline void reducedDuplexRowSetup( uint64_t *State, const uint64_t *rowIn,

 #else

-    const uint64_t *ptrWordIn = (const uint64_t*)rowIn;        //In Lyra2: pointer to prev
-    uint64_t *ptrWordInOut = rowInOut;  //In Lyra2: pointer to row*
-    uint64_t *ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+    uint64_t* ptrWordIn = rowIn;        //In Lyra2: pointer to prev
+    uint64_t* ptrWordInOut = rowInOut;  //In Lyra2: pointer to row*
+    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row

    for ( i = 0; i < nCols; i++ )
    {
@@ -833,7 +834,7 @@ inline void reducedDuplexRowSetup( uint64_t *State, const uint64_t *rowIn,
 *
 */

-inline void reducedDuplexRow( uint64_t *State, const uint64_t *rowIn,
+inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
                              uint64_t *rowInOut, uint64_t *rowOut,
                              uint64_t nCols )
 {
@@ -841,30 +842,35 @@ inline void reducedDuplexRow( uint64_t *State, const uint64_t *rowIn,

 #if defined __AVX2__

-   register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
-   register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
-   register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
-   register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
-   const __m256i* in    = (const __m256i*)rowIn;
-   __m256i *inout = (__m256i*)rowInOut;
-   __m256i *out = (__m256i*)rowOut;
-   __m256i t0, t1, t2;
+   __m256i* state = (__m256i*)State;
+   __m256i  state0 = _mm256_load_si256(  state    );
+   __m256i  state1 = _mm256_load_si256( &state[1] );
+   __m256i  state2 = _mm256_load_si256( &state[2] );
+   __m256i  state3 = _mm256_load_si256( &state[3] );

-   __builtin_prefetch( in,        0, 0 );
-   __builtin_prefetch( in     +2, 0, 0 );
-   __builtin_prefetch( in     +4, 0, 0 );
-   __builtin_prefetch( inout,     1, 0 );
-   __builtin_prefetch( inout  +2, 1, 0 );
-   __builtin_prefetch( inout  +4, 1, 0 );
-   __builtin_prefetch( out,       1, 0 );
-   __builtin_prefetch( out    +2, 1, 0 );
-   __builtin_prefetch( out    +4, 1, 0 );
+   __m256i* in    = (__m256i*)rowIn;
+   __m256i* inout = (__m256i*)rowInOut;
+   __m256i* out   = (__m256i*)rowOut;
+   __m256i  t0, t1, t2;
+
+   for ( i = 0; i < 9; i += 3)
+   {
+       _mm_prefetch( in    + i,     _MM_HINT_T0 );
+       _mm_prefetch( in    + i + 2, _MM_HINT_T0 );
+       _mm_prefetch( out   + i,     _MM_HINT_T0 );
+       _mm_prefetch( out   + i + 2, _MM_HINT_T0 );
+       _mm_prefetch( inout + i,     _MM_HINT_T0 );
+       _mm_prefetch( inout + i + 2, _MM_HINT_T0 );
+   }

   for ( i = 0; i < nCols; i++ )
   {
-      __builtin_prefetch( in    +i+6, 0, 0 );
-      __builtin_prefetch( inout +i+6, 1, 0 );
-      __builtin_prefetch( out   +i+6, 1, 0 );
+      _mm_prefetch( in    +  9, _MM_HINT_T0 );
+      _mm_prefetch( in    + 11, _MM_HINT_T0 );
+      _mm_prefetch( out   +  9, _MM_HINT_T0 );
+      _mm_prefetch( out   + 11, _MM_HINT_T0 );
+      _mm_prefetch( inout +  9, _MM_HINT_T0 );
+      _mm_prefetch( inout + 11, _MM_HINT_T0 );

      //Absorbing "M[prev] [+] M[row*]"
      state0 = _mm256_xor_si256( state0,
@@ -900,17 +906,17 @@ inline void reducedDuplexRow( uint64_t *State, const uint64_t *rowIn,
       inout += BLOCK_LEN_M256I;
   }

-    _mm256_store_si256( casto_m256i( State, 0 ), state0 );
-    _mm256_store_si256( casto_m256i( State, 1 ), state1 );
-    _mm256_store_si256( casto_m256i( State, 2 ), state2 );
-    _mm256_store_si256( casto_m256i( State, 3 ), state3 );
+   _mm256_store_si256( state,     state0 );
+   _mm256_store_si256( &state[1], state1 );
+   _mm256_store_si256( &state[2], state2 );
+   _mm256_store_si256( &state[3], state3 );

 #elif defined __AVX__

-    __m128i *state = (__m128i*)State;
-    const __m128i *in = (const __m128i*)rowIn;
-    __m128i *inout = (__m128i*)rowInOut;
-    __m128i *out   = (__m128i*)rowOut;
+    __m128i* state = (__m128i*)State;
+    __m128i* in    = (__m128i*)rowIn;
+    __m128i* inout = (__m128i*)rowInOut;
+    __m128i* out   = (__m128i*)rowOut;

    for ( i = 0; i < 6; i += 3)
    {
@@ -923,9 +929,9 @@ inline void reducedDuplexRow( uint64_t *State, const uint64_t *rowIn,
    }

    // for the last round in this function that isn't optimized for AVX
-    uint64_t *ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
-    const uint64_t *ptrWordIn = (const uint64_t*)rowIn; //In Lyra2: pointer to prev
-    uint64_t *ptrWordOut = rowOut; //In Lyra2: pointer to row
+    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
+    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
+    uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row

    for ( i = 0; i < nCols; i++)
    {
@@ -991,9 +997,9 @@ inline void reducedDuplexRow( uint64_t *State, const uint64_t *rowIn,

 #else

-    uint64_t *ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
-    const uint64_t *ptrWordIn = (const uint64_t*)rowIn; //In Lyra2: pointer to prev
-    uint64_t *ptrWordOut = rowOut; //In Lyra2: pointer to row
+    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
+    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
+    uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row

    for ( i = 0; i < nCols; i++)
    {
--- a/algo/lyra2/sponge.c.bak
+++ b/algo/lyra2/sponge.c.bak
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -159,26 +159,23 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){


 //---- Housekeeping
-void initState( uint64_t state[/*16*/] );
+void initState(uint64_t state[/*16*/]);

 //---- Squeezes
-void squeeze( uint64_t *state, unsigned char *out, unsigned int len );
-void reducedSqueezeRow0( uint64_t* state, uint64_t* row, uint64_t nCols );
+void squeeze(uint64_t *state, unsigned char *out, unsigned int len);
+void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols);

 //---- Absorbs
-void absorbBlock( uint64_t *state, const uint64_t *in );
-void absorbBlockBlake2Safe( uint64_t *state, const uint64_t *in );
+void absorbBlock(uint64_t *state, const uint64_t *in);
+void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);

 //---- Duplexes
-void reducedDuplexRow1( uint64_t *state, const uint64_t *rowIn,
-                        uint64_t *rowOut, uint64_t nCols);
-void reducedDuplexRowSetup( uint64_t *state, const uint64_t *rowIn,
-                     uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
-void reducedDuplexRow( uint64_t *state, const uint64_t *rowIn,
-                     uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
+void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols);
+void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
+void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);

 //---- Misc
-//void printArray(unsigned char *array, unsigned int size, char *name);
+void printArray(unsigned char *array, unsigned int size, char *name);

 ////////////////////////////////////////////////////////////////////////////////////////////////

--- a/algo/lyra2/sponge.h.bak
+++ b/algo/lyra2/sponge.h.bak
@@ -0,0 +1,196 @@
+/**
+ * Header file for Blake2b's internal permutation in the form of a sponge.
+ * This code is based on the original Blake2b's implementation provided by
+ * Samuel Neves (https://blake2.net/)
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPONGE_H_
+#define SPONGE_H_
+
+#include <stdint.h>
+#include "avxdefs.h"
+
+#if defined(__GNUC__)
+#define ALIGN __attribute__ ((aligned(32)))
+#elif defined(_MSC_VER)
+#define ALIGN __declspec(align(32))
+#else
+#define ALIGN
+#endif
+
+
+/*Blake2b IV Array*/
+static const uint64_t blake2b_IV[8] =
+{
+  0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+  0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+  0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+  0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
+};
+
+/*Blake2b's rotation*/
+static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
+    return ( w >> c ) | ( w << ( 64 - c ) );
+}
+
+#if defined __AVX2__
+// only available with avx2
+
+// process 4 columns in parallel
+// returns void, updates all args
+#define G_4X64(a,b,c,d) \
+   a = _mm256_add_epi64( a, b ); \
+   d = mm256_rotr_64( _mm256_xor_si256( d, a), 32 ); \
+   c = _mm256_add_epi64( c, d ); \
+   b = mm256_rotr_64( _mm256_xor_si256( b, c ), 24 ); \
+   a = _mm256_add_epi64( a, b ); \
+   d = mm256_rotr_64( _mm256_xor_si256( d, a ), 16 ); \
+   c = _mm256_add_epi64( c, d ); \
+   b = mm256_rotr_64( _mm256_xor_si256( b, c ), 63 );
+
+#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   G_4X64( s0, s1, s2, s3 ); \
+   s1 = mm256_rotr256_1x64( s1); \
+   s2 = mm256_swap_128( s2 ); \
+   s3 = mm256_rotl256_1x64( s3 ); \
+   G_4X64( s0, s1, s2, s3 ); \
+   s1 = mm256_rotl256_1x64( s1 ); \
+   s2 = mm256_swap_128( s2 ); \
+   s3 = mm256_rotr256_1x64( s3 );
+
+#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+
+#else
+// only available with avx
+
+// process 2 columns in parallel
+// returns void, all args updated
+#define G_2X64(a,b,c,d) \
+   a = _mm_add_epi64( a, b ); \
+   d = mm_rotr_64( _mm_xor_si128( d, a), 32 ); \
+   c = _mm_add_epi64( c, d ); \
+   b = mm_rotr_64( _mm_xor_si128( b, c ), 24 ); \
+   a = _mm_add_epi64( a, b ); \
+   d = mm_rotr_64( _mm_xor_si128( d, a ), 16 ); \
+   c = _mm_add_epi64( c, d ); \
+   b = mm_rotr_64( _mm_xor_si128( b, c ), 63 );
+
+#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   G_2X64( s0, s2, s4, s6 ); \
+   G_2X64( s1, s3, s5, s7 ); \
+   mm_rotl256_1x64( s2, s3 ); \
+   mm_swap_128( s4, s5 ); \
+   mm_rotr256_1x64( s6, s7 ); \
+   G_2X64( s0, s2, s4, s6 ); \
+   G_2X64( s1, s3, s5, s7 ); \
+   mm_rotr256_1x64( s2, s3 ); \
+   mm_swap_128( s4, s5 ); \
+   mm_rotl256_1x64( s6, s7 );
+
+#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+
+
+#endif // AVX2
+
+// Scalar
+//Blake2b's G function
+#define G(r,i,a,b,c,d) \
+  do { \
+    a = a + b; \
+    d = rotr64(d ^ a, 32); \
+    c = c + d; \
+    b = rotr64(b ^ c, 24); \
+    a = a + b; \
+    d = rotr64(d ^ a, 16); \
+    c = c + d; \
+    b = rotr64(b ^ c, 63); \
+  } while(0)
+
+
+/*One Round of the Blake2b's compression function*/
+#define ROUND_LYRA(r)  \
+    G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+    G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+    G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+    G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+    G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+    G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
+
+
+//---- Housekeeping
+void initState( uint64_t state[/*16*/] );
+
+//---- Squeezes
+void squeeze( uint64_t *state, unsigned char *out, unsigned int len );
+void reducedSqueezeRow0( uint64_t* state, uint64_t* row, uint64_t nCols );
+
+//---- Absorbs
+void absorbBlock( uint64_t *state, const uint64_t *in );
+void absorbBlockBlake2Safe( uint64_t *state, const uint64_t *in );
+
+//---- Duplexes
+void reducedDuplexRow1( uint64_t *state, const uint64_t *rowIn,
+                        uint64_t *rowOut, uint64_t nCols);
+void reducedDuplexRowSetup( uint64_t *state, const uint64_t *rowIn,
+                     uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
+void reducedDuplexRow( uint64_t *state, const uint64_t *rowIn,
+                     uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
+
+//---- Misc
+//void printArray(unsigned char *array, unsigned int size, char *name);
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+////TESTS////
+//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2);
+//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+/////////////
+
+
+#endif /* SPONGE_H_ */
--- a/algo/nist5/nist5-4way.c
+++ b/algo/nist5/nist5-4way.c
@@ -79,12 +79,8 @@ int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
     const uint32_t first_nonce = pdata[19];
     const uint32_t Htarg = ptarget[7];
     uint32_t *nonces = work->nonces;
-     bool *found = work->nfound;
     int num_found = 0;
-     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-     uint32_t *noncep1 = vdata + 75;
-     uint32_t *noncep2 = vdata + 77;
-     uint32_t *noncep3 = vdata + 79;
+     uint32_t *noncep = vdata + 73;   // 9*8 + 1

     uint64_t htmax[] = {          0,
                                 0xF,
@@ -117,47 +113,21 @@ int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
           uint32_t mask = masks[m];

           do {
-              found[0] = found[1] = found[2] = found[3] = false;
-              be32enc( noncep0, n   );
-              be32enc( noncep1, n+1 );
-              be32enc( noncep2, n+2 );
-              be32enc( noncep3, n+3 );
+              be32enc( noncep,   n   );
+              be32enc( noncep+2, n+1 );
+              be32enc( noncep+4, n+2 );
+              be32enc( noncep+6, n+3 );

              nist5hash_4way( hash, vdata );

              pdata[19] = n;

-              if ( ( !(hash[7] & mask) )
-                   && fulltest( hash, ptarget ) ) 
+              for ( int i = 0; i < 4; i++ )
+              if ( ( !( (hash+(i<<3))[7] & mask ) == 0 )
+                 && fulltest( hash+(i<<3), ptarget ) )
              {
-                 found[0] = true;
-                 num_found++;
-                 nonces[0] = n; 
-                 work_set_target_ratio( work, hash );
-              }
-              if ( ( !((hash+8)[7] & mask) )
-                   && fulltest( hash+8, ptarget ) )
-              {
-                 found[1] = true;
-                 num_found++;
-                 nonces[1] = n+1;
-                 work_set_target_ratio( work, hash+8 );
-              }
-              if ( ( !((hash+16)[7] & mask) )
-                 && fulltest( hash+16, ptarget ) )
-              {
-                 found[2] = true;
-                 num_found++;
-                 nonces[2] = n+2;
-                 work_set_target_ratio( work, hash+16 );
-              }
-              if ( ( !((hash+24)[7] & mask) )
-                   && fulltest( hash+24, ptarget ) )
-              {
-                 found[3] = true;
-                 num_found++;
-                 nonces[3] = n+3;
-                 work_set_target_ratio( work, hash+24 );
+                 nonces[ num_found++ ] = n+i;
+                 work_set_target_ratio( work, hash+(i<<3) );
              }
              n += 4;
           } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -145,12 +145,8 @@ int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
    uint32_t *nonces = work->nonces;
-    bool *found = work->nfound;
    int num_found = 0;
-    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-    uint32_t *noncep1 = vdata + 75;
-    uint32_t *noncep2 = vdata + 77;
-    uint32_t *noncep3 = vdata + 79;
+    uint32_t *noncep = vdata + 73;   // 9*8 + 1
    const uint32_t Htarg = ptarget[7];
    uint64_t htmax[] = {
                0,
@@ -181,42 +177,20 @@ int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,

          do
          {
-              found[0] = found[1] = found[2] = found[3] = false;
-              be32enc( noncep0, n   );
-              be32enc( noncep1, n+1 );
-              be32enc( noncep2, n+2 );
-              be32enc( noncep3, n+3 );
+             be32enc( noncep,   n   );
+             be32enc( noncep+2, n+1 );
+             be32enc( noncep+4, n+2 );
+             be32enc( noncep+6, n+3 );

-              anime_4way_hash( hash, vdata );
-              pdata[19] = n;
+             anime_4way_hash( hash, vdata );
+             pdata[19] = n;

-             if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) ) 
+             for ( int i = 0; i < 4; i++ )
+             if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
+                && fulltest( hash+(i<<3), ptarget ) )
             {
-                found[0] = true;
-                num_found++;
-                nonces[0] = n;
-                work_set_target_ratio( work, hash );
-             }
-             if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
-             {
-                found[1] = true;
-                num_found++;
-                nonces[1] = n+1;
-                work_set_target_ratio( work, hash );
-             }
-             if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
-             {
-                found[2] = true;
-                num_found++;
-                nonces[2] = n+2;
-                work_set_target_ratio( work, hash );
-             }
-             if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
-             {
-                found[3] = true;
-                num_found++;
-                nonces[3] = n+3;
-                work_set_target_ratio( work, hash );
+                nonces[ num_found++ ] = n+i;
+                work_set_target_ratio( work, hash+(i<<3) );
             }
             n += 4;
          } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -145,12 +145,8 @@ int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
    uint32_t *nonces = work->nonces;
-    bool *found = work->nfound;
    int num_found = 0;
-    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-    uint32_t *noncep1 = vdata + 75;
-    uint32_t *noncep2 = vdata + 77;
-    uint32_t *noncep3 = vdata + 79;
+    uint32_t *noncep = vdata + 73;   // 9*8 + 1

    swab32_array( endiandata, pdata, 20 );

@@ -159,42 +155,20 @@ int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,

    do
    {
-       found[0] = found[1] = found[2] = found[3] = false;
-       be32enc( noncep0, n   );
-       be32enc( noncep1, n+1 );
-       be32enc( noncep2, n+2 );
-       be32enc( noncep3, n+3 );
+       be32enc( noncep,   n   );
+       be32enc( noncep+2, n+1 );
+       be32enc( noncep+4, n+2 );
+       be32enc( noncep+6, n+3 );

       quark_4way_hash( hash, vdata );
       pdata[19] = n;

-       if ( ( hash[7] & 0xFFFFFF00 ) == 0 && fulltest( hash, ptarget ) ) 
+       for ( int i = 0; i < 4; i++ )
+       if ( ( ( (hash+(i<<3))[7] & 0xFFFFFF00 ) == 0 )
+            && fulltest( hash+(i<<3), ptarget ) )
       {
-          found[0] = true;
-          num_found++;
-          nonces[0] = n;
-          work_set_target_ratio( work, hash );
-       }
-       if ( ( (hash+8)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+8, ptarget ) )
-       {
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-          work_set_target_ratio( work, hash );
-       }
-       if ( ( (hash+16)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+16, ptarget ) )
-       {
-          found[2] = true;
-          num_found++;
-          nonces[2] = n+2;
-          work_set_target_ratio( work, hash );
-       }
-       if ( ( (hash+24)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+24, ptarget ) )
-       {
-          found[3] = true;
-          num_found++;
-          nonces[3] = n+3;
-          work_set_target_ratio( work, hash );
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
       }
       n += 4;
    } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/qubit/deep-2way.c
+++ b/algo/qubit/deep-2way.c
@@ -74,10 +74,8 @@ int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t *nonces = work->nonces;
-     bool *found = work->nfound;
     int num_found = 0;
-     uint32_t *noncep0 = vdata + 32+3;   // 4*8 + 3
-     uint32_t *noncep1 = vdata + 32+7;
+     uint32_t *noncep = vdata + 32+3;   // 4*8 + 3
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
@@ -98,24 +96,20 @@ int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
        uint32_t mask = masks[m];
        do
        {
-            found[0] = found[1] = false;
-            be32enc( noncep0, n   );
-            be32enc( noncep1, n+1 );
+            be32enc( noncep,   n   );
+            be32enc( noncep+4, n+1 );
+
            deep_2way_hash( hash, vdata );
            pdata[19] = n;

            if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
            {
-               found[0] = true;
-               num_found++;
-               nonces[0] = n;
+               nonces[ num_found++ ] = n;
               work_set_target_ratio( work, hash );
            }
            if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
            {
-               found[1] = true;
-               num_found++;
-               nonces[1] = n+1;
+               nonces[ num_found++ ] = n+1;
               work_set_target_ratio( work, hash+8 );
            }
            n += 2;
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -80,10 +80,8 @@ int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t *nonces = work->nonces;
-     bool *found = work->nfound;
     int num_found = 0;
-     uint32_t *noncep0 = vdata + 32+3;   // 4*8 + 3
-     uint32_t *noncep1 = vdata + 32+7;
+     uint32_t *noncep = vdata + 32+3;   // 4*8 + 3
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
@@ -93,7 +91,6 @@ int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
     // big endian encode 0..18 uint32_t, 64 bits at a time
     swab32_array( endiandata, pdata, 20 );

-
     uint64_t *edata = (uint64_t*)endiandata;
     mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );

@@ -105,24 +102,20 @@ int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
        uint32_t mask = masks[m];
        do
        {
-           found[0] = found[1] = false;
-            be32enc( noncep0, n   );
-            be32enc( noncep1, n+1 );
+            be32enc( noncep,   n   );
+            be32enc( noncep+4, n+1 );
            qubit_2way_hash( hash, vdata );
            pdata[19] = n;

+
            if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
            {
-               found[0] = true;
-               num_found++;
-               nonces[0] = n;
+               nonces[ num_found++ ] = n;
               work_set_target_ratio( work, hash );
            }
            if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
            {
-               found[1] = true;
-               num_found++;
-               nonces[1] = n+1;
+               nonces[ num_found++ ] = n+1;
               work_set_target_ratio( work, hash+8 );
            }
            n += 2;
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -1,7 +1,4 @@
 #include "lbry-gate.h"
-
-#if defined(LBRY_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -9,6 +6,140 @@
 #include "algo/sha/sha2-hash-4way.h"
 #include "ripemd-hash-4way.h"

+#define LBRY_INPUT_SIZE 112
+#define LBRY_MIDSTATE    64
+#define LBRY_TAIL (LBRY_INPUT_SIZE) - (LBRY_MIDSTATE)
+
+#if defined(LBRY_8WAY)
+
+static __thread sha256_8way_context sha256_8w_mid;
+
+void lbry_8way_hash( void* output, const void* input )
+{
+   uint32_t _ALIGN(64) vhashA[16<<3];
+   uint32_t _ALIGN(64) vhashB[16<<3];
+   uint32_t _ALIGN(64) vhashC[16<<3];
+   uint32_t _ALIGN(32) h0[32];
+   uint32_t _ALIGN(32) h1[32];
+   uint32_t _ALIGN(32) h2[32];
+   uint32_t _ALIGN(32) h3[32];
+   uint32_t _ALIGN(32) h4[32];
+   uint32_t _ALIGN(32) h5[32];
+   uint32_t _ALIGN(32) h6[32];
+   uint32_t _ALIGN(32) h7[32];
+   sha256_8way_context     ctx_sha256 __attribute__ ((aligned (64)));
+   sha512_4way_context     ctx_sha512;
+   ripemd160_8way_context  ctx_ripemd;
+
+   memcpy( &ctx_sha256, &sha256_8w_mid, sizeof(ctx_sha256) );
+   sha256_8way( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL );
+   sha256_8way_close( &ctx_sha256, vhashA );
+
+   sha256_8way_init( &ctx_sha256 );
+   sha256_8way( &ctx_sha256, vhashA, 32 );
+   sha256_8way_close( &ctx_sha256, vhashA );
+
+   // reinterleave to do sha512 4-way 64 bit twice.
+   mm256_deinterleave_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 );
+   mm256_interleave_4x64( vhashA, h0, h1, h2, h3, 256 );
+   mm256_interleave_4x64( vhashB, h4, h5, h6, h7, 256 );
+
+   sha512_4way_init( &ctx_sha512 );
+   sha512_4way( &ctx_sha512, vhashA, 32 );
+   sha512_4way_close( &ctx_sha512, vhashA );
+
+   sha512_4way_init( &ctx_sha512 );
+   sha512_4way( &ctx_sha512, vhashB, 32 );
+   sha512_4way_close( &ctx_sha512, vhashB );
+
+   // back to 8-way 32 bit
+   mm256_deinterleave_4x64( h0, h1, h2, h3, vhashA, 512 );
+   mm256_deinterleave_4x64( h4, h5, h6, h7, vhashB, 512 );
+   mm256_interleave_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );
+
+   ripemd160_8way_init( &ctx_ripemd );
+   ripemd160_8way( &ctx_ripemd, vhashA, 32 );
+   ripemd160_8way_close( &ctx_ripemd, vhashB );
+
+   ripemd160_8way_init( &ctx_ripemd );
+   ripemd160_8way( &ctx_ripemd, vhashA+(8<<3), 32 );
+   ripemd160_8way_close( &ctx_ripemd, vhashC );
+
+   sha256_8way_init( &ctx_sha256 );
+   sha256_8way( &ctx_sha256, vhashB, 20 );
+   sha256_8way( &ctx_sha256, vhashC, 20 );
+   sha256_8way_close( &ctx_sha256, vhashA );
+
+   sha256_8way_init( &ctx_sha256 );
+   sha256_8way( &ctx_sha256, vhashA, 32 );
+   sha256_8way_close( &ctx_sha256, vhashA );
+
+   mm256_deinterleave_8x32( output,     output+ 32, output+ 64, output+ 96,
+                            output+128, output+160, output+192, output+224,
+                            vhashA, 256 );
+}
+
+int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done)
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[32*8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[27];
+   const uint32_t first_nonce = pdata[27];
+   const uint32_t Htarg = ptarget[7];
+   uint32_t edata[32] __attribute__ ((aligned (64)));
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 216; // 27*8
+
+   uint64_t htmax[] = {          0,        0xF,       0xFF,
+                             0xFFF,     0xFFFF, 0x10000000 };
+   uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                        0xFFFFF000, 0xFFFF0000,          0 };
+
+   // we need bigendian data...
+   swab32_array( edata, pdata, 32 );
+   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
+                                 edata, edata, edata, edata, 1024 );
+   sha256_8way_init( &sha256_8w_mid );
+   sha256_8way( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
+
+   for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] )
+   {
+      uint32_t mask = masks[m];
+      do
+      {
+         be32enc( noncep,   n   );
+         be32enc( noncep+1, n+1 );
+         be32enc( noncep+2, n+2 );
+         be32enc( noncep+3, n+3 );
+         be32enc( noncep+4, n+4 );
+         be32enc( noncep+5, n+5 );
+         be32enc( noncep+6, n+6 );
+         be32enc( noncep+7, n+7 );
+
+         lbry_8way_hash( hash, vdata );
+
+         for ( int i = 0; i < 8; i++ )
+         if ( !( (hash+(i<<3))[7] & mask ) && fulltest( hash+(i<<3), ptarget ) )
+         {
+            nonces[ num_found++ ] = n+i;
+            work_set_target_ratio( work, hash+(i<<3) );
+         }
+         n+=8;
+      } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+      break;
+   }
+
+   *hashes_done = n - first_nonce;
+   return num_found;
+}
+
+#elif defined(LBRY_4WAY)
+
 static __thread sha256_4way_context sha256_mid;

 void lbry_4way_hash( void* output, const void* input )
@@ -21,7 +152,7 @@ void lbry_4way_hash( void* output, const void* input )
   uint32_t _ALIGN(64) vhashC[16<<2];

   memcpy( &ctx_sha256, &sha256_mid, sizeof(ctx_sha256) );
-   sha256_4way( &ctx_sha256, input+(64<<2), 48 );
+   sha256_4way( &ctx_sha256, input + (LBRY_MIDSTATE<<2), LBRY_TAIL );
   sha256_4way_close( &ctx_sha256, vhashA );

   sha256_4way_init( &ctx_sha256 );
@@ -67,12 +198,8 @@ int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t Htarg = ptarget[7];
   uint32_t edata[32] __attribute__ ((aligned (64)));
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 108; // 27*4
-   uint32_t *noncep1 = vdata + 109;
-   uint32_t *noncep2 = vdata + 110;
-   uint32_t *noncep3 = vdata + 111;
+   uint32_t *noncep = vdata + 108; // 27*4

   uint64_t htmax[] = {          0,        0xF,       0xFF,
                             0xFFF,     0xFFFF, 0x10000000 };
@@ -83,47 +210,25 @@ int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
   swab32_array( edata, pdata, 32 );
   mm_interleave_4x32( vdata, edata, edata, edata, edata, 1024 );
   sha256_4way_init( &sha256_mid );
-   sha256_4way( &sha256_mid, vdata, 64 );
+   sha256_4way( &sha256_mid, vdata, LBRY_MIDSTATE );

   for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] )
   {
      uint32_t mask = masks[m];
      do
      {
-         found[0] = found[1] = found[2] = found[3] = false;
-         be32enc( noncep0, n   );
-         be32enc( noncep1, n+1 );
-         be32enc( noncep2, n+2 );
-         be32enc( noncep3, n+3 );
+         be32enc( noncep,   n   );
+         be32enc( noncep+1, n+1 );
+         be32enc( noncep+2, n+2 );
+         be32enc( noncep+3, n+3 );
+
         lbry_4way_hash( hash, vdata );

-         if ( !( hash[7] & mask ) && fulltest( hash, ptarget ) )
+         for ( int i = 0; i < 4; i++ )
+         if ( !( (hash+(i<<3))[7] & mask ) && fulltest( hash+(i<<3), ptarget ) )
         {
-            found[0] = true;
-            num_found++;
-            nonces[0] = pdata[27] = n;
-            work_set_target_ratio( work, hash );
-         }
-         if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget ) ) 
-         {
-            found[1] = true;
-            num_found++;
-            nonces[1] = n+1;
-            work_set_target_ratio( work, hash+8 );
-         }
-         if ( !( (hash+16)[7] & mask ) && fulltest( hash+16, ptarget ) ) 
-         {
-            found[2] = true;
-            num_found++;
-            nonces[2] = n+2;
-            work_set_target_ratio( work, hash+16 );
-         }
-         if ( !( (hash+24)[7] & mask ) && fulltest( hash+24, ptarget ) ) 
-         {
-            found[3] = true;
-            num_found++;
-            nonces[3] = n+3;
-            work_set_target_ratio( work, hash+24 );
+            nonces[ num_found++ ] = n+i;
+            work_set_target_ratio( work, hash+(i<<3) );
         }
         n+=4;
      } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -73,7 +73,10 @@ int64_t lbry_get_max64() { return 0x1ffffLL; }
 bool register_lbry_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
-#if defined (LBRY_4WAY)
+#if defined (LBRY_8WAY)
+  gate->scanhash              = (void*)&scanhash_lbry_8way;
+  gate->hash                  = (void*)&lbry_8way_hash;
+#elif defined (LBRY_4WAY)
  gate->scanhash              = (void*)&scanhash_lbry_4way;
  gate->hash                  = (void*)&lbry_4way_hash;
 #else 
--- a/algo/ripemd/lbry-gate.h
+++ b/algo/ripemd/lbry-gate.h
@@ -4,8 +4,9 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

+// need sha512 2 way AVX x2 or 1 way scalar x4 to support 4way AVX.
 #if defined(__AVX2__)
-  #define LBRY_4WAY
+  #define LBRY_8WAY
 #endif

 #define LBRY_NTIME_INDEX 25
@@ -16,15 +17,21 @@

 bool register_lbry_algo( algo_gate_t* gate );

-#if defined(LBRY_4WAY)
+#if defined(LBRY_8WAY)
+
+void lbry_8way_hash( void *state, const void *input );
+int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+#elif defined(LBRY_4WAY)

 void lbry_4way_hash( void *state, const void *input );
 int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );
-#endif
+#else

 void lbry_hash( void *state, const void *input );
 int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done );
 #endif
-
+#endif
--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -5,25 +5,6 @@
 #include <stddef.h>
 #include <string.h>

-/*
- * Round functions for RIPEMD-128 and RIPEMD-160.
- */
-#define F1(x, y, z) \
-   _mm_xor_si128( _mm_xor_si128( x, y ), z )
-
-#define F2(x, y, z) \
-   _mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )
-
-#define F3(x, y, z) \
-   _mm_xor_si128( _mm_or_si128( x, mm_not( y ) ), z )
-
-#define F4(x, y, z) \
-   _mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )
-
-#define F5(x, y, z) \
-   _mm_xor_si128( x, _mm_or_si128( y, mm_not( z ) ) )
-
-
 static const uint32_t IV[5] =
 { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 };

@@ -42,6 +23,23 @@ static const uint32_t IV[5] =
 #define K24  0x7A6D76E9
 #define K25  0x00000000

+// RIPEMD-160 4 way
+
+#define F1(x, y, z) \
+   _mm_xor_si128( _mm_xor_si128( x, y ), z )
+
+#define F2(x, y, z) \
+   _mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )
+
+#define F3(x, y, z) \
+   _mm_xor_si128( _mm_or_si128( x, mm_not( y ) ), z )
+
+#define F4(x, y, z) \
+   _mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )
+
+#define F5(x, y, z) \
+   _mm_xor_si128( x, _mm_or_si128( y, mm_not( z ) ) )
+
 #define RR(a, b, c, d, e, f, s, r, k) \
 do{ \
   a = _mm_add_epi32( mm_rotl_32( _mm_add_epi32( _mm_add_epi32( \
@@ -321,3 +319,304 @@ void ripemd160_4way_close( ripemd160_4way_context  *sc, void *dst )
 }

 #endif
+
+#if defined(__AVX2__)
+
+// Ripemd-160 8 way
+
+#define F8W_1(x, y, z) \
+   _mm256_xor_si256( _mm256_xor_si256( x, y ), z )
+
+#define F8W_2(x, y, z) \
+   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( y, z ), x ), z )
+
+#define F8W_3(x, y, z) \
+   _mm256_xor_si256( _mm256_or_si256( x, mm256_not( y ) ), z )
+
+#define F8W_4(x, y, z) \
+   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( x, y ), z ), y )
+
+#define F8W_5(x, y, z) \
+   _mm256_xor_si256( x, _mm256_or_si256( y, mm256_not( z ) ) )
+
+#define RR_8W(a, b, c, d, e, f, s, r, k) \
+do{ \
+   a = _mm256_add_epi32( mm256_rotl_32( _mm256_add_epi32( _mm256_add_epi32( \
+                _mm256_add_epi32( a, f( b ,c, d ) ), r ), \
+                                 _mm256_set1_epi32( k ) ), s ), e ); \
+   c = mm256_rotl_32( c, 10 );\
+} while (0)
+    
+#define ROUND1_8W(a, b, c, d, e, f, s, r, k)  \
+        RR_8W(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
+
+#define ROUND2_8W(a, b, c, d, e, f, s, r, k)  \
+        RR_8W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
+
+static void ripemd160_8way_round( ripemd160_8way_context *sc )
+{
+   const __m256i *in = (__m256i*)sc->buf;
+   __m256i *h  = (__m256i*)sc->val;
+   register __m256i A1, B1, C1, D1, E1;
+   register __m256i A2, B2, C2, D2, E2;
+   __m256i tmp;
+
+   A1 = A2 = h[0];
+   B1 = B2 = h[1];
+   C1 = C2 = h[2];
+   D1 = D2 = h[3];
+   E1 = E2 = h[4];
+
+   ROUND1_8W( A, B, C, D, E, F8W_1, 11, in[ 0], 1 );
+   ROUND1_8W( E, A, B, C, D, F8W_1, 14, in[ 1], 1 );
+   ROUND1_8W( D, E, A, B, C, F8W_1, 15, in[ 2], 1 );
+   ROUND1_8W( C, D, E, A, B, F8W_1, 12, in[ 3], 1 );
+   ROUND1_8W( B, C, D, E, A, F8W_1,  5, in[ 4], 1 );
+   ROUND1_8W( A, B, C, D, E, F8W_1,  8, in[ 5], 1 );
+   ROUND1_8W( E, A, B, C, D, F8W_1,  7, in[ 6], 1 );
+   ROUND1_8W( D, E, A, B, C, F8W_1,  9, in[ 7], 1 );
+   ROUND1_8W( C, D, E, A, B, F8W_1, 11, in[ 8], 1 );
+   ROUND1_8W( B, C, D, E, A, F8W_1, 13, in[ 9], 1 );
+   ROUND1_8W( A, B, C, D, E, F8W_1, 14, in[10], 1 );
+   ROUND1_8W( E, A, B, C, D, F8W_1, 15, in[11], 1 );
+   ROUND1_8W( D, E, A, B, C, F8W_1,  6, in[12], 1 );
+   ROUND1_8W( C, D, E, A, B, F8W_1,  7, in[13], 1 );
+   ROUND1_8W( B, C, D, E, A, F8W_1,  9, in[14], 1 );
+   ROUND1_8W( A, B, C, D, E, F8W_1,  8, in[15], 1 );
+
+   ROUND1_8W( E, A, B, C, D, F8W_2,  7, in[ 7], 2 );
+   ROUND1_8W( D, E, A, B, C, F8W_2,  6, in[ 4], 2 );
+   ROUND1_8W( C, D, E, A, B, F8W_2,  8, in[13], 2 );
+   ROUND1_8W( B, C, D, E, A, F8W_2, 13, in[ 1], 2 );
+   ROUND1_8W( A, B, C, D, E, F8W_2, 11, in[10], 2 );
+   ROUND1_8W( E, A, B, C, D, F8W_2,  9, in[ 6], 2 );
+   ROUND1_8W( D, E, A, B, C, F8W_2,  7, in[15], 2 );
+   ROUND1_8W( C, D, E, A, B, F8W_2, 15, in[ 3], 2 );
+   ROUND1_8W( B, C, D, E, A, F8W_2,  7, in[12], 2 );
+   ROUND1_8W( A, B, C, D, E, F8W_2, 12, in[ 0], 2 );
+   ROUND1_8W( E, A, B, C, D, F8W_2, 15, in[ 9], 2 );
+   ROUND1_8W( D, E, A, B, C, F8W_2,  9, in[ 5], 2 );
+   ROUND1_8W( C, D, E, A, B, F8W_2, 11, in[ 2], 2 );
+   ROUND1_8W( B, C, D, E, A, F8W_2,  7, in[14], 2 );
+   ROUND1_8W( A, B, C, D, E, F8W_2, 13, in[11], 2 );
+   ROUND1_8W( E, A, B, C, D, F8W_2, 12, in[ 8], 2 );
+
+   ROUND1_8W( D, E, A, B, C, F8W_3, 11, in[ 3], 3 );
+   ROUND1_8W( C, D, E, A, B, F8W_3, 13, in[10], 3 );
+   ROUND1_8W( B, C, D, E, A, F8W_3,  6, in[14], 3 );
+   ROUND1_8W( A, B, C, D, E, F8W_3,  7, in[ 4], 3 );
+   ROUND1_8W( E, A, B, C, D, F8W_3, 14, in[ 9], 3 );
+   ROUND1_8W( D, E, A, B, C, F8W_3,  9, in[15], 3 );
+   ROUND1_8W( C, D, E, A, B, F8W_3, 13, in[ 8], 3 );
+   ROUND1_8W( B, C, D, E, A, F8W_3, 15, in[ 1], 3 );
+   ROUND1_8W( A, B, C, D, E, F8W_3, 14, in[ 2], 3 );
+   ROUND1_8W( E, A, B, C, D, F8W_3,  8, in[ 7], 3 );
+   ROUND1_8W( D, E, A, B, C, F8W_3, 13, in[ 0], 3 );
+   ROUND1_8W( C, D, E, A, B, F8W_3,  6, in[ 6], 3 );
+   ROUND1_8W( B, C, D, E, A, F8W_3,  5, in[13], 3 );
+   ROUND1_8W( A, B, C, D, E, F8W_3, 12, in[11], 3 );
+   ROUND1_8W( E, A, B, C, D, F8W_3,  7, in[ 5], 3 );
+   ROUND1_8W( D, E, A, B, C, F8W_3,  5, in[12], 3 );
+
+   ROUND1_8W( C, D, E, A, B, F8W_4, 11, in[ 1], 4 );
+   ROUND1_8W( B, C, D, E, A, F8W_4, 12, in[ 9], 4 );
+   ROUND1_8W( A, B, C, D, E, F8W_4, 14, in[11], 4 );
+   ROUND1_8W( E, A, B, C, D, F8W_4, 15, in[10], 4 );
+   ROUND1_8W( D, E, A, B, C, F8W_4, 14, in[ 0], 4 );
+   ROUND1_8W( C, D, E, A, B, F8W_4, 15, in[ 8], 4 );
+   ROUND1_8W( B, C, D, E, A, F8W_4,  9, in[12], 4 );
+   ROUND1_8W( A, B, C, D, E, F8W_4,  8, in[ 4], 4 );
+   ROUND1_8W( E, A, B, C, D, F8W_4,  9, in[13], 4 );
+   ROUND1_8W( D, E, A, B, C, F8W_4, 14, in[ 3], 4 );
+   ROUND1_8W( C, D, E, A, B, F8W_4,  5, in[ 7], 4 );
+   ROUND1_8W( B, C, D, E, A, F8W_4,  6, in[15], 4 );
+   ROUND1_8W( A, B, C, D, E, F8W_4,  8, in[14], 4 );
+   ROUND1_8W( E, A, B, C, D, F8W_4,  6, in[ 5], 4 );
+   ROUND1_8W( D, E, A, B, C, F8W_4,  5, in[ 6], 4 );
+   ROUND1_8W( C, D, E, A, B, F8W_4, 12, in[ 2], 4 );
+
+   ROUND1_8W( B, C, D, E, A, F8W_5,  9, in[ 4], 5 );
+   ROUND1_8W( A, B, C, D, E, F8W_5, 15, in[ 0], 5 );
+   ROUND1_8W( E, A, B, C, D, F8W_5,  5, in[ 5], 5 );
+   ROUND1_8W( D, E, A, B, C, F8W_5, 11, in[ 9], 5 );
+   ROUND1_8W( C, D, E, A, B, F8W_5,  6, in[ 7], 5 );
+   ROUND1_8W( B, C, D, E, A, F8W_5,  8, in[12], 5 );
+   ROUND1_8W( A, B, C, D, E, F8W_5, 13, in[ 2], 5 );
+   ROUND1_8W( E, A, B, C, D, F8W_5, 12, in[10], 5 );
+   ROUND1_8W( D, E, A, B, C, F8W_5,  5, in[14], 5 );
+   ROUND1_8W( C, D, E, A, B, F8W_5, 12, in[ 1], 5 );
+   ROUND1_8W( B, C, D, E, A, F8W_5, 13, in[ 3], 5 );
+   ROUND1_8W( A, B, C, D, E, F8W_5, 14, in[ 8], 5 );
+   ROUND1_8W( E, A, B, C, D, F8W_5, 11, in[11], 5 );
+   ROUND1_8W( D, E, A, B, C, F8W_5,  8, in[ 6], 5 );
+   ROUND1_8W( C, D, E, A, B, F8W_5,  5, in[15], 5 );
+   ROUND1_8W( B, C, D, E, A, F8W_5,  6, in[13], 5 );
+
+   ROUND2_8W( A, B, C, D, E, F8W_5,  8, in[ 5], 1 );
+   ROUND2_8W( E, A, B, C, D, F8W_5,  9, in[14], 1 );
+   ROUND2_8W( D, E, A, B, C, F8W_5,  9, in[ 7], 1 );
+   ROUND2_8W( C, D, E, A, B, F8W_5, 11, in[ 0], 1 );
+   ROUND2_8W( B, C, D, E, A, F8W_5, 13, in[ 9], 1 );
+   ROUND2_8W( A, B, C, D, E, F8W_5, 15, in[ 2], 1 );
+   ROUND2_8W( E, A, B, C, D, F8W_5, 15, in[11], 1 );
+   ROUND2_8W( D, E, A, B, C, F8W_5,  5, in[ 4], 1 );
+   ROUND2_8W( C, D, E, A, B, F8W_5,  7, in[13], 1 );
+   ROUND2_8W( B, C, D, E, A, F8W_5,  7, in[ 6], 1 );
+   ROUND2_8W( A, B, C, D, E, F8W_5,  8, in[15], 1 );
+   ROUND2_8W( E, A, B, C, D, F8W_5, 11, in[ 8], 1 );
+   ROUND2_8W( D, E, A, B, C, F8W_5, 14, in[ 1], 1 );
+   ROUND2_8W( C, D, E, A, B, F8W_5, 14, in[10], 1 );
+   ROUND2_8W( B, C, D, E, A, F8W_5, 12, in[ 3], 1 );
+   ROUND2_8W( A, B, C, D, E, F8W_5,  6, in[12], 1 );
+
+   ROUND2_8W( E, A, B, C, D, F8W_4,  9, in[ 6], 2 );
+   ROUND2_8W( D, E, A, B, C, F8W_4, 13, in[11], 2 );
+   ROUND2_8W( C, D, E, A, B, F8W_4, 15, in[ 3], 2 );
+   ROUND2_8W( B, C, D, E, A, F8W_4,  7, in[ 7], 2 );
+   ROUND2_8W( A, B, C, D, E, F8W_4, 12, in[ 0], 2 );
+   ROUND2_8W( E, A, B, C, D, F8W_4,  8, in[13], 2 );
+   ROUND2_8W( D, E, A, B, C, F8W_4,  9, in[ 5], 2 );
+   ROUND2_8W( C, D, E, A, B, F8W_4, 11, in[10], 2 );
+   ROUND2_8W( B, C, D, E, A, F8W_4,  7, in[14], 2 );
+   ROUND2_8W( A, B, C, D, E, F8W_4,  7, in[15], 2 );
+   ROUND2_8W( E, A, B, C, D, F8W_4, 12, in[ 8], 2 );
+   ROUND2_8W( D, E, A, B, C, F8W_4,  7, in[12], 2 );
+   ROUND2_8W( C, D, E, A, B, F8W_4,  6, in[ 4], 2 );
+   ROUND2_8W( B, C, D, E, A, F8W_4, 15, in[ 9], 2 );
+   ROUND2_8W( A, B, C, D, E, F8W_4, 13, in[ 1], 2 );
+   ROUND2_8W( E, A, B, C, D, F8W_4, 11, in[ 2], 2 );
+
+   ROUND2_8W( D, E, A, B, C, F8W_3,  9, in[15], 3 );
+   ROUND2_8W( C, D, E, A, B, F8W_3,  7, in[ 5], 3 );
+   ROUND2_8W( B, C, D, E, A, F8W_3, 15, in[ 1], 3 );
+   ROUND2_8W( A, B, C, D, E, F8W_3, 11, in[ 3], 3 );
+   ROUND2_8W( E, A, B, C, D, F8W_3,  8, in[ 7], 3 );
+   ROUND2_8W( D, E, A, B, C, F8W_3,  6, in[14], 3 );
+   ROUND2_8W( C, D, E, A, B, F8W_3,  6, in[ 6], 3 );
+   ROUND2_8W( B, C, D, E, A, F8W_3, 14, in[ 9], 3 );
+   ROUND2_8W( A, B, C, D, E, F8W_3, 12, in[11], 3 );
+   ROUND2_8W( E, A, B, C, D, F8W_3, 13, in[ 8], 3 );
+   ROUND2_8W( D, E, A, B, C, F8W_3,  5, in[12], 3 );
+   ROUND2_8W( C, D, E, A, B, F8W_3, 14, in[ 2], 3 );
+   ROUND2_8W( B, C, D, E, A, F8W_3, 13, in[10], 3 );
+   ROUND2_8W( A, B, C, D, E, F8W_3, 13, in[ 0], 3 );
+   ROUND2_8W( E, A, B, C, D, F8W_3,  7, in[ 4], 3 );
+   ROUND2_8W( D, E, A, B, C, F8W_3,  5, in[13], 3 );
+
+   ROUND2_8W( C, D, E, A, B, F8W_2, 15, in[ 8], 4 );
+   ROUND2_8W( B, C, D, E, A, F8W_2,  5, in[ 6], 4 );
+   ROUND2_8W( A, B, C, D, E, F8W_2,  8, in[ 4], 4 );
+   ROUND2_8W( E, A, B, C, D, F8W_2, 11, in[ 1], 4 );
+   ROUND2_8W( D, E, A, B, C, F8W_2, 14, in[ 3], 4 );
+   ROUND2_8W( C, D, E, A, B, F8W_2, 14, in[11], 4 );
+   ROUND2_8W( B, C, D, E, A, F8W_2,  6, in[15], 4 );
+   ROUND2_8W( A, B, C, D, E, F8W_2, 14, in[ 0], 4 );
+   ROUND2_8W( E, A, B, C, D, F8W_2,  6, in[ 5], 4 );
+   ROUND2_8W( D, E, A, B, C, F8W_2,  9, in[12], 4 );
+   ROUND2_8W( C, D, E, A, B, F8W_2, 12, in[ 2], 4 );
+   ROUND2_8W( B, C, D, E, A, F8W_2,  9, in[13], 4 );
+   ROUND2_8W( A, B, C, D, E, F8W_2, 12, in[ 9], 4 );
+   ROUND2_8W( E, A, B, C, D, F8W_2,  5, in[ 7], 4 );
+   ROUND2_8W( D, E, A, B, C, F8W_2, 15, in[10], 4 );
+   ROUND2_8W( C, D, E, A, B, F8W_2,  8, in[14], 4 );
+
+   ROUND2_8W( B, C, D, E, A, F8W_1,  8, in[12], 5 );
+   ROUND2_8W( A, B, C, D, E, F8W_1,  5, in[15], 5 );
+   ROUND2_8W( E, A, B, C, D, F8W_1, 12, in[10], 5 );
+   ROUND2_8W( D, E, A, B, C, F8W_1,  9, in[ 4], 5 );
+   ROUND2_8W( C, D, E, A, B, F8W_1, 12, in[ 1], 5 );
+   ROUND2_8W( B, C, D, E, A, F8W_1,  5, in[ 5], 5 );
+   ROUND2_8W( A, B, C, D, E, F8W_1, 14, in[ 8], 5 );
+   ROUND2_8W( E, A, B, C, D, F8W_1,  6, in[ 7], 5 );
+   ROUND2_8W( D, E, A, B, C, F8W_1,  8, in[ 6], 5 );
+   ROUND2_8W( C, D, E, A, B, F8W_1, 13, in[ 2], 5 );
+   ROUND2_8W( B, C, D, E, A, F8W_1,  6, in[13], 5 );
+   ROUND2_8W( A, B, C, D, E, F8W_1,  5, in[14], 5 );
+   ROUND2_8W( E, A, B, C, D, F8W_1, 15, in[ 0], 5 );
+   ROUND2_8W( D, E, A, B, C, F8W_1, 13, in[ 3], 5 );
+   ROUND2_8W( C, D, E, A, B, F8W_1, 11, in[ 9], 5 );
+   ROUND2_8W( B, C, D, E, A, F8W_1, 11, in[11], 5 );
+
+   tmp =  _mm256_add_epi32( _mm256_add_epi32( h[1], C1 ), D2 );
+   h[1] = _mm256_add_epi32( _mm256_add_epi32( h[2], D1 ), E2 );
+   h[2] = _mm256_add_epi32( _mm256_add_epi32( h[3], E1 ), A2 );
+   h[3] = _mm256_add_epi32( _mm256_add_epi32( h[4], A1 ), B2 );
+   h[4] = _mm256_add_epi32( _mm256_add_epi32( h[0], B1 ), C2 );
+   h[0] = tmp;
+}
+
+
+void ripemd160_8way_init( ripemd160_8way_context *sc )
+{
+   sc->val[0] = _mm256_set1_epi32( IV[0] );
+   sc->val[1] = _mm256_set1_epi32( IV[1] );
+   sc->val[2] = _mm256_set1_epi32( IV[2] );
+   sc->val[3] = _mm256_set1_epi32( IV[3] );
+   sc->val[4] = _mm256_set1_epi32( IV[4] );
+   sc->count_high = sc->count_low = 0;
+}
+
+void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   size_t ptr;
+   const int block_size = 64;
+
+   ptr = (unsigned)sc->count_low & (block_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      uint32_t clow, clow2;
+
+      clen = block_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( sc->buf + (ptr>>2), vdata, clen>>2 );
+      vdata = vdata + (clen>>2);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == block_size )
+      {
+         ripemd160_8way_round( sc );
+         ptr = 0;
+      }
+      clow = sc->count_low;
+      clow2 = clow + clen;
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high++;
+   }
+}
+
+void ripemd160_8way_close( ripemd160_8way_context  *sc, void *dst )
+{
+   unsigned ptr, u;
+   uint32_t low, high;
+   const int block_size = 64;
+   const int pad = block_size - 8;
+
+   ptr = (unsigned)sc->count_low & ( block_size - 1U);
+   sc->buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
+   ptr += 4;
+
+   if ( ptr > pad )
+   {
+       memset_zero_256( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
+       ripemd160_8way_round( sc );
+       memset_zero_256( sc->buf, pad>>2 );
+   }
+   else
+       memset_zero_256( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
+
+    low = sc->count_low;
+    high = (sc->count_high << 3) | (low >> 29);
+    low = low << 3;
+    sc->buf[  pad>>2      ] = _mm256_set1_epi32( low  );
+    sc->buf[ (pad>>2) + 1 ] = _mm256_set1_epi32( high );
+    ripemd160_8way_round( sc );
+    for (u = 0; u < 5; u ++)
+        casti_m256i( dst, u ) = sc->val[u];
+}
+
+#endif // __AVX2__
+
--- a/algo/ripemd/ripemd-hash-4way.h
+++ b/algo/ripemd/ripemd-hash-4way.h
@@ -19,5 +19,20 @@ void ripemd160_4way_init( ripemd160_4way_context *sc );
 void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len );
 void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst );

-#endif
-#endif
+#if defined (__AVX2__)
+
+typedef struct
+{
+   __m256i buf[64>>2];
+   __m256i val[5];
+   uint32_t count_high, count_low;
+} __attribute__ ((aligned (64))) ripemd160_8way_context;
+
+void ripemd160_8way_init( ripemd160_8way_context *sc );
+void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len );
+void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );
+
+
+#endif // __AVX2__
+#endif // __AVX__
+#endif // RIPEMD_HASH_4WAY_H__
--- a/algo/sha/sha2-hash-4way.c
+++ b/algo/sha/sha2-hash-4way.c
@@ -39,7 +39,7 @@

 #include <stdio.h>

-// SHA256 4 way 32 bit
+// SHA-256 32 bit

 static const sph_u32 H256[8] = {
        SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
@@ -83,6 +83,8 @@ static const sph_u32 K256[64] = {
        SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
 };

+// SHA-256 4 way
+
 #define SHA2s_MEXP( a, b, c, d ) \
     _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( \
                    SSG2_1( W[a] ), W[b] ), SSG2_0( W[c] ) ), W[d] );
@@ -291,13 +293,297 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
    sc->buf[ ( pad+4 ) >> 2 ] =
                 mm_bswap_32( _mm_set1_epi32( low ) );
    sha256_4way_round( sc->buf, sc->val );
+
    for ( u = 0; u < 8; u ++ )
       ((__m128i*)dst)[u] = mm_bswap_32( sc->val[u] );
 }

 #if defined(__AVX2__)

-// SHA512 4 way 64 bit
+// SHA-256 8 way
+
+#define CHx(X, Y, Z) \
+   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
+
+#define MAJx(X, Y, Z) \
+   _mm256_or_si256( _mm256_and_si256( X, Y ), \
+                    _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
+
+#define BSG2_0x(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+       mm256_rotr_32(x,  2), mm256_rotr_32(x, 13) ), mm256_rotr_32( x, 22) )
+
+#define BSG2_1x(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+       mm256_rotr_32(x,  6), mm256_rotr_32(x, 11) ), mm256_rotr_32( x, 25) )
+
+#define SSG2_0x(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+       mm256_rotr_32(x,  7), mm256_rotr_32(x, 18) ), _mm256_srli_epi32(x, 3) ) 
+
+#define SSG2_1x(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+       mm256_rotr_32(x, 17), mm256_rotr_32(x, 19) ), _mm256_srli_epi32(x, 10) )
+
+#define SHA2x_MEXP( a, b, c, d ) \
+     _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
+                    SSG2_1x( W[a] ), W[b] ), SSG2_0x( W[c] ) ), W[d] );
+
+#define SHA2s_8WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
+do { \
+  register __m256i T1, T2; \
+  T1 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
+       _mm256_add_epi32( H, BSG2_1x(E) ), CHx(E, F, G) ), \
+                          _mm256_set1_epi32( K256[( (j)+(i) )] ) ), W[i] ); \
+  T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
+  D  = _mm256_add_epi32( D,  T1 ); \
+  H  = _mm256_add_epi32( T1, T2 ); \
+} while (0)
+
+static void
+sha256_8way_round( __m256i *in, __m256i r[8] )
+{
+   register  __m256i A, B, C, D, E, F, G, H;
+   __m256i W[16];
+
+   W[ 0] = mm256_bswap_32( in[ 0] );
+   W[ 1] = mm256_bswap_32( in[ 1] );
+   W[ 2] = mm256_bswap_32( in[ 2] );
+   W[ 3] = mm256_bswap_32( in[ 3] );
+   W[ 4] = mm256_bswap_32( in[ 4] );
+   W[ 5] = mm256_bswap_32( in[ 5] );
+   W[ 6] = mm256_bswap_32( in[ 6] );
+   W[ 7] = mm256_bswap_32( in[ 7] );
+   W[ 8] = mm256_bswap_32( in[ 8] );
+   W[ 9] = mm256_bswap_32( in[ 9] );
+   W[10] = mm256_bswap_32( in[10] );
+   W[11] = mm256_bswap_32( in[11] );
+   W[12] = mm256_bswap_32( in[12] );
+   W[13] = mm256_bswap_32( in[13] );
+   W[14] = mm256_bswap_32( in[14] );
+   W[15] = mm256_bswap_32( in[15] );
+
+   A = r[0];
+   B = r[1];
+   C = r[2];
+   D = r[3];
+   E = r[4];
+   F = r[5];
+   G = r[6];
+   H = r[7];
+
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+
+//printf("sha256 8 step: D= %08lx H= %08lx\n",*(uint32_t*)&D,*(uint32_t*)&H);
+
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+//printf("sha256 8 step: A= %08lx B= %08lx\n",*(uint32_t*)&A,*(uint32_t*)&B);
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2x_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2x_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2x_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2x_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2x_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2x_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2x_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2x_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+
+   r[0] = _mm256_add_epi32( r[0], A );
+   r[1] = _mm256_add_epi32( r[1], B );
+   r[2] = _mm256_add_epi32( r[2], C );
+   r[3] = _mm256_add_epi32( r[3], D );
+   r[4] = _mm256_add_epi32( r[4], E );
+   r[5] = _mm256_add_epi32( r[5], F );
+   r[6] = _mm256_add_epi32( r[6], G );
+   r[7] = _mm256_add_epi32( r[7], H );
+}
+
+
+void sha256_8way_init( sha256_8way_context *sc )
+{
+   sc->count_high = sc->count_low = 0;
+   sc->val[0] = _mm256_set1_epi32( H256[0] );
+   sc->val[1] = _mm256_set1_epi32( H256[1] );
+   sc->val[2] = _mm256_set1_epi32( H256[2] );
+   sc->val[3] = _mm256_set1_epi32( H256[3] );
+   sc->val[4] = _mm256_set1_epi32( H256[4] );
+   sc->val[5] = _mm256_set1_epi32( H256[5] );
+   sc->val[6] = _mm256_set1_epi32( H256[6] );
+   sc->val[7] = _mm256_set1_epi32( H256[7] );
+}
+
+void sha256_8way( sha256_8way_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   size_t ptr;
+   const int buf_size = 64;
+/*
+printf("sha256 8 update1: len= %d\n", len);
+uint32_t* d = (uint32_t*)data;
+printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
+printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
+printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[64],d[72],d[80],d[88]);
+printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[96],d[104],d[112],d[120]);
+printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[128],d[136],d[144],d[152]);
+printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[160],d[168],d[176],d[184]);
+printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[192],d[200],d[208],d[216]);
+*/
+   ptr = (unsigned)sc->count_low & (buf_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      uint32_t clow, clow2;
+
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( sc->buf + (ptr>>2), vdata, clen>>2 );
+      vdata = vdata + (clen>>2);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+/*
+printf("sha256 8 update2: compress\n");
+d = (uint32_t*)sc->buf;
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[64],d[72],d[80],d[88]);
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[96],d[104],d[112],d[120]);
+d= (uint32_t*)sc->val;
+printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
+printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
+*/
+         sha256_8way_round( sc->buf, sc->val );
+/*
+printf("sha256 8 update3\n");
+d= (uint32_t*)sc->val;
+printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
+printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
+*/
+         ptr = 0;
+      }
+      clow = sc->count_low;
+      clow2 = SPH_T32( clow + clen );
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high++;
+   }
+}
+
+void sha256_8way_close( sha256_8way_context *sc, void *dst )
+{
+    unsigned ptr, u;
+    uint32_t low, high;
+    const int buf_size = 64;
+    const int pad = buf_size - 8;
+
+    ptr = (unsigned)sc->count_low & (buf_size - 1U);
+/*
+printf("sha256 8 close1: ptr= %d\n", ptr);
+uint32_t* d = (uint32_t*)sc->buf;
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[64],d[72],d[80],d[88]);
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[96],d[104],d[112],d[120]);
+*/
+
+    sc->buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
+    ptr += 4;
+
+    if ( ptr > pad )
+    {
+         memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
+
+//printf("sha256 8 close2: compress\n");
+//uint32_t* d = (uint32_t*)sc->buf;
+//printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
+
+
+         sha256_8way_round( sc->buf, sc->val );
+
+//d= (uint32_t*)sc->val;
+//printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
+
+         memset_zero_256( sc->buf, pad >> 2 );
+    }
+    else
+         memset_zero_256( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
+
+    low = sc->count_low;
+    high = (sc->count_high << 3) | (low >> 29);
+    low = low << 3;
+
+    sc->buf[ pad >> 2 ] =
+                 mm256_bswap_32( _mm256_set1_epi32( high ) );
+    sc->buf[ ( pad+4 ) >> 2 ] =
+                 mm256_bswap_32( _mm256_set1_epi32( low ) );
+/*
+d = (uint32_t*)sc->buf;
+printf("sha256 8 close3: compress\n");
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[64],d[72],d[80],d[88]);
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[96],d[104],d[112],d[120]);
+d= (uint32_t*)sc->val;
+printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
+printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
+*/
+
+    sha256_8way_round( sc->buf, sc->val );
+/*
+printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
+printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
+*/
+    for ( u = 0; u < 8; u ++ )
+       ((__m256i*)dst)[u] = mm256_bswap_32( sc->val[u] );
+}
+
+
+// SHA-512 4 way 64 bit

 static const sph_u64 H512[8] = {
        SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
--- a/algo/sha/sha2-hash-4way.h
+++ b/algo/sha/sha2-hash-4way.h
@@ -46,7 +46,9 @@

 #if defined(__AVX__)

-#define SPH_SIZE_sha256   256
+//#define SPH_SIZE_sha256   256
+
+// SHA-256 4 way

 typedef struct {
   __m128i buf[64>>2];
@@ -60,7 +62,21 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst );

 #if defined (__AVX2__)

-#define SPH_SIZE_sha512   512
+// SHA-256 8 way
+
+typedef struct {
+   __m256i buf[64>>2];
+   __m256i val[8];
+   uint32_t count_high, count_low;
+} sha256_8way_context;
+
+void sha256_8way_init( sha256_8way_context *sc );
+void sha256_8way( sha256_8way_context *sc, const void *data, size_t len );
+void sha256_8way_close( sha256_8way_context *sc, void *dst );
+
+//#define SPH_SIZE_sha512   512
+
+// SHA-512 4 way

 typedef struct {
   __m256i buf[128>>3];
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -39,7 +39,6 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
    uint32_t n = first_nonce;
    // hash is returned deinterleaved
    uint32_t *nonces = work->nonces;
-    bool *found = work->nfound;
    int num_found = 0;

 // data is 80 bytes, 20 u32 or 4 u64.
@@ -48,47 +47,22 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
 
    mm256_interleave_4x64( vdata, edata, edata, edata, edata, 640 );

-    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-    uint32_t *noncep1 = vdata + 75;
-    uint32_t *noncep2 = vdata + 77;
-    uint32_t *noncep3 = vdata + 79;
+    uint32_t *noncep = vdata + 73;   // 9*8 + 1

   do
   {
-       found[0] = found[1] = found[2] = found[3] = false;
-       be32enc( noncep0, n   );
-       be32enc( noncep1, n+1 );
-       be32enc( noncep2, n+2 );
-       be32enc( noncep3, n+3 );
+       be32enc( noncep,   n   );
+       be32enc( noncep+2, n+1 );
+       be32enc( noncep+4, n+2 );
+       be32enc( noncep+6, n+3 );

       skeinhash_4way( hash, vdata );

-       if ( hash[7] < Htarg && fulltest( hash, ptarget ) )
+       for ( int i = 0; i < 4; i++ )
+       if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
       {
-           found[0] = true;
-           num_found++;
-           nonces[0] = n;
-           // always put nonce0 in work data for compartibility with 
-           // non vectored algos.
-           pdata[19] = n;
-       }
-       if ( (hash+8)[7] < Htarg && fulltest( hash+8, ptarget ) )
-       {
-           found[1] = true;
-           num_found++;
-           nonces[1] = n+1;           
-       }
-       if ( (hash+16)[7] < Htarg && fulltest( hash+16, ptarget ) )
-       {
-           found[2] = true;
-           num_found++;
-           nonces[2] = n+2;           
-       }
-       if ( (hash+24)[7] < Htarg && fulltest( hash+24, ptarget ) )
-       {
-           found[3] = true;
-           num_found++;
-           nonces[3] = n+3;           
+           nonces[ num_found++ ] = n+i;
+           work_set_target_ratio( work, hash+(i<<3) );
       }
       n += 4;
    } while ( (num_found == 0) && (n < max_nonce)
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -36,51 +36,28 @@ int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
    uint32_t n = first_nonce;
    // hash is returned deinterleaved
    uint32_t *nonces = work->nonces;
-    bool *found = work->nfound;
    int num_found = 0;

    swab32_array( endiandata, pdata, 20 );

    mm256_interleave_4x64( vdata, edata, edata, edata, edata, 640 );

-    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-    uint32_t *noncep1 = vdata + 75;
-    uint32_t *noncep2 = vdata + 77;
-    uint32_t *noncep3 = vdata + 79;
+    uint32_t *noncep = vdata + 73;   // 9*8 + 1

    do 
    {
-       found[0] = found[1] = found[2] = found[3] = false;
-       be32enc( noncep0, n   );
-       be32enc( noncep1, n+1 );
-       be32enc( noncep2, n+2 );
-       be32enc( noncep3, n+3 );
+       be32enc( noncep,   n   );
+       be32enc( noncep+2, n+1 );
+       be32enc( noncep+4, n+2 );
+       be32enc( noncep+6, n+3 );

       skein2hash( hash, vdata );

-       if ( hash[7] < Htarg && fulltest( hash, ptarget ) )
+       for ( int i = 0; i < 4; i++ )
+       if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
       {
-           found[0] = true;
-           num_found++;
-           nonces[0] = n;
-       }
-       if ( (hash+8)[7] < Htarg && fulltest( hash+8, ptarget ) )
-       {
-           found[1] = true;
-           num_found++;
-           nonces[1] = n+1;
-       }
-       if ( (hash+16)[7] < Htarg && fulltest( hash+16, ptarget ) )
-       {
-           found[2] = true;
-           num_found++;
-           nonces[2] = n+2;
-       }
-       if ( (hash+24)[7] < Htarg && fulltest( hash+24, ptarget ) )
-       {
-           found[3] = true;
-           num_found++;
-           nonces[3] = n+3;
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
       }
       n += 4;
    } while ( (num_found == 0) && (n < max_nonce)
--- a/algo/whirlpool/whirlpool-4way.c
+++ b/algo/whirlpool/whirlpool-4way.c
@@ -61,12 +61,8 @@ int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-   uint32_t *noncep1 = vdata + 75;
-   uint32_t *noncep2 = vdata + 77;
-   uint32_t *noncep3 = vdata + 79;
+   uint32_t *noncep = vdata + 73;   // 9*8 + 1

   if (opt_benchmark)
      ((uint32_t*)ptarget)[7] = 0x0000ff;
@@ -83,42 +79,19 @@ int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce,

   do {
     const uint32_t Htarg = ptarget[7];
-     found[0] = found[1] = found[2] = found[3] = false;
-     be32enc( noncep0, n   );
-     be32enc( noncep1, n+1 );
-     be32enc( noncep2, n+2 );
-     be32enc( noncep3, n+3 );
+     be32enc( noncep,   n   );
+     be32enc( noncep+2, n+1 );
+     be32enc( noncep+4, n+2 );
+     be32enc( noncep+6, n+3 );
+     pdata[19] = n;

     whirlpool_hash_4way( hash, vdata );

-     pdata[19] = n;
-     if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+     for ( int i = 0; i < 4; i++ )
+     if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
     {
-         found[0] = true;
-         num_found++;
-         nonces[0] = n;
-         work_set_target_ratio(work, hash);
-     }
-     if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
-     {
-         found[1] = true;
-         num_found++;
-         nonces[1] = n+1;
-         work_set_target_ratio( work, hash+8 );
-     }
-     if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
-     {
-         found[2] = true;
-         num_found++;
-         nonces[2] = n+2;
-         work_set_target_ratio( work, hash+16 );
-     }
-     if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
-     {
-         found[3] = true;
-         num_found++;
-         nonces[3] = n+3;
-         work_set_target_ratio( work, hash+24 );
+         nonces[ num_found++ ] = n+i;
+         work_set_target_ratio( work, hash+(i<<3) );
     }
     n += 4;

--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -171,12 +171,8 @@ int scanhash_c11_4way( int thr_id, struct work *work, uint32_t max_nonce,
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t *nonces = work->nonces;
-     bool *found = work->nfound;
     int num_found = 0;
-     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-     uint32_t *noncep1 = vdata + 75;
-     uint32_t *noncep2 = vdata + 77;
-     uint32_t *noncep3 = vdata + 79;
+     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
@@ -195,42 +191,20 @@ int scanhash_c11_4way( int thr_id, struct work *work, uint32_t max_nonce,
         uint32_t mask = masks[m];
         do
         {
-            found[0] = found[1] = found[2] = found[3] = false;
-            be32enc( noncep0, n   );
-            be32enc( noncep1, n+1 );
-            be32enc( noncep2, n+2 );
-            be32enc( noncep3, n+3 );
+            be32enc( noncep,   n   );
+            be32enc( noncep+2, n+1 );
+            be32enc( noncep+4, n+2 );
+            be32enc( noncep+6, n+3 );

            c11_4way_hash( hash, vdata );
            pdata[19] = n;

-            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            for ( int i = 0; i < 4; i++ )
+            if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
+                 && fulltest( hash+(i<<3), ptarget ) )
            {
-               found[0] = true;
-               num_found++;
-               nonces[0] = n;
-               work_set_target_ratio( work, hash );
-            }
-            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
-            {
-               found[1] = true;
-               num_found++;
-               nonces[1] = n+1;
-               work_set_target_ratio( work, hash+8 );
-            }
-            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
-            {
-               found[2] = true;
-               num_found++;
-               nonces[2] = n+2;
-               work_set_target_ratio( work, hash+16 );
-            }
-            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
-            {
-               found[3] = true;
-               num_found++;
-               nonces[3] = n+3;
-               work_set_target_ratio( work, hash+24 );
+               nonces[ num_found++ ] = n+i;
+               work_set_target_ratio( work, hash+(i<<3) );
            }
            n += 4;
         } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -191,12 +191,8 @@ int scanhash_timetravel_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-   uint32_t *noncep1 = vdata + 75;
-   uint32_t *noncep2 = vdata + 77;
-   uint32_t *noncep3 = vdata + 79;
+   uint32_t *noncep = vdata + 73;   // 9*8 + 1
   const uint32_t Htarg = ptarget[7];
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
   int i;
@@ -224,45 +220,23 @@ int scanhash_timetravel_4way( int thr_id, struct work *work, uint32_t max_nonce,

   do
   {
-      found[0] = found[1] = found[2] = found[3] = false;
-      be32enc( noncep0, n   );
-      be32enc( noncep1, n+1 );
-      be32enc( noncep2, n+2 );
-      be32enc( noncep3, n+3 );
+      be32enc( noncep,   n   );
+      be32enc( noncep+2, n+1 );
+      be32enc( noncep+4, n+2 );
+      be32enc( noncep+6, n+3 );

      timetravel_4way_hash( hash, vdata );
      pdata[19] = n;

-      if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
      {
-         found[0] = true;
-         num_found++;
-         nonces[0] = n;
-         work_set_target_ratio( work, hash );
-      }
-      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget) )
-      {
-         found[1] = true;
-         num_found++;
-         nonces[1] = n+1;
-         work_set_target_ratio( work, hash+8 );
-      }
-      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget) )
-      {
-         found[2] = true;
-         num_found++;
-         nonces[2] = n+2;
-         work_set_target_ratio( work, hash+16 );
-      }
-      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget) )
-      {
-         found[3] = true;
-         num_found++;
-         nonces[3] = n+3;
-         work_set_target_ratio( work, hash+24 );
+          nonces[ num_found++ ] = n+i;
+           work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;
   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
+
   *hashes_done = n - first_nonce + 1;
   return num_found;
 }
--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -229,12 +229,8 @@ int scanhash_timetravel10_4way( int thr_id, struct work *work,
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-   uint32_t *noncep1 = vdata + 75;
-   uint32_t *noncep2 = vdata + 77;
-   uint32_t *noncep3 = vdata + 79;
+   uint32_t *noncep = vdata + 73;   // 9*8 + 1
   const uint32_t Htarg = ptarget[7];
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
   int i;
@@ -262,42 +258,19 @@ int scanhash_timetravel10_4way( int thr_id, struct work *work,

   do
   {
-      found[0] = found[1] = found[2] = found[3] = false;
-      be32enc( noncep0, n   );
-      be32enc( noncep1, n+1 );
-      be32enc( noncep2, n+2 );
-      be32enc( noncep3, n+3 );
+      be32enc( noncep,   n   );
+      be32enc( noncep+2, n+1 );
+      be32enc( noncep+4, n+2 );
+      be32enc( noncep+6, n+3 );

      timetravel10_4way_hash( hash, vdata );
      pdata[19] = n;

-      if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
      {
-         found[0] = true;
-         num_found++;
-         nonces[0] = n;
-         work_set_target_ratio( work, hash );
-      }
-      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget) )
-      {
-         found[1] = true;
-         num_found++;
-         nonces[1] = n+1;
-         work_set_target_ratio( work, hash+8 );
-      }
-      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget) )
-      {
-         found[2] = true;
-         num_found++;
-         nonces[2] = n+2;
-         work_set_target_ratio( work, hash+16 );
-      }
-      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget) )
-      {
-         found[3] = true;
-         num_found++;
-         nonces[3] = n+3;
-         work_set_target_ratio( work, hash+24 );
+         nonces[ num_found++ ] = n+i;
+         work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;
   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
--- a/algo/x11/tribus-4way.c
+++ b/algo/x11/tribus-4way.c
@@ -70,12 +70,8 @@ int scanhash_tribus_4way(int thr_id, struct work *work, uint32_t max_nonce, uint
   const uint32_t Htarg = ptarget[7];
   uint32_t n = pdata[19];
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-   uint32_t *noncep1 = vdata + 75;
-   uint32_t *noncep2 = vdata + 77;
-   uint32_t *noncep3 = vdata + 79;
+   uint32_t *noncep = vdata + 73;   // 9*8 + 1

   uint64_t htmax[] = {          0,
                               0xF,
@@ -112,49 +108,23 @@ int scanhash_tribus_4way(int thr_id, struct work *work, uint32_t max_nonce, uint
      {
         uint32_t mask = masks[m];
         do {
-            found[0] = found[1] = found[2] = found[3] = false;
-            be32enc( noncep0, n   );
-            be32enc( noncep1, n+1 );
-            be32enc( noncep2, n+2 );
-            be32enc( noncep3, n+3 );
+            be32enc( noncep,   n   );
+            be32enc( noncep+2, n+1 );
+            be32enc( noncep+4, n+2 );
+            be32enc( noncep+6, n+3 );

            tribus_hash_4way( hash, vdata );

            pdata[19] = n;

-            if ( ( !(hash[7] & mask) )
-                 && fulltest( hash, ptarget ) )
+            for ( int i = 0; i < 4; i++ )
+            if ( ( !( (hash+(i<<3))[7] & mask ) )
+                 && fulltest( hash+(i<<3), ptarget ) )
            {
-                found[0] = true;
-                num_found++;
-                nonces[0] = n;
-                work_set_target_ratio(work, hash);
-             }
-             if ( ( !((hash+8)[7] & mask) )
-                 && fulltest (hash+8, ptarget ) )
-             {
-                found[1] = true;
-                num_found++;
-                nonces[1] = n+1;
-                work_set_target_ratio(work, hash+8);
-             }
-             if ( ( !((hash+16)[7] & mask) )
-                 && fulltest( hash+16, ptarget ) )
-             {
-                found[2] = true;
-                num_found++;
-                nonces[2] = n+2;
-                work_set_target_ratio(work, hash+16);
-             }
-             if ( ( !((hash+24)[7] & mask) )
-                 && fulltest( hash+24, ptarget ) )
-             {
-                found[3] = true;
-                num_found++;
-                nonces[3] = n+3;
-                work_set_target_ratio(work, hash+24);
-             }
-             n += 4;
+               nonces[ num_found++ ] = n+i;
+               work_set_target_ratio( work, hash+(i<<3) );
+            }
+            n += 4;
         } while ( (num_found == 0) && ( n < max_nonce )
                    && !work_restart[thr_id].restart);
         break;
--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -170,12 +170,8 @@ int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t *nonces = work->nonces;
-     bool *found = work->nfound;
     int num_found = 0;
-     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-     uint32_t *noncep1 = vdata + 75;
-     uint32_t *noncep2 = vdata + 77;
-     uint32_t *noncep3 = vdata + 79;
+     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
@@ -194,42 +190,20 @@ int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
         uint32_t mask = masks[m];
         do
         {
-            found[0] = found[1] = found[2] = found[3] = false;
-            be32enc( noncep0, n   );
-            be32enc( noncep1, n+1 );
-            be32enc( noncep2, n+2 );
-            be32enc( noncep3, n+3 );
+            be32enc( noncep,   n   );
+            be32enc( noncep+2, n+1 );
+            be32enc( noncep+4, n+2 );
+            be32enc( noncep+6, n+3 );

            x11_4way_hash( hash, vdata );
            pdata[19] = n;

-            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            for ( int i = 0; i < 4; i++ )
+            if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
+                 && fulltest( hash+(i<<3), ptarget ) )
            {
-               found[0] = true;
-               num_found++;
-               nonces[0] = n;
-               work_set_target_ratio( work, hash );
-            }
-            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
-            {
-               found[1] = true;
-               num_found++;
-               nonces[1] = n+1;
-               work_set_target_ratio( work, hash+8 );
-            }
-            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
-            {
-               found[2] = true;
-               num_found++;
-               nonces[2] = n+2;
-               work_set_target_ratio( work, hash+16 );
-            }
-            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
-            {
-               found[3] = true;
-               num_found++;
-               nonces[3] = n+3;
-               work_set_target_ratio( work, hash+24 );
+               nonces[ num_found++ ] = n+i;
+               work_set_target_ratio( work, hash+(i<<3) );
            }
            n += 4;
         } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/x11/x11evo-4way.c
+++ b/algo/x11/x11evo-4way.c
@@ -243,12 +243,8 @@ int scanhash_x11evo_4way( int thr_id, struct work* work, uint32_t max_nonce,
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t *nonces = work->nonces;
-     bool *found = work->nfound;
     int num_found = 0;
-     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-     uint32_t *noncep1 = vdata + 75;
-     uint32_t *noncep2 = vdata + 77;
-     uint32_t *noncep3 = vdata + 79;
+     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     const uint32_t Htarg = ptarget[7];

     swab32_array( endiandata, pdata, 20 );
@@ -278,42 +274,20 @@ int scanhash_x11evo_4way( int thr_id, struct work* work, uint32_t max_nonce,

     do
     {
-         found[0] = found[1] = found[2] = found[3] = false;
-         be32enc( noncep0, n   );
-         be32enc( noncep1, n+1 );
-         be32enc( noncep2, n+2 );
-         be32enc( noncep3, n+3 );
+         be32enc( noncep,   n   );
+         be32enc( noncep+2, n+1 );
+         be32enc( noncep+4, n+2 );
+         be32enc( noncep+6, n+3 );

         x11evo_4way_hash( hash, vdata );
         pdata[19] = n;

-         if ( ( hash[7] & hmask ) == 0 && fulltest( hash, ptarget ) )
+         for ( int i = 0; i < 4; i++ )
+         if ( ( ( (hash+(i<<3))[7] & hmask ) == 0 )
+                 && fulltest( hash+(i<<3), ptarget ) )
         {
-            found[0] = true;
-            num_found++;
-            nonces[0] = n;
-            work_set_target_ratio( work, hash );
-         }
-         if ( ( (hash+8)[7] & hmask ) == 0 && fulltest( hash+8, ptarget ) )
-         {
-            found[1] = true;
-            num_found++;
-            nonces[1] = n+1;
-            work_set_target_ratio( work, hash+8 );
-         }
-         if ( ( (hash+16)[7] & hmask ) == 0 && fulltest( hash+16, ptarget ) )
-         {
-            found[2] = true;
-            num_found++;
-            nonces[2] = n+2;
-            work_set_target_ratio( work, hash+16 );
-         }
-         if ( ( (hash+24)[7] & hmask ) == 0 && fulltest( hash+24, ptarget ) )
-         {
-            found[3] = true;
-            num_found++;
-            nonces[3] = n+3;
-            work_set_target_ratio( work, hash+24 );
+            nonces[ num_found++ ] = n+i;
+            work_set_target_ratio( work, hash+(i<<3) );
         }
         n += 4;
     } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -177,12 +177,8 @@ int scanhash_x11gost_4way( int thr_id, struct work *work, uint32_t max_nonce,
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t *nonces = work->nonces;
-     bool *found = work->nfound;
     int num_found = 0;
-     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-     uint32_t *noncep1 = vdata + 75;
-     uint32_t *noncep2 = vdata + 77;
-     uint32_t *noncep3 = vdata + 79;
+     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
@@ -201,42 +197,20 @@ int scanhash_x11gost_4way( int thr_id, struct work *work, uint32_t max_nonce,
         uint32_t mask = masks[m];
         do
         {
-            found[0] = found[1] = found[2] = found[3] = false;
-            be32enc( noncep0, n   );
-            be32enc( noncep1, n+1 );
-            be32enc( noncep2, n+2 );
-            be32enc( noncep3, n+3 );
+            be32enc( noncep,   n   );
+            be32enc( noncep+2, n+1 );
+            be32enc( noncep+4, n+2 );
+            be32enc( noncep+6, n+3 );

            x11gost_4way_hash( hash, vdata );
            pdata[19] = n;

-            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            for ( int i = 0; i < 4; i++ )
+            if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
+                 && fulltest( hash+(i<<3), ptarget ) )
            {
-               found[0] = true;
-               num_found++;
-               nonces[0] = n;
-               work_set_target_ratio( work, hash );
-            }
-            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
-            {
-               found[1] = true;
-               num_found++;
-               nonces[1] = n+1;
-               work_set_target_ratio( work, hash+8 );
-            }
-            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
-            {
-               found[2] = true;
-               num_found++;
-               nonces[2] = n+2;
-               work_set_target_ratio( work, hash+16 );
-            }
-            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
-            {
-               found[3] = true;
-               num_found++;
-               nonces[3] = n+3;
-               work_set_target_ratio( work, hash+24 );
+               nonces[ num_found++ ] = n+i;
+               work_set_target_ratio( work, hash+(i<<3) );
            }
            n += 4;
         } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/x12/x12-4way.c
+++ b/algo/x12/x12-4way.c
@@ -199,12 +199,8 @@ int scanhash_x12_4way( int thr_id, struct work *work, uint32_t max_nonce,
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t *nonces = work->nonces;
-     bool *found = work->nfound;
     int num_found = 0;
-     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-     uint32_t *noncep1 = vdata + 75;
-     uint32_t *noncep2 = vdata + 77;
-     uint32_t *noncep3 = vdata + 79;
+     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
@@ -223,42 +219,20 @@ int scanhash_x12_4way( int thr_id, struct work *work, uint32_t max_nonce,
         uint32_t mask = masks[m];
         do
         {
-            found[0] = found[1] = found[2] = found[3] = false;
-            be32enc( noncep0, n   );
-            be32enc( noncep1, n+1 );
-            be32enc( noncep2, n+2 );
-            be32enc( noncep3, n+3 );
+            be32enc( noncep,   n   );
+            be32enc( noncep+2, n+1 );
+            be32enc( noncep+4, n+2 );
+            be32enc( noncep+6, n+3 );

            x12_4way_hash( hash, vdata );
            pdata[19] = n;

-            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            for ( int i = 0; i < 4; i++ )
+            if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
+                 && fulltest( hash+(i<<3), ptarget ) )
            {
-               found[0] = true;
-               num_found++;
-               nonces[0] = n;
-               work_set_target_ratio( work, hash );
-            }
-            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
-            {
-               found[1] = true;
-               num_found++;
-               nonces[1] = n+1;
-               work_set_target_ratio( work, hash+8 );
-            }
-            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
-            {
-               found[2] = true;
-               num_found++;
-               nonces[2] = n+2;
-               work_set_target_ratio( work, hash+16 );
-            }
-            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
-            {
-               found[3] = true;
-               num_found++;
-               nonces[3] = n+3;
-               work_set_target_ratio( work, hash+24 );
+               nonces[ num_found++ ] = n+i;
+               work_set_target_ratio( work, hash+(i<<3) );
            }
            n += 4;
         } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/x13/phi1612-4way.c
+++ b/algo/x13/phi1612-4way.c
@@ -120,12 +120,8 @@ int scanhash_phi1612_4way( int thr_id, struct work *work, uint32_t max_nonce,
     uint32_t _ALIGN(64) endiandata[20];
     uint32_t n = first_nonce;
     uint32_t *nonces = work->nonces;
-     bool *found = work->nfound;
     int num_found = 0;
-     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-     uint32_t *noncep1 = vdata + 75;
-     uint32_t *noncep2 = vdata + 77;
-     uint32_t *noncep3 = vdata + 79;
+     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     const uint32_t Htarg = ptarget[7];

     if ( opt_benchmark )
@@ -138,42 +134,19 @@ int scanhash_phi1612_4way( int thr_id, struct work *work, uint32_t max_nonce,
     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

     do {
-        found[0] = found[1] = found[2] = found[3] = false;
-        be32enc( noncep0, n   );
-        be32enc( noncep1, n+1 );
-        be32enc( noncep2, n+2 );
-        be32enc( noncep3, n+3 );
+        be32enc( noncep,   n   );
+        be32enc( noncep+2, n+1 );
+        be32enc( noncep+4, n+2 );
+        be32enc( noncep+6, n+3 );

        phi1612_4way_hash( hash, vdata );
        pdata[19] = n;

-        if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+        for ( int i = 0; i < 4; i++ )
+        if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
        {
-            found[0] = true;
-            num_found++;
-            nonces[0] = n;
-            work_set_target_ratio( work, hash );
-        }
-        if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) ) 
-        {
-            found[1] = true;
-            num_found++;
-            nonces[1] = n+1;
-            work_set_target_ratio( work, hash+8 );
-        }
-        if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) ) 
-        {
-            found[2] = true;
-            num_found++;
-            nonces[2] = n+2;
-            work_set_target_ratio( work, hash+16 );
-        }
-        if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) ) 
-        {
-            found[3] = true;
-            num_found++;
-            nonces[3] = n+3;
-            work_set_target_ratio( work, hash+24 );
+           nonces[ num_found++ ] = n+i;
+           work_set_target_ratio( work, hash+(i<<3) );
        }
        n += 4;
     } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/x13/skunk-4way.c
+++ b/algo/x13/skunk-4way.c
@@ -84,12 +84,8 @@ int scanhash_skunk_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-   uint32_t *noncep1 = vdata + 75;
-   uint32_t *noncep2 = vdata + 77;
-   uint32_t *noncep3 = vdata + 79;
+   uint32_t *noncep = vdata + 73;   // 9*8 + 1
   const uint32_t Htarg = ptarget[7];
   volatile uint8_t *restart = &(work_restart[thr_id].restart);

@@ -102,42 +98,19 @@ int scanhash_skunk_4way( int thr_id, struct work *work, uint32_t max_nonce,
   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
   do
   {
-      found[0] = found[1] = found[2] = found[3] = false;
-      be32enc( noncep0, n   );
-      be32enc( noncep1, n+1 );
-      be32enc( noncep2, n+2 );
-      be32enc( noncep3, n+3 );
+      be32enc( noncep,   n   );
+      be32enc( noncep+2, n+1 );
+      be32enc( noncep+4, n+2 );
+      be32enc( noncep+6, n+3 );

      skunk_4way_hash( hash, vdata );
      pdata[19] = n;

-      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
      {
-          found[0] = true;
-          num_found++;
-          nonces[0] = n;
-          work_set_target_ratio( work, hash );
-      }
-      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
-      {
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-          work_set_target_ratio( work, hash+8 );
-      }
-      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
-      {
-          found[2] = true;
-          num_found++;
-          nonces[2] = n+2;
-          work_set_target_ratio( work, hash+16 );
-      }
-      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
-      {
-          found[3] = true;
-          num_found++;
-          nonces[3] = n+3;
-          work_set_target_ratio( work, hash+24 );
+         nonces[ num_found++ ] = n+i;
+         work_set_target_ratio( work, hash+(i<<3) );
      }
      n +=4;
   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
--- a/algo/x13/x13-4way.c
+++ b/algo/x13/x13-4way.c
@@ -195,12 +195,8 @@ int scanhash_x13_4way( int thr_id, struct work *work, uint32_t max_nonce,
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t *nonces = work->nonces;
-     bool *found = work->nfound;
     int num_found = 0;
-     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-     uint32_t *noncep1 = vdata + 75;
-     uint32_t *noncep2 = vdata + 77;
-     uint32_t *noncep3 = vdata + 79;
+     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
@@ -219,42 +215,20 @@ int scanhash_x13_4way( int thr_id, struct work *work, uint32_t max_nonce,
         uint32_t mask = masks[m];
         do
         {
-            found[0] = found[1] = found[2] = found[3] = false;
-            be32enc( noncep0, n   );
-            be32enc( noncep1, n+1 );
-            be32enc( noncep2, n+2 );
-            be32enc( noncep3, n+3 );
+            be32enc( noncep,   n   );
+            be32enc( noncep+2, n+1 );
+            be32enc( noncep+4, n+2 );
+            be32enc( noncep+6, n+3 );

            x13_4way_hash( hash, vdata );
            pdata[19] = n;

-            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            for ( int i = 0; i < 4; i++ )
+            if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
+                 && fulltest( hash+(i<<3), ptarget ) )
            {
-               found[0] = true;
-               num_found++;
-               nonces[0] = n;
-               work_set_target_ratio( work, hash );
-            }
-            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
-            {
-               found[1] = true;
-               num_found++;
-               nonces[1] = n+1;
-               work_set_target_ratio( work, hash+8 );
-            }
-            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
-            {
-               found[2] = true;
-               num_found++;
-               nonces[2] = n+2;
-               work_set_target_ratio( work, hash+16 );
-            }
-            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
-            {
-               found[3] = true;
-               num_found++;
-               nonces[3] = n+3;
-               work_set_target_ratio( work, hash+24 );
+               nonces[ num_found++ ] = n+i;
+               work_set_target_ratio( work, hash+(i<<3) );
            }
            n += 4;
         } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/x13/x13sm3-4way.c
+++ b/algo/x13/x13sm3-4way.c
@@ -220,12 +220,8 @@ int scanhash_x13sm3_4way( int thr_id, struct work *work, uint32_t max_nonce,
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t *nonces = work->nonces;
-     bool *found = work->nfound;
     int num_found = 0;
-     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-     uint32_t *noncep1 = vdata + 75;
-     uint32_t *noncep2 = vdata + 77;
-     uint32_t *noncep3 = vdata + 79;
+     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
@@ -247,42 +243,20 @@ int scanhash_x13sm3_4way( int thr_id, struct work *work, uint32_t max_nonce,
         uint32_t mask = masks[m];
         do
         {
-            found[0] = found[1] = found[2] = found[3] = false;
-            be32enc( noncep0, n   );
-            be32enc( noncep1, n+1 );
-            be32enc( noncep2, n+2 );
-            be32enc( noncep3, n+3 );
+            be32enc( noncep,   n   );
+            be32enc( noncep+2, n+1 );
+            be32enc( noncep+4, n+2 );
+            be32enc( noncep+6, n+3 );

            x13sm3_4way_hash( hash, vdata );
            pdata[19] = n;

-            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            for ( int i = 0; i < 4; i++ )
+            if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
+                 && fulltest( hash+(i<<3), ptarget ) )
            {
-               found[0] = true;
-               num_found++;
-               nonces[0] = n;
-               work_set_target_ratio( work, hash );
-            }
-            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
-            {
-               found[1] = true;
-               num_found++;
-               nonces[1] = n+1;
-               work_set_target_ratio( work, hash+8 );
-            }
-            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
-            {
-               found[2] = true;
-               num_found++;
-               nonces[2] = n+2;
-               work_set_target_ratio( work, hash+16 );
-            }
-            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
-            {
-               found[3] = true;
-               num_found++;
-               nonces[3] = n+3;
-               work_set_target_ratio( work, hash+24 );
+               nonces[ num_found++ ] = n+i;
+               work_set_target_ratio( work, hash+(i<<3) );
            }
            n += 4;
         } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/x14/polytimos-4way.c
+++ b/algo/x14/polytimos-4way.c
@@ -114,12 +114,8 @@ int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-   uint32_t *noncep1 = vdata + 75;
-   uint32_t *noncep2 = vdata + 77;
-   uint32_t *noncep3 = vdata + 79;
+   uint32_t *noncep = vdata + 73;   // 9*8 + 1
   const uint32_t Htarg = ptarget[7];
   volatile uint8_t *restart = &(work_restart[thr_id].restart);

@@ -132,42 +128,19 @@ int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint64_t *edata = (uint64_t*)endiandata;
   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
   do {
-      found[0] = found[1] = found[2] = found[3] = false;
-      be32enc( noncep0, n   );
-      be32enc( noncep1, n+1 );
-      be32enc( noncep2, n+2 );
-      be32enc( noncep3, n+3 );
+      be32enc( noncep,   n   );
+      be32enc( noncep+2, n+1 );
+      be32enc( noncep+4, n+2 );
+      be32enc( noncep+6, n+3 );

      polytimos_4way_hash(hash, vdata);
      pdata[19] = n;

-      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
      {
-         found[0] = true;
-         num_found++;
-         nonces[0] = n;
-         work_set_target_ratio( work, hash );
-      }
-      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
-      {
-         found[1] = true;
-         num_found++;
-         nonces[1] = n+1;
-         work_set_target_ratio( work, hash+8 );
-      }
-      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
-      {
-         found[2] = true;
-         num_found++;
-         nonces[2] = n+2;
-         work_set_target_ratio( work, hash+16 );
-      }
-      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
-      {
-         found[3] = true;
-         num_found++;
-         nonces[3] = n+3;
-         work_set_target_ratio( work, hash+24 );
+         nonces[ num_found++ ] = n+i;
+         work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;

--- a/algo/x14/veltor-4way.c
+++ b/algo/x14/veltor-4way.c
@@ -89,12 +89,8 @@ int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
     const uint32_t first_nonce = pdata[19];
     uint32_t n = first_nonce;
     uint32_t *nonces = work->nonces;
-     bool *found = work->nfound;
     int num_found = 0;
-     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-     uint32_t *noncep1 = vdata + 75;
-     uint32_t *noncep2 = vdata + 77;
-     uint32_t *noncep3 = vdata + 79;
+     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     volatile uint8_t *restart = &(work_restart[thr_id].restart);

     if ( opt_benchmark )
@@ -108,42 +104,19 @@ int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
     do
     {
-         found[0] = found[1] = found[2] = found[3] = false;
-         be32enc( noncep0, n   );
-         be32enc( noncep1, n+1 );
-         be32enc( noncep2, n+2 );
-         be32enc( noncep3, n+3 );
+         be32enc( noncep,   n   );
+         be32enc( noncep+2, n+1 );
+         be32enc( noncep+4, n+2 );
+         be32enc( noncep+6, n+3 );

         veltor_4way_hash( hash, vdata );
         pdata[19] = n;

-         if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+         for ( int i = 0; i < 4; i++ )
+         if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
         {
-            found[0] = true;
-            num_found++;
-            nonces[0] = n;
-            work_set_target_ratio( work, hash );
-         }
-         if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) ) 
-         {
-            found[1] = true;
-            num_found++;
-            nonces[1] = n+1;
-            work_set_target_ratio( work, hash+8 );
-         }
-         if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) ) 
-         {
-            found[2] = true;
-            num_found++;
-            nonces[2] = n+2;
-            work_set_target_ratio( work, hash+16 );
-         }
-         if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) ) 
-         {
-            found[3] = true;
-            num_found++;
-            nonces[3] = n+3;
-            work_set_target_ratio( work, hash+24 );
+            nonces[ num_found++ ] = n+i;
+            work_set_target_ratio( work, hash+(i<<3) );
         }
         n += 4;
     } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -205,12 +205,8 @@ int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t *nonces = work->nonces;
-     bool *found = work->nfound;
     int num_found = 0;
-     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-     uint32_t *noncep1 = vdata + 75;
-     uint32_t *noncep2 = vdata + 77;
-     uint32_t *noncep3 = vdata + 79;
+     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
@@ -229,42 +225,20 @@ int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
         uint32_t mask = masks[m];
         do
         {
-            found[0] = found[1] = found[2] = found[3] = false;
-            be32enc( noncep0, n   );
-            be32enc( noncep1, n+1 );
-            be32enc( noncep2, n+2 );
-            be32enc( noncep3, n+3 );
+            be32enc( noncep,   n   );
+            be32enc( noncep+2, n+1 );
+            be32enc( noncep+4, n+2 );
+            be32enc( noncep+6, n+3 );

            x14_4way_hash( hash, vdata );
            pdata[19] = n;

-            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            for ( int i = 0; i < 4; i++ )
+            if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
+                 && fulltest( hash+(i<<3), ptarget ) )
            {
-               found[0] = true;
-               num_found++;
-               nonces[0] = n;
-               work_set_target_ratio( work, hash );
-            }
-            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
-            {
-               found[1] = true;
-               num_found++;
-               nonces[1] = n+1;
-               work_set_target_ratio( work, hash+8 );
-            }
-            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
-            {
-               found[2] = true;
-               num_found++;
-               nonces[2] = n+2;
-               work_set_target_ratio( work, hash+16 );
-            }
-            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
-            {
-               found[3] = true;
-               num_found++;
-               nonces[3] = n+3;
-               work_set_target_ratio( work, hash+24 );
+               nonces[ num_found++ ] = n+i;
+               work_set_target_ratio( work, hash+(i<<3) );
            }
            n += 4;
         } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -224,12 +224,8 @@ int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t *nonces = work->nonces;
-     bool *found = work->nfound;
     int num_found = 0;
-     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-     uint32_t *noncep1 = vdata + 75;
-     uint32_t *noncep2 = vdata + 77;
-     uint32_t *noncep3 = vdata + 79;
+     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
@@ -248,42 +244,20 @@ int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
         uint32_t mask = masks[m];
         do
         {
-            found[0] = found[1] = found[2] = found[3] = false;
-            be32enc( noncep0, n   );
-            be32enc( noncep1, n+1 );
-            be32enc( noncep2, n+2 );
-            be32enc( noncep3, n+3 );
+            be32enc( noncep,   n   );
+            be32enc( noncep+2, n+1 );
+            be32enc( noncep+4, n+2 );
+            be32enc( noncep+6, n+3 );

            x15_4way_hash( hash, vdata );
            pdata[19] = n;

-            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            for ( int i = 0; i < 4; i++ )
+            if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
+                 && fulltest( hash+(i<<3), ptarget ) )
            {
-               found[0] = true;
-               num_found++;
-               nonces[0] = n;
-               work_set_target_ratio( work, hash );
-            }
-            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
-            {
-               found[1] = true;
-               num_found++;
-               nonces[1] = n+1;
-               work_set_target_ratio( work, hash+8 );
-            }
-            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
-            {
-               found[2] = true;
-               num_found++;
-               nonces[2] = n+2;
-               work_set_target_ratio( work, hash+16 );
-            }
-            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
-            {
-               found[3] = true;
-               num_found++;
-               nonces[3] = n+3;
-               work_set_target_ratio( work, hash+24 );
+               nonces[ num_found++ ] = n+i;
+               work_set_target_ratio( work, hash+(i<<3) );
            }
            n += 4;
         } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/x17/x16r-4way.c
+++ b/algo/x17/x16r-4way.c
@@ -314,12 +314,8 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-   uint32_t *noncep1 = vdata + 75;
-   uint32_t *noncep2 = vdata + 77;
-   uint32_t *noncep3 = vdata + 79;
+   uint32_t *noncep = vdata + 73;   // 9*8 + 1
   volatile uint8_t *restart = &(work_restart[thr_id].restart);

   for ( int k=0; k < 19; k++ )
@@ -342,41 +338,19 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,

   do
   {
-      found[0] = found[1] = found[2] = found[3] = false;
-      be32enc( noncep0, n   );
-      be32enc( noncep1, n+1 );
-      be32enc( noncep2, n+2 );
-      be32enc( noncep3, n+3 );
+      be32enc( noncep,   n   );
+      be32enc( noncep+2, n+1 );
+      be32enc( noncep+4, n+2 );
+      be32enc( noncep+6, n+3 );
+
      x16r_4way_hash( hash, vdata );
      pdata[19] = n;

-      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
      {
-         found[0] = true;
-         num_found++;
-         nonces[0] = n;
-         work_set_target_ratio( work, hash );
-      }
-      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
-      {
-         found[1] = true;
-         num_found++;
-         nonces[1] = n+1;
-         work_set_target_ratio( work, hash+8 );
-      }
-      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
-      {
-         found[2] = true;
-         num_found++;
-         nonces[2] = n+2;
-         work_set_target_ratio( work, hash+16 );
-      }
-      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
-      {
-         found[3] = true;
-         num_found++;
-         nonces[3] = n+3;
-         work_set_target_ratio( work, hash+24 );
+         nonces[ num_found++ ] = n+i;
+         work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;
   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -235,12 +235,8 @@ int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t *nonces = work->nonces;
-     bool *found = work->nfound;
     int num_found = 0;
-     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-     uint32_t *noncep1 = vdata + 75;
-     uint32_t *noncep2 = vdata + 77;
-     uint32_t *noncep3 = vdata + 79;
+     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
@@ -259,42 +255,20 @@ int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
         uint32_t mask = masks[m];
         do
         {
-            found[0] = found[1] = found[2] = found[3] = false;
-            be32enc( noncep0, n   );
-            be32enc( noncep1, n+1 );
-            be32enc( noncep2, n+2 );
-            be32enc( noncep3, n+3 );
+            be32enc( noncep,   n   );
+            be32enc( noncep+2, n+1 );
+            be32enc( noncep+4, n+2 );
+            be32enc( noncep+6, n+3 );

            x17_4way_hash( hash, vdata );
            pdata[19] = n;

-            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            for ( int i = 0; i < 4; i++ )
+            if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
+                 && fulltest( hash+(i<<3), ptarget ) )
            {
-               found[0] = true;
-               num_found++;
-               nonces[0] = n;
-               work_set_target_ratio( work, hash );
-            }
-            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
-            {
-               found[1] = true;
-               num_found++;
-               nonces[1] = n+1;
-               work_set_target_ratio( work, hash+8 );
-            }
-            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
-            {
-               found[2] = true;
-               num_found++;
-               nonces[2] = n+2;
-               work_set_target_ratio( work, hash+16 );
-            }
-            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
-            {
-               found[3] = true;
-               num_found++;
-               nonces[3] = n+3;
-               work_set_target_ratio( work, hash+24 );
+               nonces[ num_found++ ] = n+i;
+               work_set_target_ratio( work, hash+(i<<3) );
            }
            n += 4;
         } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -384,12 +384,8 @@ int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-   uint32_t *noncep1 = vdata + 75;
-   uint32_t *noncep2 = vdata + 77;
-   uint32_t *noncep3 = vdata + 79;
+   uint32_t *noncep = vdata + 73;   // 9*8 + 1

   if ( opt_benchmark )
      ptarget[7] = 0x0cff;
@@ -403,43 +399,20 @@ int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
   xevan_4way_blake512_midstate( vdata );

   do {
-      found[0] = found[1] = found[2] = found[3] = false;
-      be32enc( noncep0, n   );
-      be32enc( noncep1, n+1 );
-      be32enc( noncep2, n+2 );
-      be32enc( noncep3, n+3 );
+      be32enc( noncep,   n   );
+      be32enc( noncep+2, n+1 );
+      be32enc( noncep+4, n+2 );
+      be32enc( noncep+6, n+3 );

      xevan_4way_hash( hash, vdata );

      pdata[19] = n;

-      if ( ( hash[7] <= Htarg ) && fulltest( hash, ptarget ) )
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
      {
-         found[0] = true;
-         num_found++;
-         nonces[0] = n;
-         work_set_target_ratio( work, hash );
-      }
-      if ( ( (hash+8)[7] <= Htarg ) && fulltest( hash+8, ptarget ) )
-      {
-         found[1] = true;
-         num_found++;
-         nonces[1] = n+1;
-         work_set_target_ratio( work, hash+8 );
-      }
-      if ( ( (hash+16)[7] <= Htarg ) && fulltest( hash+16, ptarget ) )
-      {
-         found[2] = true;
-         num_found++;
-         nonces[2] = n+2;
-         work_set_target_ratio( work, hash+16 );
-      }
-      if ( ( (hash+24)[7] <= Htarg ) && fulltest( hash+24, ptarget ) )
-      {
-         found[3] = true;
-         num_found++;
-         nonces[3] = n+3;
-         work_set_target_ratio( work, hash+24 );
+         nonces[ num_found++ ] = n+i;
+         work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;
   } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/api.c
+++ b/api.c
@@ -110,18 +110,19 @@ extern int cpu_fanpercent(void);

 static void cpustatus(int thr_id)
 {
-	if (thr_id >= 0 && thr_id < opt_n_threads) {
-		struct cpu_info *cpu = &thr_info[thr_id].cpu;
-		char buf[512]; *buf = '\0';
+   if ( thr_id >= 0 && thr_id < opt_n_threads )
+   {
+//      struct cpu_info *cpu = &thr_info[thr_id].cpu;
+      char buf[512]; *buf = '\0';
+      char units[4] = {0};
+      double hashrate = thr_hashrates[thr_id];

-		cpu->thr_id = thr_id;
-		cpu->khashes = thr_hashrates[thr_id] / 1000.0; //todo: stats_get_speed(thr_id, 0.0) / 1000.0;
-
-		snprintf(buf, sizeof(buf), "CPU=%d;KHS=%.2f|", thr_id, cpu->khashes);
-
-		// append to buffer
-		strcat(buffer, buf);
-	}
+      scale_hash_for_display ( &hashrate, units );
+      snprintf( buf, sizeof(buf), "CPU=%d;%sH/s=%.2f|", thr_id, units,
+                hashrate );
+      // append to buffer
+      strcat( buffer, buf );
+   }
 }

 /*****************************************************************************/
@@ -129,42 +130,45 @@ static void cpustatus(int thr_id)
 /**
 * Returns miner global infos
 */
-static char *getsummary(char *params)
+static char *getsummary( char *params )
 {
-	char algo[64]; *algo = '\0';
-	time_t ts = time(NULL);
-	double uptime = difftime(ts, startup);
-	double accps = (60.0 * accepted_count) / (uptime ? uptime : 1.0);
-        double diff = net_diff > 0. ? net_diff : stratum_diff;
-        char diff_str[16];
-
-	struct cpu_info cpu = { 0 };
+   char algo[64]; *algo = '\0';
+   time_t ts = time(NULL);
+   double uptime = difftime(ts, startup);
+   double accps = (60.0 * accepted_count) / (uptime ? uptime : 1.0);
+   double diff = net_diff > 0. ? net_diff : stratum_diff;
+   char diff_str[16];
+   double hashrate = (double)global_hashrate;
+   char units[4] = {0};
+   struct cpu_info cpu = { 0 };
 #ifdef USE_MONITORING
-	cpu.has_monitoring = true;
-	cpu.cpu_temp = cpu_temp(0);
-	cpu.cpu_fan = cpu_fanpercent();
-	cpu.cpu_clock = cpu_clock(0);
+   cpu.has_monitoring = true;
+   cpu.cpu_temp = cpu_temp(0);
+   cpu.cpu_fan = cpu_fanpercent();
+   cpu.cpu_clock = cpu_clock(0);
 #endif

-	get_currentalgo(algo, sizeof(algo));
+   get_currentalgo(algo, sizeof(algo));

-        // if diff is integer don't display decimals
-        if ( diff == trunc( diff ) )
-            sprintf( diff_str, "%.0f", diff);
-        else
-            sprintf( diff_str, "%.6f", diff);
+   // if diff is integer don't display decimals
+   if ( diff == trunc( diff ) )
+       sprintf( diff_str, "%.0f", diff);
+   else
+       sprintf( diff_str, "%.6f", diff);

-	*buffer = '\0';
-	sprintf(buffer, "NAME=%s;VER=%s;API=%s;"
-		"ALGO=%s;CPUS=%d;KHS=%.2f;ACC=%d;REJ=%d;"
-		"ACCMN=%.3f;DIFF=%s;TEMP=%.1f;FAN=%d;FREQ=%d;"
-		"UPTIME=%.0f;TS=%u|",
-		PACKAGE_NAME, PACKAGE_VERSION, APIVERSION,
-		algo, opt_n_threads, (double)global_hashrate / 1000.0,
-		accepted_count, rejected_count, accps, diff_str,
-		cpu.cpu_temp, cpu.cpu_fan, cpu.cpu_clock,
-		uptime, (uint32_t) ts);
-	return buffer;
+   *buffer = '\0';
+   scale_hash_for_display ( &hashrate, units );
+
+   sprintf( buffer, "NAME=%s;VER=%s;API=%s;"
+                    "ALGO=%s;CPUS=%d;%sH/s=%.2f;ACC=%d;REJ=%d;"
+                    "ACCMN=%.3f;DIFF=%s;TEMP=%.1f;FAN=%d;FREQ=%d;"
+                    "UPTIME=%.0f;TS=%u|",
+                    PACKAGE_NAME, PACKAGE_VERSION, APIVERSION,
+                    algo, opt_n_threads, units, hashrate,
+                    accepted_count, rejected_count, accps, diff_str,
+                    cpu.cpu_temp, cpu.cpu_fan, cpu.cpu_clock,
+                    uptime, (uint32_t) ts);
+   return buffer;
 }

 /**
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -1537,6 +1537,38 @@ static inline void mm256_interleave_8x32( void *dst, const void *src0,
                             s3[18], s2[18], s1[18], s0[18] );
   d[19] = _mm256_set_epi32( s7[19], s6[19], s5[19], s4[19],
                             s3[19], s2[19], s1[19], s0[19] );
+
+   if ( bit_len <= 640 ) return;
+
+   d[20] = _mm256_set_epi32( s7[20], s6[20], s5[20], s4[20],
+                             s3[20], s2[20], s1[20], s0[20] );
+   d[21] = _mm256_set_epi32( s7[21], s6[21], s5[21], s4[21],
+                             s3[21], s2[21], s1[21], s0[21] );
+   d[22] = _mm256_set_epi32( s7[22], s6[22], s5[22], s4[22],
+                             s3[22], s2[22], s1[22], s0[22] );
+   d[23] = _mm256_set_epi32( s7[23], s6[23], s5[23], s4[23],
+                             s3[23], s2[23], s1[23], s0[23] );
+
+   if ( bit_len <= 768 ) return;
+
+   d[24] = _mm256_set_epi32( s7[24], s6[24], s5[24], s4[24],
+                             s3[24], s2[24], s1[24], s0[24] );
+   d[25] = _mm256_set_epi32( s7[25], s6[25], s5[25], s4[25],
+                             s3[25], s2[25], s1[25], s0[25] );
+   d[26] = _mm256_set_epi32( s7[26], s6[26], s5[26], s4[26],
+                             s3[26], s2[26], s1[26], s0[26] );
+   d[27] = _mm256_set_epi32( s7[27], s6[27], s5[27], s4[27],
+                             s3[27], s2[27], s1[27], s0[27] );
+   d[28] = _mm256_set_epi32( s7[28], s6[28], s5[28], s4[28],
+                             s3[28], s2[28], s1[28], s0[28] );
+   d[29] = _mm256_set_epi32( s7[29], s6[29], s5[29], s4[29],
+                             s3[29], s2[29], s1[29], s0[29] );
+   d[30] = _mm256_set_epi32( s7[30], s6[30], s5[30], s4[30],
+                             s3[30], s2[30], s1[30], s0[30] );
+   d[31] = _mm256_set_epi32( s7[31], s6[31], s5[31], s4[31],
+                             s3[31], s2[31], s1[31], s0[31] );
+
+   // bit_len == 1024
 }

 // probably obsolete with double pack 2x32->64, 4x64->256.
@@ -1615,31 +1647,71 @@ static inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2,

   // null change for overrun space, vector indexing doesn't work for
   // 32 bit data
+   if ( bit_len <= 640 )
+   {
+      uint32_t *d = ((uint32_t*)d0) + 8;
+      d0[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
+                                  s[152], s[144], s[136], s[128] );
+      d = ((uint32_t*)d1) + 8;
+      d1[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
+                                  s[153], s[145], s[137], s[129] );
+      d = ((uint32_t*)d2) + 8;
+      d2[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
+                                  s[154], s[146], s[138], s[130]);
+      d = ((uint32_t*)d3) + 8;
+      d3[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
+                                  s[155], s[147], s[139], s[131] );
+      d = ((uint32_t*)d4) + 8;
+      d4[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
+                                  s[156], s[148], s[140], s[132] );
+      d = ((uint32_t*)d5) + 8;
+      d5[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
+                                  s[157], s[149], s[141], s[133] );
+      d = ((uint32_t*)d6) + 8;
+      d6[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
+                                  s[158], s[150], s[142], s[134] );
+      d = ((uint32_t*)d7) + 8;
+      d7[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
+                                  s[159], s[151], s[143], s[135] );
+      return;
+   }

-   uint32_t *d = ((uint32_t*)d0) + 8;
-   d0[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
-                               s[152], s[144], s[136], s[128] );
-   d = ((uint32_t*)d1) + 8;
-   d1[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
-                               s[153], s[145], s[137], s[129] );
-   d = ((uint32_t*)d2) + 8;
-   d2[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
-                               s[154], s[146], s[138], s[130]);
-   d = ((uint32_t*)d3) + 8;
-   d3[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
-                               s[155], s[147], s[139], s[131] );
-   d = ((uint32_t*)d4) + 8;
-   d4[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
-                               s[156], s[148], s[140], s[132] );
-   d = ((uint32_t*)d5) + 8;
-   d5[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
-                               s[157], s[149], s[141], s[133] );
-   d = ((uint32_t*)d6) + 8;
-   d6[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
-                               s[158], s[150], s[142], s[134] );
-   d = ((uint32_t*)d7) + 8;
-   d7[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
-                               s[159], s[151], s[143], s[135] );
+   d0[2] = _mm256_set_epi32( s[184], s[176], s[168], s[160],
+                             s[152], s[144], s[136], s[128] );
+   d1[2] = _mm256_set_epi32( s[185], s[177], s[169], s[161],
+                             s[153], s[145], s[137], s[129] );
+   d2[2] = _mm256_set_epi32( s[186], s[178], s[170], s[162],
+                             s[154], s[146], s[138], s[130] );
+   d3[2] = _mm256_set_epi32( s[187], s[179], s[171], s[163],
+                             s[155], s[147], s[139], s[131] );
+   d4[2] = _mm256_set_epi32( s[188], s[180], s[172], s[164],
+                             s[156], s[148], s[140], s[132] );
+   d5[2] = _mm256_set_epi32( s[189], s[181], s[173], s[165],
+                             s[157], s[149], s[141], s[133] );
+   d6[2] = _mm256_set_epi32( s[190], s[182], s[174], s[166],
+                             s[158], s[150], s[142], s[134] );
+   d7[2] = _mm256_set_epi32( s[191], s[183], s[175], s[167],
+                             s[159], s[151], s[143], s[135] );
+
+   if ( bit_len <= 768 ) return;
+
+   d0[3] = _mm256_set_epi32( s[248], s[240], s[232], s[224],
+                             s[216], s[208], s[200], s[192] );
+   d1[3] = _mm256_set_epi32( s[249], s[241], s[233], s[225],
+                             s[217], s[209], s[201], s[193] );
+   d2[3] = _mm256_set_epi32( s[250], s[242], s[234], s[226],
+                             s[218], s[210], s[202], s[194] );
+   d3[3] = _mm256_set_epi32( s[251], s[243], s[235], s[227],
+                             s[219], s[211], s[203], s[195] );
+   d4[3] = _mm256_set_epi32( s[252], s[244], s[236], s[228],
+                             s[220], s[212], s[204], s[196] );
+   d5[3] = _mm256_set_epi32( s[253], s[245], s[237], s[229],
+                             s[221], s[213], s[205], s[197] );
+   d6[3] = _mm256_set_epi32( s[254], s[246], s[238], s[230],
+                             s[222], s[214], s[206], s[198] );
+   d7[3] = _mm256_set_epi32( s[255], s[247], s[239], s[231],
+                             s[223], s[215], s[207], s[199] );
+// bit_len == 1024
 }

 // Deinterleave 8 arrays into indivdual buffers for scalar processing
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.2.1.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.3.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.8.2.1'
-PACKAGE_STRING='cpuminer-opt 3.8.2.1'
+PACKAGE_VERSION='3.8.3'
+PACKAGE_STRING='cpuminer-opt 3.8.3'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.8.2.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.8.3 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1392,7 +1392,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.8.2.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.8.3:";;
   esac
  cat <<\_ACEOF

@@ -1497,7 +1497,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.8.2.1
+cpuminer-opt configure 3.8.3
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.8.2.1, which was
+It was created by cpuminer-opt $as_me 3.8.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2981,7 +2981,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.8.2.1'
+ VERSION='3.8.3'


 cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.8.2.1, which was
+This file was extended by cpuminer-opt $as_me 3.8.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.8.2.1
+cpuminer-opt config.status 3.8.3
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.8.2.1])
+AC_INIT([cpuminer-opt], [3.8.3])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -432,7 +432,7 @@ static bool get_mininginfo(CURL *curl, struct work *work)
 	return true;
 }

-#define BLOCK_VERSION_CURRENT 3
+#define BLOCK_VERSION_CURRENT 4

 static bool gbt_work_decode(const json_t *val, struct work *work)
 {
@@ -1608,10 +1608,14 @@ void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
 {
   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );

-   if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
-      && ( clean_job || ( *nonceptr >= *end_nonce_ptr )
-         || ( work->job_id != g_work->job_id ) ) )
+   if ( ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
+          && clean_job )
+      || ( *nonceptr >= *end_nonce_ptr )
+      || ( !opt_benchmark && strcmp( work->job_id, g_work->job_id ) ) )
   {
+     if ( *nonceptr >= *end_nonce_ptr )
+         algo_gate.stratum_gen_work( &stratum, g_work );
+
     work_free( work );
     work_copy( work, g_work );
     *nonceptr = 0xffffffffU / opt_n_threads * thr_id;
@@ -1866,23 +1870,26 @@ static void *miner_thread( void *userdata )
 		hashes_done / (diff.tv_sec + diff.tv_usec * 1e-6);
 	  pthread_mutex_unlock(&stats_lock);
       }
+
       // if nonce(s) submit work 
       if ( nonce_found && !opt_benchmark )
       {
          int num_submitted = 0;
-          // look for 4way nonces
-          for ( int n = 0; n < 4; n++ )
-             if ( work.nfound[n] )
+
+          for ( int n = 0; n < nonce_found; n++ )
+          {
+             *algo_gate.get_nonceptr( work.data ) = work.nonces[n];
+             if ( submit_work( mythr, &work ) )
             {
-                 *algo_gate.get_nonceptr( work.data ) = work.nonces[n]; 
-                 if ( !submit_work( mythr, &work ) )
-                 {
-                    applog( LOG_WARNING, "Failed to submit share." );
-                    break;
-                 }
-                 applog( LOG_NOTICE, "Share submitted." );
-                 num_submitted++;
+                applog( LOG_NOTICE, "Share submitted." );
+                num_submitted++;
             }
+             else
+             {
+                applog( LOG_WARNING, "Failed to submit share." );
+                break;
+             }
+          }
          // must be a one way algo, nonce is already in work data
          if ( !num_submitted )
          {
@@ -1977,15 +1984,19 @@ json_t *std_longpoll_rpc_call( CURL *curl, int *err, char* lp_url )
 {
   json_t *val;
   char *req = NULL;
-   if (have_gbt)
-   {
+//   if (have_gbt)
+//   {
       req = (char*) malloc( strlen(gbt_lp_req) + strlen(lp_id) + 1 );
       sprintf( req, gbt_lp_req, lp_id );
-   }
-   val = json_rpc_call( curl, rpc_url, rpc_userpass, getwork_req, err,
-                        JSON_RPC_LONGPOLL );
-   val = json_rpc_call( curl, lp_url, rpc_userpass, req ? req : getwork_req,
-                        err, JSON_RPC_LONGPOLL);
+//   }
+//TODO this code makes no sense, this first call should be removed.
+// also remove conditional expression in second call, no getwork.
+//   val = json_rpc_call( curl, rpc_url, rpc_userpass, getwork_req, err,
+//                        JSON_RPC_LONGPOLL );
+//   val = json_rpc_call( curl, lp_url, rpc_userpass, req ? req : getwork_req,
+//                        err, JSON_RPC_LONGPOLL);
+   val = json_rpc_call( curl, lp_url, rpc_userpass, req,
+                        err, JSON_RPC_LONGPOLL );
   free(req);
   return val;
 }
--- a/miner.h
+++ b/miner.h
@@ -361,7 +361,6 @@ struct work {
 	size_t xnonce2_len;
 	unsigned char *xnonce2;
        uint32_t nonces[8];
-        bool     nfound[8];
 };

 struct stratum_job {
@@ -451,7 +450,7 @@ void applog_hash(void *hash);
 void format_hashrate(double hashrate, char *output);
 void print_hash_tests(void);

-
+void scale_hash_for_display ( double* hashrate, char* units );

 struct thr_info {
        int id;