v3.11.1

2025-09-17 23:44:27 +00:00 · 2020-01-06 13:55:17 -05:00
55 changed files with 753 additions and 452 deletions
--- a/30
+++ b/30
@@ -8,10 +8,9 @@ Security warning

 Miner programs are often flagged as malware by antivirus programs. This is
 usually a false positive, they are flagged simply because they are
-cryptocurrency miners. However, some malware masquerading as a miner has
-been spread using the cover that miners are known to be subject to false
-positives ans users will dismiss the AV alert. Always be on alert.
-The source code of cpuminer-opt is open for anyone to inspect.
+cryptocurrency miners. However, some malware has been spread using the
+cover that miners are known to be subject to false positives. Always be on
+alert. The source code of cpuminer-opt is open for anyone to inspect.
 If you don't trust the software don't download it.

 The cryptographic hashing code has been taken from trusted sources but has been
@@ -30,31 +29,12 @@ Requirements
 Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
 supported.

-64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
-are not supported. FreeBSD YMMV.
+64 bit Linux or Windows operating system. Apple, Android and Rpi are
+not supported. FreeBSD YMMV.

 Change Log
 ----------

-v3.11.2
-
-Fixed x11gost (sib) AVX2 invalid shares.
-
-Fixed x16r, x16rv2, x16s, x16rt, x16rt-veil (veil), x21s.
-No shares were submitted when cube, shavite or echo were the first function
-in the hash order.
-
-Fixed all algos reporting stats problems when mining with SSE2.
-
-Faster Lyra2 AVX512: lyra2z +47%, lyra2rev3 +11%, allium +13%, x21s +6% 
-
-Other minor performance improvements.
-
-Known issue:
-
-Lyra2 AVX512 improvements paradoxically reduced performance on x22i and x25x.
-https://github.com/JayDDee/cpuminer-opt/issues/225
-
 v3.11.1

 Faster panama for x25x AVX2 & AVX512.
--- a/algo/argon2/argon2a/argon2a.c
+++ b/algo/argon2/argon2a/argon2a.c
@@ -62,7 +62,9 @@ int scanhash_argon2( struct work* work, uint32_t max_nonce,
 		argon2hash(hash, endiandata);
 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			pdata[19] = nonce;
-         submit_solution( work, hash, mythr );
+			*hashes_done = pdata[19] - first_nonce;
+			work_set_target_ratio(work, hash);
+			return 1;
 		}
 		nonce++;
 	} while (nonce < max_nonce && !work_restart[thr_id].restart);
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -43,14 +43,17 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,

 	do {
 		be32enc(&endiandata[19], n);
+		//blake2b_hash_end(vhashcpu, endiandata);
 		blake2b_hash(vhashcpu, endiandata);

-		if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget))
-      {
+		if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget)) {
+			work_set_target_ratio(work, vhashcpu);
+			*hashes_done = n - first_nonce + 1;
 			pdata[19] = n;
-         submit_solution( work, vhashcpu, mythr );
-      }
-      n++;
+			return 1;
+		}
+		n++;
+
 	} while (n < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = n - first_nonce + 1;
 	pdata[19] = n;
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -77,15 +77,25 @@ int scanhash_decred( struct work *work, uint32_t max_nonce,
                be32enc(&endiandata[k], pdata[k]);
 #endif

+#ifdef DEBUG_ALGO
+        if (!thr_id) applog(LOG_DEBUG,"[%d] Target=%08x %08x", thr_id, ptarget[6], ptarget[7]);
+#endif
+
        do {
                //be32enc(&endiandata[DCR_NONCE_OFT32], n);
                endiandata[DECRED_NONCE_INDEX] = n;
                decred_hash(hash32, endiandata);

-                if (hash32[7] <= HTarget && fulltest(hash32, ptarget))
-                {
-                   pdata[DECRED_NONCE_INDEX] = n;
-                   submit_solution( work, hash32, mythr );
+                if (hash32[7] <= HTarget && fulltest(hash32, ptarget)) {
+                        work_set_target_ratio(work, hash32);
+                        *hashes_done = n - first_nonce + 1;
+#ifdef DEBUG_ALGO
+                        applog(LOG_BLUE, "Nonce : %08x %08x", n, swab32(n));
+                        applog_hash(ptarget);
+                        applog_compare_hash(hash32, ptarget);
+#endif
+                        pdata[DECRED_NONCE_INDEX] = n;
+                        return 1;
                }

                n++;
--- a/algo/gost/sph_gost.c
+++ b/algo/gost/sph_gost.c
@@ -4,7 +4,7 @@
 #include <stdlib.h>
 #include <memory.h>
 #include <math.h>
-#include "simd-utils.h"
+
 #include "sph_gost.h"

 #ifdef __cplusplus
@@ -696,26 +696,9 @@ static void AddModulo512(const void *a,const void *b,void *c)

 static void AddXor512(const void *a,const void *b,void *c)
 {
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-   casti_m512i( c, 0 ) = _mm512_xor_si512( casti_m512i( a, 0 ),
-                                           casti_m512i( b, 0 ) );
-#elif defined(__AVX2__)
-   casti_m256i( c, 0 ) = _mm256_xor_si256( casti_m256i( a, 0 ),
-                                           casti_m256i( b, 0 ) );
-   casti_m256i( c, 1 ) = _mm256_xor_si256( casti_m256i( a, 1 ),
-                                           casti_m256i( b, 1 ) );
-#elif defined(__SSE2__)
-   casti_m128i( c, 0 ) = _mm_xor_si128( casti_m128i( a, 0 ),
-                                        casti_m128i( b, 0 ) );
-   casti_m128i( c, 1 ) = _mm_xor_si128( casti_m128i( a, 1 ),
-                                        casti_m128i( b, 1 ) );
-   casti_m128i( c, 2 ) = _mm_xor_si128( casti_m128i( a, 2 ),
-                                        casti_m128i( b, 2 ) );
-   casti_m128i( c, 3 ) = _mm_xor_si128( casti_m128i( a, 3 ),
-                                        casti_m128i( b, 3 ) );
-#else
-   const unsigned long long *A=a, *B=b;
+	const unsigned long long *A=a, *B=b;
 	unsigned long long *C=c;
+#ifdef FULL_UNROLL
 	C[0] = A[0] ^ B[0];
 	C[1] = A[1] ^ B[1];
 	C[2] = A[2] ^ B[2];
@@ -724,6 +707,12 @@ static void AddXor512(const void *a,const void *b,void *c)
 	C[5] = A[5] ^ B[5];
 	C[6] = A[6] ^ B[6];
 	C[7] = A[7] ^ B[7];
+#else
+	int i = 0;
+
+	for(i=0; i<8; i++) {
+		C[i] = A[i] ^ B[i];
+	}
 #endif
 }

@@ -904,32 +893,31 @@ static void g_N(const unsigned char *N,unsigned char *h,const unsigned char *m)

 static void hash_X(unsigned char *IV,const unsigned char *message,unsigned long long length,unsigned char *out)
 {
-	unsigned char v512[64] __attribute__((aligned(64))) = {
+	unsigned char v512[64] = {
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x00
-   };
-	unsigned char v0[64]  __attribute__((aligned(64))) = {
-		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-   };
-	unsigned char Sigma[64] __attribute__((aligned(64))) = {
+	};
+	unsigned char v0[64] = {
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 	};
-	unsigned char N[64] __attribute__((aligned(64))) = {
+	unsigned char Sigma[64] = {
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 	};
-	unsigned char m[64] __attribute__((aligned(64)));
-   unsigned char *hash = IV;
+	unsigned char N[64] = {
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	};
+	unsigned char m[64], *hash = IV;
 	unsigned long long len = length;

 	// Stage 2
@@ -964,7 +952,7 @@ static void hash_X(unsigned char *IV,const unsigned char *message,unsigned long

 static void hash_512(const unsigned char *message, unsigned long long length, unsigned char *out)
 {
-	unsigned char IV[64] __attribute__((aligned(64))) = {
+	unsigned char IV[64] = {
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
--- a/algo/gost/sph_gost.h
+++ b/algo/gost/sph_gost.h
@@ -81,9 +81,9 @@ typedef struct {
 */
 typedef struct {
 #ifndef DOXYGEN_IGNORE
-	unsigned char buf[64] __attribute__((aligned(64))); 
-   sph_u32 V[5][8] __attribute__((aligned(64)));
+	unsigned char buf[64];    /* first field, for alignment */
 	size_t ptr;
+	sph_u32 V[5][8];
 #endif
 } sph_gost512_context;

--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -67,12 +67,8 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
     ctx->chaining[i] = _mm_setzero_si128();
     ctx->buffer[i]   = _mm_setzero_si128();
  }
-
-  // The only non-zero in the IV is len. It can be hard coded.
-  ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
-//  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-//  INIT(ctx->chaining);
-
+  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+  INIT(ctx->chaining);
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -42,12 +42,9 @@ int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
     ctx->buffer[i]   = m512_zero;
  }

-  // The only non-zero in the IV is len. It can be hard coded.
-  ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
-//  uint64_t len = U64BIG((uint64_t)LENGTH);
-//  ctx->chaining[ COLS/2 -1 ] = _mm512_set4_epi64( len, 0, len, 0 );
-//  INIT_4way(ctx->chaining);
-
+  uint64_t len = U64BIG((uint64_t)LENGTH);
+  ctx->chaining[ COLS/2 -1 ] = _mm512_set4_epi64( len, 0, len, 0 );
+  INIT_4way(ctx->chaining);
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

--- a/algo/groestl/groestl512-intr-4way.h
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -115,7 +115,7 @@ __m512i ALL_FF;
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\
+  b1 = ALL_1B;\
  MUL2(a0, b0, b1);\
  a0 = _mm512_xor_si512(a0, TEMP0);\
  MUL2(a1, b0, b1);\
@@ -276,7 +276,7 @@ __m512i ALL_FF;
  for ( round_counter = 0; round_counter < 14; round_counter += 2) \
  { \
    /* AddRoundConstant Q1024 */\
-    xmm1 = m512_neg1;\
+    xmm1 = ALL_FF;\
    xmm8  = _mm512_xor_si512( xmm8,  xmm1 );\
    xmm9  = _mm512_xor_si512( xmm9,  xmm1 );\
    xmm10 = _mm512_xor_si512( xmm10, xmm1 );\
@@ -298,7 +298,7 @@ __m512i ALL_FF;
    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
    \
    /* AddRoundConstant Q1024 */\
-    xmm9 = m512_neg1;\
+    xmm9 = ALL_FF;\
    xmm0 = _mm512_xor_si512( xmm0, xmm9 );\
    xmm1 = _mm512_xor_si512( xmm1, xmm9 );\
    xmm2 = _mm512_xor_si512( xmm2, xmm9 );\
--- a/algo/heavy/bastion.c
+++ b/algo/heavy/bastion.c
@@ -150,8 +150,10 @@ int scanhash_bastion( struct work *work, uint32_t max_nonce,
 		be32enc(&endiandata[19], n);
 		bastionhash(hash32, endiandata);
 		if (hash32[7] < Htarg && fulltest(hash32, ptarget)) {
+			work_set_target_ratio(work, hash32);
+			*hashes_done = n - first_nonce + 1;
 			pdata[19] = n;
-         submit_solution( work, hash32, mythr );
+			return true;
 		}
 		n++;

--- a/algo/jh/jha.c
+++ b/algo/jh/jha.c
@@ -117,6 +117,9 @@ int scanhash_jha( struct work *work, uint32_t max_nonce,

        jha_kec_midstate( endiandata );

+#ifdef DEBUG_ALGO
+	printf("[%d] Htarg=%X\n", thr_id, Htarg);
+#endif
 	for (int m=0; m < 6; m++) {
 		if (Htarg <= htmax[m]) {
 			uint32_t mask = masks[m];
@@ -124,9 +127,25 @@ int scanhash_jha( struct work *work, uint32_t max_nonce,
 				pdata[19] = ++n;
 				be32enc(&endiandata[19], n);
 				jha_hash(hash32, endiandata);
-				if ((!(hash32[7] & mask)) && fulltest(hash32, ptarget)) 
-               submit_solution( work, hash32, mythr );
+#ifndef DEBUG_ALGO
+				if ((!(hash32[7] & mask)) && fulltest(hash32, ptarget)) {
+					work_set_target_ratio(work, hash32);
+					*hashes_done = n - first_nonce + 1;
+					return 1;
+				}
+#else
+				if (!(n % 0x1000) && !thr_id) printf(".");
+				if (!(hash32[7] & mask)) {
+					printf("[%d]",thr_id);
+					if (fulltest(hash32, ptarget)) {
+						work_set_target_ratio(work, hash32);
+						*hashes_done = n - first_nonce + 1;
+						return 1;
+					}
+				}
+#endif
 			} while (n < max_nonce && !work_restart[thr_id].restart);
+			// see blake.c if else to understand the loop on htmax => mask
 			break;
 		}
 	}
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -196,6 +196,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
    __m512i* in    = (__m512i*)rowIn;
    __m512i* inout = (__m512i*)rowInOut;
    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
+    __m512i  t0, t1, t2;

    state0 = _mm512_load_si512( (__m512i*)State     );
    state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -217,27 +218,24 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
       out[1] = _mm512_xor_si512( state1, in[1] );
       out[2] = _mm512_xor_si512( state2, in[2] );

-      {
-        register __m512i t0, t1, t2;
-       
-        //M[row*][col] = M[row*][col] XOR rotW(rand)
-        t0 = _mm512_permutex_epi64( state0, 0x93 );
-        t1 = _mm512_permutex_epi64( state1, 0x93 );
-        t2 = _mm512_permutex_epi64( state2, 0x93 );
+       //M[row*][col] = M[row*][col] XOR rotW(rand)
+       t0 = _mm512_permutex_epi64( state0, 0x93 );
+       t1 = _mm512_permutex_epi64( state1, 0x93 );
+       t2 = _mm512_permutex_epi64( state2, 0x93 );

-        inout[0] = _mm512_xor_si512( inout[0],
-                                 _mm512_mask_blend_epi64( 0x11, t0, t2 ) );
-        inout[1] = _mm512_xor_si512( inout[1],
-                                 _mm512_mask_blend_epi64( 0x11, t1, t0 ) );
-        inout[2] = _mm512_xor_si512( inout[2],
-                                 _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
-      }
+       inout[0] = _mm512_xor_si512( inout[0],
+                                 _mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
+       inout[1] = _mm512_xor_si512( inout[1],
+                                 _mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
+       inout[2] = _mm512_xor_si512( inout[2],
+                                 _mm512_mask_blend_epi32( 0x0303, t2, t1 ) );

-      //Inputs: next column (i.e., next block in sequence)
-      in    += BLOCK_LEN_M256I;
-      inout += BLOCK_LEN_M256I;
-      //Output: goes to previous column
-      out   -= BLOCK_LEN_M256I;
+
+       //Inputs: next column (i.e., next block in sequence)
+       in    += BLOCK_LEN_M256I;
+       inout += BLOCK_LEN_M256I;
+       //Output: goes to previous column
+       out   -= BLOCK_LEN_M256I;
    }

    _mm512_store_si512( (__m512i*)State,     state0 );
@@ -246,6 +244,9 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
    _mm512_store_si512( (__m512i*)State + 3, state3 );
 }

+// big ugly workaound for pointer aliasing, use a union of pointers.
+// Access matrix using m512i for in and out, m256i for inout
+
 inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
                            uint64_t *rowInOut0, uint64_t *rowInOut1,
                            uint64_t *rowOut, uint64_t nCols)
@@ -256,81 +257,95 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
   __m256i *inout0 = (__m256i*)rowInOut0;
   __m256i *inout1 = (__m256i*)rowInOut1;
   __m512i *out = (__m512i*)rowOut;
-   register __m512i io0, io1, io2;
+   __m512i io[3];
+   povly inout;
+   inout.v512 = &io[0];
+    __m512i t0, t1, t2;

   state0 = _mm512_load_si512( (__m512i*)State     );
   state1 = _mm512_load_si512( (__m512i*)State + 1 );
   state2 = _mm512_load_si512( (__m512i*)State + 2 );
   state3 = _mm512_load_si512( (__m512i*)State + 3 );
    
-   for ( i = 0; i < nCols; i++ )
-   {
-     //Absorbing "M[prev] [+] M[row*]"
-     io0 = _mm512_mask_blend_epi64( 0xf0,
-                 _mm512_load_si512( (__m512i*)inout0 ),
-                 _mm512_load_si512( (__m512i*)inout1 ) );
-     io1 = _mm512_mask_blend_epi64( 0xf0,
-                 _mm512_load_si512( (__m512i*)inout0 +1 ),
-                 _mm512_load_si512( (__m512i*)inout1 +1 ) );
-     io2 = _mm512_mask_blend_epi64( 0xf0,
-                 _mm512_load_si512( (__m512i*)inout0 +2 ),
-                 _mm512_load_si512( (__m512i*)inout1 +2 ) );
+    _mm_prefetch( in,     _MM_HINT_T0 );
+    _mm_prefetch( inout0,     _MM_HINT_T0 );
+    _mm_prefetch( inout1,     _MM_HINT_T0 );
+    _mm_prefetch( in     + 2, _MM_HINT_T0 );
+    _mm_prefetch( inout0 + 2, _MM_HINT_T0 );
+    _mm_prefetch( inout1 + 2, _MM_HINT_T0 );
+    _mm_prefetch( in     + 4, _MM_HINT_T0 );
+    _mm_prefetch( inout0 + 4, _MM_HINT_T0 );
+    _mm_prefetch( inout1 + 4, _MM_HINT_T0 );
+    _mm_prefetch( in     + 6, _MM_HINT_T0 );
+    _mm_prefetch( inout0 + 6, _MM_HINT_T0 );
+    _mm_prefetch( inout1 + 6, _MM_HINT_T0 );

-     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0 ) );
-     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1 ) );
-     state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io2 ) );
+    
+    for ( i = 0; i < nCols; i++ )
+    {

-     //Applies the reduced-round transformation f to the sponge's state
-     LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+      //Absorbing "M[prev] [+] M[row*]"
+      inout.v256[0] = inout0[0];
+      inout.v256[1] = inout1[1];
+      inout.v256[2] = inout0[2];
+      inout.v256[3] = inout1[3];
+      inout.v256[4] = inout0[4];
+      inout.v256[5] = inout1[5];

-     {
-       register __m512i t0, t1, t2;
+      state0 = _mm512_xor_si512( state0,
+                                 _mm512_add_epi64( in[0], inout.v512[0] ) );
+      state1 = _mm512_xor_si512( state1,
+                                 _mm512_add_epi64( in[1], inout.v512[1] ) );
+      state2 = _mm512_xor_si512( state2,
+                                 _mm512_add_epi64( in[2], inout.v512[2] ) );

-       //M[rowOut][col] = M[rowOut][col] XOR rand
-       t0 = _mm512_xor_si512( out[0], state0 );
-       t1 = _mm512_xor_si512( out[1], state1 );
-       t2 = _mm512_xor_si512( out[2], state2 );

-       // if out is the same row as inout, update with new data.
-       if ( rowOut == rowInOut0 )
-       {
-          io0 = _mm512_mask_blend_epi64( 0x0f, io0, t0 );
-          io1 = _mm512_mask_blend_epi64( 0x0f, io1, t1 );
-          io2 = _mm512_mask_blend_epi64( 0x0f, io2, t2 );
-       }
-       if ( rowOut == rowInOut1 )
-       {
-          io0 = _mm512_mask_blend_epi64( 0xf0, io0, t0 );
-          io1 = _mm512_mask_blend_epi64( 0xf0, io1, t1 );
-          io2 = _mm512_mask_blend_epi64( 0xf0, io2, t2 );
-       }
+      //Applies the reduced-round transformation f to the sponge's state
+      LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );

-       out[0] = t0;
-       out[1] = t1;
-       out[2] = t2;
+      //M[rowOut][col] = M[rowOut][col] XOR rand
+      out[0] = _mm512_xor_si512( out[0], state0 );
+      out[1] = _mm512_xor_si512( out[1], state1 );
+      out[2] = _mm512_xor_si512( out[2], state2 );

-       //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
-       t0 = _mm512_permutex_epi64( state0, 0x93 );
-       t1 = _mm512_permutex_epi64( state1, 0x93 );
-       t2 = _mm512_permutex_epi64( state2, 0x93 );
+      // if inout is the same row as out it was just overwritten, reload.
+      if ( rowOut == rowInOut0 )
+      {
+         inout.v256[0] = inout0[0];
+         inout.v256[2] = inout0[2];
+         inout.v256[4] = inout0[4];
+      }
+      if ( rowOut == rowInOut1 )
+      {
+         inout.v256[1] = inout1[1];
+         inout.v256[3] = inout1[3];
+         inout.v256[5] = inout1[5];
+      }

-       io0 = _mm512_xor_si512( io0, _mm512_mask_blend_epi64( 0x11, t0, t2 ) );
-       io1 = _mm512_xor_si512( io1, _mm512_mask_blend_epi64( 0x11, t1, t0 ) );
-       io2 = _mm512_xor_si512( io2, _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
-     }
+      //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+      t0 = _mm512_permutex_epi64( state0, 0x93 );
+      t1 = _mm512_permutex_epi64( state1, 0x93 );
+      t2 = _mm512_permutex_epi64( state2, 0x93 );

-     _mm512_mask_store_epi64( (__m512i*)inout0,    0x0f, io0 );
-     _mm512_mask_store_epi64( (__m512i*)inout1,    0xf0, io0 );
-     _mm512_mask_store_epi64( (__m512i*)inout0 +1, 0x0f, io1 );
-     _mm512_mask_store_epi64( (__m512i*)inout1 +1, 0xf0, io1 );
-     _mm512_mask_store_epi64( (__m512i*)inout0 +2, 0x0f, io2 );
-     _mm512_mask_store_epi64( (__m512i*)inout1 +2, 0xf0, io2 );
+      inout.v512[0] = _mm512_xor_si512( inout.v512[0],
+                                   _mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
+      inout.v512[1] = _mm512_xor_si512( inout.v512[1],
+                                   _mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
+      inout.v512[2] = _mm512_xor_si512( inout.v512[2],
+                                   _mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
+      
+      inout0[0] = inout.v256[0];
+      inout1[1] = inout.v256[1];
+      inout0[2] = inout.v256[2];
+      inout1[3] = inout.v256[3];
+      inout0[4] = inout.v256[4];
+      inout1[5] = inout.v256[5];

-      //Goes to next block
-      in     += BLOCK_LEN_M256I;
-      inout0 += BLOCK_LEN_M256I * 2;
-      inout1 += BLOCK_LEN_M256I * 2;
-      out    += BLOCK_LEN_M256I;
+       //Goes to next block
+       in     += BLOCK_LEN_M256I;
+       inout0 += BLOCK_LEN_M256I * 2;
+       inout1 += BLOCK_LEN_M256I * 2;
+       out    += BLOCK_LEN_M256I;
   }

   _mm512_store_si512( (__m512i*)State,     state0 );
--- a/algo/nist5/nist5.c
+++ b/algo/nist5/nist5.c
@@ -25,6 +25,7 @@ void nist5hash(void *output, const void *input)
   sph_skein512_context    ctx_skein;
   sph_jh512_context       ctx_jh;
   sph_keccak512_context   ctx_keccak;
+   uint32_t mask = 8;

   sph_blake512_init( &ctx_blake );
   sph_blake512( &ctx_blake, input, 80 );
@@ -58,10 +59,10 @@ void nist5hash(void *output, const void *input)
 int scanhash_nist5( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr)
 {
-   uint32_t endiandata[20] __attribute__((aligned(64)));
-   uint32_t hash64[8] __attribute__((aligned(32)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+        uint32_t endiandata[20] __attribute__((aligned(64)));
+        uint32_t hash64[8] __attribute__((aligned(32)));
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
   int thr_id = mythr->id;  // thr_id arg is deprecated
@@ -87,6 +88,9 @@ int scanhash_nist5( struct work *work, uint32_t max_nonce,
 	// we need bigendian data...
        swab32_array( endiandata, pdata, 20 );

+#ifdef DEBUG_ALGO
+	printf("[%d] Htarg=%X\n", thr_id, Htarg);
+#endif
 	for (int m=0; m < 6; m++) {
 		if (Htarg <= htmax[m]) {
 			uint32_t mask = masks[m];
@@ -94,9 +98,24 @@ int scanhash_nist5( struct work *work, uint32_t max_nonce,
 				pdata[19] = ++n;
 				be32enc(&endiandata[19], n);
 				nist5hash(hash64, endiandata);
-				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget))
-                submit_solution( work, hash64, mythr );
+#ifndef DEBUG_ALGO
+				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
+					*hashes_done = n - first_nonce + 1;
+					return true;
+				}
+#else
+				if (!(n % 0x1000) && !thr_id) printf(".");
+				if (!(hash64[7] & mask)) {
+					printf("[%d]",thr_id);
+					if (fulltest(hash64, ptarget)) {
+                                                work_set_target_ratio( work, hash64 );
+						*hashes_done = n - first_nonce + 1;
+						return true;
+					}
+				}
+#endif
 			} while (n < max_nonce && !work_restart[thr_id].restart);
+			// see blake.c if else to understand the loop on htmax => mask
 			break;
 		}
 	}
--- a/algo/nist5/zr5.c
+++ b/algo/nist5/zr5.c
@@ -142,7 +142,11 @@ int scanhash_zr5( struct work *work, uint32_t max_nonce,
       {
         pdata[0] = tmpdata[0];
         pdata[19] = nonce;
-         submit_solution( work, hash, mythr );
+         *hashes_done = pdata[19] - first_nonce + 1;
+         work_set_target_ratio( work, hash );
+         if (opt_debug)
+           applog(LOG_INFO, "found nonce %x", nonce);
+         return 1;
       }
    }
    nonce++;
--- a/algo/panama/panama-hash-4way.c
+++ b/algo/panama/panama-hash-4way.c
@@ -6,25 +6,26 @@

 #define M17( macro ) \
 do { \
-      macro(  0,  1,  2,  4 ); \
-      macro(  1,  2,  3,  5 ); \
-      macro(  2,  3,  4,  6 ); \
-      macro(  3,  4,  5,  7 ); \
-      macro(  4,  5,  6,  8 ); \
-      macro(  5,  6,  7,  9 ); \
-      macro(  6,  7,  8, 10 ); \
-      macro(  7,  8,  9, 11 ); \
-      macro(  8,  9, 10, 12 ); \
-      macro(  9, 10, 11, 13 ); \
-      macro( 10, 11, 12, 14 ); \
-      macro( 11, 12, 13, 15 ); \
-      macro( 12, 13, 14, 16 ); \
-      macro( 13, 14, 15,  0 ); \
-      macro( 14, 15, 16,  1 ); \
-      macro( 15, 16,  0,  2 ); \
-      macro( 16,  0,  1,  3 ); \
+      macro( 0,  1,  2,  4); \
+      macro( 1,  2,  3,  5); \
+      macro( 2,  3,  4,  6); \
+      macro( 3,  4,  5,  7); \
+      macro( 4,  5,  6,  8); \
+      macro( 5,  6,  7,  9); \
+      macro( 6,  7,  8, 10); \
+      macro( 7,  8,  9, 11); \
+      macro( 8,  9, 10, 12); \
+      macro( 9, 10, 11, 13); \
+      macro(10, 11, 12, 14); \
+      macro(11, 12, 13, 15); \
+      macro(12, 13, 14, 16); \
+      macro(13, 14, 15,  0); \
+      macro(14, 15, 16,  1); \
+      macro(15, 16,  0,  2); \
+      macro(16,  0,  1,  3); \
 } while (0)

+
 #define RSTATE(n0, n1, n2, n4)    (a ## n0 = sc->state[n0])

 #define WSTATE(n0, n1, n2, n4)    (sc->state[n0] = a ## n0)
@@ -49,7 +50,9 @@ do { \

 #define LVARS_4W   \
 	LVAR17_4W(a) \
-	LVAR17_4W(g)
+	LVAR17_4W(g) \
+	LVAR17_4W(p) \
+	LVAR17_4W(t)

 #define BUPDATE1_4W( n0, n2 ) \
 do { \
@@ -60,14 +63,14 @@ do { \

 #define BUPDATE_4W \
 do { \
-		BUPDATE1_4W( 0, 2 ); \
-		BUPDATE1_4W( 1, 3 ); \
-		BUPDATE1_4W( 2, 4 ); \
-		BUPDATE1_4W( 3, 5 ); \
-		BUPDATE1_4W( 4, 6 ); \
-		BUPDATE1_4W( 5, 7 ); \
-		BUPDATE1_4W( 6, 0 ); \
-		BUPDATE1_4W( 7, 1 ); \
+		BUPDATE1_4W(0, 2); \
+		BUPDATE1_4W(1, 3); \
+		BUPDATE1_4W(2, 4); \
+		BUPDATE1_4W(3, 5); \
+		BUPDATE1_4W(4, 6); \
+		BUPDATE1_4W(5, 7); \
+		BUPDATE1_4W(6, 0); \
+		BUPDATE1_4W(7, 1); \
 } while (0)

 #define GAMMA_4W(n0, n1, n2, n4)   \
@@ -75,46 +78,46 @@ do { \
                             _mm_or_si128( a ## n1, mm128_not( a ## n2 ) ) ) )

 #define PI_ALL_4W   do { \
-      a0  = g0; \
-      a1  = mm128_rol_32( g7,   1 ); \
-      a2  = mm128_rol_32( g14,  3 ); \
-      a3  = mm128_rol_32( g4,   6 ); \
-      a4  = mm128_rol_32( g11, 10 ); \
-      a5  = mm128_rol_32( g1,  15 ); \
-      a6  = mm128_rol_32( g8,  21 ); \
-      a7  = mm128_rol_32( g15, 28 ); \
-      a8  = mm128_rol_32( g5,   4 ); \
-      a9  = mm128_rol_32( g12, 13 ); \
-      a10 = mm128_rol_32( g2,  23 ); \
-      a11 = mm128_rol_32( g9,   2 ); \
-      a12 = mm128_rol_32( g16, 14 ); \
-      a13 = mm128_rol_32( g6,  27 ); \
-      a14 = mm128_rol_32( g13,  9 ); \
-      a15 = mm128_rol_32( g3,  24 ); \
-      a16 = mm128_rol_32( g10,  8 ); \
+      p0  = g0; \
+      p1  = mm128_rol_32(  g7,  1 ); \
+      p2  = mm128_rol_32( g14,  3 ); \
+      p3  = mm128_rol_32(  g4,  6 ); \
+      p4  = mm128_rol_32( g11, 10 ); \
+      p5  = mm128_rol_32(  g1, 15 ); \
+      p6  = mm128_rol_32(  g8, 21 ); \
+      p7  = mm128_rol_32( g15, 28 ); \
+      p8  = mm128_rol_32(  g5,  4 ); \
+      p9  = mm128_rol_32( g12, 13 ); \
+      p10 = mm128_rol_32(  g2, 23 ); \
+      p11 = mm128_rol_32(  g9,  2 ); \
+      p12 = mm128_rol_32( g16, 14 ); \
+      p13 = mm128_rol_32(  g6, 27 ); \
+      p14 = mm128_rol_32( g13,  9 ); \
+      p15 = mm128_rol_32(  g3, 24 ); \
+      p16 = mm128_rol_32( g10,  8 ); \
   } while (0)

 #define THETA_4W(n0, n1, n2, n4)   \
-   ( g ## n0 = _mm_xor_si128( a ## n0, _mm_xor_si128( a ## n1, a ## n4 ) ) )
+   ( t ## n0 = _mm_xor_si128( p ## n0, _mm_xor_si128( p ## n1, p ## n4 ) ) )

 #define SIGMA_ALL_4W   do { \
-		a0 = _mm_xor_si128( g0, m128_one_32 ); \
-		a1 = _mm_xor_si128( g1, INW2( 0 ) ); \
-		a2 = _mm_xor_si128( g2, INW2( 1 ) ); \
-		a3 = _mm_xor_si128( g3, INW2( 2 ) ); \
-		a4 = _mm_xor_si128( g4, INW2( 3 ) ); \
-		a5 = _mm_xor_si128( g5, INW2( 4 ) ); \
-		a6 = _mm_xor_si128( g6, INW2( 5 ) ); \
-		a7 = _mm_xor_si128( g7, INW2( 6 ) ); \
-		a8 = _mm_xor_si128( g8, INW2( 7 ) ); \
-		a9  = _mm_xor_si128( g9,  sc->buffer[ ptr16 ][0] ); \
-		a10 = _mm_xor_si128( g10, sc->buffer[ ptr16 ][1] ); \
-		a11 = _mm_xor_si128( g11, sc->buffer[ ptr16 ][2] ); \
-		a12 = _mm_xor_si128( g12, sc->buffer[ ptr16 ][3] ); \
-		a13 = _mm_xor_si128( g13, sc->buffer[ ptr16 ][4] ); \
-		a14 = _mm_xor_si128( g14, sc->buffer[ ptr16 ][5] ); \
-		a15 = _mm_xor_si128( g15, sc->buffer[ ptr16 ][6] ); \
-		a16 = _mm_xor_si128( g16, sc->buffer[ ptr16 ][7] ); \
+		a0 = _mm_xor_si128( t0, _mm_set1_epi32( 1 ) ); \
+		a1 = _mm_xor_si128( t1, INW2( 0 ) ); \
+		a2 = _mm_xor_si128( t2, INW2( 1 ) ); \
+		a3 = _mm_xor_si128( t3, INW2( 2 ) ); \
+		a4 = _mm_xor_si128( t4, INW2( 3 ) ); \
+		a5 = _mm_xor_si128( t5, INW2( 4 ) ); \
+		a6 = _mm_xor_si128( t6, INW2( 5 ) ); \
+		a7 = _mm_xor_si128( t7, INW2( 6 ) ); \
+		a8 = _mm_xor_si128( t8, INW2( 7 ) ); \
+		a9  = _mm_xor_si128(  t9, sc->buffer[ ptr16 ] [0 ] ); \
+		a10 = _mm_xor_si128( t10, sc->buffer[ ptr16 ] [1 ] ); \
+		a11 = _mm_xor_si128( t11, sc->buffer[ ptr16 ] [2 ] ); \
+		a12 = _mm_xor_si128( t12, sc->buffer[ ptr16 ] [3 ] ); \
+		a13 = _mm_xor_si128( t13, sc->buffer[ ptr16 ] [4 ] ); \
+		a14 = _mm_xor_si128( t14, sc->buffer[ ptr16 ] [5 ] ); \
+		a15 = _mm_xor_si128( t15, sc->buffer[ ptr16 ] [6 ] ); \
+		a16 = _mm_xor_si128( t16, sc->buffer[ ptr16 ] [7 ] ); \
 	} while (0)

 #define PANAMA_STEP_4W   do { \
@@ -142,9 +145,9 @@ panama_4way_push( panama_4way_context *sc, const unsigned char *pbuf,
 #define INW2(i)   INW1(i)

 	M17( RSTATE );
+
   ptr0 = sc->buffer_ptr;
-	while ( num-- > 0 )
-   {
+	while (num -- > 0) {
 		PANAMA_STEP_4W;
 		pbuf = (const unsigned char *)pbuf + 32*4;
 	}
@@ -170,11 +173,14 @@ panama_4way_pull( panama_4way_context *sc, unsigned num )
 #define INW2(i)     casti_m128i( sc->buffer[ptr4], i )

 	M17( RSTATE );
+
   ptr0 = sc->buffer_ptr;
-   while ( num-- > 0 )
-   {
+
+   while (num -- > 0) {
 		unsigned ptr4;
+
 		ptr4 = ( (ptr0 + 4) & 31 );
+
      PANAMA_STEP_4W;
 	}
 	M17( WSTATE );
@@ -186,11 +192,18 @@ panama_4way_pull( panama_4way_context *sc, unsigned num )
 }

 void
-panama_4way_init( void *cc )
+panama_4way_init(void *cc)
 {
 	panama_4way_context *sc;

 	sc = cc;
+	/*
+	 * This is not completely conformant, but "it will work
+	 * everywhere". Initial state consists of zeroes everywhere.
+	 * Conceptually, the sph_u32 type may have padding bits which
+	 * must not be set to 0; but such an architecture remains to
+	 * be seen.
+	 */
 	sc->data_ptr = 0;
 	memset( sc->buffer, 0, sizeof sc->buffer );
 	sc->buffer_ptr = 0;
@@ -204,8 +217,7 @@ panama_4way_short( void *cc, const void *data, size_t len )
 	unsigned current;
 	sc = cc;
 	current = sc->data_ptr;
-	while ( len > 0 )
-   {
+	while (len > 0) {
 		unsigned clen;

 		clen = ( (sizeof sc->data ) >> 2 ) - current;
@@ -271,8 +283,11 @@ panama_4way_close( void *cc, void *dst )
 	*(__m128i*)( sc->data + current ) = m128_one_32;
   current++;
   memset_zero_128( (__m128i*)sc->data + current, 32 - current );
+
   panama_4way_push( sc, sc->data, 1 );
+
   panama_4way_pull( sc, 32 );
+
   for ( i = 0; i < 8; i ++ )
      casti_m128i( dst, i ) = sc->state[i + 9];
 }
@@ -291,7 +306,9 @@ panama_4way_close( void *cc, void *dst )

 #define LVARS_8W   \
   LVAR17_8W(a) \
-   LVAR17_8W(g)
+   LVAR17_8W(g) \
+   LVAR17_8W(p) \
+   LVAR17_8W(t)

 #define BUPDATE1_8W( n0, n2 ) \
 do { \
@@ -302,14 +319,14 @@ do { \

 #define BUPDATE_8W \
 do { \
-      BUPDATE1_8W( 0, 2 ); \
-      BUPDATE1_8W( 1, 3 ); \
-      BUPDATE1_8W( 2, 4 ); \
-      BUPDATE1_8W( 3, 5 ); \
-      BUPDATE1_8W( 4, 6 ); \
-      BUPDATE1_8W( 5, 7 ); \
-      BUPDATE1_8W( 6, 0 ); \
-      BUPDATE1_8W( 7, 1 ); \
+      BUPDATE1_8W(0, 2); \
+      BUPDATE1_8W(1, 3); \
+      BUPDATE1_8W(2, 4); \
+      BUPDATE1_8W(3, 5); \
+      BUPDATE1_8W(4, 6); \
+      BUPDATE1_8W(5, 7); \
+      BUPDATE1_8W(6, 0); \
+      BUPDATE1_8W(7, 1); \
 } while (0)

 #define GAMMA_8W(n0, n1, n2, n4)   \
@@ -317,47 +334,46 @@ do { \
                         _mm256_or_si256( a ## n1, mm256_not( a ## n2 ) ) ) )

 #define PI_ALL_8W   do { \
-      a0  = g0; \
-      a1  = mm256_rol_32( g7,   1 ); \
-      a2  = mm256_rol_32( g14,  3 ); \
-      a3  = mm256_rol_32( g4,   6 ); \
-      a4  = mm256_rol_32( g11, 10 ); \
-      a5  = mm256_rol_32( g1,  15 ); \
-      a6  = mm256_rol_32( g8,  21 ); \
-      a7  = mm256_rol_32( g15, 28 ); \
-      a8  = mm256_rol_32( g5,   4 ); \
-      a9  = mm256_rol_32( g12, 13 ); \
-      a10 = mm256_rol_32( g2,  23 ); \
-      a11 = mm256_rol_32( g9,   2 ); \
-      a12 = mm256_rol_32( g16, 14 ); \
-      a13 = mm256_rol_32( g6,  27 ); \
-      a14 = mm256_rol_32( g13,  9 ); \
-      a15 = mm256_rol_32( g3,  24 ); \
-      a16 = mm256_rol_32( g10,  8 ); \
+      p0  = g0; \
+      p1  = mm256_rol_32(  g7,  1 ); \
+      p2  = mm256_rol_32( g14,  3 ); \
+      p3  = mm256_rol_32(  g4,  6 ); \
+      p4  = mm256_rol_32( g11, 10 ); \
+      p5  = mm256_rol_32(  g1, 15 ); \
+      p6  = mm256_rol_32(  g8, 21 ); \
+      p7  = mm256_rol_32( g15, 28 ); \
+      p8  = mm256_rol_32(  g5,  4 ); \
+      p9  = mm256_rol_32( g12, 13 ); \
+      p10 = mm256_rol_32(  g2, 23 ); \
+      p11 = mm256_rol_32(  g9,  2 ); \
+      p12 = mm256_rol_32( g16, 14 ); \
+      p13 = mm256_rol_32(  g6, 27 ); \
+      p14 = mm256_rol_32( g13,  9 ); \
+      p15 = mm256_rol_32(  g3, 24 ); \
+      p16 = mm256_rol_32( g10,  8 ); \
   } while (0)

 #define THETA_8W(n0, n1, n2, n4)   \
-   ( g ## n0 = _mm256_xor_si256( a ## n0, _mm256_xor_si256( a ## n1, \
-                                                            a ## n4 ) ) )
+   ( t ## n0 = _mm256_xor_si256( p ## n0, _mm256_xor_si256( p ## n1, p ## n4 ) ) )

 #define SIGMA_ALL_8W   do { \
-      a0  = _mm256_xor_si256( g0, m256_one_32 ); \
-      a1  = _mm256_xor_si256( g1, INW2( 0 ) ); \
-      a2  = _mm256_xor_si256( g2, INW2( 1 ) ); \
-      a3  = _mm256_xor_si256( g3, INW2( 2 ) ); \
-      a4  = _mm256_xor_si256( g4, INW2( 3 ) ); \
-      a5  = _mm256_xor_si256( g5, INW2( 4 ) ); \
-      a6  = _mm256_xor_si256( g6, INW2( 5 ) ); \
-      a7  = _mm256_xor_si256( g7, INW2( 6 ) ); \
-      a8  = _mm256_xor_si256( g8, INW2( 7 ) ); \
-      a9  = _mm256_xor_si256( g9,  sc->buffer[ ptr16 ][0] ); \
-      a10 = _mm256_xor_si256( g10, sc->buffer[ ptr16 ][1] ); \
-      a11 = _mm256_xor_si256( g11, sc->buffer[ ptr16 ][2] ); \
-      a12 = _mm256_xor_si256( g12, sc->buffer[ ptr16 ][3] ); \
-      a13 = _mm256_xor_si256( g13, sc->buffer[ ptr16 ][4] ); \
-      a14 = _mm256_xor_si256( g14, sc->buffer[ ptr16 ][5] ); \
-      a15 = _mm256_xor_si256( g15, sc->buffer[ ptr16 ][6] ); \
-      a16 = _mm256_xor_si256( g16, sc->buffer[ ptr16 ][7] ); \
+      a0  = _mm256_xor_si256( t0, m256_one_32 ); \
+      a1  = _mm256_xor_si256( t1, INW2( 0 ) ); \
+      a2  = _mm256_xor_si256( t2, INW2( 1 ) ); \
+      a3  = _mm256_xor_si256( t3, INW2( 2 ) ); \
+      a4  = _mm256_xor_si256( t4, INW2( 3 ) ); \
+      a5  = _mm256_xor_si256( t5, INW2( 4 ) ); \
+      a6  = _mm256_xor_si256( t6, INW2( 5 ) ); \
+      a7  = _mm256_xor_si256( t7, INW2( 6 ) ); \
+      a8  = _mm256_xor_si256( t8, INW2( 7 ) ); \
+      a9  = _mm256_xor_si256(  t9, sc->buffer[ ptr16 ] [0 ] ); \
+      a10 = _mm256_xor_si256( t10, sc->buffer[ ptr16 ] [1 ] ); \
+      a11 = _mm256_xor_si256( t11, sc->buffer[ ptr16 ] [2 ] ); \
+      a12 = _mm256_xor_si256( t12, sc->buffer[ ptr16 ] [3 ] ); \
+      a13 = _mm256_xor_si256( t13, sc->buffer[ ptr16 ] [4 ] ); \
+      a14 = _mm256_xor_si256( t14, sc->buffer[ ptr16 ] [5 ] ); \
+      a15 = _mm256_xor_si256( t15, sc->buffer[ ptr16 ] [6 ] ); \
+      a16 = _mm256_xor_si256( t16, sc->buffer[ ptr16 ] [7 ] ); \
   } while (0)

 #define PANAMA_STEP_8W   do { \
@@ -385,6 +401,7 @@ panama_8way_push( panama_8way_context *sc, const unsigned char *pbuf,
 #define INW2(i)   INW1(i)

   M17( RSTATE );
+
   ptr0 = sc->buffer_ptr;
   while ( num-- > 0 )
   {
@@ -412,9 +429,9 @@ panama_8way_pull( panama_8way_context *sc, unsigned num )

   ptr0 = sc->buffer_ptr;

-   while ( num-- > 0 )
-   {
+   while (num -- > 0) {
      unsigned ptr4;
+
      ptr4 = ( (ptr0 + 4) & 31 );
      PANAMA_STEP_8W;
   }
@@ -432,6 +449,13 @@ panama_8way_init( void *cc )
   panama_8way_context *sc;

   sc = cc;
+   /*
+    * This is not completely conformant, but "it will work
+    * everywhere". Initial state consists of zeroes everywhere.
+    * Conceptually, the sph_u32 type may have padding bits which
+    * must not be set to 0; but such an architecture remains to
+    * be seen.
+    */
   sc->data_ptr = 0;
   memset( sc->buffer, 0, sizeof sc->buffer );
   sc->buffer_ptr = 0;
@@ -445,8 +469,7 @@ panama_8way_short( void *cc, const void *data, size_t len )
   unsigned current;
   sc = cc;
   current = sc->data_ptr;
-   while ( len > 0 )
-   {
+   while (len > 0) {
      unsigned clen;

      clen = ( (sizeof sc->data ) >> 3 ) - current;
@@ -463,6 +486,7 @@ panama_8way_short( void *cc, const void *data, size_t len )
         panama_8way_push( sc, sc->data, 1 );
      }
   }
+
   sc->data_ptr = current;
 }

@@ -511,7 +535,9 @@ panama_8way_close( void *cc, void *dst )
   *(__m256i*)( sc->data + current ) = m256_one_32;
   current++;
   memset_zero_256( (__m256i*)sc->data + current, 32 - current );
+
   panama_8way_push( sc, sc->data, 1 );
+
   panama_8way_pull( sc, 32 );

   for ( i = 0; i < 8; i ++ )
--- a/algo/quark/anime.c
+++ b/algo/quark/anime.c
@@ -160,12 +160,16 @@ int scanhash_anime( struct work *work, uint32_t max_nonce,
              pdata[19] = n;

             if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) ) 
-                submit_solution( work, hash, mythr );
+             {
+                work_set_target_ratio( work, hash );
+                *hashes_done = n - first_nonce + 1;
+                return true;
+             }
             n++;
          } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
          break;
       }
-    *hashes_done = n - first_nonce + 1;
+
    pdata[19] = n;
    return 0;
 }
--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -330,8 +330,11 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			be32enc(&endiandata[19], n); 
 			hmq1725hash(hash64, endiandata);
 			if (((hash64[7]&0xFFFFFFFF)==0) && 
-					fulltest(hash64, ptarget)) 
-            submit_solution( work, hash64, mythr );
+					fulltest(hash64, ptarget)) {
+				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
+				return true;
+			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
 	} 
 	else if (ptarget[7]<=0xF) 
@@ -341,8 +344,11 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			be32enc(&endiandata[19], n); 
 			hmq1725hash(hash64, endiandata);
 			if (((hash64[7]&0xFFFFFFF0)==0) && 
-					fulltest(hash64, ptarget)) 
-            submit_solution( work, hash64, mythr );
+					fulltest(hash64, ptarget)) {
+				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
+				return true;
+			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
 	} 
 	else if (ptarget[7]<=0xFF) 
@@ -352,8 +358,11 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			be32enc(&endiandata[19], n); 
 			hmq1725hash(hash64, endiandata);
 			if (((hash64[7]&0xFFFFFF00)==0) && 
-					fulltest(hash64, ptarget)) 
-            submit_solution( work, hash64, mythr );
+					fulltest(hash64, ptarget)) {
+				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
+				return true;
+			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
 	} 
 	else if (ptarget[7]<=0xFFF) 
@@ -363,9 +372,13 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			be32enc(&endiandata[19], n); 
 			hmq1725hash(hash64, endiandata);
 			if (((hash64[7]&0xFFFFF000)==0) && 
-					fulltest(hash64, ptarget)) 
-            submit_solution( work, hash64, mythr );
+					fulltest(hash64, ptarget)) {
+				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
+				return true;
+			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
+
 	} 
 	else if (ptarget[7]<=0xFFFF) 
 	{
@@ -374,9 +387,13 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			be32enc(&endiandata[19], n); 
 			hmq1725hash(hash64, endiandata);
 			if (((hash64[7]&0xFFFF0000)==0) && 
-					fulltest(hash64, ptarget)) 
-                submit_solution( work, hash64, mythr );
+					fulltest(hash64, ptarget)) {
+				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
+				return true;
+			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
+
 	} 
 	else 
 	{
@@ -384,10 +401,15 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			pdata[19] = ++n;
 			be32enc(&endiandata[19], n); 
 			hmq1725hash(hash64, endiandata);
-			if (fulltest(hash64, ptarget)) 
-                submit_solution( work, hash64, mythr );
+			if (fulltest(hash64, ptarget)) {
+				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
+				return true;
+			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
 	}
+	
+	
 	*hashes_done = n - first_nonce + 1;
 	pdata[19] = n;
 	return 0;
--- a/algo/quark/quark.c
+++ b/algo/quark/quark.c
@@ -126,11 +126,15 @@ int scanhash_quark( struct work *work, uint32_t max_nonce,
 		pdata[19] = ++n;
 		be32enc(&endiandata[19], n); 
 		quark_hash(hash64, &endiandata);
-      if ((hash64[7]&0xFFFFFF00)==0)
-      {
-         if (fulltest(hash64, ptarget)) 
-                submit_solution( work, hash64, mythr );
-      }
+                if ((hash64[7]&0xFFFFFF00)==0)
+                {
+                  if (fulltest(hash64, ptarget)) 
+                  {
+                    work_set_target_ratio( work, hash64 );
+                    *hashes_done = n - first_nonce + 1;
+		    return true;
+                  }
+               }
 	} while (n < max_nonce && !work_restart[thr_id].restart);
 	
 	*hashes_done = n - first_nonce + 1;
--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -92,21 +92,46 @@ int scanhash_deep( struct work *work, uint32_t max_nonce,

        deep_luffa_midstate( endiandata );

+#ifdef DEBUG_ALGO
+	printf("[%d] Htarg=%X\n", thr_id, Htarg);
+#endif
 	for ( int m=0; m < 6; m++ )
-   {
+        {
 	    if ( Htarg <= htmax[m] )
-       {
+            {
 	        uint32_t mask = masks[m];
 	        do
-           {
+                {
 	            pdata[19] = ++n;
-               be32enc( &endiandata[19], n );
-		         deep_hash( hash64, endiandata );
-		         if (!(hash64[7] & mask))
-               if ( fulltest(hash64, ptarget) )
-                   submit_solution( work, hash64, mythr );
-            } while ( n < max_nonce && !work_restart[thr_id].restart );
-          break;
+		    be32enc( &endiandata[19], n );
+		    deep_hash( hash64, endiandata );
+#ifndef DEBUG_ALGO
+		    if (!(hash64[7] & mask))
+                    {
+                       if ( fulltest(hash64, ptarget) )
+                       {
+		          *hashes_done = n - first_nonce + 1;
+		          return true;
+                       }
+//                       else
+//                       {
+//                          applog(LOG_INFO, "Result does not validate on CPU!");
+//                       }
+                     }
+#else
+                    if (!(n % 0x1000) && !thr_id) printf(".");
+	        	if (!(hash64[7] & mask)) {
+		            printf("[%d]",thr_id);
+			    if (fulltest(hash64, ptarget)) {
+                             work_set_target_ratio( work, hash64 );
+                             *hashes_done = n - first_nonce + 1;
+				return true;
+	                    }
+ 	                }
+#endif
+                } while ( n < max_nonce && !work_restart[thr_id].restart );
+                // see blake.c if else to understand the loop on htmax => mask
+            break;
          } 
        }

--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -104,23 +104,48 @@ int scanhash_qubit( struct work *work,	uint32_t max_nonce,

        qubit_luffa_midstate( endiandata );

+#ifdef DEBUG_ALGO
+	printf("[%d] Htarg=%X\n", thr_id, Htarg);
+#endif
 	for ( int m=0; m < 6; m++ )
-   {
+        {
 	    if ( Htarg <= htmax[m] )
-       {
+            {
 	        uint32_t mask = masks[m];
 	        do
-           {
+                {
 	            pdata[19] = ++n;
-		         be32enc(&endiandata[19], n);
-		         qubit_hash(hash64, endiandata);
-		         if (!(hash64[7] & mask))
-               if ( fulltest(hash64, ptarget) )
-                  submit_solution( work, hash64, mythr );
-           } while ( n < max_nonce && !work_restart[thr_id].restart );
-           break;
-       } 
-   }
+		    be32enc(&endiandata[19], n);
+		    qubit_hash(hash64, endiandata);
+#ifndef DEBUG_ALGO
+		    if (!(hash64[7] & mask))
+                    {
+                       if ( fulltest(hash64, ptarget) )
+                       {
+		          *hashes_done = n - first_nonce + 1;
+		          return true;
+                       }
+//                       else
+//                       {
+//                          applog(LOG_INFO, "Result does not validate on CPU!");
+//                       }
+                     }
+#else
+                    if (!(n % 0x1000) && !thr_id) printf(".");
+	        	if (!(hash64[7] & mask)) {
+		            printf("[%d]",thr_id);
+			    if (fulltest(hash64, ptarget)) {
+                             work_set_target_ratio( work, hash64 );
+                             *hashes_done = n - first_nonce + 1;
+				return true;
+	                    }
+ 	                }
+#endif
+                } while ( n < max_nonce && !work_restart[thr_id].restart );
+                // see blake.c if else to understand the loop on htmax => mask
+            break;
+          } 
+        }

 	*hashes_done = n - first_nonce + 1;
 	pdata[19] = n;
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -753,8 +753,10 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
 		
 		for (i = 0; i < throughput; i++) {
 			if (unlikely(hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget))) {
+				*hashes_done = n - pdata[19] + 1;
 				pdata[19] = data[i * 20 + 19];
-            submit_solution( work, hash, mythr );
+                                work_set_target_ratio( work, hash );
+				return 1;
 			}
 		}
 	} while (likely(n < max_nonce && !work_restart[thr_id].restart));
--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -1,8 +1,6 @@
 #include "shavite-hash-4way.h"
 #include <stdint.h>

-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
 static const uint32_t IV512[] =
 {
        0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
@@ -11,6 +9,8 @@ static const uint32_t IV512[] =
        0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
 };

+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 #define mm512_ror2x512hi_1x32( a, b ) \
   _mm512_mask_blend_epi32( 0x8888, mm512_ror128_32( a ), \
                                    mm512_ror128_32( b ) )
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -1035,7 +1035,7 @@ int simd_4way_update( simd_4way_context *state, const void *data,

  while ( databitlen > 0 )
  {
-    if ( ( current == 0 ) && ( databitlen >= bs ) )
+    if ( current == 0 && databitlen >= bs )
    {
       // We can hash the data directly from the input buffer.
      SIMD_4way_Compress( state, data, 0 );
@@ -1049,13 +1049,13 @@ int simd_4way_update( simd_4way_context *state, const void *data,
      int len = bs - current;
      if ( databitlen < len )
      {
-        memcpy( state->buffer + 4 * (current/8), data, 4 * (databitlen/8) );
+        memcpy( state->buffer + 4*(current/8), data, 4*((databitlen+7)/8) );
        state->count += databitlen;
        return 0;
      }
      else
      {
-        memcpy( state->buffer + 4 * (current / 8), data, 4 * (len / 8) );
+        memcpy( state->buffer + 4*(current/8), data, 4*(len/8) );
        state->count += len;
        databitlen -= len;
        data += 4*(len/8);
@@ -1128,7 +1128,7 @@ int simd_4way_update_close( simd_4way_context *state, void *hashval,
      int len = bs - current;
      if ( databitlen < len )
      {
-        memcpy( state->buffer + 4*( current/8 ), data, 4*( (databitlen)/8 ) );
+        memcpy( state->buffer + 4*( current/8 ), data, 4*( (databitlen+7)/8 ) );
        state->count += databitlen;
        break;
      }
@@ -1149,7 +1149,7 @@ int simd_4way_update_close( simd_4way_context *state, void *hashval,
  // If there is still some data in the buffer, hash it
  if ( current )
  {
-    current = current / 8;
+    current = ( current+7 ) / 8;
    memset( state->buffer + 4*current, 0, 4*( state->blocksize/8 - current) );
    SIMD_4way_Compress( state, state->buffer, 0 );
  }
--- a/algo/whirlpool/whirlpool.c
+++ b/algo/whirlpool/whirlpool.c
@@ -78,12 +78,17 @@ int scanhash_whirlpool( struct work* work, uint32_t max_nonce,
 	do {
 		const uint32_t Htarg = ptarget[7];
 		uint32_t vhash[8];
-      pdata[19] = ++n;
+                pdata[19] = ++n;
 		be32enc(&endiandata[19], n );
 		whirlpool_hash(vhash, endiandata);

 		if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
-             submit_solution( work, vhash, mythr );
+                {
+			work_set_target_ratio(work, vhash);
+                       *hashes_done = n - first_nonce + 1;
+			return true;
+		}
+
 	} while ( n < max_nonce && !work_restart[thr_id].restart);

 	*hashes_done = n - first_nonce + 1;
--- a/algo/whirlpool/whirlpoolx.c
+++ b/algo/whirlpool/whirlpoolx.c
@@ -48,7 +48,11 @@ int scanhash_whirlpoolx( struct work* work, uint32_t max_nonce,
 		whirlpoolx_hash(vhash, endiandata);

 		if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
-          submit_solution( work, vhash, mythr );
+                {
+			work_set_target_ratio(work, vhash);
+                       *hashes_done = n - first_nonce + 1;
+			return true;
+		}

 	} while ( n < max_nonce && !work_restart[thr_id].restart);

--- a/algo/x11/c11.c
+++ b/algo/x11/c11.c
@@ -138,12 +138,16 @@ int scanhash_c11( struct work *work, uint32_t max_nonce,
        swab32_array( endiandata, pdata, 20 );

 	do
-   {
-      pdata[19] = nonce;
+        {
 		be32enc( &endiandata[19], nonce );
 		c11_hash( hash, endiandata );
 		if ( hash[7] <= Htarg && fulltest(hash, ptarget) )
-            submit_solution( work, hash, mythr );
+                {
+			pdata[19] = nonce;
+			*hashes_done = pdata[19] - first_nonce;
+                        work_set_target_ratio( work, hash );
+ 			return 1;
+		}
 		nonce++;
 	} while ( nonce < max_nonce && !(*restart) );
 	pdata[19] = nonce;
--- a/algo/x11/timetravel.c
+++ b/algo/x11/timetravel.c
@@ -293,10 +293,14 @@ int scanhash_timetravel( struct work *work, uint32_t max_nonce,

        if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
        {
+              work_set_target_ratio( work, hash );
              pdata[19] = nonce;
-              submit_solution( work, hash, mythr );
-        }
-        nonce++;
+              *hashes_done = pdata[19] - first_nonce;
+              work_set_target_ratio( work, hash );
+              return 1;
+         }
+         nonce++;
+
        } while (nonce < max_nonce && !(*restart));

        pdata[19] = nonce;
--- a/algo/x11/timetravel10.c
+++ b/algo/x11/timetravel10.c
@@ -334,10 +334,14 @@ int scanhash_timetravel10( struct work *work, uint32_t max_nonce,

        if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
        {
+              work_set_target_ratio( work, hash );
              pdata[19] = nonce;
-              submit_solution( work, hash, mythr );
-        }
-        nonce++;
+              work_set_target_ratio( work, hash );
+              *hashes_done = pdata[19] - first_nonce;
+              return 1;
+         }
+         nonce++;
+
        } while (nonce < max_nonce && !(*restart));

        pdata[19] = nonce;
--- a/algo/x11/tribus.c
+++ b/algo/x11/tribus.c
@@ -98,6 +98,9 @@ int scanhash_tribus( struct work *work, uint32_t max_nonce,
        sph_jh512_init( &tribus_ctx.jh );
        sph_jh512( &tribus_ctx.jh, endiandata, 64 );

+#ifdef DEBUG_ALGO
+	printf("[%d] Htarg=%X\n", thr_id, Htarg);
+#endif
 	for (int m=0; m < 6; m++) {
 		if (Htarg <= htmax[m]) {
 			uint32_t mask = masks[m];
@@ -105,9 +108,25 @@ int scanhash_tribus( struct work *work, uint32_t max_nonce,
 				pdata[19] = ++n;
 				be32enc(&endiandata[19], n);
 				tribus_hash(hash32, endiandata);
-				if ((!(hash32[7] & mask)) && fulltest(hash32, ptarget)) 
-                submit_solution( work, hash32, mythr );
+#ifndef DEBUG_ALGO
+				if ((!(hash32[7] & mask)) && fulltest(hash32, ptarget)) {
+					work_set_target_ratio(work, hash32);
+					*hashes_done = n - first_nonce + 1;
+					return 1;
+				}
+#else
+				if (!(n % 0x1000) && !thr_id) printf(".");
+				if (!(hash32[7] & mask)) {
+					printf("[%d]",thr_id);
+					if (fulltest(hash32, ptarget)) {
+						work_set_target_ratio(work, hash32);
+						*hashes_done = n - first_nonce + 1;
+						return 1;
+					}
+				}
+#endif
 			} while (n < max_nonce && !work_restart[thr_id].restart);
+			// see blake.c if else to understand the loop on htmax => mask
 			break;
 		}
 	}
--- a/algo/x11/x11.c
+++ b/algo/x11/x11.c
@@ -163,7 +163,11 @@ int scanhash_x11( struct work *work, uint32_t max_nonce,
              if ( ( hash64[7] & mask ) == 0 )
              {
                 if ( fulltest( hash64, ptarget ) )
-                    submit_solution( work, hash64, mythr );
+                 {
+                    *hashes_done = n - first_nonce + 1;
+                    work_set_target_ratio( work, hash64 );
+                    return true;
+                 }
              }
            } while ( n < max_nonce && !work_restart[thr_id].restart );
          }
--- a/algo/x11/x11evo.c
+++ b/algo/x11/x11evo.c
@@ -199,8 +199,12 @@ int scanhash_x11evo( struct work* work, uint32_t max_nonce,
          if ( ( hash64[7] & hmask ) == 0 )
          {
             if ( fulltest( hash64, ptarget ) )
-                submit_solution( work, hash64, mythr );
-          }
+             {
+                 *hashes_done = n - first_nonce + 1;
+                 work_set_target_ratio( work, hash64 );
+                 return true;
+             }
+           }
        } while ( n < max_nonce && !work_restart[thr_id].restart );

 	*hashes_done = n - first_nonce + 1;
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -448,7 +448,6 @@ void x11gost_4way_hash( void *state, const void *input )
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_2x128( hash0, hash1, vhash, 512 );
     intrlv_2x128( vhash, hash2, hash3, 512 );
-     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_2x128( hash2, hash3, vhash, 512 );

--- a/algo/x11/x11gost-gate.c
+++ b/algo/x11/x11gost-gate.c
@@ -15,7 +15,7 @@ bool register_x11gost_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x11gost;
  gate->hash      = (void*)&x11gost_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -128,8 +128,9 @@ void x11gost_hash(void *output, const void *input)
 int scanhash_x11gost( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+
 	const uint32_t first_nonce = pdata[19];
 	uint32_t _ALIGN(64) endiandata[20];
   int thr_id = mythr->id;  // thr_id arg is deprecated
@@ -148,13 +149,16 @@ int scanhash_x11gost( struct work *work, uint32_t max_nonce,
 		be32enc(&endiandata[19], nonce);
 		x11gost_hash(hash, endiandata);

-		if (hash[7] <= Htarg && fulltest(hash, ptarget))
-      {
+		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			pdata[19] = nonce;
-         submit_solution( work, hash, mythr );
+			*hashes_done = pdata[19] - first_nonce;
+                        work_set_target_ratio( work, hash );
+			return 1;
 		}
 		nonce++;
+
 	} while (nonce < max_nonce && !(*restart));
+
 	pdata[19] = nonce;
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
--- a/algo/x12/x12.c
+++ b/algo/x12/x12.c
@@ -158,6 +158,9 @@ int scanhash_x12( struct work *work, uint32_t max_nonce,
 	// we need bigendian data...
        swab32_array( endiandata, pdata, 20 );

+#ifdef DEBUG_ALGO
+	printf("[%d] Htarg=%X\n", thr_id, Htarg);
+#endif
   for (int m=0; m < 6; m++) {
      if (Htarg <= htmax[m]) {
        uint32_t mask = masks[m];
@@ -165,10 +168,33 @@ int scanhash_x12( struct work *work, uint32_t max_nonce,
 	   pdata[19] = ++n;
 	   be32enc(&endiandata[19], n);
 	   x12hash(hash64, endiandata);
+#ifndef DEBUG_ALGO
 	   if (!(hash64[7] & mask))
-      if ( fulltest(hash64, ptarget) )
-         submit_solution( work, hash64, mythr );
+           { 
+              if ( fulltest(hash64, ptarget) )
+              {
+	     	*hashes_done = n - first_nonce + 1;
+	 	return true;
+              }
+//                                   else
+//                                  {
+//                                      applog(LOG_INFO, "Result does not validate on CPU!");
+//                                  }
+            }
+                                   
+#else
+	    if (!(n % 0x1000) && !thr_id) printf(".");
+		if (!(hash64[7] & mask)) {
+			printf("[%d]",thr_id);
+			if (fulltest(hash64, ptarget)) {
+                                work_set_target_ratio( work, hash );
+				*hashes_done = n - first_nonce + 1;
+				return true;
+			}
+		}
+#endif
 	} while (n < max_nonce && !work_restart[thr_id].restart);
+			// see blake.c if else to understand the loop on htmax => mask
 	break;
     }
  }
--- a/algo/x13/phi1612.c
+++ b/algo/x13/phi1612.c
@@ -115,10 +115,11 @@ int scanhash_phi1612( struct work *work, uint32_t max_nonce,
 		be32enc(&endiandata[19], nonce);
 		phi1612_hash(hash, endiandata);

-		if (hash[7] <= Htarg && fulltest(hash, ptarget))
-      {
+		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			pdata[19] = nonce;
-         submit_solution( work, hash, mythr );
+                        work_set_target_ratio( work, hash );
+			*hashes_done = pdata[19] - first_nonce;
+			return 1;
 		}
 		nonce++;

--- a/algo/x13/skunk.c
+++ b/algo/x13/skunk.c
@@ -68,9 +68,11 @@ int scanhash_skunk( struct work *work, uint32_t max_nonce,
 	   skunkhash( hash, endiandata );

 	   if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
-      {
-         pdata[19] = nonce;
-         submit_solution( work, hash, mythr );
+           {
+		pdata[19] = nonce;
+		*hashes_done = pdata[19] - first_nonce;
+                work_set_target_ratio( work, hash );
+		return 1;
 	   }
 	   nonce++;
 	} while ( nonce < max_nonce && !(*restart) );
--- a/algo/x13/x13.c
+++ b/algo/x13/x13.c
@@ -161,8 +161,11 @@ int scanhash_x13( struct work *work, uint32_t max_nonce,
 	};

 	// we need bigendian data...
-   swab32_array( endiandata, pdata, 20 );
+        swab32_array( endiandata, pdata, 20 );

+#ifdef DEBUG_ALGO
+	printf("[%d] Htarg=%X\n", thr_id, Htarg);
+#endif
   for (int m=0; m < 6; m++) {
      if (Htarg <= htmax[m]) {
        uint32_t mask = masks[m];
@@ -170,11 +173,31 @@ int scanhash_x13( struct work *work, uint32_t max_nonce,
 	   pdata[19] = ++n;
 	   be32enc(&endiandata[19], n);
 	   x13hash(hash64, endiandata);
+#ifndef DEBUG_ALGO
 	   if (!(hash64[7] & mask))
-      { 
-          if ( fulltest(hash64, ptarget) )
-                submit_solution( work, hash64, mythr );
-      }
+           { 
+              if ( fulltest(hash64, ptarget) )
+              {
+	     	*hashes_done = n - first_nonce + 1;
+	 	return true;
+              }
+//                                   else
+//                                  {
+//                                      applog(LOG_INFO, "Result does not validate on CPU!");
+//                                  }
+            }
+                                   
+#else
+	    if (!(n % 0x1000) && !thr_id) printf(".");
+		if (!(hash64[7] & mask)) {
+			printf("[%d]",thr_id);
+			if (fulltest(hash64, ptarget)) {
+                                work_set_target_ratio( work, hash );
+				*hashes_done = n - first_nonce + 1;
+				return true;
+			}
+		}
+#endif
 	} while (n < max_nonce && !work_restart[thr_id].restart);
 			// see blake.c if else to understand the loop on htmax => mask
 	break;
--- a/algo/x13/x13bcd.c
+++ b/algo/x13/x13bcd.c
@@ -165,6 +165,10 @@ int scanhash_x13bcd( struct work *work, uint32_t max_nonce,
 	// we need bigendian data...
        swab32_array( endiandata, pdata, 20 );

+#ifdef DEBUG_ALGO
+	if (Htarg != 0)
+		printf("[%d] Htarg=%X\n", thr_id, Htarg);
+#endif
 	for (int m=0; m < 6; m++) {
 		if (Htarg <= htmax[m]) {
 			uint32_t mask = masks[m];
@@ -172,9 +176,24 @@ int scanhash_x13bcd( struct work *work, uint32_t max_nonce,
 				pdata[19] = ++n;
 				be32enc(&endiandata[19], n);
 				x13bcd_hash(hash64, endiandata);
-				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) 
-                submit_solution( work, hash64, mythr );
+#ifndef DEBUG_ALGO
+				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
+					*hashes_done = n - first_nonce + 1;
+					return true;
+				}
+#else
+				if (!(n % 0x1000) && !thr_id) printf(".");
+				if (!(hash64[7] & mask)) {
+					printf("[%d]",thr_id);
+					if (fulltest(hash64, ptarget)) {
+                  work_set_target_ratio( work, hash64 );
+						*hashes_done = n - first_nonce + 1;
+						return true;
+					}
+				}
+#endif
 			} while (n < max_nonce && !work_restart[thr_id].restart);
+			// see blake.c if else to understand the loop on htmax => mask
 			break;
 		}
 	}
--- a/algo/x13/x13sm3.c
+++ b/algo/x13/x13sm3.c
@@ -177,6 +177,10 @@ int scanhash_x13sm3( struct work *work, uint32_t max_nonce,
 	// we need bigendian data...
        swab32_array( endiandata, pdata, 20 );

+#ifdef DEBUG_ALGO
+	if (Htarg != 0)
+		printf("[%d] Htarg=%X\n", thr_id, Htarg);
+#endif
 	for (int m=0; m < 6; m++) {
 		if (Htarg <= htmax[m]) {
 			uint32_t mask = masks[m];
@@ -184,8 +188,22 @@ int scanhash_x13sm3( struct work *work, uint32_t max_nonce,
 				pdata[19] = ++n;
 				be32enc(&endiandata[19], n);
 				x13sm3_hash(hash64, endiandata);
-				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget))
-                submit_solution( work, hash64, mythr );
+#ifndef DEBUG_ALGO
+				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
+					*hashes_done = n - first_nonce + 1;
+					return true;
+				}
+#else
+				if (!(n % 0x1000) && !thr_id) printf(".");
+				if (!(hash64[7] & mask)) {
+					printf("[%d]",thr_id);
+					if (fulltest(hash64, ptarget)) {
+                                                work_set_target_ratio( work, hash64 );
+						*hashes_done = n - first_nonce + 1;
+						return true;
+					}
+				}
+#endif
 			} while (n < max_nonce && !work_restart[thr_id].restart);
 			// see blake.c if else to understand the loop on htmax => mask
 			break;
--- a/algo/x14/axiom.c
+++ b/algo/x14/axiom.c
@@ -63,10 +63,11 @@ int scanhash_axiom( struct work *work,
 	do {
 		be32enc(&endiandata[19], n);
 		axiomhash(hash64, endiandata);
-		if (hash64[7] < Htarg && fulltest(hash64, ptarget))
-      {
+		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
+			*hashes_done = n - first_nonce + 1;
 			pdata[19] = n;
-         submit_solution( work, hash64, mythr );
+                        work_set_target_ratio( work, hash64 );
+			return true;
 		}
 		n++;

--- a/algo/x14/polytimos.c
+++ b/algo/x14/polytimos.c
@@ -100,10 +100,11 @@ int scanhash_polytimos( struct work *work, uint32_t max_nonce,
 		be32enc(&endiandata[19], nonce);
 		polytimos_hash(hash, endiandata);

-		if (hash[7] <= Htarg && fulltest(hash, ptarget))
-      {
+		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
+			work_set_target_ratio(work, hash);
 			pdata[19] = nonce;
-         submit_solution( work, hash, mythr );
+			*hashes_done = pdata[19] - first_nonce;
+			return 1;
 		}
 		nonce++;

--- a/algo/x14/veltor.c
+++ b/algo/x14/veltor.c
@@ -89,10 +89,11 @@ int scanhash_veltor( struct work *work, uint32_t max_nonce,
 		be32enc(&endiandata[19], nonce);
 		veltor_hash(hash, endiandata);

-		if (hash[7] <= Htarg && fulltest(hash, ptarget))
-      {
+		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
+			work_set_target_ratio(work, hash);
 			pdata[19] = nonce;
-         submit_solution( work, hash, mythr );
+			*hashes_done = pdata[19] - first_nonce;
+			return 1;
 		}
 		nonce++;

--- a/algo/x14/x14.c
+++ b/algo/x14/x14.c
@@ -170,6 +170,10 @@ int scanhash_x14( struct work *work, uint32_t max_nonce,
 	// we need bigendian data...
        swab32_array( endiandata, pdata, 20 );

+#ifdef DEBUG_ALGO
+	if (Htarg != 0)
+		printf("[%d] Htarg=%X\n", thr_id, Htarg);
+#endif
 	for (int m=0; m < 6; m++) {
 		if (Htarg <= htmax[m]) {
 			uint32_t mask = masks[m];
@@ -177,9 +181,24 @@ int scanhash_x14( struct work *work, uint32_t max_nonce,
 				pdata[19] = ++n;
 				be32enc(&endiandata[19], n);
 				x14hash(hash64, endiandata);
-				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget))
-                submit_solution( work, hash64, mythr );
+#ifndef DEBUG_ALGO
+				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
+					*hashes_done = n - first_nonce + 1;
+					return true;
+				}
+#else
+				if (!(n % 0x1000) && !thr_id) printf(".");
+				if (!(hash64[7] & mask)) {
+					printf("[%d]",thr_id);
+					if (fulltest(hash64, ptarget)) {
+                                                work_set_target_ratio( work, hash64 );
+						*hashes_done = n - first_nonce + 1;
+						return true;
+					}
+				}
+#endif
 			} while (n < max_nonce && !work_restart[thr_id].restart);
+			// see blake.c if else to understand the loop on htmax => mask
 			break;
 		}
 	}
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -217,22 +217,22 @@ void x16r_8way_hash( void* output, const void* input )
         case CUBEHASH:
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, size );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, size );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
         break;
         case SHAVITE:
 #if defined(__VAES__)
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
            shavite512_4way_init( &ctx.shavite );
-            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size );
+            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
            shavite512_4way_init( &ctx.shavite );
-            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size );
+            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
            sph_shavite512_init( &ctx.shavite );
@@ -275,11 +275,11 @@ void x16r_8way_hash( void* output, const void* input )
 #if defined(__VAES__)
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
            echo_4way_init( &ctx.echo, 512 );
-            echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 );
+            echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
            echo_4way_init( &ctx.echo, 512 );
-            echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 );
+            echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
            init_echo( &ctx.echo, 512 );
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -211,22 +211,22 @@ void x16rt_8way_hash( void* output, const void* input )
         case CUBEHASH:
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, size );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, size );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
         break;
         case SHAVITE:
 #if defined(__VAES__)
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
            shavite512_4way_init( &ctx.shavite );
-            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size );
+            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
            shavite512_4way_init( &ctx.shavite );
-            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size );
+            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
            sph_shavite512_init( &ctx.shavite );
@@ -269,11 +269,11 @@ void x16rt_8way_hash( void* output, const void* input )
 #if defined(__VAES__)
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
            echo_4way_init( &ctx.echo, 512 );
-            echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 );
+            echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
            echo_4way_init( &ctx.echo, 512 );
-            echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 );
+            echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
             init_echo( &ctx.echo, 512 );
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -42,12 +42,15 @@ union _x16rv2_8way_context_overlay
 {
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
+//    hashState_groestl       groestl;
    skein512_8way_context   skein;
    jh512_8way_context      jh;
    keccak512_8way_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
+//    sph_shavite512_context  shavite;
    simd_4way_context       simd;
+//    hashState_echo          echo;
    hamsi512_8way_context   hamsi;
    sph_fugue512_context    fugue;
    shabal512_8way_context  shabal;
@@ -272,22 +275,22 @@ void x16rv2_8way_hash( void* output, const void* input )
         case CUBEHASH:
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, size );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, size );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
         break;
         case SHAVITE:
 #if defined(__VAES__)
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
            shavite512_4way_init( &ctx.shavite );
-            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size );
+            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
            shavite512_4way_init( &ctx.shavite );
-            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size );
+            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
            sph_shavite512_init( &ctx.shavite );
@@ -330,11 +333,11 @@ void x16rv2_8way_hash( void* output, const void* input )
 #if defined(__VAES__)
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
            echo_4way_init( &ctx.echo, 512 );
-            echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 );
+            echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
            echo_4way_init( &ctx.echo, 512 );
-            echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 );
+            echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
             init_echo( &ctx.echo, 512 );
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -230,22 +230,22 @@ void x21s_8way_hash( void* output, const void* input )
         case CUBEHASH:
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, size );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, size );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
         break;
         case SHAVITE:
 #if defined(__VAES__)
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
            shavite512_4way_init( &ctx.shavite );
-            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size );
+            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
            shavite512_4way_init( &ctx.shavite );
-            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size );
+            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
            sph_shavite512_init( &ctx.shavite );
@@ -285,14 +285,15 @@ void x21s_8way_hash( void* output, const void* input )
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
         break;
         case ECHO:
+
 #if defined(__VAES__)
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
            echo_4way_init( &ctx.echo, 512 );
-            echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 );
+            echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
            echo_4way_init( &ctx.echo, 512 );
-            echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 );
+            echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
            init_echo( &ctx.echo, 512 );
@@ -516,7 +517,7 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
-              applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime );
+              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
   }

   do
--- a/algo/x20/x20r-gate.h
+++ b/algo/x20/x20r-gate.h
@@ -44,15 +44,15 @@ bool register_xi20r_algo( algo_gate_t* gate );

 void x20r_4way_hash( void *state, const void *input );

-int scanhash_x20r_4way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr );
+int scanhash_x20r_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );

 #endif

 void x20rhash( void *state, const void *input );

-int scanhash_x20r( struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done, struct thr_info *mythr );
+int scanhash_x20r( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );

 #endif

--- a/algo/x20/x20r.c
+++ b/algo/x20/x20r.c
@@ -228,8 +228,8 @@ void x20r_hash(void* output, const void* input)
   memcpy(output, hash, 32);
 }

-int scanhash_x20r( struct work *work, uint32_t max_nonce,
-	           uint64_t *hashes_done, struct thr_info *mythr )
+int scanhash_x20r( int thr_id, struct work *work, uint32_t max_nonce,
+	           uint64_t *hashes_done )
 {
   uint32_t _ALIGN(128) hash32[8];
   uint32_t _ALIGN(128) endiandata[20];
@@ -238,7 +238,6 @@ int scanhash_x20r( struct work *work, uint32_t max_nonce,
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t nonce = first_nonce;
-   int thr_id = mythr->id;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);

   for (int k=0; k < 19; k++)
@@ -260,9 +259,11 @@ int scanhash_x20r( struct work *work, uint32_t max_nonce,
 	x20r_hash( hash32, endiandata );

 	if ( hash32[7] <= Htarg && fulltest( hash32, ptarget ) )
-  	{
-        pdata[19] = nonce;
-        submit_solution( work, hash32, mythr );
+       	{
+           work_set_target_ratio( work, hash32 );
+	   pdata[19] = nonce;
+	   *hashes_done = pdata[19] - first_nonce;
+	   return 1;
 	}
 	nonce++;

--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.11.2.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.11.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.11.2'
-PACKAGE_STRING='cpuminer-opt 3.11.2'
+PACKAGE_VERSION='3.11.1'
+PACKAGE_STRING='cpuminer-opt 3.11.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.11.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.11.1 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.11.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.11.1:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.11.2
+cpuminer-opt configure 3.11.1
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.11.2, which was
+It was created by cpuminer-opt $as_me 3.11.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.11.2'
+ VERSION='3.11.1'


 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.11.2, which was
+This file was extended by cpuminer-opt $as_me 3.11.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.11.2
+cpuminer-opt config.status 3.11.1
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.11.2])
+AC_INIT([cpuminer-opt], [3.11.1])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -3343,7 +3343,7 @@ static void show_credits()
 {
   printf("\n         **********  "PACKAGE_NAME" "PACKAGE_VERSION"  *********** \n");
   printf("     A CPU miner with multi algo support and optimized for CPUs\n");
-   printf("     with AVX512, SHA and VAES extensions.\n");
+   printf("     with AES_NI, AVX2, AVX512, SHA and VAES extensions.\n");
   printf("     BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n\n");
 }

--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -132,19 +132,10 @@ do { \

 // Parallel AES, for when x is expected to be in a 256 bit register.
 // Use same 128 bit key.
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-#define mm256_aesenc_2x128( x, k ) \
-   _mm256_aesenc_epi128( x, m256_const1_128(k ) )
-
-#else
-
 #define mm256_aesenc_2x128( x, k ) \
   mm256_concat_128( _mm_aesenc_si128( mm128_extr_hi128_256( x ), k ), \
                     _mm_aesenc_si128( mm128_extr_lo128_256( x ), k ) )

-#endif
-
 #define mm256_paesenc_2x128( y, x, k ) do \
 { \
  __m128i *X = (__m128i*)x; \
@@ -555,14 +546,14 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_ror512_128( v1, v2 ) \
 do { \
   __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
-   v1 = _mm256_permute2x128( v2, v1, 0x21 ); \
+   v1 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \
   v2 = t; \
 } while(0)

 #define mm256_rol512_128( v1, v2 ) \
 do { \
   __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
-   v2 = _mm256_permute2x128( v2, v1, 0x21 ); \
+   v2 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \
   v1 = t; \
 } while(0)

--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -44,14 +44,14 @@
 //
 //    Constants need to be composed at run time by assembling individual
 //    elements, very expensive. The cost is proportional to the number of
-//    different elements therefore use the largest element size possible,
-//    merge smaller integer elements to 64 bits, and group repeated elements.
+//    elements therefor use the largest element size possible, even by
+//    merging smaller values.
 //
 //    Constants with repeating patterns can be optimized with the smaller
 //    patterns repeated more frequently being more efficient.
 //
 //    Some specific constants can be very efficient. Zero is very efficient,
-//    1 and -1 slightly less so. 
+//    1 and -1 slightly less so.
 //
 //    If an expensive constant is to be reused in the same function it should
 //    be declared as a local variable defined once and reused.