v3.4.8 release

2025-09-17 23:44:27 +00:00 · 2016-10-17 14:16:28 -04:00
parent 4ecf9555da
commit 6f28a67f9c
22 changed files with 1117 additions and 656 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -105,6 +105,7 @@ cpuminer_SOURCES = \
  algo/lyra2/sponge.c \
  algo/lyra2/lyra2rev2.c \
  algo/lyra2/lyra2re.c \
+  algo/lyra2/zcoin.c \
  algo/keccak/sse2/keccak.c \
  algo/m7m.c \
  algo/neoscrypt.c \
--- a/2
+++ b/2
@@ -12,7 +12,9 @@ comparison below.

 New in 3.4.8

+- added zcoin support, optimized for AVX2 but no increase in performance
 - fixed API display of diff for cryptonight
+- --show-diff is now the default, use "--hide-diff" to disable
 - cleaned up some cpuminer-multi artifacts

 Users with non-SSE2 CPUs or who want to mine algos not supported by
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -169,6 +169,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_LUFFA:       register_luffa_algo      ( gate ); break;
     case ALGO_LYRA2RE:     register_lyra2re_algo    ( gate ); break;
     case ALGO_LYRA2REV2:   register_lyra2rev2_algo  ( gate ); break;
+     case ALGO_LYRA2Z:      register_zcoin_algo      ( gate ); break;
     case ALGO_M7M:         register_m7m_algo        ( gate ); break;
     case ALGO_MYR_GR:      register_myriad_algo     ( gate ); break;
     case ALGO_NEOSCRYPT:   register_neoscrypt_algo  ( gate ); break;
@@ -268,6 +269,7 @@ const char* const algo_alias_map[][2] =
  { "sib",               "x11gost"     },
  { "yes",               "yescrypt"    },
  { "ziftr",             "zr5"         },
+  { "zcoin",             "lyra2z"      },
  { NULL,                NULL          }   
 };

--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -115,7 +115,7 @@ void ( *hash_alt ) ( void*, const void*, uint32_t );
 void ( *hash_suw ) ( void*, const void* );

 //optional, safe to use default in most cases
-bool ( *miner_thread_init )      ();
+bool ( *miner_thread_init )      ( int );
 void ( *stratum_gen_work )       ( struct stratum_ctx*, struct work* );
 void ( *get_new_work )           ( struct work*, struct work*, int, uint32_t*,
                                   bool );
@@ -129,7 +129,7 @@ bool ( *submit_getwork_result )  ( CURL*, struct work* );
 void ( *gen_merkle_root )        ( char*, struct stratum_ctx* );
 void ( *build_stratum_request )  ( char*, struct work*, struct stratum_ctx* );
 void ( *set_work_data_endian )   ( struct work* );
-void ( *calc_network_diff )      ( struct work* );
+double ( *calc_network_diff )    ( struct work* );
 void ( *build_extraheader )      ( struct work*, struct stratum_ctx* );
 bool ( *prevent_dupes )          ( struct work*, struct stratum_ctx*, int );
 void ( *resync_threads )         ( struct work* );
@@ -229,7 +229,7 @@ void jr2_build_stratum_request   ( char *req, struct work *work );
 // default is do_nothing;
 void swab_work_data( struct work *work );

-void std_calc_network_diff( struct work *work );
+double std_calc_network_diff( struct work *work );

 void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );

--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -115,7 +115,8 @@ uint32_t *decred_get_nonceptr( uint32_t *work_data )
 // does it need to increment nonce, seems not because gen_work_now always
 // returns true

-void decred_calc_network_diff( struct work* work )
+double decred_calc_network_diff( struct work* work )
+//void decred_calc_network_diff( struct work* work )
 {
   // sample for diff 43.281 : 1c05ea29
   // todo: endian reversed on longpoll could be zr5 specific...
@@ -123,17 +124,18 @@ void decred_calc_network_diff( struct work* work )
   uint32_t bits = ( nbits & 0xffffff );
   int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
   int m;
-   net_diff = (double)0x0000ffff / (double)bits;
+   double d = (double)0x0000ffff / (double)bits;

   for ( m = shift; m < 29; m++ )
-       net_diff *= 256.0;
+       d *= 256.0;
   for ( m = 29; m < shift; m++ )
-       net_diff /= 256.0;
+       d /= 256.0;
   if ( shift == 28 )
-       net_diff *= 256.0; // testnet
+       d *= 256.0; // testnet
   if ( opt_debug_diff )
-       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", net_diff,
-               shift, bits);
+       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d,
+                           shift, bits );
+   return net_diff;
 }

 void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
--- a/algo/drop.c
+++ b/algo/drop.c
@@ -233,11 +233,6 @@ void drop_get_new_work( struct work* work, struct work* g_work, int thr_id,
       ++(*nonceptr);
 }

-void drop_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (65536.0 * opt_diff_factor) );
-}
- 
 void drop_display_pok( struct work* work ) 
 {
      if ( work->data[0] & 0x00008000 ) 
--- a/algo/lbry.c
+++ b/algo/lbry.c
@@ -142,7 +142,7 @@ int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

-void lbry_calc_network_diff(struct work *work)
+double lbry_calc_network_diff( struct work *work )
 {
        // sample for diff 43.281 : 1c05ea29
        // todo: endian reversed on longpoll could be zr5 specific...
@@ -159,7 +159,7 @@ void lbry_calc_network_diff(struct work *work)
   if (opt_debug_diff)
      applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);

-   net_diff = d;
+   return d;
 }

 // std_le should work but it doesn't
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -44,171 +44,344 @@
 *
 * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
 */
-int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols)
+
+// Lyra2RE & Lyra2REv2, nRows must be a power of 2
+int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
+           const void *salt, uint64_t saltlen, uint64_t timeCost,
+           const uint64_t nRows, const uint64_t nCols )
 {
-	//============================= Basic variables ============================//
-	int64_t row = 2; //index of row to be processed
-	int64_t prev = 1; //index of prev (last row ever computed/modified)
-	int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
-	int64_t tau; //Time Loop iterator
-	int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
-	int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
-	int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
-	int64_t i; //auxiliary iteration counter
-	int64_t v64; // 64bit var for memcpy
-	//==========================================================================/
+   //====================== Basic variables ============================//
+   uint64_t _ALIGN(256) state[16];
+   int64_t row = 2; //index of row to be processed
+   int64_t prev = 1; //index of prev (last row ever computed/modified)
+   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+   int64_t tau; //Time Loop iterator
+   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+   int64_t i; //auxiliary iteration counter
+   int64_t v64; // 64bit var for memcpy
+   //====================================================================/

-	//========== Initializing the Memory Matrix and pointers to it =============//
-	//Tries to allocate enough space for the whole memory matrix
+   //=== Initializing the Memory Matrix and pointers to it =============//
+   //Tries to allocate enough space for the whole memory matrix

-	const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-	const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
-	// for Lyra2REv2, nCols = 4, v1 was using 8
-	const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   // for Lyra2REv2, nCols = 4, v1 was using 8
+   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
+                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;

-	i = (int64_t)ROW_LEN_BYTES * nRows;
-	uint64_t *wholeMatrix = malloc(i);
-	if (wholeMatrix == NULL) {
-		return -1;
-	}
-	memset(wholeMatrix, 0, i);
+   i = (int64_t)ROW_LEN_BYTES * nRows;
+   uint64_t *wholeMatrix = malloc(i);
+   if (wholeMatrix == NULL)
+      return -1;

-	//Allocates pointers to each row of the matrix
-	uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows);
-	if (memMatrix == NULL) {
-		return -1;
-	}
-	//Places the pointers in the correct positions
-	uint64_t *ptrWord = wholeMatrix;
-	for (i = 0; i < nRows; i++) {
-		memMatrix[i] = ptrWord;
-		ptrWord += ROW_LEN_INT64;
-	}
-	//==========================================================================/
+   memset(wholeMatrix, 0, i);

-	//============= Getting the password + salt + basil padded with 10*1 ===============//
-	//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
-	//but this ensures that the password copied locally will be overwritten as soon as possible
+   //Allocates pointers to each row of the matrix
+   uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows);
+   if (memMatrix == NULL)
+      return -1;

-	//First, we clean enough blocks for the password, salt, basil and padding
-	int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
+   //Places the pointers in the correct positions
+   uint64_t *ptrWord = wholeMatrix;
+   for (i = 0; i < nRows; i++)
+   {
+       memMatrix[i] = ptrWord;
+       ptrWord += ROW_LEN_INT64;
+   }

-	byte *ptrByte = (byte*) wholeMatrix;
+   //=== Getting the password + salt + basil padded with 10*1 ==========//
+   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+   //but this ensures that the password copied locally will be overwritten as soon as possible

-	//Prepends the password
-	memcpy(ptrByte, pwd, pwdlen);
-	ptrByte += pwdlen;
+   //First, we clean enough blocks for the password, salt, basil and padding
+   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;

-	//Concatenates the salt
-	memcpy(ptrByte, salt, saltlen);
-	ptrByte += saltlen;
+   byte *ptrByte = (byte*) wholeMatrix;

-	memset(ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen));
+   //Prepends the password
+   memcpy(ptrByte, pwd, pwdlen);
+   ptrByte += pwdlen;

-	//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
-	memcpy(ptrByte, &kLen, sizeof(int64_t));
-	ptrByte += sizeof(uint64_t);
-	v64 = pwdlen;
-	memcpy(ptrByte, &v64, sizeof(int64_t));
-	ptrByte += sizeof(uint64_t);
-	v64 = saltlen;
-	memcpy(ptrByte, &v64, sizeof(int64_t));
-	ptrByte += sizeof(uint64_t);
-	v64 = timeCost;
-	memcpy(ptrByte, &v64, sizeof(int64_t));
-	ptrByte += sizeof(uint64_t);
-	v64 = nRows;
-	memcpy(ptrByte, &v64, sizeof(int64_t));
-	ptrByte += sizeof(uint64_t);
-	v64 = nCols;
-	memcpy(ptrByte, &v64, sizeof(int64_t));
-	ptrByte += sizeof(uint64_t);
+   //Concatenates the salt
+   memcpy(ptrByte, salt, saltlen);
+   ptrByte += saltlen;

-	//Now comes the padding
-	*ptrByte = 0x80; //first byte of padding: right after the password
-	ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
-	ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
-	*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
-	//==========================================================================/
+   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
+                       - (saltlen + pwdlen) );

-	//======================= Initializing the Sponge State ====================//
-	//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
-	uint64_t _ALIGN(256) state[16];
-	initState(state);
-	//==========================================================================/
+   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+   memcpy(ptrByte, &kLen, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = pwdlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = saltlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = timeCost;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nRows;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nCols;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);

-	//================================ Setup Phase =============================//
-	//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
-	ptrWord = wholeMatrix;
-	for (i = 0; i < nBlocksInput; i++) {
-		absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
-		ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
-	}
+   //Now comes the padding
+   *ptrByte = 0x80; //first byte of padding: right after the password
+   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block

-	//Initializes M[0] and M[1]
-	reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
+   //================= Initializing the Sponge State ====================//
+   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+   initState(state);

-	reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
+   //========================= Setup Phase =============================//
+   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+   ptrWord = wholeMatrix;
+   for (i = 0; i < nBlocksInput; i++)
+   {
+       absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
+       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
+   }

-	do {
-		//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+   //Initializes M[0] and M[1]
+   reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here

-		reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+   reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);

-		//updates the value of row* (deterministically picked during Setup))
-		rowa = (rowa + step) & (window - 1);
-		//update prev: it now points to the last row ever computed
-		prev = row;
-		//updates row: goes to the next row to be computed
-		row++;
+   do
+   {
+      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)

-		//Checks if all rows in the window where visited.
-		if (rowa == 0) {
-		step = window + gap; //changes the step: approximately doubles its value
-		window *= 2; //doubles the size of the re-visitation window
-		gap = -gap; //inverts the modifier to the step
-	}
+      reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);

-	} while (row < nRows);
-	//==========================================================================/
+      //updates the value of row* (deterministically picked during Setup))
+      rowa = (rowa + step) & (window - 1);
+      //update prev: it now points to the last row ever computed
+      prev = row;
+      //updates row: goes to the next row to be computed
+      row++;

-	//============================ Wandering Phase =============================//
-	row = 0; //Resets the visitation to the first row of the memory matrix
-	for (tau = 1; tau <= timeCost; tau++) {
-		//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
-		step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
-		do {
-			//Selects a pseudorandom index row*
-			//------------------------------------------------------------------------------------------
-			rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
-			//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
-			//------------------------------------------------------------------------------------------
+      //Checks if all rows in the window where visited.
+      if (rowa == 0)
+      {
+         step = window + gap; //changes the step: approximately doubles its value
+         window *= 2; //doubles the size of the re-visitation window
+         gap = -gap; //inverts the modifier to the step
+      }

-			//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
-			reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+   } while (row < nRows);

-			//update prev: it now points to the last row ever computed
-			prev = row;
+   //===================== Wandering Phase =============================//
+   row = 0; //Resets the visitation to the first row of the memory matrix
+   for (tau = 1; tau <= timeCost; tau++)
+   {
+       //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+       step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+       do
+       {
+           //Selects a pseudorandom index row*
+           //-----------------------------------------------
+           rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+           //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //-------------------------------------------

-			//updates row: goes to the next row to be computed
-			//------------------------------------------------------------------------------------------
-			row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
-			//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
-			//------------------------------------------------------------------------------------------
+           //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+           reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);

-		} while (row != 0);
-	}
+           //update prev: it now points to the last row ever computed
+           prev = row;

-	//============================ Wrap-up Phase ===============================//
-	//Absorbs the last block of the memory matrix
-	absorbBlock(state, memMatrix[rowa]);
+           //updates row: goes to the next row to be computed
+           //----------------------------------------------------
+           row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+           //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //----------------------------------------------------

-	//Squeezes the key
-	squeeze(state, K, (unsigned int) kLen);
+       } while (row != 0);
+   }

-	//========================= Freeing the memory =============================//
-	free(memMatrix);
-	free(wholeMatrix);
+   //===================== Wrap-up Phase ===============================//
+   //Absorbs the last block of the memory matrix
+   absorbBlock(state, memMatrix[rowa]);

-	return 0;
+   //Squeezes the key
+   squeeze(state, K, (unsigned int) kLen);
+
+   //================== Freeing the memory =============================//
+   free(memMatrix);
+   free(wholeMatrix);
+
+   return 0;
 }
+
+// Zcoin, nRows may be any value
+int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
+            const void *salt, uint64_t saltlen, uint64_t timeCost,
+            uint64_t nRows, uint64_t nCols )
+{
+    //========================== Basic variables ============================//
+    uint64_t _ALIGN(256) state[16];
+    int64_t row = 2; //index of row to be processed
+    int64_t prev = 1; //index of prev (last row ever computed/modified)
+    int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+    int64_t tau; //Time Loop iterator
+    int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+    int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+    int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+    int64_t i; //auxiliary iteration counter
+    //=======================================================================/
+
+    //======= Initializing the Memory Matrix and pointers to it =============//
+    //Tries to allocate enough space for the whole memory matrix
+
+    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+    i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES);
+    uint64_t *wholeMatrix = malloc(i);
+    if (wholeMatrix == NULL) 
+      return -1;
+    
+    memset(wholeMatrix, 0, i);
+    //Allocates pointers to each row of the matrix
+    uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*));
+    if (memMatrix == NULL) 
+      return -1;
+    
+    //Places the pointers in the correct positions
+    uint64_t *ptrWord = wholeMatrix;
+    for (i = 0; i < nRows; i++)
+    {
+      memMatrix[i] = ptrWord;
+      ptrWord += ROW_LEN_INT64;
+    }
+
+    //==== Getting the password + salt + basil padded with 10*1 ============//
+    //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+    //but this ensures that the password copied locally will be overwritten as soon as possible
+
+    //First, we clean enough blocks for the password, salt, basil and padding
+    uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof (uint64_t) )
+                               / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
+    byte *ptrByte = (byte*) wholeMatrix;
+    memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES );
+
+    //Prepends the password
+    memcpy(ptrByte, pwd, pwdlen);
+    ptrByte += pwdlen;
+
+    //Concatenates the salt
+    memcpy(ptrByte, salt, saltlen);
+    ptrByte += saltlen;
+
+    //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+    memcpy(ptrByte, &kLen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &saltlen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &timeCost, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &nRows, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &nCols, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+
+    //Now comes the padding
+    *ptrByte = 0x80; //first byte of padding: right after the password
+    ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+    ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+    *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+
+    //=================== Initializing the Sponge State ====================//
+    //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+    initState( state );
+
+    //============================== Setup Phase =============================//
+    //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+    ptrWord = wholeMatrix;
+    for ( i = 0; i < nBlocksInput; i++ )
+    {
+      absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
+      ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
+    }
+
+    //Initializes M[0] and M[1]
+    reducedSqueezeRow0( state, memMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+    reducedDuplexRow1( state, memMatrix[0], memMatrix[1], nCols );
+
+    do
+    {
+      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+      reducedDuplexRowSetup( state, memMatrix[prev], memMatrix[rowa],
+                              memMatrix[row], nCols );
+
+      //updates the value of row* (deterministically picked during Setup))
+      rowa = (rowa + step) & (window - 1);
+      //update prev: it now points to the last row ever computed
+      prev = row;
+      //updates row: goes to the next row to be computed
+      row++;
+
+      //Checks if all rows in the window where visited.
+      if (rowa == 0)
+      {
+        step = window + gap; //changes the step: approximately doubles its value
+        window *= 2; //doubles the size of the re-visitation window
+        gap = -gap; //inverts the modifier to the step
+      }
+
+    } while (row < nRows);
+
+    //======================== Wandering Phase =============================//
+    row = 0; //Resets the visitation to the first row of the memory matrix
+    for ( tau = 1; tau <= timeCost; tau++ )
+    {
+        //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+        step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+        do {
+        //Selects a pseudorandom index row*
+        //----------------------------------------------------------------------
+        //rowa = ((unsigned int)state[0]) & (nRows-1);	//(USE THIS IF nRows IS A POWER OF 2)
+        rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+        //-----------------------------------------------------------------
+
+        //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+        reducedDuplexRow( state, memMatrix[prev], memMatrix[rowa],
+                          memMatrix[row], nCols );
+
+        //update prev: it now points to the last row ever computed
+        prev = row;
+
+        //updates row: goes to the next row to be computed
+        //---------------------------------------------------------------
+        //row = (row + step) & (nRows-1);	//(USE THIS IF nRows IS A POWER OF 2)
+        row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+        //--------------------------------------------------------------------
+
+      } while (row != 0);
+    }
+
+    //========================= Wrap-up Phase ===============================//
+    //Absorbs the last block of the memory matrix
+    absorbBlock( state, memMatrix[rowa] );
+
+    //Squeezes the key
+    squeeze( state, K, kLen );
+
+    //====================== Freeing the memory =============================//
+    free( memMatrix );
+    free( wholeMatrix );
+
+    return 0;
+}
+
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -37,6 +37,10 @@ typedef unsigned char byte;
        #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8)    //Block length, in bytes
 #endif

-int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
-
+int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
+           const void *salt, uint64_t saltlen, uint64_t timeCost,
+           uint64_t nRows, uint64_t nCols );
+int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, 
+           const void *salt, uint64_t saltlen, uint64_t timeCost, 
+           uint64_t nRows, uint64_t nCols );
 #endif /* LYRA2_H_ */
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -23,42 +23,34 @@
 #define SPONGE_H_

 #include <stdint.h>
+#include "avxdefs.h"

-/* Blake2b IV Array */
+#if defined(__GNUC__)
+#define ALIGN __attribute__ ((aligned(32)))
+#elif defined(_MSC_VER)
+#define ALIGN __declspec(align(32))
+#else
+#define ALIGN
+#endif
+
+
+/*Blake2b IV Array*/
 static const uint64_t blake2b_IV[8] =
 {
-	0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
-	0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
-	0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
-	0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
+  0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+  0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+  0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+  0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
 };

-/* Blake2b's rotation */
-static __inline uint64_t rotr64(const uint64_t w, const unsigned c) {
-#ifdef _MSC_VER
-	return _rotr64(w, c);
-#else
-	return ( w >> c ) | ( w << ( 64 - c ) );
-#endif
+/*Blake2b's rotation*/
+static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
+    return ( w >> c ) | ( w << ( 64 - c ) );
 }

 #if defined __AVX2__
 // only available with avx2

-// rotate each uint64 c bits
-// returns _m256i
-#define  mm256_rotr_64(w,c) _mm256_or_si256(_mm256_srli_epi64(w, c), \
-                                            _mm256_slli_epi64(w, 64 - c))
-
-// Rotate 4 uint64 (256 bits) by one uint64 (64 bits)
-// returns __m256i
-#define mm256_rotl256_1x64(s) _mm256_permute4x64_epi64( s, 0x39 )
-#define mm256_rotr256_1x64(s) _mm256_permute4x64_epi64( s, 0x93 )
-
-// swap hi and lo 128 bits in 256 bit vector
-// returns _m256i
-#define mm256_swap128(s) _mm256_permute2f128_si256( s, s, 1 )
-
 // init vectors from memory
 // returns void, updates defines and inits implicit args a, b, c, d
 #define LYRA_INIT_AVX2 \
@@ -88,15 +80,29 @@ static __inline uint64_t rotr64(const uint64_t w, const unsigned c) {
   c = _mm256_add_epi64( c, d ); \
   b = mm256_rotr_64( _mm256_xor_si256( b, c ), 63 );

-#define LYRA_ROUND_AVX2 \
-   G_4X64( a[0], a[1], a[2], a[3] ); \
-   a[1] = mm256_rotl256_1x64( a[1]); \
-   a[2] = mm256_swap128( a[2] ); \
-   a[3] = mm256_rotr256_1x64( a[3] ); \
-   G_4X64( a[0], a[1], a[2], a[3] ); \
-   a[1] = mm256_rotr256_1x64( a[1] ); \
-   a[2] = mm256_swap128( a[2] ); \
-   a[3] = mm256_rotl256_1x64( a[3] );
+#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   G_4X64( s0, s1, s2, s3 ); \
+   s1 = mm256_rotl256_1x64( s1); \
+   s2 = mm256_swap128( s2 ); \
+   s3 = mm256_rotr256_1x64( s3 ); \
+   G_4X64( s0, s1, s2, s3 ); \
+   s1 = mm256_rotr256_1x64( s1 ); \
+   s2 = mm256_swap128( s2 ); \
+   s3 = mm256_rotl256_1x64( s3 );
+
+#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \

 #else
 // only available with avx
@@ -148,6 +154,7 @@ static __inline uint64_t rotr64(const uint64_t w, const unsigned c) {

 #endif // AVX2

+/*
 #if defined __AVX__
 // can coexist with AVX2

@@ -161,7 +168,7 @@ static __inline uint64_t rotr64(const uint64_t w, const unsigned c) {
 #define mm128_swap128(s0, s1) s0 = _mm_xor_si128(s0, s1); \
                              s1 = _mm_xor_si128(s0, s1); \
                              s0 = _mm_xor_si128(s0, s1);
- 
+
 // swap uint64 in 128 bit source vector, equivalent of rotating 128 bits by
 // 64 bits (8 bytes)
 // __m128i
@@ -193,30 +200,33 @@ static __inline uint64_t rotr64(const uint64_t w, const unsigned c) {
 } while(0)

 #endif   // AVX
+*/

-/* Blake2b's G function */
-#define G(r,i,a,b,c,d) do { \
-	a = a + b; \
-	d = rotr64(d ^ a, 32); \
-	c = c + d; \
-	b = rotr64(b ^ c, 24); \
-	a = a + b; \
-	d = rotr64(d ^ a, 16); \
-	c = c + d; \
-	b = rotr64(b ^ c, 63); \
+// Scalar
+//Blake2b's G function
+#define G(r,i,a,b,c,d) \
+  do { \
+    a = a + b; \
+    d = rotr64(d ^ a, 32); \
+    c = c + d; \
+    b = rotr64(b ^ c, 24); \
+    a = a + b; \
+    d = rotr64(d ^ a, 16); \
+    c = c + d; \
+    b = rotr64(b ^ c, 63); \
  } while(0)


 /*One Round of the Blake2b's compression function*/
-#define ROUND_LYRA(r) \
-	G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
-	G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
-	G(r,2,v[ 2],v[ 6],v[10],v[14]); \
-	G(r,3,v[ 3],v[ 7],v[11],v[15]); \
-	G(r,4,v[ 0],v[ 5],v[10],v[15]); \
-	G(r,5,v[ 1],v[ 6],v[11],v[12]); \
-	G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
-	G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
+#define ROUND_LYRA(r)  \
+    G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+    G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+    G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+    G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+    G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+    G(r,7,v[ 3],v[ 4],v[ 9],v[14]);


 //---- Housekeeping
@@ -224,18 +234,31 @@ void initState(uint64_t state[/*16*/]);

 //---- Squeezes
 void squeeze(uint64_t *state, unsigned char *out, unsigned int len);
-void reducedSqueezeRow0(uint64_t* state, uint64_t* row, const uint32_t nCols);
+void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols);

 //---- Absorbs
 void absorbBlock(uint64_t *state, const uint64_t *in);
 void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);

 //---- Duplexes
-void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const uint32_t nCols);
-void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols);
-void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols);
+void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols);
+void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
+void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);

 //---- Misc
 void printArray(unsigned char *array, unsigned int size, char *name);

+////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+////TESTS////
+//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2);
+//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+/////////////
+
+
 #endif /* SPONGE_H_ */
--- a/algo/lyra2/zcoin.c
+++ b/algo/lyra2/zcoin.c
@@ -0,0 +1,77 @@
+#include <memory.h>
+#include "miner.h"
+#include "algo-gate-api.h"
+#include "lyra2.h"
+
+void zcoin_hash(void *state, const void *input, uint32_t height)
+{
+
+	uint32_t _ALIGN(256) hash[16];
+
+        LYRA2Z(hash, 32, input, 80, input, 80, 2, height, 256);
+
+	memcpy(state, hash, 32);
+}
+
+//int scanhash_zcoin(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, uint32_t height)
+int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done )
+{
+	uint32_t _ALIGN(128) hash[8];
+	uint32_t _ALIGN(128) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t Htarg = ptarget[7];
+	const uint32_t first_nonce = pdata[19];
+	uint32_t nonce = first_nonce;
+	if (opt_benchmark)
+		ptarget[7] = 0x0000ff;
+
+	for (int i=0; i < 19; i++) {
+		be32enc(&endiandata[i], pdata[i]);
+	}
+
+	do {
+		be32enc(&endiandata[19], nonce);
+		zcoin_hash( hash, endiandata, work->height );
+
+		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
+			work_set_target_ratio(work, hash);
+			pdata[19] = nonce;
+			*hashes_done = pdata[19] - first_nonce;
+			return 1;
+		}
+		nonce++;
+
+	} while (nonce < max_nonce && !work_restart[thr_id].restart);
+
+	pdata[19] = nonce;
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
+
+int64_t get_max64_0xffffLL() { return 0xffffLL; };
+
+void zcoin_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
+{
+   work->height = sctx->bloc_height;
+   return false;
+}
+
+bool register_zcoin_algo( algo_gate_t* gate )
+{
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->scanhash   = (void*)&scanhash_zcoin;
+  gate->hash       = (void*)&zcoin_hash;
+  gate->hash_alt   = (void*)&zcoin_hash;
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  gate->set_target = (void*)&zcoin_set_target;
+  gate->prevent_dupes = (void*)&zcoin_get_work_height;
+  return true;
+};
+
--- a/algo/pluck.c
+++ b/algo/pluck.c
@@ -487,24 +487,19 @@ int64_t pluck_get_max64 ()
  return 0x1ffLL;
 }

-void pluck_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (65536.0 * opt_diff_factor) );
-}
-
-bool pluck_miner_thread_init( )
+bool pluck_miner_thread_init( int thr_id )
 { 
  scratchbuf = malloc( 128 * 1024 ); 
  if ( scratchbuf )
    return true;
-  applog( LOG_ERR, "Pluck buffer allocation failed");
+  applog( LOG_ERR, "Thread %u: Pluck buffer allocation failed", thr_id );
  return false;
 }

 bool register_pluck_algo( algo_gate_t* gate )
 {
  algo_not_tested();
-  gate->miner_thread_init = (void*)& pluck_miner_thread_init;
+  gate->miner_thread_init = (void*)&pluck_miner_thread_init;
  gate->scanhash         = (void*)&scanhash_pluck;
  gate->hash             = (void*)&pluck_hash;
  gate->set_target       = (void*)&scrypt_set_target;
--- a/algo/scrypt.c
+++ b/algo/scrypt.c
@@ -774,12 +774,12 @@ int64_t scrypt_get_max64()
     return max64;
 }

-bool scrypt_miner_thread_init()
+bool scrypt_miner_thread_init( int thr_id )
 {
 scratchbuf = scrypt_buffer_alloc( opt_scrypt_n );  
 if ( scratchbuf )
   return true;
- applog( LOG_ERR, "Scrypt buffer allocation failed" );
+ applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id );
 return false; 
 }

--- a/algo/scryptjane/scrypt-jane.c
+++ b/algo/scryptjane/scrypt-jane.c
@@ -231,11 +231,6 @@ void scryptjanehash(void *output, const void *input )
 	scrypt_free(&YX);
 }

-void scryptjane_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (65536.0 * opt_diff_factor) );
-}
-
 bool register_scryptjane_algo( algo_gate_t* gate )
 {
    gate->scanhash   = (void*)&scanhash_scryptjane;
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -49,19 +49,11 @@ int64_t yescrypt_get_max64 ()
  return 0x1ffLL;
 }

-
-void yescrypt_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (65536.0 * opt_diff_factor) );
-}
-
-
 bool register_yescrypt_algo ( algo_gate_t* gate )
 {
   gate->scanhash   = (void*)&scanhash_yescrypt;
   gate->hash       = (void*)&yescrypt_hash;
   gate->hash_alt   = (void*)&yescrypthash;
-//   gate->set_target = (void*)&yescrypt_set_target;
   gate->set_target = (void*)&scrypt_set_target;
   gate->get_max64  = (void*)&yescrypt_get_max64;
   return true;
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -1,7 +1,7 @@
 //   Some tools to help using AVX and AVX2

 #include <inttypes.h>
-//#include <immintrin.h>
+#include <immintrin.h>

 // Use these overlays to access the same data in memory as different types
 //
@@ -15,7 +15,7 @@

 typedef union
 {
-#if defined __AVX__
+#if defined __AVX2__
 __m256i   v256;
 #endif
 __m128i   v128[ 2];
@@ -49,47 +49,85 @@ uint8_t   v8 [16];

 #if defined __AVX2__

-// Rotate bits in 4 uint64
+// Rotate bits in 4 uint64  (3 instructions)
 // __m256i mm256_rotr_64( __256i, int )
-#define  mm256_rotr_64(w,c) _mm256_or_si256(_mm256_srli_epi64(w, c), \
-                                            _mm256_slli_epi64(w, 64 - c))
-//static inline __m256i mm256_rotr_64 ( __m256i w, int c)
-//{
-//   return _mm256_or_si256( _mm256_srli_epi64( w, c ),
-//                           _mm256_slli_epi64( w, 64 - c ) );
-//}
-// Rotate uint64 by one uint64
-//__m256i mm256_rotl256_1x64( _mm256i, int )
-#define mm256_rotl256_1x64(s) _mm256_permute4x64_epi64( s, 0x39 )
-#define mm256_rotr256_1x64(s) _mm256_permute4x64_epi64( s, 0x93 )
-//static inline __m256i mm256_rotl256_1x64( __m256i s ) 
-//{ 
-//   return _mm256_permute4x64_epi64( s, 0x39 );
-//}
-//static inline __m256i mm256_rotr256_1x64( __m256i s )
-//{
-//   return _mm256_permute4x64_epi64( s, 0x93 );
-//}
+#define  mm256_rotr_64( w, c ) \
+    _mm256_or_si256( _mm256_srli_epi64(w, c), _mm256_slli_epi64(w, 64 - c) )
+
+#define  mm256_rotl_64( w, c ) \
+    _mm256_or_si256( _mm256_slli_epi64(w, c), _mm256_srli_epi64(w, 64 - c) )

 // swap hi and lo 128 bits in 256 bit vector
 //  __m256i mm256_swap128( __m256i )
-#define mm256_swap128(s) _mm256_permute2f128_si256( s, s, 1 )
-//static inline __m256i mm256_swap128( __m256i s )
-//{
-//    return _mm256_permute2f128_si256( s, s, 1 );
-//}
+#define mm256_swap128( w ) \
+    _mm256_permute2f128_si256( w, w, 1 )

-#endif
+// Rotate 256 bits by 64 bits (4 uint64 by one uint64)
+//__m256i mm256_rotl256_1x64( _mm256i, int )
+#define mm256_rotl256_1x64( w ) \
+     _mm256_permute4x64_epi64( w, 0x39 )
+
+#define mm256_rotr256_1x64( w ) \
+     _mm256_permute4x64_epi64( w, 0x93 )
+
+// shift 256 bits by n*64 bits (4 uint64 by n uint64)
+#define mm256_slli256_1x64( w ) \
+    _mm256_and_si256( mm256_rotl256_1x64( w ), \
+                      _mm256_set_epi64x( 0, \
+                                         0xffffffffffffffffull, \
+                                         0xffffffffffffffffull, \
+                                         0xffffffffffffffffull ) )
+//                      _mm256_set_epi64x( 0xffffffffffffffffull, \
+//                                         0xffffffffffffffffull, \
+//                                         0xffffffffffffffffull, \
+//                                         0 ) )
+
+
+#define mm256_slli256_2x64( w ) \
+    _mm256_and_si256( mm256_swap128( w ), \
+                      _mm256_set_epi64x( 0xffffffffffffffffull, \
+                                         0xffffffffffffffffull, \
+                                         0, \
+                                         0 ) )
+
+#define mm256_slli256_3x64( w ) \
+    _mm256_and_si256( mm256_rotr256_1x64( w ), \
+                      _mm256_set_epi64x( 0xffffffffffffffffull, \
+                                         0, \
+                                         0, \
+                                         0 ) )
+
+#define mm256_srli256_1x64( w ) \
+    _mm256_and_si256( mm256_rotr256_1x64( w ), \
+                      _mm256_set_epi64x( 0, \
+                                         0xffffffffffffffffull, \
+                                         0xffffffffffffffffull, \
+                                         0xffffffffffffffffull ) )
+
+#define mm256_srli256_2x64( w ) \
+    _mm256_and_si256( mm256_swap128( w ), \
+                      _mm256_set_epi64x( 0, \
+                                         0, \
+                                         0xffffffffffffffffull, \
+                                         0xffffffffffffffffull ))
+
+#define mm256_srli256_3x64( w ) \
+    _mm256_and_si256( mm256_rotl256_1x64( w ), \
+                      _mm256_set_epi64x( 0xffffffffffffffffull, \
+                                         0, \
+                                         0, \
+                                         0 ) )
+//                      _mm256_set_epi64x( 0, \
+//                                         0, \
+//                                         0, \
+//                                         0xffffffffffffffffull ) )
+
+#endif  // AVX2

 // rotate bits in 2 uint64
 // _m128i mm_rotr_64( __m128i, int )
 #define  mm_rotr_64(w,c) _mm_or_si128(_mm_srli_epi64(w, c), \
                                      _mm_slli_epi64(w, 64 - c))
-//static inline __m128i  mm_rotr_64( __m128i w, int c )
-//{
-//      _mm_or_si128( _mm_srli_epi64( w, c ), 
-//                    _mm_slli_epi64( w, 64 - c ) );
-//}

 // swap 128 bit source vectors
 // void mm128_swap128( __m128i, __m128i )
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.4.8-dev])
+AC_INIT([cpuminer-opt], [4.3.8-dev])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -79,7 +79,7 @@ bool opt_debug_diff = false;
 bool opt_protocol = false;
 bool opt_benchmark = false;
 bool opt_redirect = true;
-bool opt_showdiff = false;
+bool opt_showdiff = true;
 bool opt_extranonce = true;
 bool want_longpoll = true;
 bool have_longpoll = false;
@@ -306,8 +306,8 @@ static bool work_decode(const json_t *val, struct work *work)
 {
    if ( !algo_gate.work_decode( val, work ) )
        return false;
-    if ((opt_showdiff || opt_max_diff > 0.) && !allow_mininginfo)
-        algo_gate.calc_network_diff( work );
+    if ( !allow_mininginfo )
+        net_diff = algo_gate.calc_network_diff( work );
    work->targetdiff = target_to_diff(work->target);
    // for api stats, on longpoll pools
    stratum_diff = work->targetdiff;
@@ -781,10 +781,6 @@ static int share_result( int result, struct work *work, const char *reason )
                       (uint32_t)cpu_temp(0) );
 #endif

-//   applog(LOG_NOTICE, "accepted: %lu/%lu (%s%%), %s %sH, %s %sH/s %s",
-//              accepted_count, total_submits, accepted_rate_s,
-//              hc, hc_units, hr, hr_units, sres );
-
   if (reason)
   {
 	applog(LOG_WARNING, "reject reason: %s", reason);
@@ -1467,7 +1463,7 @@ void std_build_extraheader( struct work* work, struct stratum_ctx* sctx )
   work->data[31] = 0x00000280;
 }

-void std_calc_network_diff( struct work* work )
+double std_calc_network_diff( struct work* work )
 {
   // sample for diff 43.281 : 1c05ea29
   // todo: endian reversed on longpoll could be zr5 specific...
@@ -1477,11 +1473,14 @@ void std_calc_network_diff( struct work* work )
   uint32_t bits  = ( nbits & 0xffffff );
   int16_t  shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
   int m;
-   net_diff = (double)0x0000ffff / (double)bits;
+   double d = (double)0x0000ffff / (double)bits;
   for ( m = shift; m < 29; m++ )
-       net_diff *= 256.0;
+       d *= 256.0;
   for ( m = 29; m < shift; m++ )
-       net_diff /= 256.0;
+       d /= 256.0;
+   if ( opt_debug_diff )
+      applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
+   return d;
 }

 uint32_t* std_get_nonceptr( uint32_t *work_data )
@@ -1501,7 +1500,8 @@ void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
   
   if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
-      && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) ) )
+      && ( clean_job || ( *nonceptr >= *end_nonce_ptr )
+         || ( work->job_id != g_work->job_id ) ) )
   {
     work_free( work );
     work_copy( work, g_work );
@@ -1518,6 +1518,7 @@ void jr2_get_new_work( struct work* work, struct work* g_work, int thr_id,
                     uint32_t *end_nonce_ptr )
 {
   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
+
   // byte data[ 0..38, 43..75 ], skip over misaligned nonce [39..42]
   if ( memcmp( work->data, g_work->data, algo_gate.nonce_index )
     || memcmp( ((uint8_t*) work->data)   + JR2_WORK_CMP_INDEX_2,
@@ -1611,9 +1612,9 @@ static void *miner_thread( void *userdata )
      }
   }

-   if ( !algo_gate.miner_thread_init() )
+   if ( !algo_gate.miner_thread_init( thr_id ) )
   {
-      applog( LOG_ERR, "FAIL: thread %u failed to initialize");
+      applog( LOG_ERR, "FAIL: thread %u failed to initialize", thr_id );
      exit (1);
   }

@@ -1660,6 +1661,8 @@ static void *miner_thread( void *userdata )
       } // do_this_thread
       algo_gate.resync_threads( &work );

+       // prevent dupes is called on every loop and has useful args so it
+       // is being used by zcoin to pass along the work height.
       if ( algo_gate.prevent_dupes( &work, &stratum, thr_id ) )
          continue;
       // prevent scans before a job is received
@@ -2075,8 +2078,7 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
      g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );

   algo_gate.build_extraheader( g_work, sctx );
-   if ( opt_showdiff || opt_max_diff > 0. )
-       algo_gate.calc_network_diff( g_work );
+   net_diff = algo_gate.calc_network_diff( g_work );
   algo_gate.set_work_data_endian( g_work );
   pthread_mutex_unlock( &sctx->work_lock );

@@ -2161,6 +2163,7 @@ static void *stratum_thread(void *userdata )
 
 	       sleep(opt_fail_pause);
           }
+
 	   if (jsonrpc_2)
           {
 		work_free(&g_work);
@@ -2168,15 +2171,15 @@ static void *stratum_thread(void *userdata )
 	   }
 	}

-	if (stratum.job.job_id &&
-	     (!g_work_time || strcmp(stratum.job.job_id, g_work.job_id)) )
+        if (stratum.job.job_id &&
+             (!g_work_time || strcmp(stratum.job.job_id, g_work.job_id)) )
 	{
 	   pthread_mutex_lock(&g_work_lock);
           algo_gate.stratum_gen_work( &stratum, &g_work );
 	   time(&g_work_time);
 	   pthread_mutex_unlock(&g_work_lock);

-	   if (stratum.job.clean || jsonrpc_2)
+           if (stratum.job.clean || jsonrpc_2)
           {
 		static uint32_t last_bloc_height;
 		if (!opt_quiet && last_bloc_height != stratum.bloc_height)
@@ -2554,7 +2557,7 @@ void parse_arg(int key, char *arg )
 		opt_extranonce = false;
 		break;
 	case 1013:
-		opt_showdiff = true;
+		opt_showdiff = false;
 		break;
 	case 1016:			/* --coinbase-addr */
 		pk_script_size = address_to_script(pk_script, sizeof(pk_script), arg);
--- a/m4/placeholder-to-prevent-git-from-deleting-this-directory
+++ b/m4/placeholder-to-prevent-git-from-deleting-this-directory
--- a/miner.h
+++ b/miner.h
@@ -492,6 +492,7 @@ enum algos {
        ALGO_LUFFA,       
        ALGO_LYRA2RE,       
        ALGO_LYRA2REV2,   
+        ALGO_LYRA2Z,
        ALGO_M7M,
        ALGO_MYR_GR,      
        ALGO_NEOSCRYPT,
@@ -546,6 +547,7 @@ static const char *algo_names[] = {
        "luffa",
        "lyra2re",
        "lyra2rev2",
+        "lyra2z",
        "m7m",
        "myr-gr",
        "neoscrypt",
@@ -573,6 +575,7 @@ static const char *algo_names[] = {
        "x15",
        "x17",
        "yescrypt",
+        "lyra2z",
        "zr5",
        "\0"
 };
@@ -654,6 +657,7 @@ Options:\n\
                          luffa        Luffa\n\
                          lyra2re      lyra2\n\
                          lyra2rev2    lyrav2\n\
+                          lyra2z       Zcoin (XZC)\n\
                          m7m          Magi (XMG)\n\
                          myr-gr       Myriad-Groestl\n\
                          neoscrypt    NeoScrypt(128, 2, 1)\n\
@@ -700,6 +704,7 @@ Options:\n\
      --randomize       Randomize scan range start to reduce duplicates\n\
  -f, --diff-factor     Divide req. difficulty by this factor (std is 1.0)\n\
  -m, --diff-multiplier Multiply difficulty by this factor (std is 1.0)\n\
+      --hide-diff       Do not display changes in difficulty\n\
  -n, --nfactor         neoscrypt N-Factor\n\
      --coinbase-addr=ADDR  payout address for solo mining\n\
      --coinbase-sig=TEXT  data to insert in the coinbase when possible\n\
@@ -763,6 +768,7 @@ static struct option const options[] = {
        { "diff-factor", 1, NULL, 'f' },
        { "diff", 1, NULL, 'f' }, // deprecated (alias)
        { "diff-multiplier", 1, NULL, 'm' },
+        { "hide-diff", 0, NULL, 1013 },
        { "help", 0, NULL, 'h' },
        { "nfactor", 1, NULL, 'n' },
        { "no-gbt", 0, NULL, 1011 },
@@ -783,7 +789,6 @@ static struct option const options[] = {
        { "retry-pause", 1, NULL, 'R' },
        { "randomize", 0, NULL, 1024 },
        { "scantime", 1, NULL, 's' },
-        { "show-diff", 0, NULL, 1013 },
 #ifdef HAVE_SYSLOG_H
        { "syslog", 0, NULL, 'S' },
 #endif
--- a/util.c
+++ b/util.c
@@ -1630,7 +1630,7 @@ bool rpc2_job_decode(const json_t *job, struct work *work)
 		      hashrate += thr_hashrates[i];
                   pthread_mutex_unlock(&stats_lock);
 		   double diff = trunc( ( ((double)0xffffffff) / target ) );
-		   if (!opt_quiet)
+		   if ( opt_showdiff )
 		      // xmr pool diff can change a lot...
 		      applog(LOG_WARNING, "Stratum difficulty set to %g", diff);
 		   stratum_diff = diff;