diff --git a/Makefile.am b/Makefile.am
index c5f9a8e..832fe6f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -105,6 +105,7 @@ cpuminer_SOURCES = \
   algo/lyra2/sponge.c \
   algo/lyra2/lyra2rev2.c \
   algo/lyra2/lyra2re.c \
+  algo/lyra2/zcoin.c \
   algo/keccak/sse2/keccak.c \
   algo/m7m.c \
   algo/neoscrypt.c \
diff --git a/RELEASE_ANNOUNCEMENT b/RELEASE_ANNOUNCEMENT
index 87c6f65..efb153a 100644
--- a/RELEASE_ANNOUNCEMENT
+++ b/RELEASE_ANNOUNCEMENT
@@ -12,7 +12,9 @@ comparison below.
 
 New in 3.4.8
 
+- added zcoin support, optimized for AVX2 but no increase in performance
 - fixed API display of diff for cryptonight
+- --show-diff is now the default, use "--hide-diff" to disable
 - cleaned up some cpuminer-multi artifacts
 
 Users with non-SSE2 CPUs or who want to mine algos not supported by
diff --git a/algo-gate-api.c b/algo-gate-api.c
index f1d1b01..d161fe2 100644
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -169,6 +169,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
      case ALGO_LUFFA:       register_luffa_algo      ( gate ); break;
      case ALGO_LYRA2RE:     register_lyra2re_algo    ( gate ); break;
      case ALGO_LYRA2REV2:   register_lyra2rev2_algo  ( gate ); break;
+     case ALGO_LYRA2Z:      register_zcoin_algo      ( gate ); break;
      case ALGO_M7M:         register_m7m_algo        ( gate ); break;
      case ALGO_MYR_GR:      register_myriad_algo     ( gate ); break;
      case ALGO_NEOSCRYPT:   register_neoscrypt_algo  ( gate ); break;
@@ -268,6 +269,7 @@ const char* const algo_alias_map[][2] =
   { "sib",               "x11gost"     },
   { "yes",               "yescrypt"    },
   { "ziftr",             "zr5"         },
+  { "zcoin",             "lyra2z"      },
   { NULL,                NULL          }   
 };
 
diff --git a/algo-gate-api.h b/algo-gate-api.h
index 4d72b09..bf7bebf 100644
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -115,7 +115,7 @@ void ( *hash_alt ) ( void*, const void*, uint32_t );
 void ( *hash_suw ) ( void*, const void* );
 
 //optional, safe to use default in most cases
-bool ( *miner_thread_init )      ();
+bool ( *miner_thread_init )      ( int );
 void ( *stratum_gen_work )       ( struct stratum_ctx*, struct work* );
 void ( *get_new_work )           ( struct work*, struct work*, int, uint32_t*,
                                    bool );
@@ -129,7 +129,7 @@ bool ( *submit_getwork_result )  ( CURL*, struct work* );
 void ( *gen_merkle_root )        ( char*, struct stratum_ctx* );
 void ( *build_stratum_request )  ( char*, struct work*, struct stratum_ctx* );
 void ( *set_work_data_endian )   ( struct work* );
-void ( *calc_network_diff )      ( struct work* );
+double ( *calc_network_diff )    ( struct work* );
 void ( *build_extraheader )      ( struct work*, struct stratum_ctx* );
 bool ( *prevent_dupes )          ( struct work*, struct stratum_ctx*, int );
 void ( *resync_threads )         ( struct work* );
@@ -229,7 +229,7 @@ void jr2_build_stratum_request   ( char *req, struct work *work );
 // default is do_nothing;
 void swab_work_data( struct work *work );
 
-void std_calc_network_diff( struct work *work );
+double std_calc_network_diff( struct work *work );
 
 void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );
 
diff --git a/algo/blake/decred.c b/algo/blake/decred.c
index 44ac63f..b6684aa 100644
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -115,7 +115,8 @@ uint32_t *decred_get_nonceptr( uint32_t *work_data )
 // does it need to increment nonce, seems not because gen_work_now always
 // returns true
 
-void decred_calc_network_diff( struct work* work )
+double decred_calc_network_diff( struct work* work )
+//void decred_calc_network_diff( struct work* work )
 {
    // sample for diff 43.281 : 1c05ea29
    // todo: endian reversed on longpoll could be zr5 specific...
@@ -123,17 +124,18 @@ void decred_calc_network_diff( struct work* work )
    uint32_t bits = ( nbits & 0xffffff );
    int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
    int m;
-   net_diff = (double)0x0000ffff / (double)bits;
+   double d = (double)0x0000ffff / (double)bits;
 
    for ( m = shift; m < 29; m++ )
-       net_diff *= 256.0;
+       d *= 256.0;
    for ( m = 29; m < shift; m++ )
-       net_diff /= 256.0;
+       d /= 256.0;
    if ( shift == 28 )
-       net_diff *= 256.0; // testnet
+       d *= 256.0; // testnet
    if ( opt_debug_diff )
-       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", net_diff,
-               shift, bits);
+       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d,
+                           shift, bits );
+   return net_diff;
 }
 
 void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
diff --git a/algo/drop.c b/algo/drop.c
index 2b16fb6..aa3dae3 100644
--- a/algo/drop.c
+++ b/algo/drop.c
@@ -233,11 +233,6 @@ void drop_get_new_work( struct work* work, struct work* g_work, int thr_id,
        ++(*nonceptr);
 }
 
-void drop_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (65536.0 * opt_diff_factor) );
-}
- 
 void drop_display_pok( struct work* work ) 
 {
       if ( work->data[0] & 0x00008000 ) 
diff --git a/algo/lbry.c b/algo/lbry.c
index d7bd8d4..d2f081c 100644
--- a/algo/lbry.c
+++ b/algo/lbry.c
@@ -142,7 +142,7 @@ int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }
 
-void lbry_calc_network_diff(struct work *work)
+double lbry_calc_network_diff( struct work *work )
 {
         // sample for diff 43.281 : 1c05ea29
         // todo: endian reversed on longpoll could be zr5 specific...
@@ -159,7 +159,7 @@ void lbry_calc_network_diff(struct work *work)
    if (opt_debug_diff)
       applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
 
-   net_diff = d;
+   return d;
 }
 
 // std_le should work but it doesn't
diff --git a/algo/lyra2/lyra2.c b/algo/lyra2/lyra2.c
index 5f1b527..fab6da5 100644
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -44,171 +44,344 @@
  *
  * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
  */
-int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols)
+
+// Lyra2RE & Lyra2REv2, nRows must be a power of 2
+int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
+           const void *salt, uint64_t saltlen, uint64_t timeCost,
+           const uint64_t nRows, const uint64_t nCols )
 {
-	//============================= Basic variables ============================//
-	int64_t row = 2; //index of row to be processed
-	int64_t prev = 1; //index of prev (last row ever computed/modified)
-	int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
-	int64_t tau; //Time Loop iterator
-	int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
-	int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
-	int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
-	int64_t i; //auxiliary iteration counter
-	int64_t v64; // 64bit var for memcpy
-	//==========================================================================/
+   //====================== Basic variables ============================//
+   uint64_t _ALIGN(256) state[16];
+   int64_t row = 2; //index of row to be processed
+   int64_t prev = 1; //index of prev (last row ever computed/modified)
+   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+   int64_t tau; //Time Loop iterator
+   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+   int64_t i; //auxiliary iteration counter
+   int64_t v64; // 64bit var for memcpy
+   //====================================================================/
 
-	//========== Initializing the Memory Matrix and pointers to it =============//
-	//Tries to allocate enough space for the whole memory matrix
+   //=== Initializing the Memory Matrix and pointers to it =============//
+   //Tries to allocate enough space for the whole memory matrix
 
-	const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-	const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
-	// for Lyra2REv2, nCols = 4, v1 was using 8
-	const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   // for Lyra2REv2, nCols = 4, v1 was using 8
+   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
+                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
 
-	i = (int64_t)ROW_LEN_BYTES * nRows;
-	uint64_t *wholeMatrix = malloc(i);
-	if (wholeMatrix == NULL) {
-		return -1;
-	}
-	memset(wholeMatrix, 0, i);
+   i = (int64_t)ROW_LEN_BYTES * nRows;
+   uint64_t *wholeMatrix = malloc(i);
+   if (wholeMatrix == NULL)
+      return -1;
 
-	//Allocates pointers to each row of the matrix
-	uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows);
-	if (memMatrix == NULL) {
-		return -1;
-	}
-	//Places the pointers in the correct positions
-	uint64_t *ptrWord = wholeMatrix;
-	for (i = 0; i < nRows; i++) {
-		memMatrix[i] = ptrWord;
-		ptrWord += ROW_LEN_INT64;
-	}
-	//==========================================================================/
+   memset(wholeMatrix, 0, i);
 
-	//============= Getting the password + salt + basil padded with 10*1 ===============//
-	//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
-	//but this ensures that the password copied locally will be overwritten as soon as possible
+   //Allocates pointers to each row of the matrix
+   uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows);
+   if (memMatrix == NULL)
+      return -1;
 
-	//First, we clean enough blocks for the password, salt, basil and padding
-	int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
+   //Places the pointers in the correct positions
+   uint64_t *ptrWord = wholeMatrix;
+   for (i = 0; i < nRows; i++)
+   {
+       memMatrix[i] = ptrWord;
+       ptrWord += ROW_LEN_INT64;
+   }
 
-	byte *ptrByte = (byte*) wholeMatrix;
+   //=== Getting the password + salt + basil padded with 10*1 ==========//
+   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+   //but this ensures that the password copied locally will be overwritten as soon as possible
 
-	//Prepends the password
-	memcpy(ptrByte, pwd, pwdlen);
-	ptrByte += pwdlen;
+   //First, we clean enough blocks for the password, salt, basil and padding
+   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
 
-	//Concatenates the salt
-	memcpy(ptrByte, salt, saltlen);
-	ptrByte += saltlen;
+   byte *ptrByte = (byte*) wholeMatrix;
 
-	memset(ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen));
+   //Prepends the password
+   memcpy(ptrByte, pwd, pwdlen);
+   ptrByte += pwdlen;
 
-	//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
-	memcpy(ptrByte, &kLen, sizeof(int64_t));
-	ptrByte += sizeof(uint64_t);
-	v64 = pwdlen;
-	memcpy(ptrByte, &v64, sizeof(int64_t));
-	ptrByte += sizeof(uint64_t);
-	v64 = saltlen;
-	memcpy(ptrByte, &v64, sizeof(int64_t));
-	ptrByte += sizeof(uint64_t);
-	v64 = timeCost;
-	memcpy(ptrByte, &v64, sizeof(int64_t));
-	ptrByte += sizeof(uint64_t);
-	v64 = nRows;
-	memcpy(ptrByte, &v64, sizeof(int64_t));
-	ptrByte += sizeof(uint64_t);
-	v64 = nCols;
-	memcpy(ptrByte, &v64, sizeof(int64_t));
-	ptrByte += sizeof(uint64_t);
+   //Concatenates the salt
+   memcpy(ptrByte, salt, saltlen);
+   ptrByte += saltlen;
 
-	//Now comes the padding
-	*ptrByte = 0x80; //first byte of padding: right after the password
-	ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
-	ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
-	*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
-	//==========================================================================/
+   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
+                       - (saltlen + pwdlen) );
 
-	//======================= Initializing the Sponge State ====================//
-	//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
-	uint64_t _ALIGN(256) state[16];
-	initState(state);
-	//==========================================================================/
+   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+   memcpy(ptrByte, &kLen, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = pwdlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = saltlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = timeCost;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nRows;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nCols;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
 
-	//================================ Setup Phase =============================//
-	//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
-	ptrWord = wholeMatrix;
-	for (i = 0; i < nBlocksInput; i++) {
-		absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
-		ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
-	}
+   //Now comes the padding
+   *ptrByte = 0x80; //first byte of padding: right after the password
+   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
 
-	//Initializes M[0] and M[1]
-	reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
+   //================= Initializing the Sponge State ====================//
+   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+   initState(state);
 
-	reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
+   //========================= Setup Phase =============================//
+   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+   ptrWord = wholeMatrix;
+   for (i = 0; i < nBlocksInput; i++)
+   {
+       absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
+       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
+   }
 
-	do {
-		//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+   //Initializes M[0] and M[1]
+   reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
 
-		reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+   reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
 
-		//updates the value of row* (deterministically picked during Setup))
-		rowa = (rowa + step) & (window - 1);
-		//update prev: it now points to the last row ever computed
-		prev = row;
-		//updates row: goes to the next row to be computed
-		row++;
+   do
+   {
+      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
 
-		//Checks if all rows in the window where visited.
-		if (rowa == 0) {
-		step = window + gap; //changes the step: approximately doubles its value
-		window *= 2; //doubles the size of the re-visitation window
-		gap = -gap; //inverts the modifier to the step
-	}
+      reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
 
-	} while (row < nRows);
-	//==========================================================================/
+      //updates the value of row* (deterministically picked during Setup))
+      rowa = (rowa + step) & (window - 1);
+      //update prev: it now points to the last row ever computed
+      prev = row;
+      //updates row: goes to the next row to be computed
+      row++;
 
-	//============================ Wandering Phase =============================//
-	row = 0; //Resets the visitation to the first row of the memory matrix
-	for (tau = 1; tau <= timeCost; tau++) {
-		//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
-		step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
-		do {
-			//Selects a pseudorandom index row*
-			//------------------------------------------------------------------------------------------
-			rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
-			//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
-			//------------------------------------------------------------------------------------------
+      //Checks if all rows in the window where visited.
+      if (rowa == 0)
+      {
+         step = window + gap; //changes the step: approximately doubles its value
+         window *= 2; //doubles the size of the re-visitation window
+         gap = -gap; //inverts the modifier to the step
+      }
 
-			//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
-			reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+   } while (row < nRows);
 
-			//update prev: it now points to the last row ever computed
-			prev = row;
+   //===================== Wandering Phase =============================//
+   row = 0; //Resets the visitation to the first row of the memory matrix
+   for (tau = 1; tau <= timeCost; tau++)
+   {
+       //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+       step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+       do
+       {
+           //Selects a pseudorandom index row*
+           //-----------------------------------------------
+           rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+           //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //-------------------------------------------
 
-			//updates row: goes to the next row to be computed
-			//------------------------------------------------------------------------------------------
-			row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
-			//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
-			//------------------------------------------------------------------------------------------
+           //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+           reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
 
-		} while (row != 0);
-	}
+           //update prev: it now points to the last row ever computed
+           prev = row;
 
-	//============================ Wrap-up Phase ===============================//
-	//Absorbs the last block of the memory matrix
-	absorbBlock(state, memMatrix[rowa]);
+           //updates row: goes to the next row to be computed
+           //----------------------------------------------------
+           row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+           //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //----------------------------------------------------
 
-	//Squeezes the key
-	squeeze(state, K, (unsigned int) kLen);
+       } while (row != 0);
+   }
 
-	//========================= Freeing the memory =============================//
-	free(memMatrix);
-	free(wholeMatrix);
+   //===================== Wrap-up Phase ===============================//
+   //Absorbs the last block of the memory matrix
+   absorbBlock(state, memMatrix[rowa]);
 
-	return 0;
+   //Squeezes the key
+   squeeze(state, K, (unsigned int) kLen);
+
+   //================== Freeing the memory =============================//
+   free(memMatrix);
+   free(wholeMatrix);
+
+   return 0;
 }
+
+// Zcoin, nRows may be any value
+int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
+            const void *salt, uint64_t saltlen, uint64_t timeCost,
+            uint64_t nRows, uint64_t nCols )
+{
+    //========================== Basic variables ============================//
+    uint64_t _ALIGN(256) state[16];
+    int64_t row = 2; //index of row to be processed
+    int64_t prev = 1; //index of prev (last row ever computed/modified)
+    int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+    int64_t tau; //Time Loop iterator
+    int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+    int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+    int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+    int64_t i; //auxiliary iteration counter
+    //=======================================================================/
+
+    //======= Initializing the Memory Matrix and pointers to it =============//
+    //Tries to allocate enough space for the whole memory matrix
+
+    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+    i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES);
+    uint64_t *wholeMatrix = malloc(i);
+    if (wholeMatrix == NULL) 
+      return -1;
+    
+    memset(wholeMatrix, 0, i);
+    //Allocates pointers to each row of the matrix
+    uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*));
+    if (memMatrix == NULL) 
+      return -1;
+    
+    //Places the pointers in the correct positions
+    uint64_t *ptrWord = wholeMatrix;
+    for (i = 0; i < nRows; i++)
+    {
+      memMatrix[i] = ptrWord;
+      ptrWord += ROW_LEN_INT64;
+    }
+
+    //==== Getting the password + salt + basil padded with 10*1 ============//
+    //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+    //but this ensures that the password copied locally will be overwritten as soon as possible
+
+    //First, we clean enough blocks for the password, salt, basil and padding
+    uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof (uint64_t) )
+                               / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
+    byte *ptrByte = (byte*) wholeMatrix;
+    memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES );
+
+    //Prepends the password
+    memcpy(ptrByte, pwd, pwdlen);
+    ptrByte += pwdlen;
+
+    //Concatenates the salt
+    memcpy(ptrByte, salt, saltlen);
+    ptrByte += saltlen;
+
+    //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+    memcpy(ptrByte, &kLen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &saltlen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &timeCost, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &nRows, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &nCols, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+
+    //Now comes the padding
+    *ptrByte = 0x80; //first byte of padding: right after the password
+    ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+    ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+    *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+
+    //=================== Initializing the Sponge State ====================//
+    //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+    initState( state );
+
+    //============================== Setup Phase =============================//
+    //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+    ptrWord = wholeMatrix;
+    for ( i = 0; i < nBlocksInput; i++ )
+    {
+      absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
+      ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
+    }
+
+    //Initializes M[0] and M[1]
+    reducedSqueezeRow0( state, memMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+    reducedDuplexRow1( state, memMatrix[0], memMatrix[1], nCols );
+
+    do
+    {
+      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+      reducedDuplexRowSetup( state, memMatrix[prev], memMatrix[rowa],
+                              memMatrix[row], nCols );
+
+      //updates the value of row* (deterministically picked during Setup))
+      rowa = (rowa + step) & (window - 1);
+      //update prev: it now points to the last row ever computed
+      prev = row;
+      //updates row: goes to the next row to be computed
+      row++;
+
+      //Checks if all rows in the window where visited.
+      if (rowa == 0)
+      {
+        step = window + gap; //changes the step: approximately doubles its value
+        window *= 2; //doubles the size of the re-visitation window
+        gap = -gap; //inverts the modifier to the step
+      }
+
+    } while (row < nRows);
+
+    //======================== Wandering Phase =============================//
+    row = 0; //Resets the visitation to the first row of the memory matrix
+    for ( tau = 1; tau <= timeCost; tau++ )
+    {
+        //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+        step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+        do {
+        //Selects a pseudorandom index row*
+        //----------------------------------------------------------------------
+        //rowa = ((unsigned int)state[0]) & (nRows-1);	//(USE THIS IF nRows IS A POWER OF 2)
+        rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+        //-----------------------------------------------------------------
+
+        //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+        reducedDuplexRow( state, memMatrix[prev], memMatrix[rowa],
+                          memMatrix[row], nCols );
+
+        //update prev: it now points to the last row ever computed
+        prev = row;
+
+        //updates row: goes to the next row to be computed
+        //---------------------------------------------------------------
+        //row = (row + step) & (nRows-1);	//(USE THIS IF nRows IS A POWER OF 2)
+        row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+        //--------------------------------------------------------------------
+
+      } while (row != 0);
+    }
+
+    //========================= Wrap-up Phase ===============================//
+    //Absorbs the last block of the memory matrix
+    absorbBlock( state, memMatrix[rowa] );
+
+    //Squeezes the key
+    squeeze( state, K, kLen );
+
+    //====================== Freeing the memory =============================//
+    free( memMatrix );
+    free( wholeMatrix );
+
+    return 0;
+}
+
diff --git a/algo/lyra2/lyra2.h b/algo/lyra2/lyra2.h
index edf9179..3a9403b 100644
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -37,6 +37,10 @@ typedef unsigned char byte;
         #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8)    //Block length, in bytes
 #endif
 
-int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
-
+int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
+           const void *salt, uint64_t saltlen, uint64_t timeCost,
+           uint64_t nRows, uint64_t nCols );
+int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, 
+           const void *salt, uint64_t saltlen, uint64_t timeCost, 
+           uint64_t nRows, uint64_t nCols );
 #endif /* LYRA2_H_ */
diff --git a/algo/lyra2/sponge.c b/algo/lyra2/sponge.c
index e746ede..0ab66ba 100644
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -25,6 +25,8 @@
 #include "sponge.h"
 #include "lyra2.h"
 
+
+
 /**
  * Initializes the Sponge State. The first 512 bits are set to zeros and the remainder
  * receive Blake2b's IV as per Blake2b's specification. <b>Note:</b> Even though sponges
@@ -36,10 +38,8 @@
  *
  * @param state         The 1024-bit array to be initialized
  */
-void initState(uint64_t state[/*16*/])
-{
+inline void initState(uint64_t state[/*16*/]) {
 #ifdef __AVX2__
-
   (*(__m256i*)(&state[0])) = _mm256_setzero_si256();
   (*(__m256i*)(&state[4])) = _mm256_setzero_si256();
 
@@ -56,18 +56,17 @@ void initState(uint64_t state[/*16*/])
 //#elif defined __AVX__
 
 #else
-
-	//First 512 bis are zeros
-	memset(state, 0, 64);
-	//Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV
-	state[8] = blake2b_IV[0];
-	state[9] = blake2b_IV[1];
-	state[10] = blake2b_IV[2];
-	state[11] = blake2b_IV[3];
-	state[12] = blake2b_IV[4];
-	state[13] = blake2b_IV[5];
-	state[14] = blake2b_IV[6];
-	state[15] = blake2b_IV[7];
+    //First 512 bis are zeros
+    memset(state, 0, 64);
+    //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV
+    state[8] = blake2b_IV[0];
+    state[9] = blake2b_IV[1];
+    state[10] = blake2b_IV[2];
+    state[11] = blake2b_IV[3];
+    state[12] = blake2b_IV[4];
+    state[13] = blake2b_IV[5];
+    state[14] = blake2b_IV[6];
+    state[15] = blake2b_IV[7];
 #endif
 }
 
@@ -76,23 +75,11 @@ void initState(uint64_t state[/*16*/])
  *
  * @param v     A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
  */
-__inline static void blake2bLyra( uint64_t *v )
-{
+inline static void blake2bLyra(uint64_t *v) {
 #if defined __AVX2__
-
+// may be still used by squeeze
    LYRA_INIT_AVX2;         // defines local a[4]
-   LYRA_ROUND_AVX2;
-   LYRA_ROUND_AVX2;   
-   LYRA_ROUND_AVX2;   
-   LYRA_ROUND_AVX2;   
-   LYRA_ROUND_AVX2;   
-   LYRA_ROUND_AVX2;   
-   LYRA_ROUND_AVX2;   
-   LYRA_ROUND_AVX2;   
-   LYRA_ROUND_AVX2;   
-   LYRA_ROUND_AVX2;   
-   LYRA_ROUND_AVX2;   
-   LYRA_ROUND_AVX2;   
+   LYRA_12_ROUNDS_AVX2( a[0], a[1], a[2], a[3] );
    LYRA_CLOSE_AVX2;
 
 #elif defined __AVX__
@@ -113,20 +100,18 @@ __inline static void blake2bLyra( uint64_t *v )
    LYRA_CLOSE_AVX;
 
 #else
-
-   ROUND_LYRA(0);
-   ROUND_LYRA(0);
-   ROUND_LYRA(0);
-   ROUND_LYRA(0);
-   ROUND_LYRA(0);
-   ROUND_LYRA(0);
-   ROUND_LYRA(0);
-   ROUND_LYRA(0);
-   ROUND_LYRA(0);
-   ROUND_LYRA(0);
-   ROUND_LYRA(0);
-   ROUND_LYRA(0);
-
+    ROUND_LYRA(0);
+    ROUND_LYRA(1);
+    ROUND_LYRA(2);
+    ROUND_LYRA(3);
+    ROUND_LYRA(4);
+    ROUND_LYRA(5);
+    ROUND_LYRA(6);
+    ROUND_LYRA(7);
+    ROUND_LYRA(8);
+    ROUND_LYRA(9);
+    ROUND_LYRA(10);
+    ROUND_LYRA(11);
 #endif
 }
 
@@ -134,19 +119,8 @@ __inline static void blake2bLyra( uint64_t *v )
  * Executes a reduced version of Blake2b's G function with only one round
  * @param v     A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
  */
-__inline static void reducedBlake2bLyra(uint64_t *v) {
-
-#if defined __AVX2__
-   LYRA_INIT_AVX2;         // defines local a[4]
-   LYRA_ROUND_AVX2;
-   LYRA_CLOSE_AVX2;
-#elif defined __AVX__
-   LYRA_INIT_AVX;         // defines locals a0[4], a1[4]
-   LYRA_ROUND_AVX;
-   LYRA_CLOSE_AVX;
-#else
-   ROUND_LYRA(0);
-#endif
+inline static void reducedBlake2bLyra(uint64_t *v) {
+    ROUND_LYRA(0);
 }
 
 /**
@@ -157,21 +131,21 @@ __inline static void reducedBlake2bLyra(uint64_t *v) {
  * @param out        Array that will receive the data squeezed
  * @param len        The number of bytes to be squeezed into the "out" array
  */
-void squeeze(uint64_t *state, byte *out, unsigned int len)
+inline void squeeze( uint64_t *state, byte *out, unsigned int len )
 {
-	int fullBlocks = len / BLOCK_LEN_BYTES;
-	byte *ptr = out;
-	int i;
+    int fullBlocks = len / BLOCK_LEN_BYTES;
+    byte *ptr = out;
+    int i;
 
-	//Squeezes full blocks
-	for (i = 0; i < fullBlocks; i++) {
-		memcpy(ptr, state, BLOCK_LEN_BYTES);
-		blake2bLyra(state);
-		ptr += BLOCK_LEN_BYTES;
-	}
-
-	//Squeezes remaining bytes
-	memcpy(ptr, state, (len % BLOCK_LEN_BYTES));
+    //Squeezes full blocks
+    for ( i = 0; i < fullBlocks; i++ )
+    {
+       memcpy(ptr, state, BLOCK_LEN_BYTES);
+       blake2bLyra(state);
+       ptr += BLOCK_LEN_BYTES;
+    }
+    //Squeezes remaining bytes
+    memcpy(ptr, state, (len % BLOCK_LEN_BYTES));
 }
 
 /**
@@ -181,12 +155,10 @@ void squeeze(uint64_t *state, byte *out, unsigned int len)
  * @param state The current state of the sponge
  * @param in    The block to be absorbed (BLOCK_LEN_INT64 words)
  */
-void absorbBlock(uint64_t *state, const uint64_t *in)
-{
-//XORs the first BLOCK_LEN_INT64 words of "in" with the current state
+inline void absorbBlock(uint64_t *state, const uint64_t *in) {
 #if defined __AVX2__
 
-    __m256i state_v[3], in_v[3];
+    __m256i state_v[4], in_v[3];
 
     // only state is guaranteed aligned 256
     state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
@@ -195,13 +167,18 @@ void absorbBlock(uint64_t *state, const uint64_t *in)
     in_v   [1] = _mm256_loadu_si256( (__m256i*)(&in[4]) );
     state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
     in_v   [2] = _mm256_loadu_si256( (__m256i*)(&in[8]) ); 
+    state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) );
 
-    _mm256_store_si256( (__m256i*)&state[0],
-                          _mm256_xor_si256( state_v[0], in_v[0] ) );
-    _mm256_store_si256( (__m256i*)&state[4],
-                          _mm256_xor_si256( state_v[1], in_v[1] ) );
-    _mm256_store_si256( (__m256i*)&state[8],
-                          _mm256_xor_si256( state_v[2], in_v[2] ) );
+    state_v[0] = _mm256_xor_si256( state_v[0], in_v[0] );
+    state_v[1] = _mm256_xor_si256( state_v[1], in_v[1] );
+    state_v[2] = _mm256_xor_si256( state_v[2], in_v[2] );
+
+    LYRA_12_ROUNDS_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] );
+
+    _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
+    _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
+    _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
+    _mm256_store_si256( (__m256i*)&state[12], state_v[3] );
 
 #elif defined __AVX__
 
@@ -221,6 +198,8 @@ void absorbBlock(uint64_t *state, const uint64_t *in)
     in_v[4]    = _mm_load_si128( (__m128i*)(&in[8]) );
     in_v[5]    = _mm_load_si128( (__m128i*)(&in[10]) );
 
+// do blake2bLyra without init
+// LYRA_ROUND_AVX2( state_v )
     _mm_store_si128( (__m128i*)(&state[0]),
                        _mm_xor_si128( state_v[0], in_v[0] ) );
     _mm_store_si128( (__m128i*)(&state[2]),
@@ -234,26 +213,28 @@ void absorbBlock(uint64_t *state, const uint64_t *in)
     _mm_store_si128( (__m128i*)(&state[10]),
                        _mm_xor_si128( state_v[5], in_v[5] ) );
 
-#else
+    //Applies the transformation f to the sponge's state
+    blake2bLyra(state);
 
-   state[0] ^= in[0];
-   state[1] ^= in[1];
-   state[2] ^= in[2];
-   state[3] ^= in[3];
-   state[4] ^= in[4];
-   state[5] ^= in[5];
-   state[6] ^= in[6];
-   state[7] ^= in[7];
-   state[8] ^= in[8];
-   state[9] ^= in[9];
-   state[10] ^= in[10];
-   state[11] ^= in[11];
+#else
+    //XORs the first BLOCK_LEN_INT64 words of "in" with the current state
+    state[0] ^= in[0];
+    state[1] ^= in[1];
+    state[2] ^= in[2];
+    state[3] ^= in[3];
+    state[4] ^= in[4];
+    state[5] ^= in[5];
+    state[6] ^= in[6];
+    state[7] ^= in[7];
+    state[8] ^= in[8];
+    state[9] ^= in[9];
+    state[10] ^= in[10];
+    state[11] ^= in[11];
+
+    //Applies the transformation f to the sponge's state
+    blake2bLyra(state);
 
 #endif
-
-//Applies the transformation f to the sponge's state
-blake2bLyra(state);
-
 }
 
 /**
@@ -263,23 +244,28 @@ blake2bLyra(state);
  * @param state The current state of the sponge
  * @param in    The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words)
  */
-void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in)
-{
-
-//XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
+inline void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) {
+    //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
 #if defined __AVX2__
 
-    __m256i state_v[2], in_v[2];
+    __m256i state_v[4], in_v[2];
 
     state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
     in_v   [0] = _mm256_loadu_si256( (__m256i*)(&in[0]) );
     state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
     in_v   [1] = _mm256_loadu_si256( (__m256i*)(&in[4]) );
+    state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
+    state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) );
 
-    _mm256_store_si256( (__m256i*)(&state[0]),
-                          _mm256_xor_si256( state_v[0], in_v[0] ) );
-    _mm256_store_si256( (__m256i*)(&state[4]),
-                          _mm256_xor_si256( state_v[1], in_v[1] ) );
+    state_v[0] = _mm256_xor_si256( state_v[0], in_v[0] );
+    state_v[1] = _mm256_xor_si256( state_v[1], in_v[1] );
+
+    LYRA_12_ROUNDS_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] );
+
+    _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
+    _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
+    _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
+    _mm256_store_si256( (__m256i*)&state[12], state_v[3] );
 
 #elif defined __AVX__
 
@@ -304,22 +290,24 @@ void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in)
     _mm_store_si128( (__m128i*)(&state[6]),
                         _mm_xor_si128( state_v[3], in_v[3] ) );
 
+    //Applies the transformation f to the sponge's state
+    blake2bLyra(state);
+
 #else
 
-   state[0] ^= in[0];
-   state[1] ^= in[1];
-   state[2] ^= in[2];
-   state[3] ^= in[3];
-   state[4] ^= in[4];
-   state[5] ^= in[5];
-   state[6] ^= in[6];
-   state[7] ^= in[7];
+    state[0] ^= in[0];
+    state[1] ^= in[1];
+    state[2] ^= in[2];
+    state[3] ^= in[3];
+    state[4] ^= in[4];
+    state[5] ^= in[5];
+    state[6] ^= in[6];
+    state[7] ^= in[7];
 
+    //Applies the transformation f to the sponge's state
+    blake2bLyra(state);
 #endif
 
-//Applies the transformation f to the sponge's state
-blake2bLyra(state);
-
 }
 
 /**
@@ -330,21 +318,34 @@ blake2bLyra(state);
  * @param state     The current state of the sponge
  * @param rowOut    Row to receive the data squeezed
  */
-void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, const uint32_t nCols)
+inline void reducedSqueezeRow0( uint64_t* state, uint64_t* rowOut,
+                                uint64_t nCols )
 {
-   uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
-   unsigned int i;
-   //M[row][C-1-col] = H.reduced_squeeze()
-   for (i = 0; i < nCols; i++)
-   {
+    uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
+    int i;
+    //M[row][C-1-col] = H.reduced_squeeze()
+
+#if defined __AVX2__
+       __m256i state_v[4];
+       state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
+       state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
+       state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
+       state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) );
+#endif
+
+    for ( i = 0; i < nCols; i++ )
+    {
+
 #if defined __AVX2__
 
-      _mm256_storeu_si256( (__m256i*)&ptrWord[0],
-                          _mm256_load_si256( (__m256i*)(&state[0]) ) );
-      _mm256_storeu_si256( (__m256i*)&ptrWord[4],
-                          _mm256_load_si256( (__m256i*)(&state[4]) ) );
-      _mm256_storeu_si256( (__m256i*)&ptrWord[8],
-                          _mm256_load_si256( (__m256i*)(&state[8]) ) );
+       _mm256_storeu_si256( (__m256i*)&ptrWord[0], state_v[0] );
+       _mm256_storeu_si256( (__m256i*)&ptrWord[4], state_v[1] );
+       _mm256_storeu_si256( (__m256i*)&ptrWord[8], state_v[2] );
+
+       //Goes to next block (column) that will receive the squeezed data
+       ptrWord -= BLOCK_LEN_INT64;
+
+       LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] );
 
 #elif defined __AVX__
 
@@ -361,28 +362,42 @@ void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, const uint32_t nCols)
       _mm_store_si128( (__m128i*)(&ptrWord[10]),
                        _mm_load_si128( (__m128i*)(&state[10]) ) );
 
-#else
+    //Goes to next block (column) that will receive the squeezed data
+    ptrWord -= BLOCK_LEN_INT64;
 
-	ptrWord[0] = state[0];
-	ptrWord[1] = state[1];
-	ptrWord[2] = state[2];
-	ptrWord[3] = state[3];
-	ptrWord[4] = state[4];
-	ptrWord[5] = state[5];
-	ptrWord[6] = state[6];
-	ptrWord[7] = state[7];
-	ptrWord[8] = state[8];
-	ptrWord[9] = state[9];
-	ptrWord[10] = state[10];
-	ptrWord[11] = state[11];
+    //Applies the reduced-round transformation f to the sponge's state
+    reducedBlake2bLyra(state);
+
+#else
+    ptrWord[0] = state[0];
+    ptrWord[1] = state[1];
+    ptrWord[2] = state[2];
+    ptrWord[3] = state[3];
+    ptrWord[4] = state[4];
+    ptrWord[5] = state[5];
+    ptrWord[6] = state[6];
+    ptrWord[7] = state[7];
+    ptrWord[8] = state[8];
+    ptrWord[9] = state[9];
+    ptrWord[10] = state[10];
+    ptrWord[11] = state[11];
+
+    //Goes to next block (column) that will receive the squeezed data
+    ptrWord -= BLOCK_LEN_INT64;
+
+    //Applies the reduced-round transformation f to the sponge's state
+    reducedBlake2bLyra(state);
 #endif
 
-	//Goes to next block (column) that will receive the squeezed data
-	ptrWord -= BLOCK_LEN_INT64;
-
-	//Applies the reduced-round transformation f to the sponge's state
-	reducedBlake2bLyra(state);
     }
+
+#if defined __AVX2__
+    _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
+    _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
+    _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
+    _mm256_store_si256( (__m256i*)&state[12], state_v[3] );
+#endif
+
 }
 
 /**
@@ -394,34 +409,43 @@ void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, const uint32_t nCols)
  * @param rowIn		Row to feed the sponge
  * @param rowOut	Row to receive the sponge's output
  */
-void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const uint32_t nCols)
+inline void reducedDuplexRow1( uint64_t *state, uint64_t *rowIn,
+                               uint64_t *rowOut, uint64_t nCols )
 {
-   uint64_t* ptrWordIn = rowIn;				//In Lyra2: pointer to prev
-   uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
-   unsigned int i;
-
-   for (i = 0; i < nCols; i++)
-   {
-      //Absorbing "M[prev][col]"
-      #if defined __AVX2__
-
-         __m256i state_v[3], in_v[3];
+    uint64_t* ptrWordIn = rowIn;	//In Lyra2: pointer to prev
+    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+    int i;
 
+#if defined __AVX2__
+         __m256i state_v[4], in_v[3];
          state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
-         in_v   [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) );
          state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
-         in_v   [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) );
          state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
+         state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) );
+#endif
+
+    for ( i = 0; i < nCols; i++ )
+    {
+#if defined __AVX2__
+
+         in_v   [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) );
+         in_v   [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) );
          in_v   [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) );
  
-         _mm256_store_si256( (__m256i*)(&state[0]),
+         state_v[0] = _mm256_xor_si256( state_v[0], in_v[0] );
+         state_v[1] = _mm256_xor_si256( state_v[1], in_v[1] );
+         state_v[2] = _mm256_xor_si256( state_v[2], in_v[2] );
+
+         LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] );
+
+         _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]),
                               _mm256_xor_si256( state_v[0], in_v[0] ) );
-         _mm256_store_si256( (__m256i*)(&state[4]),
+         _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]),
                               _mm256_xor_si256( state_v[1], in_v[1] ) );
-         _mm256_store_si256( (__m256i*)(&state[8]),
+         _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
                               _mm256_xor_si256( state_v[2], in_v[2] ) );
 
-      #elif defined __AVX__
+#elif defined __AVX__
 
         __m128i state_v[6], in_v[6];
 
@@ -452,29 +476,31 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const
          _mm_store_si128( (__m128i*)(&state[10]),
                            _mm_xor_si128( state_v[5], in_v[5] ) );
 
-      #else
+        //Applies the reduced-round transformation f to the sponge's state
+        reducedBlake2bLyra(state);
 
-         state[0]  ^= (ptrWordIn[0]);
-         state[1]  ^= (ptrWordIn[1]);
-         state[2]  ^= (ptrWordIn[2]);
-         state[3]  ^= (ptrWordIn[3]);
-         state[4]  ^= (ptrWordIn[4]);
-         state[5]  ^= (ptrWordIn[5]);
-         state[6]  ^= (ptrWordIn[6]);
-         state[7]  ^= (ptrWordIn[7]);
-         state[8]  ^= (ptrWordIn[8]);
-         state[9]  ^= (ptrWordIn[9]);
-         state[10] ^= (ptrWordIn[10]);
-         state[11] ^= (ptrWordIn[11]);
+#else
 
-      #endif
+        //Absorbing "M[prev][col]"
+        state[0]  ^= (ptrWordIn[0]);
+        state[1]  ^= (ptrWordIn[1]);
+        state[2]  ^= (ptrWordIn[2]);
+        state[3]  ^= (ptrWordIn[3]);
+        state[4]  ^= (ptrWordIn[4]);
+        state[5]  ^= (ptrWordIn[5]);
+        state[6]  ^= (ptrWordIn[6]);
+        state[7]  ^= (ptrWordIn[7]);
+        state[8]  ^= (ptrWordIn[8]);
+        state[9]  ^= (ptrWordIn[9]);
+        state[10] ^= (ptrWordIn[10]);
+        state[11] ^= (ptrWordIn[11]);
 
-      //Applies the reduced-round transformation f to the sponge's state
-      reducedBlake2bLyra(state);
+        //Applies the reduced-round transformation f to the sponge's state
+        reducedBlake2bLyra(state);
+#endif
 
-      //M[row][C-1-col] = M[prev][col] XOR rand
       #if defined __AVX2__
-         state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
+/*         state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
          state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
          state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
 
@@ -484,7 +510,7 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const
                               _mm256_xor_si256( state_v[1], in_v[1] ) );
          _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
                               _mm256_xor_si256( state_v[2], in_v[2] ) );
-
+*/
       #elif defined __AVX__
 
          state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
@@ -509,25 +535,34 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const
 
       #else
 
-        ptrWordOut[0] = ptrWordIn[0]  ^ state[0];
-        ptrWordOut[1] = ptrWordIn[1]  ^ state[1];
-        ptrWordOut[2] = ptrWordIn[2]  ^ state[2];
-        ptrWordOut[3] = ptrWordIn[3]  ^ state[3];
-	ptrWordOut[4] = ptrWordIn[4]  ^ state[4];
-	ptrWordOut[5] = ptrWordIn[5]  ^ state[5];
-	ptrWordOut[6] = ptrWordIn[6]  ^ state[6];
-	ptrWordOut[7] = ptrWordIn[7]  ^ state[7];
-	ptrWordOut[8] = ptrWordIn[8]  ^ state[8];
-	ptrWordOut[9] = ptrWordIn[9]  ^ state[9];
-	ptrWordOut[10] = ptrWordIn[10] ^ state[10];
-	ptrWordOut[11] = ptrWordIn[11] ^ state[11];
-     #endif
+    //M[row][C-1-col] = M[prev][col] XOR rand
+    ptrWordOut[0] = ptrWordIn[0]  ^ state[0];
+    ptrWordOut[1] = ptrWordIn[1]  ^ state[1];
+    ptrWordOut[2] = ptrWordIn[2]  ^ state[2];
+    ptrWordOut[3] = ptrWordIn[3]  ^ state[3];
+    ptrWordOut[4] = ptrWordIn[4]  ^ state[4];
+    ptrWordOut[5] = ptrWordIn[5]  ^ state[5];
+    ptrWordOut[6] = ptrWordIn[6]  ^ state[6];
+    ptrWordOut[7] = ptrWordIn[7]  ^ state[7];
+    ptrWordOut[8] = ptrWordIn[8]  ^ state[8];
+    ptrWordOut[9] = ptrWordIn[9]  ^ state[9];
+    ptrWordOut[10] = ptrWordIn[10] ^ state[10];
+    ptrWordOut[11] = ptrWordIn[11] ^ state[11];
+#endif
+
+       //Input: next column (i.e., next block in sequence)
+       ptrWordIn += BLOCK_LEN_INT64;
+       //Output: goes to previous column
+       ptrWordOut -= BLOCK_LEN_INT64;
+    }
+
+#if defined __AVX2__
+    _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
+    _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
+    _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
+    _mm256_store_si256( (__m256i*)&state[12], state_v[3] );
+#endif
 
-     //Input: next column (i.e., next block in sequence)
-     ptrWordIn += BLOCK_LEN_INT64;
-     //Output: goes to previous column
-     ptrWordOut -= BLOCK_LEN_INT64;
-   }
 }
 
 /**
@@ -544,46 +579,86 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const
  * @param rowOut         Row receiving the output
  *
  */
-void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols)
+inline void reducedDuplexRowSetup( uint64_t *state, uint64_t *rowIn,
+                                   uint64_t *rowInOut, uint64_t *rowOut,
+                                   uint64_t nCols )
 {
-   uint64_t* ptrWordIn = rowIn;				//In Lyra2: pointer to prev
-   uint64_t* ptrWordInOut = rowInOut;				//In Lyra2: pointer to row*
-   uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
-   unsigned int i;
+    uint64_t* ptrWordIn = rowIn;	//In Lyra2: pointer to prev
+    uint64_t* ptrWordInOut = rowInOut;	//In Lyra2: pointer to row*
+    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+    int i;
 
-   for (i = 0; i < nCols; i++)
-   {
-      //Absorbing "M[prev] [+] M[row*]"
-      #if defined __AVX2__
+#if defined __AVX2__
+    __m256i state_v[4], in_v[3], inout_v[3];
+    #define t_state in_v
 
-       __m256i state_v[3], in_v[3], inout_v[3];
+    state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
+    state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
+    state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
+    state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) );
 
-       state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
+    for ( i = 0; i < nCols; i++ )
+    {
        in_v   [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) );
        inout_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[0]) );
-       state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
        in_v   [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) );
        inout_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[4]) );
-       state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
        in_v   [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) );
        inout_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[8]) );
 
-       _mm256_store_si256( (__m256i*)(&state[0]),
-                            _mm256_xor_si256( state_v[0],
-                                              _mm256_add_epi64( in_v[0],
-                                                               inout_v[0] ) ) );
-       _mm256_store_si256( (__m256i*)(&state[4]),
-                            _mm256_xor_si256( state_v[1],
-                                              _mm256_add_epi64( in_v[1],
-                                                               inout_v[1] ) ) );
-       _mm256_store_si256( (__m256i*)(&state[8]),
-                            _mm256_xor_si256( state_v[2],
-                                              _mm256_add_epi64( in_v[2],
-                                                               inout_v[2] ) ) );
-      #elif defined __AVX__
+       state_v[0] = _mm256_xor_si256( state_v[0],  _mm256_add_epi64( in_v[0],
+                                                                inout_v[0] ) );
+       state_v[1] = _mm256_xor_si256( state_v[1],  _mm256_add_epi64( in_v[1],
+                                                                inout_v[1] ) );
+       state_v[2] = _mm256_xor_si256( state_v[2],  _mm256_add_epi64( in_v[2],
+                                                                inout_v[2] ) );
+
+       LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] );
+
+       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]),
+                              _mm256_xor_si256( state_v[0], in_v[0] ) );
+       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]),
+                              _mm256_xor_si256( state_v[1], in_v[1] ) );
+       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
+                              _mm256_xor_si256( state_v[2], in_v[2] ) );
+
+       //M[row*][col] = M[row*][col] XOR rotW(rand)
+      t_state[0] = _mm256_permute4x64_epi64( state_v[0], 0x93 );
+      t_state[1] = _mm256_permute4x64_epi64( state_v[1], 0x93 );
+      t_state[2] = _mm256_permute4x64_epi64( state_v[2], 0x93 );
+
+      inout_v[0] = _mm256_xor_si256( inout_v[0],
+                         _mm256_blend_epi32( t_state[0], t_state[2], 0x03 ) );
+      inout_v[1] = _mm256_xor_si256( inout_v[1],
+                         _mm256_blend_epi32( t_state[1], t_state[0], 0x03 ) );
+      inout_v[2] = _mm256_xor_si256( inout_v[2],
+                         _mm256_blend_epi32( t_state[2], t_state[1], 0x03 ) );
+
+      _mm256_storeu_si256( (__m256i*)&ptrWordInOut[0], inout_v[0] );
+      _mm256_storeu_si256( (__m256i*)&ptrWordInOut[4], inout_v[1] );
+      _mm256_storeu_si256( (__m256i*)&ptrWordInOut[8], inout_v[2] );
+
+       //Inputs: next column (i.e., next block in sequence)
+       ptrWordInOut += BLOCK_LEN_INT64;
+       ptrWordIn += BLOCK_LEN_INT64;
+       //Output: goes to previous column
+       ptrWordOut -= BLOCK_LEN_INT64;
+    }
+
+    _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
+    _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
+    _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
+    _mm256_store_si256( (__m256i*)&state[12], state_v[3] );
+
+    #undef t_state 
+
+#elif defined __AVX__
 
         __m128i state_v[6], in_v[6], inout_v[6];
 
+    for ( i = 0; i < nCols; i++ )
+    {
+
         state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
         state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
         state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
@@ -630,39 +705,34 @@ void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut,
                                          _mm_add_epi64( in_v[5],
                                                         inout_v[5] ) ) );
 
-      #else
+    //Applies the reduced-round transformation f to the sponge's state
+    reducedBlake2bLyra(state);
+#else
+    for ( i = 0; i < nCols; i++ )
+    {
 
-	state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
-	state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
-	state[2]  ^= (ptrWordIn[2]  + ptrWordInOut[2]);
-	state[3]  ^= (ptrWordIn[3]  + ptrWordInOut[3]);
-	state[4]  ^= (ptrWordIn[4]  + ptrWordInOut[4]);
-	state[5]  ^= (ptrWordIn[5]  + ptrWordInOut[5]);
-	state[6]  ^= (ptrWordIn[6]  + ptrWordInOut[6]);
-	state[7]  ^= (ptrWordIn[7]  + ptrWordInOut[7]);
-	state[8]  ^= (ptrWordIn[8]  + ptrWordInOut[8]);
-	state[9]  ^= (ptrWordIn[9]  + ptrWordInOut[9]);
-	state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
-	state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
-      #endif
+    //Absorbing "M[prev] [+] M[row*]"
+    state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
+    state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
+    state[2]  ^= (ptrWordIn[2]  + ptrWordInOut[2]);
+    state[3]  ^= (ptrWordIn[3]  + ptrWordInOut[3]);
+    state[4]  ^= (ptrWordIn[4]  + ptrWordInOut[4]);
+    state[5]  ^= (ptrWordIn[5]  + ptrWordInOut[5]);
+    state[6]  ^= (ptrWordIn[6]  + ptrWordInOut[6]);
+    state[7]  ^= (ptrWordIn[7]  + ptrWordInOut[7]);
+    state[8]  ^= (ptrWordIn[8]  + ptrWordInOut[8]);
+    state[9]  ^= (ptrWordIn[9]  + ptrWordInOut[9]);
+    state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
+    state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
+    //Applies the reduced-round transformation f to the sponge's state
+    reducedBlake2bLyra(state);
+
+    //M[row][col] = M[prev][col] XOR rand
+#endif
 
-      //Applies the reduced-round transformation f to the sponge's state
-      reducedBlake2bLyra(state);
 
-      //M[row][col] = M[prev][col] XOR rand
       #if defined __AVX2__
 
-         state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
-         state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
-         state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
-
-         _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]),
-                              _mm256_xor_si256( state_v[0], in_v[0] ) );
-         _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]),
-                              _mm256_xor_si256( state_v[1], in_v[1] ) );
-         _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
-                              _mm256_xor_si256( state_v[2], in_v[2] ) );
-
       #elif defined __AVX__
 
          state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
@@ -687,40 +757,49 @@ void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut,
 
       #else
 
-         ptrWordOut[0] = ptrWordIn[0]  ^ state[0];
-         ptrWordOut[1] = ptrWordIn[1]  ^ state[1];
-         ptrWordOut[2] = ptrWordIn[2]  ^ state[2];
-         ptrWordOut[3] = ptrWordIn[3]  ^ state[3];
-         ptrWordOut[4] = ptrWordIn[4]  ^ state[4];
-	 ptrWordOut[5] = ptrWordIn[5]  ^ state[5];
-	 ptrWordOut[6] = ptrWordIn[6]  ^ state[6];
-	 ptrWordOut[7] = ptrWordIn[7]  ^ state[7];
-	 ptrWordOut[8] = ptrWordIn[8]  ^ state[8];
-	 ptrWordOut[9] = ptrWordIn[9]  ^ state[9];
-	 ptrWordOut[10] = ptrWordIn[10] ^ state[10];
-	 ptrWordOut[11] = ptrWordIn[11] ^ state[11];
-      #endif
+    ptrWordOut[0] = ptrWordIn[0]  ^ state[0];
+    ptrWordOut[1] = ptrWordIn[1]  ^ state[1];
+    ptrWordOut[2] = ptrWordIn[2]  ^ state[2];
+    ptrWordOut[3] = ptrWordIn[3]  ^ state[3];
+    ptrWordOut[4] = ptrWordIn[4]  ^ state[4];
+    ptrWordOut[5] = ptrWordIn[5]  ^ state[5];
+    ptrWordOut[6] = ptrWordIn[6]  ^ state[6];
+    ptrWordOut[7] = ptrWordIn[7]  ^ state[7];
+    ptrWordOut[8] = ptrWordIn[8]  ^ state[8];
+    ptrWordOut[9] = ptrWordIn[9]  ^ state[9];
+    ptrWordOut[10] = ptrWordIn[10] ^ state[10];
+    ptrWordOut[11] = ptrWordIn[11] ^ state[11];
+#endif
 
-      //M[row*][col] = M[row*][col] XOR rotW(rand)
-      ptrWordInOut[0]  ^= state[11];
-      ptrWordInOut[1]  ^= state[0];
-      ptrWordInOut[2]  ^= state[1];
-      ptrWordInOut[3]  ^= state[2];
-      ptrWordInOut[4]  ^= state[3];
-      ptrWordInOut[5]  ^= state[4];
-      ptrWordInOut[6]  ^= state[5];
-      ptrWordInOut[7]  ^= state[6];
-      ptrWordInOut[8]  ^= state[7];
-      ptrWordInOut[9]  ^= state[8];
-      ptrWordInOut[10] ^= state[9];
-      ptrWordInOut[11] ^= state[10];
+    //M[row*][col] = M[row*][col] XOR rotW(rand)
+// Need to fix this before taking state load/store out of loop
+#ifdef __AVX2__
+
+
+#else
+
+    ptrWordInOut[0]  ^= state[11];
+    ptrWordInOut[1]  ^= state[0];
+    ptrWordInOut[2]  ^= state[1];
+    ptrWordInOut[3]  ^= state[2];
+    ptrWordInOut[4]  ^= state[3];
+    ptrWordInOut[5]  ^= state[4];
+    ptrWordInOut[6]  ^= state[5];
+    ptrWordInOut[7]  ^= state[6];
+    ptrWordInOut[8]  ^= state[7];
+    ptrWordInOut[9]  ^= state[8];
+    ptrWordInOut[10] ^= state[9];
+    ptrWordInOut[11] ^= state[10];
+
+       //Inputs: next column (i.e., next block in sequence)
+       ptrWordInOut += BLOCK_LEN_INT64;
+       ptrWordIn += BLOCK_LEN_INT64;
+       //Output: goes to previous column
+       ptrWordOut -= BLOCK_LEN_INT64;
+    }
+
+#endif
 
-      //Inputs: next column (i.e., next block in sequence)
-      ptrWordInOut += BLOCK_LEN_INT64;
-      ptrWordIn += BLOCK_LEN_INT64;
-      //Output: goes to previous column
-      ptrWordOut -= BLOCK_LEN_INT64;
-   }
 }
 
 /**
@@ -737,22 +816,26 @@ void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut,
  * @param rowOut         Row receiving the output
  *
  */
-void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols)
+inline void reducedDuplexRow( uint64_t *state, uint64_t *rowIn,
+                              uint64_t *rowInOut, uint64_t *rowOut,
+                              uint64_t nCols )
 {
-   uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
-   uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
-   uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
-   unsigned int i;
+    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
+    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
+    uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
+    int i;
 
-   for (i = 0; i < nCols; i++)
-   {
 
-   //Absorbing "M[prev] [+] M[row*]"
-   #if defined __AVX2__
+#if defined __AVX2__
 
-       __m256i state_v[3], in_v[3], inout_v[3];
+    for ( i = 0; i < nCols; i++)
+    {
+
+       //Absorbing "M[prev] [+] M[row*]"
+
+       __m256i state_v[4], in_v[3], inout_v[3];
        #define out_v in_v    // reuse register in next code block
-
+       #define t_state in_v
        state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
        in_v   [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) );
        inout_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[0]) );
@@ -762,21 +845,82 @@ void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint
        state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
        in_v   [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) );
        inout_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[8]) );
+       state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) );
+
+       state_v[0] = _mm256_xor_si256( state_v[0], _mm256_add_epi64( in_v[0],
+                                                               inout_v[0] ) );
+       state_v[1] = _mm256_xor_si256( state_v[1], _mm256_add_epi64( in_v[1],
+                                                               inout_v[1] ) );
+       state_v[2] = _mm256_xor_si256( state_v[2], _mm256_add_epi64( in_v[2],
+                                                               inout_v[2] ) );
+
+       out_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[0]) );
+       out_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[4]) );
+       out_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[8]) );
+
+       LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] );
+
+       _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
+       _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
+       _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
+       _mm256_store_si256( (__m256i*)&state[12], state_v[3] );
+
+       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]),
+                            _mm256_xor_si256( state_v[0], out_v[0] ) );
+       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]),
+                            _mm256_xor_si256( state_v[1], out_v[1] ) );
+       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
+                            _mm256_xor_si256( state_v[2], out_v[2] ) );
+
+/*
+       t_state[0] = _mm256_permute4x64_epi64( state_v[0], 0x93 );
+       t_state[1] = _mm256_permute4x64_epi64( state_v[1], 0x93 );
+       t_state[2] = _mm256_permute4x64_epi64( state_v[2], 0x93 );
+
+       inout_v[0] = _mm256_xor_si256( inout_v[0],
+                        _mm256_blend_epi32( t_state[0], t_state[2], 0x03 ) );
+       inout_v[1] = _mm256_xor_si256( inout_v[1], 
+                        _mm256_blend_epi32( t_state[1], t_state[0], 0x03 ) );
+       inout_v[2] = _mm256_xor_si256( inout_v[2], 
+                        _mm256_blend_epi32( t_state[2], t_state[1], 0x03 ) );
+
+       _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[0]), inout_v[0] );
+       _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[4]), inout_v[1] );
+       _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[8]), inout_v[2] );
+
+       _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
+       _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
+       _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
+       _mm256_store_si256( (__m256i*)&state[12], state_v[3] );
+*/
+       #undef out_v
+       #undef t_state 
+
+    //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+    ptrWordInOut[0] ^= state[11];
+    ptrWordInOut[1] ^= state[0];
+    ptrWordInOut[2] ^= state[1];
+    ptrWordInOut[3] ^= state[2];
+    ptrWordInOut[4] ^= state[3];
+    ptrWordInOut[5] ^= state[4];
+    ptrWordInOut[6] ^= state[5];
+    ptrWordInOut[7] ^= state[6];
+    ptrWordInOut[8] ^= state[7];
+    ptrWordInOut[9] ^= state[8];
+    ptrWordInOut[10] ^= state[9];
+    ptrWordInOut[11] ^= state[10];
+
+       //Goes to next block
+       ptrWordOut += BLOCK_LEN_INT64;
+       ptrWordInOut += BLOCK_LEN_INT64;
+       ptrWordIn += BLOCK_LEN_INT64;
+    }
 
-       _mm256_store_si256( (__m256i*)(&state[0]),
-                             _mm256_xor_si256( state_v[0], 
-                                               _mm256_add_epi64( in_v[0],
-                                                               inout_v[0] ) ) );
-       _mm256_store_si256( (__m256i*)(&state[4]), 
-                            _mm256_xor_si256( state_v[1],
-                                               _mm256_add_epi64( in_v[1],
-                                                               inout_v[1] ) ) );
-       _mm256_store_si256( (__m256i*)(&state[8]), 
-                            _mm256_xor_si256( state_v[2], 
-                                              _mm256_add_epi64( in_v[2],
-                                                               inout_v[2] ) ) );
    #elif defined __AVX__
 
+    for ( i = 0; i < nCols; i++)
+    {
+
        __m128i state_v[6], in_v[6], inout_v[6];
        #define out_v in_v    // reuse register in next code block
 
@@ -826,28 +970,35 @@ void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint
                                         _mm_add_epi64( in_v[5],
                                                        inout_v[5] ) ) );
 
-   #else
-
-       state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
-       state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
-       state[2]  ^= (ptrWordIn[2]  + ptrWordInOut[2]);
-       state[3]  ^= (ptrWordIn[3]  + ptrWordInOut[3]);
-       state[4]  ^= (ptrWordIn[4]  + ptrWordInOut[4]);
-       state[5]  ^= (ptrWordIn[5]  + ptrWordInOut[5]);
-       state[6]  ^= (ptrWordIn[6]  + ptrWordInOut[6]);
-       state[7]  ^= (ptrWordIn[7]  + ptrWordInOut[7]);
-       state[8]  ^= (ptrWordIn[8]  + ptrWordInOut[8]);
-       state[9]  ^= (ptrWordIn[9]  + ptrWordInOut[9]);
-       state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
-       state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
-    #endif
-
     //Applies the reduced-round transformation f to the sponge's state
     reducedBlake2bLyra(state);
 
-    //M[rowOut][col] = M[rowOut][col] XOR rand
-    #if defined __AVX2__
+   #else
 
+    for ( i = 0; i < nCols; i++)
+    {
+
+    state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
+    state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
+    state[2]  ^= (ptrWordIn[2]  + ptrWordInOut[2]);
+    state[3]  ^= (ptrWordIn[3]  + ptrWordInOut[3]);
+    state[4]  ^= (ptrWordIn[4]  + ptrWordInOut[4]);
+    state[5]  ^= (ptrWordIn[5]  + ptrWordInOut[5]);
+    state[6]  ^= (ptrWordIn[6]  + ptrWordInOut[6]);
+    state[7]  ^= (ptrWordIn[7]  + ptrWordInOut[7]);
+    state[8]  ^= (ptrWordIn[8]  + ptrWordInOut[8]);
+    state[9]  ^= (ptrWordIn[9]  + ptrWordInOut[9]);
+    state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
+    state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
+
+    //Applies the reduced-round transformation f to the sponge's state
+    reducedBlake2bLyra(state);
+#endif
+
+    //M[rowOut][col] = M[rowOut][col] XOR rand
+
+    #if defined __AVX2__
+/*
        state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
        out_v  [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[0]) );
        state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
@@ -861,7 +1012,7 @@ void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint
                             _mm256_xor_si256( state_v[1], out_v[1] ) );
        _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
                             _mm256_xor_si256( state_v[2], out_v[2] ) );
-
+*/
     #elif defined __AVX__
 
        state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
@@ -891,22 +1042,39 @@ void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint
        _mm_store_si128( (__m128i*)(&ptrWordOut[10]),
                          _mm_xor_si128( state_v[5], out_v[5] ) );
 
-    #else
+    //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+    ptrWordInOut[0] ^= state[11];
+    ptrWordInOut[1] ^= state[0];
+    ptrWordInOut[2] ^= state[1];
+    ptrWordInOut[3] ^= state[2];
+    ptrWordInOut[4] ^= state[3];
+    ptrWordInOut[5] ^= state[4];
+    ptrWordInOut[6] ^= state[5];
+    ptrWordInOut[7] ^= state[6];
+    ptrWordInOut[8] ^= state[7];
+    ptrWordInOut[9] ^= state[8];
+    ptrWordInOut[10] ^= state[9];
+    ptrWordInOut[11] ^= state[10];
 
-       ptrWordOut[0] ^= state[0];
-       ptrWordOut[1] ^= state[1];
-       ptrWordOut[2] ^= state[2];
-       ptrWordOut[3] ^= state[3];
-       ptrWordOut[4] ^= state[4];
-       ptrWordOut[5] ^= state[5];
-       ptrWordOut[6] ^= state[6];
-       ptrWordOut[7] ^= state[7];
-       ptrWordOut[8] ^= state[8];
-       ptrWordOut[9] ^= state[9];
-       ptrWordOut[10] ^= state[10];
-       ptrWordOut[11] ^= state[11];
+       //Goes to next block
+       ptrWordOut += BLOCK_LEN_INT64;
+       ptrWordInOut += BLOCK_LEN_INT64;
+       ptrWordIn += BLOCK_LEN_INT64;
+    }
 
-    #endif
+#else
+    ptrWordOut[0] ^= state[0];
+    ptrWordOut[1] ^= state[1];
+    ptrWordOut[2] ^= state[2];
+    ptrWordOut[3] ^= state[3];
+    ptrWordOut[4] ^= state[4];
+    ptrWordOut[5] ^= state[5];
+    ptrWordOut[6] ^= state[6];
+    ptrWordOut[7] ^= state[7];
+    ptrWordOut[8] ^= state[8];
+    ptrWordOut[9] ^= state[9];
+    ptrWordOut[10] ^= state[10];
+    ptrWordOut[11] ^= state[11];
 
     //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
     ptrWordInOut[0] ^= state[11];
@@ -922,24 +1090,10 @@ void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint
     ptrWordInOut[10] ^= state[9];
     ptrWordInOut[11] ^= state[10];
 
-    //Goes to next block
-    ptrWordOut += BLOCK_LEN_INT64;
-    ptrWordInOut += BLOCK_LEN_INT64;
-    ptrWordIn += BLOCK_LEN_INT64;
-  }
+       //Goes to next block
+       ptrWordOut += BLOCK_LEN_INT64;
+       ptrWordInOut += BLOCK_LEN_INT64;
+       ptrWordIn += BLOCK_LEN_INT64;
+    }
+#endif
 }
-
-/**
- * Prints an array of unsigned chars
- */
-void printArray(unsigned char *array, unsigned int size, char *name)
-{
-	unsigned int i;
-	printf("%s: ", name);
-	for (i = 0; i < size; i++) {
-		printf("%2x|", array[i]);
-	}
-	printf("\n");
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h
index 5b38dc7..8f6cb4c 100644
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -23,42 +23,34 @@
 #define SPONGE_H_
 
 #include <stdint.h>
+#include "avxdefs.h"
 
-/* Blake2b IV Array */
+#if defined(__GNUC__)
+#define ALIGN __attribute__ ((aligned(32)))
+#elif defined(_MSC_VER)
+#define ALIGN __declspec(align(32))
+#else
+#define ALIGN
+#endif
+
+
+/*Blake2b IV Array*/
 static const uint64_t blake2b_IV[8] =
 {
-	0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
-	0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
-	0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
-	0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
+  0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+  0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+  0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+  0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
 };
 
-/* Blake2b's rotation */
-static __inline uint64_t rotr64(const uint64_t w, const unsigned c) {
-#ifdef _MSC_VER
-	return _rotr64(w, c);
-#else
-	return ( w >> c ) | ( w << ( 64 - c ) );
-#endif
+/*Blake2b's rotation*/
+static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
+    return ( w >> c ) | ( w << ( 64 - c ) );
 }
 
 #if defined __AVX2__
 // only available with avx2
 
-// rotate each uint64 c bits
-// returns _m256i
-#define  mm256_rotr_64(w,c) _mm256_or_si256(_mm256_srli_epi64(w, c), \
-                                            _mm256_slli_epi64(w, 64 - c))
-
-// Rotate 4 uint64 (256 bits) by one uint64 (64 bits)
-// returns __m256i
-#define mm256_rotl256_1x64(s) _mm256_permute4x64_epi64( s, 0x39 )
-#define mm256_rotr256_1x64(s) _mm256_permute4x64_epi64( s, 0x93 )
-
-// swap hi and lo 128 bits in 256 bit vector
-// returns _m256i
-#define mm256_swap128(s) _mm256_permute2f128_si256( s, s, 1 )
-
 // init vectors from memory
 // returns void, updates defines and inits implicit args a, b, c, d
 #define LYRA_INIT_AVX2 \
@@ -88,15 +80,29 @@ static __inline uint64_t rotr64(const uint64_t w, const unsigned c) {
    c = _mm256_add_epi64( c, d ); \
    b = mm256_rotr_64( _mm256_xor_si256( b, c ), 63 );
 
-#define LYRA_ROUND_AVX2 \
-   G_4X64( a[0], a[1], a[2], a[3] ); \
-   a[1] = mm256_rotl256_1x64( a[1]); \
-   a[2] = mm256_swap128( a[2] ); \
-   a[3] = mm256_rotr256_1x64( a[3] ); \
-   G_4X64( a[0], a[1], a[2], a[3] ); \
-   a[1] = mm256_rotr256_1x64( a[1] ); \
-   a[2] = mm256_swap128( a[2] ); \
-   a[3] = mm256_rotl256_1x64( a[3] );
+#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   G_4X64( s0, s1, s2, s3 ); \
+   s1 = mm256_rotl256_1x64( s1); \
+   s2 = mm256_swap128( s2 ); \
+   s3 = mm256_rotr256_1x64( s3 ); \
+   G_4X64( s0, s1, s2, s3 ); \
+   s1 = mm256_rotr256_1x64( s1 ); \
+   s2 = mm256_swap128( s2 ); \
+   s3 = mm256_rotl256_1x64( s3 );
+
+#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
 
 #else
 // only available with avx
@@ -148,6 +154,7 @@ static __inline uint64_t rotr64(const uint64_t w, const unsigned c) {
 
 #endif // AVX2
 
+/*
 #if defined __AVX__
 // can coexist with AVX2
 
@@ -161,7 +168,7 @@ static __inline uint64_t rotr64(const uint64_t w, const unsigned c) {
 #define mm128_swap128(s0, s1) s0 = _mm_xor_si128(s0, s1); \
                               s1 = _mm_xor_si128(s0, s1); \
                               s0 = _mm_xor_si128(s0, s1);
- 
+
 // swap uint64 in 128 bit source vector, equivalent of rotating 128 bits by
 // 64 bits (8 bytes)
 // __m128i
@@ -193,30 +200,33 @@ static __inline uint64_t rotr64(const uint64_t w, const unsigned c) {
 } while(0)
 
 #endif   // AVX
+*/
 
-/* Blake2b's G function */
-#define G(r,i,a,b,c,d) do { \
-	a = a + b; \
-	d = rotr64(d ^ a, 32); \
-	c = c + d; \
-	b = rotr64(b ^ c, 24); \
-	a = a + b; \
-	d = rotr64(d ^ a, 16); \
-	c = c + d; \
-	b = rotr64(b ^ c, 63); \
+// Scalar
+//Blake2b's G function
+#define G(r,i,a,b,c,d) \
+  do { \
+    a = a + b; \
+    d = rotr64(d ^ a, 32); \
+    c = c + d; \
+    b = rotr64(b ^ c, 24); \
+    a = a + b; \
+    d = rotr64(d ^ a, 16); \
+    c = c + d; \
+    b = rotr64(b ^ c, 63); \
   } while(0)
 
 
 /*One Round of the Blake2b's compression function*/
-#define ROUND_LYRA(r) \
-	G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
-	G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
-	G(r,2,v[ 2],v[ 6],v[10],v[14]); \
-	G(r,3,v[ 3],v[ 7],v[11],v[15]); \
-	G(r,4,v[ 0],v[ 5],v[10],v[15]); \
-	G(r,5,v[ 1],v[ 6],v[11],v[12]); \
-	G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
-	G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
+#define ROUND_LYRA(r)  \
+    G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+    G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+    G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+    G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+    G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+    G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
 
 
 //---- Housekeeping
@@ -224,18 +234,31 @@ void initState(uint64_t state[/*16*/]);
 
 //---- Squeezes
 void squeeze(uint64_t *state, unsigned char *out, unsigned int len);
-void reducedSqueezeRow0(uint64_t* state, uint64_t* row, const uint32_t nCols);
+void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols);
 
 //---- Absorbs
 void absorbBlock(uint64_t *state, const uint64_t *in);
 void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);
 
 //---- Duplexes
-void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const uint32_t nCols);
-void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols);
-void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols);
+void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols);
+void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
+void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
 
 //---- Misc
 void printArray(unsigned char *array, unsigned int size, char *name);
 
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+////TESTS////
+//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2);
+//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+/////////////
+
+
 #endif /* SPONGE_H_ */
diff --git a/algo/lyra2/zcoin.c b/algo/lyra2/zcoin.c
new file mode 100644
index 0000000..2c7a747
--- /dev/null
+++ b/algo/lyra2/zcoin.c
@@ -0,0 +1,77 @@
+#include <memory.h>
+#include "miner.h"
+#include "algo-gate-api.h"
+#include "lyra2.h"
+
+void zcoin_hash(void *state, const void *input, uint32_t height)
+{
+
+	uint32_t _ALIGN(256) hash[16];
+
+        LYRA2Z(hash, 32, input, 80, input, 80, 2, height, 256);
+
+	memcpy(state, hash, 32);
+}
+
+//int scanhash_zcoin(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, uint32_t height)
+int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done )
+{
+	uint32_t _ALIGN(128) hash[8];
+	uint32_t _ALIGN(128) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t Htarg = ptarget[7];
+	const uint32_t first_nonce = pdata[19];
+	uint32_t nonce = first_nonce;
+	if (opt_benchmark)
+		ptarget[7] = 0x0000ff;
+
+	for (int i=0; i < 19; i++) {
+		be32enc(&endiandata[i], pdata[i]);
+	}
+
+	do {
+		be32enc(&endiandata[19], nonce);
+		zcoin_hash( hash, endiandata, work->height );
+
+		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
+			work_set_target_ratio(work, hash);
+			pdata[19] = nonce;
+			*hashes_done = pdata[19] - first_nonce;
+			return 1;
+		}
+		nonce++;
+
+	} while (nonce < max_nonce && !work_restart[thr_id].restart);
+
+	pdata[19] = nonce;
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
+
+int64_t get_max64_0xffffLL() { return 0xffffLL; };
+
+void zcoin_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
+{
+   work->height = sctx->bloc_height;
+   return false;
+}
+
+bool register_zcoin_algo( algo_gate_t* gate )
+{
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->scanhash   = (void*)&scanhash_zcoin;
+  gate->hash       = (void*)&zcoin_hash;
+  gate->hash_alt   = (void*)&zcoin_hash;
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  gate->set_target = (void*)&zcoin_set_target;
+  gate->prevent_dupes = (void*)&zcoin_get_work_height;
+  return true;
+};
+
diff --git a/algo/pluck.c b/algo/pluck.c
index 30224d9..b8de21e 100644
--- a/algo/pluck.c
+++ b/algo/pluck.c
@@ -487,24 +487,19 @@ int64_t pluck_get_max64 ()
   return 0x1ffLL;
 }
 
-void pluck_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (65536.0 * opt_diff_factor) );
-}
-
-bool pluck_miner_thread_init( )
+bool pluck_miner_thread_init( int thr_id )
 { 
   scratchbuf = malloc( 128 * 1024 ); 
   if ( scratchbuf )
     return true;
-  applog( LOG_ERR, "Pluck buffer allocation failed");
+  applog( LOG_ERR, "Thread %u: Pluck buffer allocation failed", thr_id );
   return false;
 }
 
 bool register_pluck_algo( algo_gate_t* gate )
 {
   algo_not_tested();
-  gate->miner_thread_init = (void*)& pluck_miner_thread_init;
+  gate->miner_thread_init = (void*)&pluck_miner_thread_init;
   gate->scanhash         = (void*)&scanhash_pluck;
   gate->hash             = (void*)&pluck_hash;
   gate->set_target       = (void*)&scrypt_set_target;
diff --git a/algo/scrypt.c b/algo/scrypt.c
index 31c6c09..fedc9a7 100644
--- a/algo/scrypt.c
+++ b/algo/scrypt.c
@@ -774,12 +774,12 @@ int64_t scrypt_get_max64()
      return max64;
 }
 
-bool scrypt_miner_thread_init()
+bool scrypt_miner_thread_init( int thr_id )
 {
  scratchbuf = scrypt_buffer_alloc( opt_scrypt_n );  
  if ( scratchbuf )
    return true;
- applog( LOG_ERR, "Scrypt buffer allocation failed" );
+ applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id );
  return false; 
 }
 
diff --git a/algo/scryptjane/scrypt-jane.c b/algo/scryptjane/scrypt-jane.c
index 8b87134..bcab680 100644
--- a/algo/scryptjane/scrypt-jane.c
+++ b/algo/scryptjane/scrypt-jane.c
@@ -231,11 +231,6 @@ void scryptjanehash(void *output, const void *input )
 	scrypt_free(&YX);
 }
 
-void scryptjane_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (65536.0 * opt_diff_factor) );
-}
-
 bool register_scryptjane_algo( algo_gate_t* gate )
 {
     gate->scanhash   = (void*)&scanhash_scryptjane;
diff --git a/algo/yescrypt/yescrypt.c b/algo/yescrypt/yescrypt.c
index d6d1f44..a175070 100644
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -49,19 +49,11 @@ int64_t yescrypt_get_max64 ()
   return 0x1ffLL;
 }
 
-
-void yescrypt_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (65536.0 * opt_diff_factor) );
-}
-
-
 bool register_yescrypt_algo ( algo_gate_t* gate )
 {
    gate->scanhash   = (void*)&scanhash_yescrypt;
    gate->hash       = (void*)&yescrypt_hash;
    gate->hash_alt   = (void*)&yescrypthash;
-//   gate->set_target = (void*)&yescrypt_set_target;
    gate->set_target = (void*)&scrypt_set_target;
    gate->get_max64  = (void*)&yescrypt_get_max64;
    return true;
diff --git a/avxdefs.h b/avxdefs.h
index d3ad5e2..09b077f 100644
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -1,7 +1,7 @@
 //   Some tools to help using AVX and AVX2
 
 #include <inttypes.h>
-//#include <immintrin.h>
+#include <immintrin.h>
 
 // Use these overlays to access the same data in memory as different types
 //
@@ -15,7 +15,7 @@
 
 typedef union
 {
-#if defined __AVX__
+#if defined __AVX2__
 __m256i   v256;
 #endif
 __m128i   v128[ 2];
@@ -49,47 +49,85 @@ uint8_t   v8 [16];
 
 #if defined __AVX2__
 
-// Rotate bits in 4 uint64
+// Rotate bits in 4 uint64  (3 instructions)
 // __m256i mm256_rotr_64( __256i, int )
-#define  mm256_rotr_64(w,c) _mm256_or_si256(_mm256_srli_epi64(w, c), \
-                                            _mm256_slli_epi64(w, 64 - c))
-//static inline __m256i mm256_rotr_64 ( __m256i w, int c)
-//{
-//   return _mm256_or_si256( _mm256_srli_epi64( w, c ),
-//                           _mm256_slli_epi64( w, 64 - c ) );
-//}
-// Rotate uint64 by one uint64
-//__m256i mm256_rotl256_1x64( _mm256i, int )
-#define mm256_rotl256_1x64(s) _mm256_permute4x64_epi64( s, 0x39 )
-#define mm256_rotr256_1x64(s) _mm256_permute4x64_epi64( s, 0x93 )
-//static inline __m256i mm256_rotl256_1x64( __m256i s ) 
-//{ 
-//   return _mm256_permute4x64_epi64( s, 0x39 );
-//}
-//static inline __m256i mm256_rotr256_1x64( __m256i s )
-//{
-//   return _mm256_permute4x64_epi64( s, 0x93 );
-//}
+#define  mm256_rotr_64( w, c ) \
+    _mm256_or_si256( _mm256_srli_epi64(w, c), _mm256_slli_epi64(w, 64 - c) )
+
+#define  mm256_rotl_64( w, c ) \
+    _mm256_or_si256( _mm256_slli_epi64(w, c), _mm256_srli_epi64(w, 64 - c) )
 
 // swap hi and lo 128 bits in 256 bit vector
 //  __m256i mm256_swap128( __m256i )
-#define mm256_swap128(s) _mm256_permute2f128_si256( s, s, 1 )
-//static inline __m256i mm256_swap128( __m256i s )
-//{
-//    return _mm256_permute2f128_si256( s, s, 1 );
-//}
+#define mm256_swap128( w ) \
+    _mm256_permute2f128_si256( w, w, 1 )
 
-#endif
+// Rotate 256 bits by 64 bits (4 uint64 by one uint64)
+//__m256i mm256_rotl256_1x64( _mm256i, int )
+#define mm256_rotl256_1x64( w ) \
+     _mm256_permute4x64_epi64( w, 0x39 )
+
+#define mm256_rotr256_1x64( w ) \
+     _mm256_permute4x64_epi64( w, 0x93 )
+
+// shift 256 bits by n*64 bits (4 uint64 by n uint64)
+#define mm256_slli256_1x64( w ) \
+    _mm256_and_si256( mm256_rotl256_1x64( w ), \
+                      _mm256_set_epi64x( 0, \
+                                         0xffffffffffffffffull, \
+                                         0xffffffffffffffffull, \
+                                         0xffffffffffffffffull ) )
+//                      _mm256_set_epi64x( 0xffffffffffffffffull, \
+//                                         0xffffffffffffffffull, \
+//                                         0xffffffffffffffffull, \
+//                                         0 ) )
+
+
+#define mm256_slli256_2x64( w ) \
+    _mm256_and_si256( mm256_swap128( w ), \
+                      _mm256_set_epi64x( 0xffffffffffffffffull, \
+                                         0xffffffffffffffffull, \
+                                         0, \
+                                         0 ) )
+
+#define mm256_slli256_3x64( w ) \
+    _mm256_and_si256( mm256_rotr256_1x64( w ), \
+                      _mm256_set_epi64x( 0xffffffffffffffffull, \
+                                         0, \
+                                         0, \
+                                         0 ) )
+
+#define mm256_srli256_1x64( w ) \
+    _mm256_and_si256( mm256_rotr256_1x64( w ), \
+                      _mm256_set_epi64x( 0, \
+                                         0xffffffffffffffffull, \
+                                         0xffffffffffffffffull, \
+                                         0xffffffffffffffffull ) )
+
+#define mm256_srli256_2x64( w ) \
+    _mm256_and_si256( mm256_swap128( w ), \
+                      _mm256_set_epi64x( 0, \
+                                         0, \
+                                         0xffffffffffffffffull, \
+                                         0xffffffffffffffffull ))
+
+#define mm256_srli256_3x64( w ) \
+    _mm256_and_si256( mm256_rotl256_1x64( w ), \
+                      _mm256_set_epi64x( 0xffffffffffffffffull, \
+                                         0, \
+                                         0, \
+                                         0 ) )
+//                      _mm256_set_epi64x( 0, \
+//                                         0, \
+//                                         0, \
+//                                         0xffffffffffffffffull ) )
+
+#endif  // AVX2
 
 // rotate bits in 2 uint64
 // _m128i mm_rotr_64( __m128i, int )
 #define  mm_rotr_64(w,c) _mm_or_si128(_mm_srli_epi64(w, c), \
                                       _mm_slli_epi64(w, 64 - c))
-//static inline __m128i  mm_rotr_64( __m128i w, int c )
-//{
-//      _mm_or_si128( _mm_srli_epi64( w, c ), 
-//                    _mm_slli_epi64( w, 64 - c ) );
-//}
 
 // swap 128 bit source vectors
 // void mm128_swap128( __m128i, __m128i )
diff --git a/configure.ac b/configure.ac
index a3f77eb..52b1278 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.4.8-dev])
+AC_INIT([cpuminer-opt], [4.3.8-dev])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index f055c25..9a057a4 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -79,7 +79,7 @@ bool opt_debug_diff = false;
 bool opt_protocol = false;
 bool opt_benchmark = false;
 bool opt_redirect = true;
-bool opt_showdiff = false;
+bool opt_showdiff = true;
 bool opt_extranonce = true;
 bool want_longpoll = true;
 bool have_longpoll = false;
@@ -306,8 +306,8 @@ static bool work_decode(const json_t *val, struct work *work)
 {
     if ( !algo_gate.work_decode( val, work ) )
         return false;
-    if ((opt_showdiff || opt_max_diff > 0.) && !allow_mininginfo)
-        algo_gate.calc_network_diff( work );
+    if ( !allow_mininginfo )
+        net_diff = algo_gate.calc_network_diff( work );
     work->targetdiff = target_to_diff(work->target);
     // for api stats, on longpoll pools
     stratum_diff = work->targetdiff;
@@ -781,10 +781,6 @@ static int share_result( int result, struct work *work, const char *reason )
                        (uint32_t)cpu_temp(0) );
 #endif
 
-//   applog(LOG_NOTICE, "accepted: %lu/%lu (%s%%), %s %sH, %s %sH/s %s",
-//              accepted_count, total_submits, accepted_rate_s,
-//              hc, hc_units, hr, hr_units, sres );
-
    if (reason)
    {
 	applog(LOG_WARNING, "reject reason: %s", reason);
@@ -1467,7 +1463,7 @@ void std_build_extraheader( struct work* work, struct stratum_ctx* sctx )
    work->data[31] = 0x00000280;
 }
 
-void std_calc_network_diff( struct work* work )
+double std_calc_network_diff( struct work* work )
 {
    // sample for diff 43.281 : 1c05ea29
    // todo: endian reversed on longpoll could be zr5 specific...
@@ -1477,11 +1473,14 @@ void std_calc_network_diff( struct work* work )
    uint32_t bits  = ( nbits & 0xffffff );
    int16_t  shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
    int m;
-   net_diff = (double)0x0000ffff / (double)bits;
+   double d = (double)0x0000ffff / (double)bits;
    for ( m = shift; m < 29; m++ )
-       net_diff *= 256.0;
+       d *= 256.0;
    for ( m = 29; m < shift; m++ )
-       net_diff /= 256.0;
+       d /= 256.0;
+   if ( opt_debug_diff )
+      applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
+   return d;
 }
 
 uint32_t* std_get_nonceptr( uint32_t *work_data )
@@ -1501,7 +1500,8 @@ void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
    uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
    
    if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
-      && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) ) )
+      && ( clean_job || ( *nonceptr >= *end_nonce_ptr )
+         || ( work->job_id != g_work->job_id ) ) )
    {
      work_free( work );
      work_copy( work, g_work );
@@ -1518,6 +1518,7 @@ void jr2_get_new_work( struct work* work, struct work* g_work, int thr_id,
                      uint32_t *end_nonce_ptr )
 {
    uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
+
    // byte data[ 0..38, 43..75 ], skip over misaligned nonce [39..42]
    if ( memcmp( work->data, g_work->data, algo_gate.nonce_index )
      || memcmp( ((uint8_t*) work->data)   + JR2_WORK_CMP_INDEX_2,
@@ -1611,9 +1612,9 @@ static void *miner_thread( void *userdata )
       }
    }
 
-   if ( !algo_gate.miner_thread_init() )
+   if ( !algo_gate.miner_thread_init( thr_id ) )
    {
-      applog( LOG_ERR, "FAIL: thread %u failed to initialize");
+      applog( LOG_ERR, "FAIL: thread %u failed to initialize", thr_id );
       exit (1);
    }
 
@@ -1660,6 +1661,8 @@ static void *miner_thread( void *userdata )
        } // do_this_thread
        algo_gate.resync_threads( &work );
 
+       // prevent dupes is called on every loop and has useful args so it
+       // is being used by zcoin to pass along the work height.
        if ( algo_gate.prevent_dupes( &work, &stratum, thr_id ) )
           continue;
        // prevent scans before a job is received
@@ -2075,8 +2078,7 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
       g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
 
    algo_gate.build_extraheader( g_work, sctx );
-   if ( opt_showdiff || opt_max_diff > 0. )
-       algo_gate.calc_network_diff( g_work );
+   net_diff = algo_gate.calc_network_diff( g_work );
    algo_gate.set_work_data_endian( g_work );
    pthread_mutex_unlock( &sctx->work_lock );
 
@@ -2161,6 +2163,7 @@ static void *stratum_thread(void *userdata )
  
 	       sleep(opt_fail_pause);
            }
+
 	   if (jsonrpc_2)
            {
 		work_free(&g_work);
@@ -2168,15 +2171,15 @@ static void *stratum_thread(void *userdata )
 	   }
 	}
 
-	if (stratum.job.job_id &&
-	     (!g_work_time || strcmp(stratum.job.job_id, g_work.job_id)) )
+        if (stratum.job.job_id &&
+             (!g_work_time || strcmp(stratum.job.job_id, g_work.job_id)) )
 	{
 	   pthread_mutex_lock(&g_work_lock);
            algo_gate.stratum_gen_work( &stratum, &g_work );
 	   time(&g_work_time);
 	   pthread_mutex_unlock(&g_work_lock);
 
-	   if (stratum.job.clean || jsonrpc_2)
+           if (stratum.job.clean || jsonrpc_2)
            {
 		static uint32_t last_bloc_height;
 		if (!opt_quiet && last_bloc_height != stratum.bloc_height)
@@ -2554,7 +2557,7 @@ void parse_arg(int key, char *arg )
 		opt_extranonce = false;
 		break;
 	case 1013:
-		opt_showdiff = true;
+		opt_showdiff = false;
 		break;
 	case 1016:			/* --coinbase-addr */
 		pk_script_size = address_to_script(pk_script, sizeof(pk_script), arg);
diff --git a/m4/placeholder-to-prevent-git-from-deleting-this-directory b/m4/placeholder-to-prevent-git-from-deleting-this-directory
deleted file mode 100644
index e69de29..0000000
diff --git a/miner.h b/miner.h
index 17ed0bd..b619ef2 100644
--- a/miner.h
+++ b/miner.h
@@ -492,6 +492,7 @@ enum algos {
         ALGO_LUFFA,       
         ALGO_LYRA2RE,       
         ALGO_LYRA2REV2,   
+        ALGO_LYRA2Z,
         ALGO_M7M,
         ALGO_MYR_GR,      
         ALGO_NEOSCRYPT,
@@ -546,6 +547,7 @@ static const char *algo_names[] = {
         "luffa",
         "lyra2re",
         "lyra2rev2",
+        "lyra2z",
         "m7m",
         "myr-gr",
         "neoscrypt",
@@ -573,6 +575,7 @@ static const char *algo_names[] = {
         "x15",
         "x17",
         "yescrypt",
+        "lyra2z",
         "zr5",
         "\0"
 };
@@ -654,6 +657,7 @@ Options:\n\
                           luffa        Luffa\n\
                           lyra2re      lyra2\n\
                           lyra2rev2    lyrav2\n\
+                          lyra2z       Zcoin (XZC)\n\
                           m7m          Magi (XMG)\n\
                           myr-gr       Myriad-Groestl\n\
                           neoscrypt    NeoScrypt(128, 2, 1)\n\
@@ -700,6 +704,7 @@ Options:\n\
       --randomize       Randomize scan range start to reduce duplicates\n\
   -f, --diff-factor     Divide req. difficulty by this factor (std is 1.0)\n\
   -m, --diff-multiplier Multiply difficulty by this factor (std is 1.0)\n\
+      --hide-diff       Do not display changes in difficulty\n\
   -n, --nfactor         neoscrypt N-Factor\n\
       --coinbase-addr=ADDR  payout address for solo mining\n\
       --coinbase-sig=TEXT  data to insert in the coinbase when possible\n\
@@ -763,6 +768,7 @@ static struct option const options[] = {
         { "diff-factor", 1, NULL, 'f' },
         { "diff", 1, NULL, 'f' }, // deprecated (alias)
         { "diff-multiplier", 1, NULL, 'm' },
+        { "hide-diff", 0, NULL, 1013 },
         { "help", 0, NULL, 'h' },
         { "nfactor", 1, NULL, 'n' },
         { "no-gbt", 0, NULL, 1011 },
@@ -783,7 +789,6 @@ static struct option const options[] = {
         { "retry-pause", 1, NULL, 'R' },
         { "randomize", 0, NULL, 1024 },
         { "scantime", 1, NULL, 's' },
-        { "show-diff", 0, NULL, 1013 },
 #ifdef HAVE_SYSLOG_H
         { "syslog", 0, NULL, 'S' },
 #endif
diff --git a/util.c b/util.c
index a70abc6..07cab73 100644
--- a/util.c
+++ b/util.c
@@ -1630,7 +1630,7 @@ bool rpc2_job_decode(const json_t *job, struct work *work)
 		      hashrate += thr_hashrates[i];
                    pthread_mutex_unlock(&stats_lock);
 		   double diff = trunc( ( ((double)0xffffffff) / target ) );
-		   if (!opt_quiet)
+		   if ( opt_showdiff )
 		      // xmr pool diff can change a lot...
 		      applog(LOG_WARNING, "Stratum difficulty set to %g", diff);
 		   stratum_diff = diff;