v3.5.6

2025-09-17 23:44:27 +00:00 · 2017-02-12 12:43:08 -05:00
parent 1ee41348f4
commit 8efab74183
20 changed files with 1891 additions and 1294 deletions
--- a/algo/cryptonight/cryptonight-aesni.c
+++ b/algo/cryptonight/cryptonight-aesni.c
@@ -3,6 +3,7 @@
 #include "cryptonight.h"
 #include "miner.h"
 #include "crypto/c_keccak.h"
+#include "avxdefs.h"

 void aesni_parallel_noxor(uint8_t *long_state, uint8_t *text, uint8_t *ExpandedKey);
 void aesni_parallel_xor(uint8_t *text, uint8_t *ExpandedKey, uint8_t *long_state);
@@ -147,6 +148,11 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 	_mm_store_si128(&(longoutput[(i >> 4) + 7]), xmminput[7]);
    }

+//     cast_m128i( ctx.a ) = _mm_xor_si128( casti_m128i( ctx.state.k, 0 ) ,
+//                                          casti_m128i( ctx.state.k, 2 ) );
+//     cast_m128i( ctx.b ) = _mm_xor_si128( casti_m128i( ctx.state.k, 1 ),
+//                                          casti_m128i( ctx.state.k, 3 ) );
+
     ctx.a[0] = ((uint64_t *)ctx.state.k)[0] ^ ((uint64_t *)ctx.state.k)[4];
     ctx.b[0] = ((uint64_t *)ctx.state.k)[2] ^ ((uint64_t *)ctx.state.k)[6];
     ctx.a[1] = ((uint64_t *)ctx.state.k)[1] ^ ((uint64_t *)ctx.state.k)[5];
@@ -196,9 +202,12 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 	  a[1] += lo;
 	}
 	uint64_t *dst = (uint64_t*)&ctx.long_state[c[0] & 0x1FFFF0];
+
+//        cast_m128i( dst ) = cast_m128i( a ); 
 	dst[0] = a[0];
 	dst[1] = a[1];

+//        cast_m128i( a ) = _mm_xor_si128( cast_m128i( a ), cast_m128i( b ) );
 	a[0] ^= b[0];
 	a[1] ^= b[1];
 	b_x = c_x;
--- a/algo/luffa/sse2/luffa_for_sse2.c
+++ b/algo/luffa/sse2/luffa_for_sse2.c
@@ -275,7 +275,7 @@ HashReturn init_luffa(hashState_luffa *state, int hashbitlen)
        CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
    for ( i=0; i<10; i++ ) 
 	state->chainv[i] = _mm_load_si128( (__m128i*)&IV[i*4] );
-//    memset(state->buffer, 0, sizeof state->buffer );
+    memset(state->buffer, 0, sizeof state->buffer );
    return SUCCESS;
 }

--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -21,6 +21,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
+#include <mm_malloc.h>
 #include "compat.h"
 #include "lyra2.h"
 #include "sponge.h"
@@ -45,10 +46,9 @@
 * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
 */

-// Lyra2RE & Lyra2REv2, nRows must be a power of 2
-int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
-           const void *salt, uint64_t saltlen, uint64_t timeCost,
-           const uint64_t nRows, const uint64_t nCols )
+int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
+               uint64_t pwdlen, const void *salt, uint64_t saltlen,
+               uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
 {
   //====================== Basic variables ============================//
   uint64_t _ALIGN(256) state[16];
@@ -71,26 +71,21 @@ int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
   // for Lyra2REv2, nCols = 4, v1 was using 8
   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
-
+/*
   i = (int64_t)ROW_LEN_BYTES * nRows;
-   uint64_t *wholeMatrix = malloc(i);
+   uint64_t *wholeMatrix = _mm_malloc( i, 64 );
   if (wholeMatrix == NULL)
      return -1;

+#if defined (__AVX2__)
+   memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
+#elif defined(__AVX__)
+   memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
+#else
   memset(wholeMatrix, 0, i);
-
-   //Allocates pointers to each row of the matrix
-   uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows);
-   if (memMatrix == NULL)
-      return -1;
-
-   //Places the pointers in the correct positions
+#endif
+*/
   uint64_t *ptrWord = wholeMatrix;
-   for (i = 0; i < nRows; i++)
-   {
-       memMatrix[i] = ptrWord;
-       ptrWord += ROW_LEN_INT64;
-   }

   //=== Getting the password + salt + basil padded with 10*1 ==========//
   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
@@ -140,31 +135,36 @@ int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,

   //================= Initializing the Sponge State ====================//
   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
-   initState(state);
+
+   initState( state );

   //========================= Setup Phase =============================//
   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+   
   ptrWord = wholeMatrix;
   for (i = 0; i < nBlocksInput; i++)
   {
-       absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
+       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
   }
-
   //Initializes M[0] and M[1]
-   reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
+   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here

-   reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
+   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
+                      nCols);

   do
   {
      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)

-      reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[row*ROW_LEN_INT64], nCols );

      //updates the value of row* (deterministically picked during Setup))
      rowa = (rowa + step) & (window - 1);
      //update prev: it now points to the last row ever computed
+
      prev = row;
      //updates row: goes to the next row to be computed
      row++;
@@ -190,12 +190,14 @@ int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
           //Selects a pseudorandom index row*
           //-----------------------------------------------
           rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+
           //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
           //-------------------------------------------

           //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
-           reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
-
+           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
           //update prev: it now points to the last row ever computed
           prev = row;

@@ -210,22 +212,17 @@ int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,

   //===================== Wrap-up Phase ===============================//
   //Absorbs the last block of the memory matrix
-   absorbBlock(state, memMatrix[rowa]);
-
+   absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
   //Squeezes the key
   squeeze(state, K, (unsigned int) kLen);

   //================== Freeing the memory =============================//
-   free(memMatrix);
-   free(wholeMatrix);
+//   free(wholeMatrix);

   return 0;
 }

-// Zcoin, nRows may be any value
-int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
-            const void *salt, uint64_t saltlen, uint64_t timeCost,
-            uint64_t nRows, uint64_t nCols )
+int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols )
 {
    //========================== Basic variables ============================//
    uint64_t _ALIGN(256) state[16];
@@ -244,33 +241,27 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,

    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+/*
+   i = (int64_t)ROW_LEN_BYTES * nRows;
+   uint64_t *wholeMatrix = _mm_malloc( i, 64 );

-    i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES);
-    uint64_t *wholeMatrix = malloc(i);
-    if (wholeMatrix == NULL) 
+    if (wholeMatrix == NULL)
      return -1;
-    
-    memset(wholeMatrix, 0, i);
-    //Allocates pointers to each row of the matrix
-    uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*));
-    if (memMatrix == NULL) 
-      return -1;
-    
-    //Places the pointers in the correct positions
-    uint64_t *ptrWord = wholeMatrix;
-    for (i = 0; i < nRows; i++)
-    {
-      memMatrix[i] = ptrWord;
-      ptrWord += ROW_LEN_INT64;
-    }

+#if defined (__AVX2__)
+   memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
+#elif defined(__AVX__)
+   memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
+#else
+   memset(wholeMatrix, 0, i);
+#endif
+*/
    //==== Getting the password + salt + basil padded with 10*1 ============//
    //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
    //but this ensures that the password copied locally will be overwritten as soon as possible

    //First, we clean enough blocks for the password, salt, basil and padding
-    uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof (uint64_t) )
-                               / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
+    uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
    byte *ptrByte = (byte*) wholeMatrix;
    memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES );

@@ -281,7 +272,6 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
    //Concatenates the salt
    memcpy(ptrByte, salt, saltlen);
    ptrByte += saltlen;
-
    //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
    memcpy(ptrByte, &kLen, sizeof (uint64_t));
    ptrByte += sizeof (uint64_t);
@@ -304,11 +294,15 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,

    //=================== Initializing the Sponge State ====================//
    //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+//        uint64_t *state = _mm_malloc(16 * sizeof(uint64_t), 32);
+//        if (state == NULL) {
+//                return -1;
+//        }
    initState( state );

    //============================== Setup Phase =============================//
    //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
-    ptrWord = wholeMatrix;
+        uint64_t *ptrWord = wholeMatrix;
    for ( i = 0; i < nBlocksInput; i++ )
    {
      absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
@@ -316,31 +310,28 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
    }

    //Initializes M[0] and M[1]
-    reducedSqueezeRow0( state, memMatrix[0], nCols ); //The locally copied password is most likely overwritten here
-    reducedDuplexRow1( state, memMatrix[0], memMatrix[1], nCols );
+        reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here
+        reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols);

-    do
-    {
-      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
-      reducedDuplexRowSetup( state, memMatrix[prev], memMatrix[rowa],
-                              memMatrix[row], nCols );
+        do {
+                //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+                reducedDuplexRowSetup(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);

-      //updates the value of row* (deterministically picked during Setup))
-      rowa = (rowa + step) & (window - 1);
-      //update prev: it now points to the last row ever computed
-      prev = row;
-      //updates row: goes to the next row to be computed
-      row++;
+                //updates the value of row* (deterministically picked during Setup))
+                rowa = (rowa + step) & (window - 1);
+                //update prev: it now points to the last row ever computed
+                prev = row;
+                //updates row: goes to the next row to be computed
+                row++;

-      //Checks if all rows in the window where visited.
-      if (rowa == 0)
-      {
-        step = window + gap; //changes the step: approximately doubles its value
-        window *= 2; //doubles the size of the re-visitation window
-        gap = -gap; //inverts the modifier to the step
-      }
+                //Checks if all rows in the window where visited.
+                if (rowa == 0) {
+                        step = window + gap; //changes the step: approximately doubles its value
+                        window *= 2; //doubles the size of the re-visitation window
+                        gap = -gap; //inverts the modifier to the step
+                }

-    } while (row < nRows);
+        } while (row < nRows);

    //======================== Wandering Phase =============================//
    row = 0; //Resets the visitation to the first row of the memory matrix
@@ -351,20 +342,19 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
        do {
        //Selects a pseudorandom index row*
        //----------------------------------------------------------------------
-        //rowa = ((unsigned int)state[0]) & (nRows-1);	//(USE THIS IF nRows IS A POWER OF 2)
+        //rowa = ((unsigned int)state[0]) & (nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
        rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
        //-----------------------------------------------------------------

        //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
-        reducedDuplexRow( state, memMatrix[prev], memMatrix[rowa],
-                          memMatrix[row], nCols );
+                reducedDuplexRow(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);

        //update prev: it now points to the last row ever computed
        prev = row;

        //updates row: goes to the next row to be computed
        //---------------------------------------------------------------
-        //row = (row + step) & (nRows-1);	//(USE THIS IF nRows IS A POWER OF 2)
+        //row = (row + step) & (nRows-1);       //(USE THIS IF nRows IS A POWER OF 2)
        row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
        //--------------------------------------------------------------------

@@ -373,15 +363,190 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,

    //========================= Wrap-up Phase ===============================//
    //Absorbs the last block of the memory matrix
-    absorbBlock( state, memMatrix[rowa] );
+        absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);

    //Squeezes the key
    squeeze( state, K, kLen );

    //====================== Freeing the memory =============================//
-    free( memMatrix );
-    free( wholeMatrix );
-
+//        _mm_free(state);
+//        _mm_free( wholeMatrix );
    return 0;
 }

+int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
+             uint64_t pwdlen, const void *salt, uint64_t saltlen,
+             uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
+{
+   //====================== Basic variables ============================//
+   uint64_t _ALIGN(256) state[16];
+   int64_t row = 2; //index of row to be processed
+   int64_t prev = 1; //index of prev (last row ever computed/modified)
+   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+   int64_t tau; //Time Loop iterator
+   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+   int64_t i; //auxiliary iteration counter
+   int64_t v64; // 64bit var for memcpy
+   //====================================================================/
+
+   //=== Initializing the Memory Matrix and pointers to it =============//
+   //Tries to allocate enough space for the whole memory matrix
+
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   // for Lyra2REv2, nCols = 4, v1 was using 8
+   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
+                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+
+   i = (int64_t)ROW_LEN_BYTES * nRows;
+   uint64_t *wholeMatrix = _mm_malloc( i, 64 );
+   if (wholeMatrix == NULL)
+      return -1;
+
+#if defined (__AVX2__)
+   memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
+#elif defined(__AVX__)
+   memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
+#else
+   memset(wholeMatrix, 0, i);
+#endif
+
+   uint64_t *ptrWord = wholeMatrix;
+
+   //=== Getting the password + salt + basil padded with 10*1 ==========//
+   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+   //but this ensures that the password copied locally will be overwritten as soon as possible
+
+   //First, we clean enough blocks for the password, salt, basil and padding
+   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+
+   byte *ptrByte = (byte*) wholeMatrix;
+
+   //Prepends the password
+   memcpy(ptrByte, pwd, pwdlen);
+   ptrByte += pwdlen;
+
+   //Concatenates the salt
+   memcpy(ptrByte, salt, saltlen);
+   ptrByte += saltlen;
+
+   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
+                       - (saltlen + pwdlen) );
+
+   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+   memcpy(ptrByte, &kLen, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = pwdlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = saltlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = timeCost;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nRows;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nCols;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+
+   //Now comes the padding
+   *ptrByte = 0x80; //first byte of padding: right after the password
+   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+
+   //================= Initializing the Sponge State ====================//
+   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+
+   initState( state );
+
+   //========================= Setup Phase =============================//
+   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+
+   ptrWord = wholeMatrix;
+   for (i = 0; i < nBlocksInput; i++)
+   {
+       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
+       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
+   }
+   //Initializes M[0] and M[1]
+   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+
+   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
+                      nCols);
+
+   do
+   {
+      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+
+      //updates the value of row* (deterministically picked during Setup))
+      rowa = (rowa + step) & (window - 1);
+      //update prev: it now points to the last row ever computed
+
+      prev = row;
+      //updates row: goes to the next row to be computed
+      row++;
+
+      //Checks if all rows in the window where visited.
+      if (rowa == 0)
+      {
+         step = window + gap; //changes the step: approximately doubles its value
+         window *= 2; //doubles the size of the re-visitation window
+         gap = -gap; //inverts the modifier to the step
+      }
+
+   } while (row < nRows);
+
+   //===================== Wandering Phase =============================//
+   row = 0; //Resets the visitation to the first row of the memory matrix
+   for (tau = 1; tau <= timeCost; tau++)
+   {
+       //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+       step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+       do
+       {
+           //Selects a pseudorandom index row*
+           //-----------------------------------------------
+           rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+
+           //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //-------------------------------------------
+
+           //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+           //update prev: it now points to the last row ever computed
+           prev = row;
+
+           //updates row: goes to the next row to be computed
+           //----------------------------------------------------
+           row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+           //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //----------------------------------------------------
+
+       } while (row != 0);
+   }
+
+   //===================== Wrap-up Phase ===============================//
+   //Absorbs the last block of the memory matrix
+   absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
+   //Squeezes the key
+   squeeze(state, K, (unsigned int) kLen);
+
+   //================== Freeing the memory =============================//
+   free(wholeMatrix);
+
+   return 0;
+}
+
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -37,10 +37,20 @@ typedef unsigned char byte;
        #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8)    //Block length, in bytes
 #endif

-int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
-           const void *salt, uint64_t saltlen, uint64_t timeCost,
-           uint64_t nRows, uint64_t nCols );
-int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, 
-           const void *salt, uint64_t saltlen, uint64_t timeCost, 
-           uint64_t nRows, uint64_t nCols );
+#define BLOCK_LEN_M256I (BLOCK_LEN_INT64 / 4 )
+#define BLOCK_LEN_M128I (BLOCK_LEN_INT64 / 2 )
+
+int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
+             uint64_t pwdlen, const void *salt, uint64_t saltlen,
+             uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
+
+int LYRA2REV2( uint64_t*, void *K, uint64_t kLen, const void *pwd,
+               uint64_t pwdlen, const void *salt, uint64_t saltlen,
+               uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
+int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
+            uint64_t pwdlen, const void *salt, uint64_t saltlen,
+            uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
 #endif /* LYRA2_H_ */
--- a/algo/lyra2/lyra2re.c
+++ b/algo/lyra2/lyra2re.c
@@ -7,11 +7,14 @@
 #include "algo/keccak/sph_keccak.h"
 #include "lyra2.h"
 #include "algo-gate-api.h"
+#include "avxdefs.h"

 #ifndef NO_AES_NI
  #include "algo/groestl/aes_ni/hash-groestl256.h"
 #endif

+//__thread uint64_t* lyra2re_wholeMatrix;
+
 typedef struct {
        sph_blake256_context     blake;
        sph_keccak256_context    keccak;
@@ -24,6 +27,7 @@ typedef struct {
 } lyra2re_ctx_holder;

 lyra2re_ctx_holder lyra2re_ctx;
+static __thread sph_blake256_context lyra2_blake_mid;

 void init_lyra2re_ctx()
 {
@@ -37,6 +41,12 @@ void init_lyra2re_ctx()
 #endif
 }

+void lyra2_blake256_midstate( const void* input )
+{
+    memcpy( &lyra2_blake_mid, &lyra2re_ctx.blake, sizeof lyra2_blake_mid );
+    sph_blake256( &lyra2_blake_mid, input, 64 );
+}
+
 void lyra2re_hash(void *state, const void *input)
 {
        lyra2re_ctx_holder ctx;
@@ -47,13 +57,19 @@ void lyra2re_hash(void *state, const void *input)
        #define hashA hash
        #define hashB hash+16

-	sph_blake256(&ctx.blake, input, 80);
+        const int midlen = 64;            // bytes
+        const int tail   = 80 - midlen;   // 16
+
+        memcpy( &ctx.blake, &lyra2_blake_mid, sizeof lyra2_blake_mid );
+        sph_blake256( &ctx.blake, input + 64, 16 );
+
+//	sph_blake256(&ctx.blake, input, 80);
 	sph_blake256_close(&ctx.blake, hashA);

 	sph_keccak256(&ctx.keccak, hashA, 32);
 	sph_keccak256_close(&ctx.keccak, hashB);

-	LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
+	LYRA2RE( hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);

 	sph_skein256(&ctx.skein, hashA, 32);
 	sph_skein256_close(&ctx.skein, hashB);
@@ -81,6 +97,8 @@ int scanhash_lyra2re(int thr_id, struct work *work,

        swab32_array( endiandata, pdata, 20 );

+        lyra2_blake256_midstate( endiandata );
+
 	do {
 		be32enc(&endiandata[19], nonce);
 		lyra2re_hash(hash, endiandata);
@@ -112,10 +130,34 @@ void lyra2re_set_target ( struct work* work, double job_diff )
   work_set_target(work, job_diff / (128.0 * opt_diff_factor) );
 }

+/*
+bool lyra2re_thread_init()
+{
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+   int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
+   lyra2re_wholeMatrix = _mm_malloc( i, 64 );
+
+   if ( lyra2re_wholeMatrix == NULL )
+     return false;
+
+#if defined (__AVX2__)
+   memset_zero_m256i( (__m256i*)lyra2re_wholeMatrix, i/32 );
+#elif defined(__AVX__)
+   memset_zero_m128i( (__m128i*)lyra2re_wholeMatrix, i/16 );
+#else
+   memset( lyra2re_wholeMatrix, 0, i );
+#endif
+   return true;
+}
+*/
+
 bool register_lyra2re_algo( algo_gate_t* gate )
 {
  init_lyra2re_ctx();
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+//  gate->miner_thread_init = (void*)&lyra2re_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2re;
  gate->hash       = (void*)&lyra2re_hash;
  gate->hash_alt   = (void*)&lyra2re_hash;
--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -8,10 +8,11 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/bmw/sph_bmw.h"
-
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
-
 #include "lyra2.h"
+#include "avxdefs.h"
+
+__thread uint64_t* l2v2_wholeMatrix;

 typedef struct {
        cubehashParam           cube1;
@@ -23,7 +24,8 @@ typedef struct {

 } lyra2v2_ctx_holder;

-lyra2v2_ctx_holder lyra2v2_ctx;
+static lyra2v2_ctx_holder lyra2v2_ctx;
+static __thread sph_blake256_context l2v2_blake_mid;

 void init_lyra2rev2_ctx()
 {
@@ -35,14 +37,23 @@ void init_lyra2rev2_ctx()
        sph_bmw256_init( &lyra2v2_ctx.bmw );
 }

+void l2v2_blake256_midstate( const void* input )
+{
+    memcpy( &l2v2_blake_mid, &lyra2v2_ctx.blake, sizeof l2v2_blake_mid );
+    sph_blake256( &l2v2_blake_mid, input, 64 );
+}
+
 void lyra2rev2_hash( void *state, const void *input )
 {
        lyra2v2_ctx_holder ctx;
        memcpy( &ctx, &lyra2v2_ctx, sizeof(lyra2v2_ctx) );
-
 	uint32_t _ALIGN(128) hashA[8], hashB[8];

-	sph_blake256( &ctx.blake, input, 80 );
+        const int midlen = 64;            // bytes
+        const int tail   = 80 - midlen;   // 16
+
+        memcpy( &ctx.blake, &l2v2_blake_mid, sizeof l2v2_blake_mid );
+	sph_blake256( &ctx.blake, (uint8_t*)input + midlen, tail );
 	sph_blake256_close( &ctx.blake, hashA );

 	sph_keccak256( &ctx.keccak, hashA, 32 );
@@ -50,18 +61,14 @@ void lyra2rev2_hash( void *state, const void *input )

        cubehashUpdateDigest( &ctx.cube1, (byte*) hashA,
                              (const byte*) hashB, 32 );
-//        cubehashUpdate( &ctx.cube1, (const byte*) hashB,32 );
-//        cubehashDigest( &ctx.cube1, (byte*)hashA );

-	LYRA2( hashA, 32, hashA, 32, hashA, 32, 1, 4, 4 );
+	LYRA2REV2( l2v2_wholeMatrix, hashA, 32, hashA, 32, hashA, 32, 1, 4, 4 );

 	sph_skein256( &ctx.skein, hashA, 32 );
 	sph_skein256_close( &ctx.skein, hashB );

        cubehashUpdateDigest( &ctx.cube2, (byte*) hashA, 
                              (const byte*) hashB, 32 );
-//        cubehashUpdate( &ctx.cube2, (const byte*) hashB,32 );
-//        cubehashDigest( &ctx.cube2, (byte*)hashA );

 	sph_bmw256( &ctx.bmw, hashA, 32 );
 	sph_bmw256_close( &ctx.bmw, hashB );
@@ -85,6 +92,8 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,

        swab32_array( endiandata, pdata, 20 );

+        l2v2_blake256_midstate( endiandata );
+
 	do {
 		be32enc(&endiandata[19], nonce);
 		lyra2rev2_hash(hash, endiandata);
@@ -112,10 +121,33 @@ void lyra2rev2_set_target( struct work* work, double job_diff )
 work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
 }

+
+bool lyra2rev2_thread_init()
+{
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+   int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+   l2v2_wholeMatrix = _mm_malloc( i, 64 );
+
+   if ( l2v2_wholeMatrix == NULL )
+     return false;
+
+#if defined (__AVX2__)
+   memset_zero_m256i( (__m256i*)l2v2_wholeMatrix, i/32 );
+#elif defined(__AVX__)
+   memset_zero_m128i( (__m128i*)l2v2_wholeMatrix, i/16 );
+#else
+   memset( l2v2_wholeMatrix, 0, i );
+#endif
+   return true;
+}
+
 bool register_lyra2rev2_algo( algo_gate_t* gate )
 {
  init_lyra2rev2_ctx();
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2rev2;
  gate->hash       = (void*)&lyra2rev2_hash;
  gate->hash_alt   = (void*)&lyra2rev2_hash;
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -51,24 +51,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 #if defined __AVX2__
 // only available with avx2

-// init vectors from memory
-// returns void, updates defines and inits implicit args a, b, c, d
-#define LYRA_INIT_AVX2 \
-   __m256i a[4]; \
-   a[0] = _mm256_load_si256( (__m256i*)(&v[ 0]) ); \
-   a[1] = _mm256_load_si256( (__m256i*)(&v[ 4]) ); \
-   a[2] = _mm256_load_si256( (__m256i*)(&v[ 8]) ); \
-   a[3] = _mm256_load_si256( (__m256i*)(&v[12]) );
-
-// save to memory
-// returns void
-#define LYRA_CLOSE_AVX2 \
-   _mm256_store_si256( (__m256i*)(&v[ 0]), a[0] ); \
-   _mm256_store_si256( (__m256i*)(&v[ 4]), a[1] ); \
-   _mm256_store_si256( (__m256i*)(&v[ 8]), a[2] ); \
-   _mm256_store_si256( (__m256i*)(&v[12]), a[3] );
-
-// process 4 rows in parallel
+// process 4 columns in parallel
 // returns void, updates all args
 #define G_4X64(a,b,c,d) \
   a = _mm256_add_epi64( a, b ); \
@@ -107,28 +90,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 #else
 // only available with avx

-#define LYRA_INIT_AVX \
-   __m128i a0[4], a1[4]; \
-   a0[0] = _mm_load_si128( (__m128i*)(&v[ 0]) ); \
-   a1[0] = _mm_load_si128( (__m128i*)(&v[ 2]) ); \
-   a0[1] = _mm_load_si128( (__m128i*)(&v[ 4]) ); \
-   a1[1] = _mm_load_si128( (__m128i*)(&v[ 6]) ); \
-   a0[2] = _mm_load_si128( (__m128i*)(&v[ 8]) ); \
-   a1[2] = _mm_load_si128( (__m128i*)(&v[10]) ); \
-   a0[3] = _mm_load_si128( (__m128i*)(&v[12]) ); \
-   a1[3] = _mm_load_si128( (__m128i*)(&v[14]) );
-
-#define LYRA_CLOSE_AVX \
-   _mm_store_si128( (__m128i*)(&v[ 0]), a0[0] ); \
-   _mm_store_si128( (__m128i*)(&v[ 2]), a1[0] ); \
-   _mm_store_si128( (__m128i*)(&v[ 4]), a0[1] ); \
-   _mm_store_si128( (__m128i*)(&v[ 6]), a1[1] ); \
-   _mm_store_si128( (__m128i*)(&v[ 8]), a0[2] ); \
-   _mm_store_si128( (__m128i*)(&v[10]), a1[2] ); \
-   _mm_store_si128( (__m128i*)(&v[12]), a0[3] ); \
-   _mm_store_si128( (__m128i*)(&v[14]), a1[3] );
-
-// process 2 rows in parallel
+// process 2 columns in parallel
 // returns void, all args updated
 #define G_2X64(a,b,c,d) \
   a = _mm_add_epi64( a, b ); \
@@ -140,68 +102,35 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   c = _mm_add_epi64( c, d ); \
   b = mm_rotr_64( _mm_xor_si128( b, c ), 63 );

-#define LYRA_ROUND_AVX \
-   G_2X64( a0[0], a0[1], a0[2], a0[3] ); \
-   G_2X64( a1[0], a1[1], a1[2], a1[3] ); \
-   mm128_rotl256_1x64( a0[1], a1[1] ); \
-   mm128_swap128( a0[2], a1[2] ); \
-   mm128_rotr256_1x64( a0[3], a1[3] ); \
-   G_2X64( a0[0], a0[1], a0[2], a0[3] ); \
-   G_2X64( a1[0], a1[1], a1[2], a1[3] ); \
-   mm128_rotr256_1x64( a0[1], a1[1] ); \
-   mm128_swap128( a0[2], a1[2] ); \
-   mm128_rotl256_1x64( a0[3], a1[3] );
+#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   G_2X64( s0, s2, s4, s6 ); \
+   G_2X64( s1, s3, s5, s7 ); \
+   mm128_rotl256_1x64( s2, s3 ); \
+   mm128_swap128( s4, s5 ); \
+   mm128_rotr256_1x64( s6, s7 ); \
+   G_2X64( s0, s2, s4, s6 ); \
+   G_2X64( s1, s3, s5, s7 ); \
+   mm128_rotr256_1x64( s2, s3 ); \
+   mm128_swap128( s4, s5 ); \
+   mm128_rotl256_1x64( s6, s7 );
+
+#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+

 #endif // AVX2

-/*
-#if defined __AVX__
-// can coexist with AVX2
-
-// rotate each uint64 c bits
-// _m128i
-#define  mm_rotr_64(w,c) _mm_or_si128(_mm_srli_epi64(w, c), \
-                                      _mm_slli_epi64(w, 64 - c))
-
-// swap 128 bit source vectors, equivalent of rotating 256 bits by 128 bits
-// void
-#define mm128_swap128(s0, s1) s0 = _mm_xor_si128(s0, s1); \
-                              s1 = _mm_xor_si128(s0, s1); \
-                              s0 = _mm_xor_si128(s0, s1);
-
-// swap uint64 in 128 bit source vector, equivalent of rotating 128 bits by
-// 64 bits (8 bytes)
-// __m128i
-#define mm128_swap64(s) _mm_or_si128( _mm_slli_si128( s, 8 ), \
-                                      _mm_srli_si128( s, 8 ) )
-
-// rotate 2 128 bit vectors as one 256 vector by 1 uint64, very inefficient
-// returns void, args updated
-#define mm128_rotl256_1x64(s0, s1) do { \
-   __m128i t; \
-   s0 = mm128_swap64( s0); \
-   s1 = mm128_swap64( s1); \
-   t = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
-                     _mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
-   s1 = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
-                      _mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
-   s0 = t; \
-} while(0)
-
-#define mm128_rotr256_1x64(s0, s1) do { \
-   __m128i t; \
-   s0 = mm128_swap64( s0); \
-   s1 = mm128_swap64( s1); \
-   t = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
-                        _mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
-   s1 = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
-                      _mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
-   s0 = t; \
-} while(0)
-
-#endif   // AVX
-*/
-
 // Scalar
 //Blake2b's G function
 #define G(r,i,a,b,c,d) \
--- a/algo/lyra2/zcoin.c
+++ b/algo/lyra2/zcoin.c
@@ -1,20 +1,40 @@
 #include <memory.h>
+#include <mm_malloc.h>
 #include "miner.h"
 #include "algo-gate-api.h"
 #include "lyra2.h"
+#include "algo/blake/sph_blake.h"
+#include "avxdefs.h"

-void zcoin_hash(void *state, const void *input, uint32_t height)
+__thread uint64_t* zcoin_wholeMatrix;
+
+static __thread sph_blake256_context zcoin_blake_mid;
+
+
+void zcoin_midstate( const void* input )
 {
-
-	uint32_t _ALIGN(256) hash[16];
-
-//        LYRA2Z(hash, 32, input, 80, input, 80, 2, height, 256);
-        LYRA2Z(hash, 32, input, 80, input, 80, 2, 8192, 256);
-
-	memcpy(state, hash, 32);
+       sph_blake256_init( &zcoin_blake_mid );
+       sph_blake256( &zcoin_blake_mid, input, 64 );
+}
+
+// block 2050 new algo, blake plus new lyra parms. new input
+// is power of 2 so normal lyra can be used
+//void zcoin_hash(void *state, const void *input, uint32_t height)
+void zcoin_hash(void *state, const void *input )
+{
+        uint32_t _ALIGN(256) hash[16];
+
+        sph_blake256_context ctx_blake;
+
+        memcpy( &ctx_blake, &zcoin_blake_mid, sizeof zcoin_blake_mid );
+        sph_blake256( &ctx_blake, input + 64, 16 );
+        sph_blake256_close( &ctx_blake, hash );
+
+        LYRA2Z( zcoin_wholeMatrix, hash, 32, hash, 32, hash, 32, 8, 8, 8);
+
+    memcpy(state, hash, 32);
 }

-//int scanhash_zcoin(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, uint32_t height)
 int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done )
 {
@@ -25,6 +45,7 @@ int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
+
 	if (opt_benchmark)
 		ptarget[7] = 0x0000ff;

@@ -32,9 +53,11 @@ int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
 		be32enc(&endiandata[i], pdata[i]);
 	}

+        zcoin_midstate( endiandata );
+
 	do {
 		be32enc(&endiandata[19], nonce);
-		zcoin_hash( hash, endiandata, work->height );
+                zcoin_hash( hash, endiandata );

 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			work_set_target_ratio(work, hash);
@@ -57,22 +80,45 @@ void zcoin_set_target( struct work* work, double job_diff )
 {
 work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
 }
-
+/*
 bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
 {
   work->height = sctx->bloc_height;
   return false;
 }
+*/
+
+bool zcoin_thread_init()
+{
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+   int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
+   zcoin_wholeMatrix = _mm_malloc( i, 64 );
+
+   if ( zcoin_wholeMatrix == NULL )
+     return false;
+
+#if defined (__AVX2__)
+   memset_zero_m256i( (__m256i*)zcoin_wholeMatrix, i/32 );
+#elif defined(__AVX__)
+   memset_zero_m128i( (__m128i*)zcoin_wholeMatrix, i/16 );
+#else
+   memset( zcoin_wholeMatrix, 0, i );
+#endif
+   return true;
+}

 bool register_zcoin_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->miner_thread_init = (void*)&zcoin_thread_init;
  gate->scanhash   = (void*)&scanhash_zcoin;
  gate->hash       = (void*)&zcoin_hash;
  gate->hash_alt   = (void*)&zcoin_hash;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  gate->set_target = (void*)&zcoin_set_target;
-  gate->prevent_dupes = (void*)&zcoin_get_work_height;
+//  gate->prevent_dupes = (void*)&zcoin_get_work_height;
  return true;
 };

--- a/algo/lyra2/zoin.c
+++ b/algo/lyra2/zoin.c
@@ -2,13 +2,15 @@
 #include "miner.h"
 #include "algo-gate-api.h"
 #include "lyra2.h"
+#include "avxdefs.h"
+
+__thread uint64_t* zoin_wholeMatrix;

 void zoin_hash(void *state, const void *input, uint32_t height)
 {
-
 	uint32_t _ALIGN(256) hash[16];

-        LYRA2Z(hash, 32, input, 80, input, 80, 2, 330, 256);
+        LYRA2Z( zoin_wholeMatrix, hash, 32, input, 80, input, 80, 2, 330, 256);

 	memcpy(state, hash, 32);
 }
@@ -53,22 +55,45 @@ void zoin_set_target( struct work* work, double job_diff )
 {
 work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
 }
-
+/*
 bool zoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
 {
   work->height = sctx->bloc_height;
   return false;
 }
+*/
+
+bool zoin_thread_init()
+{
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 256; // nCols
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+   int i = (int64_t)ROW_LEN_BYTES * 330; // nRows;
+   zoin_wholeMatrix = _mm_malloc( i, 64 );
+
+   if ( zoin_wholeMatrix == NULL )
+     return false;
+
+#if defined (__AVX2__)
+   memset_zero_m256i( (__m256i*)zoin_wholeMatrix, i/32 );
+#elif defined(__AVX__)
+   memset_zero_m128i( (__m128i*)zoin_wholeMatrix, i/16 );
+#else
+   memset( zoin_wholeMatrix, 0, i );
+#endif
+   return true;
+}

 bool register_zoin_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->miner_thread_init = (void*)&zoin_thread_init;
  gate->scanhash   = (void*)&scanhash_zoin;
  gate->hash       = (void*)&zoin_hash;
  gate->hash_alt   = (void*)&zoin_hash;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  gate->set_target = (void*)&zoin_set_target;
-  gate->prevent_dupes = (void*)&zoin_get_work_height;
+//  gate->prevent_dupes = (void*)&zoin_get_work_height;
  return true;
 };

--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -175,13 +175,13 @@ int scanhash_m7m_hash( int thr_id, struct work* work,

    memcpy(data, pdata, 80);

-    sph_sha256( &ctx1.sha256, data, M7_MIDSTATE_LEN );
-    sph_sha512( &ctx1.sha512, data, M7_MIDSTATE_LEN );
-    sph_keccak512( &ctx1.keccak, data, M7_MIDSTATE_LEN );
+    sph_sha256( &ctx1.sha256,       data, M7_MIDSTATE_LEN );
+    sph_sha512( &ctx1.sha512,       data, M7_MIDSTATE_LEN );
+    sph_keccak512( &ctx1.keccak,    data, M7_MIDSTATE_LEN );
    sph_whirlpool( &ctx1.whirlpool, data, M7_MIDSTATE_LEN );
-    sph_haval256_5( &ctx1.haval, data, M7_MIDSTATE_LEN );
-    sph_tiger( &ctx1.tiger, data, M7_MIDSTATE_LEN );
-    sph_ripemd160( &ctx1.ripemd, data, M7_MIDSTATE_LEN );
+    sph_haval256_5( &ctx1.haval,    data, M7_MIDSTATE_LEN );
+    sph_tiger( &ctx1.tiger,         data, M7_MIDSTATE_LEN );
+    sph_ripemd160( &ctx1.ripemd,    data, M7_MIDSTATE_LEN );

    mpz_t magipi, magisw, product, bns0, bns1;
    mpf_t magifpi, magifpi0, mpt1, mpt2, mptmp, mpten;
@@ -228,40 +228,11 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
        sph_ripemd160( &ctx2.ripemd, data_p64, 80 - M7_MIDSTATE_LEN );
        sph_ripemd160_close( &ctx2.ripemd, (void*)(bhash[6]) );

-/*
-        ctx2_sha256 = ctx_sha256;
-        sph_sha256 (&ctx2_sha256, data_p64, 80 - M7_MIDSTATE_LEN);
-        sph_sha256_close(&ctx2_sha256, (void*)(bhash[0]));
-
-        ctx2_sha512 = ctx_sha512;
-        sph_sha512 (&ctx2_sha512, data_p64, 80 - M7_MIDSTATE_LEN);
-        sph_sha512_close(&ctx2_sha512, (void*)(bhash[1]));
-        
-        ctx2_keccak = ctx_keccak;
-        sph_keccak512 (&ctx2_keccak, data_p64, 80 - M7_MIDSTATE_LEN);
-        sph_keccak512_close(&ctx2_keccak, (void*)(bhash[2]));
-
-        ctx2_whirlpool = ctx_whirlpool;
-        sph_whirlpool (&ctx2_whirlpool, data_p64, 80 - M7_MIDSTATE_LEN);
-        sph_whirlpool_close(&ctx2_whirlpool, (void*)(bhash[3]));
-        
-        ctx2_haval = ctx_haval;
-        sph_haval256_5 (&ctx2_haval, data_p64, 80 - M7_MIDSTATE_LEN);
-        sph_haval256_5_close(&ctx2_haval, (void*)(bhash[4]));
-
-        ctx2_tiger = ctx_tiger;
-        sph_tiger (&ctx2_tiger, data_p64, 80 - M7_MIDSTATE_LEN);
-        sph_tiger_close(&ctx2_tiger, (void*)(bhash[5]));
-
-        ctx2_ripemd = ctx_ripemd;
-        sph_ripemd160 (&ctx2_ripemd, data_p64, 80 - M7_MIDSTATE_LEN);
-        sph_ripemd160_close(&ctx2_ripemd, (void*)(bhash[6]));
-*/
-
 	mpz_import(bns0, a, -1, p, -1, 0, bhash[0]);
        mpz_set(bns1, bns0);
 	mpz_set(product, bns0);
-	for(int i=1; i < 7; i++){
+	for ( i=1; i < 7; i++ )
+        {
 	    mpz_import(bns0, a, -1, p, -1, 0, bhash[i]);
 	    mpz_add(bns1, bns1, bns0);
            mpz_mul(product, product, bns0);
@@ -275,11 +246,6 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
        sph_sha256( &ctxf_sha256, bdata, bytes );
        sph_sha256_close( &ctxf_sha256, (void*)(hash) );

-/*
-        sph_sha256 (&ctx_final_sha256, bdata, bytes);
-        sph_sha256_close(&ctx_final_sha256, (void*)(hash));
-*/
-
        digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
        mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
 	mpf_set_prec_raw(magifpi, prec);
@@ -291,7 +257,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
 	mpzscale = 1;
        mpz_set_ui(magisw, usw_);
 	    
-        for(i = 0; i < 5; i++)
+        for ( i = 0; i < 5; i++ )
        {	
            mpf_set_d(mpt1, 0.25*mpzscale);
 	    mpf_sub(mpt1, mpt1, mpt2);
@@ -314,23 +280,22 @@ int scanhash_m7m_hash( int thr_id, struct work* work,

            sph_sha256( &ctxf_sha256, bdata, bytes );
            sph_sha256_close( &ctxf_sha256, (void*)(hash) );
-			
-/*
-            sph_sha256 (&ctx_final_sha256, bdata, bytes);
-            sph_sha256_close(&ctx_final_sha256, (void*)(hash));
-*/
 	}

 	const unsigned char *hash_ = (const unsigned char *)hash;
 	const unsigned char *target_ = (const unsigned char *)ptarget;
-	for (i = 31; i >= 0; i--) {
-	      if (hash_[i] != target_[i]) {
+	for ( i = 31; i >= 0; i-- )
+        {
+	      if ( hash_[i] != target_[i] )
+              {
 		rc = hash_[i] < target_[i];
 		break;
 	      }
 	}
-        if (unlikely(rc)) {
-            if (opt_debug) {
+        if ( unlikely(rc) )
+        {
+            if ( opt_debug )
+            {
                bin2hex(hash_str, (unsigned char *)hash, 32);
                bin2hex(target_str, (unsigned char *)ptarget, 32);
                bin2hex(data_str, (unsigned char *)data, 80);
@@ -343,20 +308,22 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
            goto out;
 	  }
    } while (n < max_nonce && !work_restart[thr_id].restart);
+
     pdata[19] = n;
+
 out:
-	mpf_set_prec_raw(magifpi, prec0);
-	mpf_set_prec_raw(magifpi0, prec0);
-	mpf_set_prec_raw(mptmp, prec0);
-	mpf_set_prec_raw(mpt1, prec0);
-	mpf_set_prec_raw(mpt2, prec0);
-	mpf_clear(magifpi);
-	mpf_clear(magifpi0);
-	mpf_clear(mpten);
-	mpf_clear(mptmp);
-	mpf_clear(mpt1);
-	mpf_clear(mpt2);
-	mpz_clears(magipi, magisw, product, bns0, bns1, NULL);
+     mpf_set_prec_raw(magifpi, prec0);
+     mpf_set_prec_raw(magifpi0, prec0);
+     mpf_set_prec_raw(mptmp, prec0);
+     mpf_set_prec_raw(mpt1, prec0);
+     mpf_set_prec_raw(mpt2, prec0);
+     mpf_clear(magifpi);
+     mpf_clear(magifpi0);
+     mpf_clear(mpten);
+     mpf_clear(mptmp);
+     mpf_clear(mpt1);
+     mpf_clear(mpt2);
+     mpz_clears(magipi, magisw, product, bns0, bns1, NULL);

    *hashes_done = n - first_nonce + 1;
    return rc;
--- a/algo/timetravel.c
+++ b/algo/timetravel.c
@@ -5,6 +5,7 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
+#include "avxdefs.h"

 #include "algo/blake/sph_blake.h"
 #include "algo/bmw/sph_bmw.h"
@@ -99,6 +100,7 @@ typedef struct {
 } tt_ctx_holder;

 tt_ctx_holder tt_ctx;
+__thread tt_ctx_holder tt_mid;

 void init_tt_ctx()
 {
@@ -125,6 +127,8 @@ void timetravel_hash(void *output, const void *input)
   tt_ctx_holder ctx;
   memcpy( &ctx, &tt_ctx, sizeof(tt_ctx) );
   int i;
+   const int midlen = 64;            // bytes
+   const int tail   = 80 - midlen;   // 16

   for ( i = 0; i < HASH_FUNC_COUNT; i++ )
   {
@@ -140,50 +144,129 @@ void timetravel_hash(void *output, const void *input)
 	}
 	hashB = &hash[16 * i];

-	switch ( permutation[i] )
+    switch ( permutation[i] )
+    {
+      case 0:
+//        if ( i == 0 )
+//        {
+//           memcpy( &ctx.blake, &tt_mid.blake, sizeof tt_mid.blake );
+//           sph_blake256( &ctx.blake, input + midlen, tail );
+//           sph_blake256_close( &ctx.blake, hashB );
+//        }
+//        else
+//        {
+          sph_blake512( &ctx.blake, hashA, dataLen );
+          sph_blake512_close( &ctx.blake, hashB );
+//        }
+        break;
+     case 1:
+        if ( i == 0 )
        {
-	   case 0:
-		sph_blake512( &ctx.blake, hashA, dataLen );
-		sph_blake512_close( &ctx.blake, hashB );
-		break;
-	   case 1:
-		sph_bmw512( &ctx.bmw, hashA, dataLen );
-		sph_bmw512_close( &ctx.bmw, hashB );
-		break;
-	   case 2:
+           memcpy( &ctx.bmw, &tt_mid.bmw, sizeof tt_mid.bmw );
+           sph_bmw512( &ctx.bmw, input + midlen, tail );
+           sph_bmw512_close( &ctx.bmw, hashB );
+        }
+        else
+        {          
+           sph_bmw512( &ctx.bmw, hashA, dataLen );
+           sph_bmw512_close( &ctx.bmw, hashB );
+        }
+        break;
+     case 2:
 #ifdef NO_AES_NI
-		sph_groestl512( &ctx.groestl, hashA, dataLen );
-		sph_groestl512_close( &ctx.groestl, hashB );
+        if ( i == 0 )
+        {
+           memcpy( &ctx.groestl, &tt_mid.groestl, sizeof tt_mid.groestl );
+           sph_groestl512( &ctx.groestl, input + midlen, tail );
+           sph_groestl512_close( &ctx.groestl, hashB );
+        }
+        else
+        {
+           sph_groestl512( &ctx.groestl, hashA, dataLen );
+           sph_groestl512_close( &ctx.groestl, hashB );
+        }
 #else
-                update_and_final_groestl( &ctx.groestl, (char*)hashB,
-                                          (char*)hashA, dataLen*8 );
-
+        if ( i == 0 )
+        {
+           memcpy( &ctx.groestl, &tt_mid.groestl, sizeof tt_mid.groestl );
+           update_and_final_groestl( &ctx.groestl, (char*)hashB,
+                                    (char*)input + midlen, tail*8 );
+        }
+        else
+        {
+           update_and_final_groestl( &ctx.groestl, (char*)hashB,
+                                    (char*)hashA, dataLen*8 );
+        }
 #endif
-		break;
-	   case 3:
-		sph_skein512( &ctx.skein, hashA, dataLen );
-		sph_skein512_close( &ctx.skein, hashB );
-		break;
-	   case 4:
-		sph_jh512( &ctx.jh, hashA, dataLen );
-		sph_jh512_close( &ctx.jh, hashB);
-		break;
-	   case 5:
-		sph_keccak512( &ctx.keccak, hashA, dataLen );
-		sph_keccak512_close( &ctx.keccak, hashB );
-		break;
-	   case 6:
-                update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
-                                        (const BitSequence*)hashA, dataLen );
-		break;
-	   case 7:
-                cubehashUpdateDigest( &ctx.cube, (byte*)hashB,
-                                      (const byte*) hashA, dataLen );
-		break;
-	   default:
-		break;
-	   }
-	}
+        break;
+     case 3:
+        if ( i == 0 )
+        {
+           memcpy( &ctx.skein, &tt_mid.skein, sizeof tt_mid.skein );
+           sph_skein512( &ctx.skein, input + midlen, tail );
+           sph_skein512_close( &ctx.skein, hashB );
+        }
+        else
+        {
+           sph_skein512( &ctx.skein, hashA, dataLen );
+           sph_skein512_close( &ctx.skein, hashB );
+        }
+        break;
+     case 4:
+        if ( i == 0 )
+        {
+           memcpy( &ctx.jh, &tt_mid.jh, sizeof tt_mid.jh );
+           sph_jh512( &ctx.jh, input + midlen, tail );
+           sph_jh512_close( &ctx.jh, hashB );
+        }
+        else
+        {
+           sph_jh512( &ctx.jh, hashA, dataLen );
+           sph_jh512_close( &ctx.jh, hashB);
+        }
+        break;
+     case 5:
+        if ( i == 0 )
+        {
+           memcpy( &ctx.keccak, &tt_mid.keccak, sizeof tt_mid.keccak );
+           sph_keccak512( &ctx.keccak, input + midlen, tail );
+           sph_keccak512_close( &ctx.keccak, hashB );
+        }
+        else
+        {
+           sph_keccak512( &ctx.keccak, hashA, dataLen );
+           sph_keccak512_close( &ctx.keccak, hashB );
+        }
+        break;
+     case 6:
+//        if ( i == 0 )
+//        {
+//           memcpy( &ctx.luffa, &tt_mid.luffa, sizeof tt_mid.luffa );
+//           update_and_final_luffa( &ctx.luffa, hashB,
+//                                   input + 64, 16 );
+//        }
+//        else
+//        {
+           update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
+                                   hashA, dataLen );
+//        }
+        break;
+     case 7:
+        if ( i == 0 )
+        {
+           memcpy( &ctx.cube, &tt_mid.cube, sizeof tt_mid.cube );
+           cubehashUpdateDigest( &ctx.cube, hashB,
+                                 input + midlen, tail );
+        }
+        else
+        {
+           cubehashUpdateDigest( &ctx.cube, hashB, hashA, dataLen );
+        }
+        break;
+     default:
+	break;
+    }
+  }

 	memcpy(output, &hash[16 * (HASH_FUNC_COUNT - 1)], 32);
 }
@@ -191,52 +274,98 @@ void timetravel_hash(void *output, const void *input)
 int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done )
 {
-	uint32_t _ALIGN(64) hash[8];
-	uint32_t _ALIGN(64) endiandata[20];
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
+   uint32_t _ALIGN(64) hash[8];
+   uint32_t _ALIGN(64) endiandata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;

-	const uint32_t Htarg = ptarget[7];
-	const uint32_t first_nonce = pdata[19];
-	uint32_t nonce = first_nonce;
-	volatile uint8_t *restart = &(work_restart[thr_id].restart);
-        int i;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t nonce = first_nonce;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   int i;

-	if (opt_benchmark)
-		ptarget[7] = 0x0cff;
+   if (opt_benchmark)
+	ptarget[7] = 0x0cff;

-	for (int k=0; k < 19; k++)
-		be32enc(&endiandata[k], pdata[k]);
+   for (int k=0; k < 19; k++)
+	be32enc(&endiandata[k], pdata[k]);

-        const uint32_t timestamp = endiandata[17];
-        if ( timestamp != s_ntime )
+   const uint32_t timestamp = endiandata[17];
+   if ( timestamp != s_ntime )
+   {
+      const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP )
+                    % HASH_FUNC_COUNT_PERMUTATIONS;
+      for ( i = 0; i < HASH_FUNC_COUNT; i++ )
+         permutation[i] = i;
+      for ( i = 0; i < steps; i++ )
+         next_permutation( permutation, permutation + HASH_FUNC_COUNT );
+      s_ntime = timestamp;
+
+      // do midstate precalc for first function
+      switch ( permutation[0] )
+      {
+         case 0:
+//           memcpy( &tt_mid.blake, &tt_ctx.blake, sizeof(tt_mid.blake) );
+//           sph_blake256( &tt_mid.blake, endiandata, 64 );
+           break;
+        case 1:
+           memcpy( &tt_mid.bmw, &tt_ctx.bmw, sizeof(tt_mid.bmw) );
+           sph_bmw512( &tt_mid.bmw, endiandata, 64 );
+           break;
+        case 2:
+#ifdef NO_AES_NI
+           memcpy( &tt_mid.groestl, &tt_ctx.groestl, sizeof(tt_mid.groestl ) );
+           sph_groestl512( &tt_mid.groestl, endiandata, 64 );
+#else
+           memcpy( &tt_mid.groestl, &tt_ctx.groestl, sizeof(tt_mid.groestl ) );
+           update_groestl( &tt_mid.groestl, (char*)endiandata, 64*8 );
+#endif
+           break;
+        case 3:
+           memcpy( &tt_mid.skein, &tt_ctx.skein, sizeof(tt_mid.skein ) );
+           sph_skein512( &tt_mid.skein, endiandata, 64 );
+           break;
+        case 4:
+           memcpy( &tt_mid.jh, &tt_ctx.jh, sizeof(tt_mid.jh ) );
+           sph_jh512( &tt_mid.jh, endiandata, 64 );
+           break;
+         case 5:
+           memcpy( &tt_mid.keccak, &tt_ctx.keccak, sizeof(tt_mid.keccak ) );
+           sph_keccak512( &tt_mid.keccak, endiandata, 64 );
+           break;
+        case 6:
+//           init_luffa( &tt_mid.luffa, 512 );
+//           memcpy( &tt_mid.luffa, &tt_ctx.luffa, sizeof(tt_mid.luffa ) );
+//           update_luffa( &tt_mid.luffa, endiandata, 64 );
+           break;
+        case 7:
+           memcpy( &tt_mid.cube, &tt_ctx.cube, sizeof(tt_mid.cube ) );
+           cubehashUpdate( &tt_mid.cube, endiandata, 64 );
+           break;
+        default:
+           break;
+      }
+   }
+
+   do {
+        be32enc( &endiandata[19], nonce );
+        timetravel_hash( hash, endiandata );
+
+        if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
        {
-           const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP )
-                         % HASH_FUNC_COUNT_PERMUTATIONS;
-           for ( i = 0; i < HASH_FUNC_COUNT; i++ )
-              permutation[i] = i;
-           for ( i = 0; i < steps; i++ )
-              next_permutation( permutation, permutation + HASH_FUNC_COUNT );
-           s_ntime = timestamp;
-        }
+              work_set_target_ratio( work, hash );
+              pdata[19] = nonce;
+              *hashes_done = pdata[19] - first_nonce;
+              return 1;
+         }
+         nonce++;

-	do {
-		be32enc(&endiandata[19], nonce);
-		timetravel_hash(hash, endiandata);
+        } while (nonce < max_nonce && !(*restart));

-		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
-			work_set_target_ratio(work, hash);
-			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-			return 1;
-		}
-		nonce++;
-
-	} while (nonce < max_nonce && !(*restart));
-
-	pdata[19] = nonce;
-	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
+        pdata[19] = nonce;
+        *hashes_done = pdata[19] - first_nonce + 1;
+  return 0;
 }

 void timetravel_set_target( struct work* work, double job_diff )
--- a/algo/veltor.c
+++ b/algo/veltor.c
@@ -95,5 +95,6 @@ bool register_veltor_algo( algo_gate_t* gate )
    gate->hash      = (void*)&veltorhash;
    gate->hash_alt  = (void*)&veltorhash;
    gate->get_max64 = (void*)&get_max64_0x3ffff;
+    return true;
 }

--- a/algo/whirlpool/whirlpool.c
+++ b/algo/whirlpool/whirlpool.c
@@ -7,44 +7,58 @@
 #include <stdio.h>
 #include "sph_whirlpool.h"

+typedef struct {
+   sph_whirlpool_context   whirl1;
+   sph_whirlpool_context   whirl2;
+   sph_whirlpool_context   whirl3;
+   sph_whirlpool_context   whirl4;
+} whirlpool_ctx_holder;
+
+static whirlpool_ctx_holder whirl_ctx;
+static __thread sph_whirlpool_context whirl1_mid_ctx;
+
+void init_whirlpool_ctx()
+{
+  sph_whirlpool1_init( &whirl_ctx.whirl1 );
+  sph_whirlpool1_init( &whirl_ctx.whirl2 );
+  sph_whirlpool1_init( &whirl_ctx.whirl3 );
+  sph_whirlpool1_init( &whirl_ctx.whirl4 );
+}
+
 void whirlpool_hash(void *state, const void *input)
 {
-	sph_whirlpool_context ctx_whirlpool;
+        whirlpool_ctx_holder ctx;
+        memcpy( &ctx, &whirl_ctx, sizeof(whirl_ctx) );

+        const int midlen = 64;
+        const int tail   = 80 - midlen;
 	unsigned char hash[128]; // uint32_t hashA[16], hashB[16];
 	#define hashB hash+64

-	memset(hash, 0, sizeof hash);
+        // copy cached midstate
+        memcpy( &ctx.whirl1, &whirl1_mid_ctx, sizeof whirl1_mid_ctx );
+        sph_whirlpool1( &ctx.whirl1, input + midlen, tail );
+        sph_whirlpool1_close(&ctx.whirl1, hash);

-	sph_whirlpool1_init(&ctx_whirlpool);
-	sph_whirlpool1(&ctx_whirlpool, input, 80);
-	sph_whirlpool1_close(&ctx_whirlpool, hash);
+	sph_whirlpool1(&ctx.whirl2, hash, 64);
+	sph_whirlpool1_close(&ctx.whirl2, hashB);

-	sph_whirlpool1_init(&ctx_whirlpool);
-	sph_whirlpool1(&ctx_whirlpool, hash, 64);
-	sph_whirlpool1_close(&ctx_whirlpool, hashB);
+	sph_whirlpool1(&ctx.whirl3, hashB, 64);
+	sph_whirlpool1_close(&ctx.whirl3, hash);

-	sph_whirlpool1_init(&ctx_whirlpool);
-	sph_whirlpool1(&ctx_whirlpool, hashB, 64);
-	sph_whirlpool1_close(&ctx_whirlpool, hash);
-
-	sph_whirlpool1_init(&ctx_whirlpool);
-	sph_whirlpool1(&ctx_whirlpool, hash, 64);
-	sph_whirlpool1_close(&ctx_whirlpool, hash);
+	sph_whirlpool1(&ctx.whirl4, hash, 64);
+	sph_whirlpool1_close(&ctx.whirl4, hash);

 	memcpy(state, hash, 32);
 }

-void whirlpool_midstate(void *state, const void *input)
+void whirlpool_midstate( const void* input )
 {
-	sph_whirlpool_context ctx;
-
-	sph_whirlpool1_init(&ctx);
-	sph_whirlpool1(&ctx, input, 64);
-
-	memcpy(state, ctx.state, 64);
+       memcpy( &whirl1_mid_ctx, &whirl_ctx.whirl1, sizeof whirl1_mid_ctx );
+       sph_whirlpool1( &whirl1_mid_ctx, input, 64 );
 }

+
 int scanhash_whirlpool(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t _ALIGN(128) endiandata[20];
@@ -59,6 +73,8 @@ int scanhash_whirlpool(int thr_id, struct work* work, uint32_t max_nonce, unsign
        for (int i=0; i < 19; i++)
                be32enc(&endiandata[i], pdata[i]);

+        whirlpool_midstate( endiandata );
+
 	do {
 		const uint32_t Htarg = ptarget[7];
 		uint32_t vhash[8];
@@ -82,9 +98,9 @@ int scanhash_whirlpool(int thr_id, struct work* work, uint32_t max_nonce, unsign

 bool register_whirlpool_algo( algo_gate_t* gate )
 {
-  algo_not_tested();
  gate->scanhash  = (void*)&scanhash_whirlpool;
  gate->hash      = (void*)&whirlpool_hash;
+  init_whirlpool_ctx();
  return true;
 };

--- a/algo/x11/x11evo.c
+++ b/algo/x11/x11evo.c
@@ -71,10 +71,18 @@ void init_x11evo_ctx()
     sph_shavite512_init( &x11evo_ctx.shavite );
 }

+/*
 uint32_t getCurrentAlgoSeq(uint32_t current_time, uint32_t base_time)
 {
 	return (current_time - base_time) / (60 * 60 * 24);
 }
+*/
+
+static inline int getCurrentAlgoSeq( uint32_t current_time )
+{
+        // change once per day
+        return (int) (current_time - INITIAL_DATE) / (60 * 60 * 24);
+}

 // swap_vars doesn't work here
 void evo_swap( uint8_t *a, uint8_t *b )
@@ -136,41 +144,37 @@ void getAlgoString( char *str, uint32_t count )
 	//applog(LOG_DEBUG, "nextPerm %s", str);
 }

-// Broken on Windows
-#if !((defined(__WINDOWS__)) || (defined(__WIN64)))
-static __thread uint32_t saved_ntime = UINT32_MAX;
-#endif
+static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
+static __thread uint32_t s_ntime = UINT32_MAX;
+static int s_seq = -1;

-void evocoin_twisted_code( char *result, char *code )
+static void evo_twisted_code(uint32_t ntime, char *permstr)
 {
-    uint32_t h32, *be32 = get_stratum_job_ntime();
-#if !((defined(__WINDOWS__)) || (defined(__WIN64)))
-    if ( *be32 != saved_ntime )
-    {
-#endif
-        h32 = be32toh(*be32);
-	uint32_t count = getCurrentAlgoSeq(h32, INITIAL_DATE);
-	getAlgoString(code, count);
-	sprintf(result, "_%d_%s_", count, code);
-#if !((defined(__WINDOWS__)) || (defined(__WIN64)))
-        saved_ntime = *be32;
-    }
-#endif
+        int seq = getCurrentAlgoSeq(ntime);
+        if (s_seq != seq)
+        {
+                getAlgoString(permstr, seq);
+                s_seq = seq;
+        }
 }

 static inline void x11evo_hash( void *state, const void *input )
 {
   uint32_t hash[16];
-   char completeCode[64];
-   char resultCode[HASH_FUNC_COUNT + 1];
   x11evo_ctx_holder ctx;
   memcpy( &ctx, &x11evo_ctx, sizeof(x11evo_ctx) );
-   evocoin_twisted_code( completeCode, resultCode );
+
+   if ( s_seq == -1 )
+   {
+       uint32_t *data = (uint32_t*) input;
+       const uint32_t ntime = data[17];
+       evo_twisted_code(ntime, hashOrder);
+    }

   int i;
-   for ( i = 0; i < strlen(resultCode); i++ )
+   for ( i = 0; i < strlen(hashOrder); i++ )
   {
-	char elem = resultCode[i];
+	char elem = hashOrder[i];
 	uint8_t idx;
 	if (elem >= 'A')
 		idx = elem - 'A' + 10;
@@ -196,8 +200,6 @@ static inline void x11evo_hash( void *state, const void *input )
 #else
              update_and_final_groestl( &ctx.groestl, (char*)hash,
                                        (const char*)hash, 512 );
-//              update_groestl( &ctx.groestl, (char*)hash, 512 );
-//              final_groestl( &ctx.groestl, (char*)hash );
 #endif
 	      break;
 	    case 3:
@@ -215,14 +217,10 @@ static inline void x11evo_hash( void *state, const void *input )
 	    case 6:
              update_and_final_luffa( &ctx.luffa, (char*)hash,
                                      (const char*)hash, 64 );
-//              update_luffa( &ctx.luffa, (char*)hash, 64 );
-//              final_luffa( &ctx.luffa, (char*)hash );
 	      break;
 	    case 7:
              cubehashUpdateDigest( &ctx.cube, (char*)hash, 
                                    (const char*)hash, 64 );
-//              cubehashUpdate( &ctx.cube, (char*)hash, 64 );
-//              cubehashDigest( &ctx.cube, (char*)hash );
 	      break;
 	    case 8:
 	      sph_shavite512( &ctx.shavite, (char*)hash, size );
@@ -239,8 +237,6 @@ static inline void x11evo_hash( void *state, const void *input )
 #else
              update_final_echo( &ctx.echo, (char*)hash,
                                 (const char*)hash, 512 );
-//              update_echo( &ctx.echo, (char*)hash, 512 );
-//              final_echo( &ctx.echo, (char*)hash );
 #endif
 	      break;
 	}
@@ -263,6 +259,13 @@ int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,

        swab32_array( endiandata, pdata, 20 );

+        int ntime = endiandata[17];
+        if ( ntime != s_ntime  ||  s_seq == -1 )
+        {
+            evo_twisted_code( ntime, hashOrder );
+            s_ntime = ntime;
+        }
+
        uint32_t hmask = 0xFFFFFFFF;
        if ( Htarg  > 0 )
        {