diff --git a/Makefile.am b/Makefile.am
index c5f9a8e..832fe6f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -105,6 +105,7 @@ cpuminer_SOURCES = \
algo/lyra2/sponge.c \
algo/lyra2/lyra2rev2.c \
algo/lyra2/lyra2re.c \
+ algo/lyra2/zcoin.c \
algo/keccak/sse2/keccak.c \
algo/m7m.c \
algo/neoscrypt.c \
diff --git a/RELEASE_ANNOUNCEMENT b/RELEASE_ANNOUNCEMENT
index 87c6f65..efb153a 100644
--- a/RELEASE_ANNOUNCEMENT
+++ b/RELEASE_ANNOUNCEMENT
@@ -12,7 +12,9 @@ comparison below.
New in 3.4.8
+- added zcoin support, optimized for AVX2 but no increase in performance
- fixed API display of diff for cryptonight
+- --show-diff is now the default, use "--hide-diff" to disable
- cleaned up some cpuminer-multi artifacts
Users with non-SSE2 CPUs or who want to mine algos not supported by
diff --git a/algo-gate-api.c b/algo-gate-api.c
index f1d1b01..d161fe2 100644
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -169,6 +169,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_LUFFA: register_luffa_algo ( gate ); break;
case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break;
case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break;
+ case ALGO_LYRA2Z: register_zcoin_algo ( gate ); break;
case ALGO_M7M: register_m7m_algo ( gate ); break;
case ALGO_MYR_GR: register_myriad_algo ( gate ); break;
case ALGO_NEOSCRYPT: register_neoscrypt_algo ( gate ); break;
@@ -268,6 +269,7 @@ const char* const algo_alias_map[][2] =
{ "sib", "x11gost" },
{ "yes", "yescrypt" },
{ "ziftr", "zr5" },
+ { "zcoin", "lyra2z" },
{ NULL, NULL }
};
diff --git a/algo-gate-api.h b/algo-gate-api.h
index 4d72b09..bf7bebf 100644
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -115,7 +115,7 @@ void ( *hash_alt ) ( void*, const void*, uint32_t );
void ( *hash_suw ) ( void*, const void* );
//optional, safe to use default in most cases
-bool ( *miner_thread_init ) ();
+bool ( *miner_thread_init ) ( int );
void ( *stratum_gen_work ) ( struct stratum_ctx*, struct work* );
void ( *get_new_work ) ( struct work*, struct work*, int, uint32_t*,
bool );
@@ -129,7 +129,7 @@ bool ( *submit_getwork_result ) ( CURL*, struct work* );
void ( *gen_merkle_root ) ( char*, struct stratum_ctx* );
void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* );
void ( *set_work_data_endian ) ( struct work* );
-void ( *calc_network_diff ) ( struct work* );
+double ( *calc_network_diff ) ( struct work* );
void ( *build_extraheader ) ( struct work*, struct stratum_ctx* );
bool ( *prevent_dupes ) ( struct work*, struct stratum_ctx*, int );
void ( *resync_threads ) ( struct work* );
@@ -229,7 +229,7 @@ void jr2_build_stratum_request ( char *req, struct work *work );
// default is do_nothing;
void swab_work_data( struct work *work );
-void std_calc_network_diff( struct work *work );
+double std_calc_network_diff( struct work *work );
void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );
diff --git a/algo/blake/decred.c b/algo/blake/decred.c
index 44ac63f..b6684aa 100644
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -115,7 +115,8 @@ uint32_t *decred_get_nonceptr( uint32_t *work_data )
// does it need to increment nonce, seems not because gen_work_now always
// returns true
-void decred_calc_network_diff( struct work* work )
+double decred_calc_network_diff( struct work* work )
+//void decred_calc_network_diff( struct work* work )
{
// sample for diff 43.281 : 1c05ea29
// todo: endian reversed on longpoll could be zr5 specific...
@@ -123,17 +124,18 @@ void decred_calc_network_diff( struct work* work )
uint32_t bits = ( nbits & 0xffffff );
int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
int m;
- net_diff = (double)0x0000ffff / (double)bits;
+ double d = (double)0x0000ffff / (double)bits;
for ( m = shift; m < 29; m++ )
- net_diff *= 256.0;
+ d *= 256.0;
for ( m = 29; m < shift; m++ )
- net_diff /= 256.0;
+ d /= 256.0;
if ( shift == 28 )
- net_diff *= 256.0; // testnet
+ d *= 256.0; // testnet
if ( opt_debug_diff )
- applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", net_diff,
- shift, bits);
+ applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d,
+ shift, bits );
+ return net_diff;
}
void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
diff --git a/algo/drop.c b/algo/drop.c
index 2b16fb6..aa3dae3 100644
--- a/algo/drop.c
+++ b/algo/drop.c
@@ -233,11 +233,6 @@ void drop_get_new_work( struct work* work, struct work* g_work, int thr_id,
++(*nonceptr);
}
-void drop_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (65536.0 * opt_diff_factor) );
-}
-
void drop_display_pok( struct work* work )
{
if ( work->data[0] & 0x00008000 )
diff --git a/algo/lbry.c b/algo/lbry.c
index d7bd8d4..d2f081c 100644
--- a/algo/lbry.c
+++ b/algo/lbry.c
@@ -142,7 +142,7 @@ int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
return 0;
}
-void lbry_calc_network_diff(struct work *work)
+double lbry_calc_network_diff( struct work *work )
{
// sample for diff 43.281 : 1c05ea29
// todo: endian reversed on longpoll could be zr5 specific...
@@ -159,7 +159,7 @@ void lbry_calc_network_diff(struct work *work)
if (opt_debug_diff)
applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
- net_diff = d;
+ return d;
}
// std_le should work but it doesn't
diff --git a/algo/lyra2/lyra2.c b/algo/lyra2/lyra2.c
index 5f1b527..fab6da5 100644
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -44,171 +44,344 @@
*
* @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
*/
-int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols)
+
+// Lyra2RE & Lyra2REv2, nRows must be a power of 2
+int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
+ const void *salt, uint64_t saltlen, uint64_t timeCost,
+ const uint64_t nRows, const uint64_t nCols )
{
- //============================= Basic variables ============================//
- int64_t row = 2; //index of row to be processed
- int64_t prev = 1; //index of prev (last row ever computed/modified)
- int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
- int64_t tau; //Time Loop iterator
- int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
- int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
- int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
- int64_t i; //auxiliary iteration counter
- int64_t v64; // 64bit var for memcpy
- //==========================================================================/
+ //====================== Basic variables ============================//
+ uint64_t _ALIGN(256) state[16];
+ int64_t row = 2; //index of row to be processed
+ int64_t prev = 1; //index of prev (last row ever computed/modified)
+ int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+ int64_t tau; //Time Loop iterator
+ int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+ int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+ int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+ int64_t i; //auxiliary iteration counter
+ int64_t v64; // 64bit var for memcpy
+ //====================================================================/
- //========== Initializing the Memory Matrix and pointers to it =============//
- //Tries to allocate enough space for the whole memory matrix
+ //=== Initializing the Memory Matrix and pointers to it =============//
+ //Tries to allocate enough space for the whole memory matrix
- const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
- const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
- // for Lyra2REv2, nCols = 4, v1 was using 8
- const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+ const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+ const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+ // for Lyra2REv2, nCols = 4, v1 was using 8
+ const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
+ : BLOCK_LEN_BLAKE2_SAFE_BYTES;
- i = (int64_t)ROW_LEN_BYTES * nRows;
- uint64_t *wholeMatrix = malloc(i);
- if (wholeMatrix == NULL) {
- return -1;
- }
- memset(wholeMatrix, 0, i);
+ i = (int64_t)ROW_LEN_BYTES * nRows;
+ uint64_t *wholeMatrix = malloc(i);
+ if (wholeMatrix == NULL)
+ return -1;
- //Allocates pointers to each row of the matrix
- uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows);
- if (memMatrix == NULL) {
- return -1;
- }
- //Places the pointers in the correct positions
- uint64_t *ptrWord = wholeMatrix;
- for (i = 0; i < nRows; i++) {
- memMatrix[i] = ptrWord;
- ptrWord += ROW_LEN_INT64;
- }
- //==========================================================================/
+ memset(wholeMatrix, 0, i);
- //============= Getting the password + salt + basil padded with 10*1 ===============//
- //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
- //but this ensures that the password copied locally will be overwritten as soon as possible
+ //Allocates pointers to each row of the matrix
+ uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows);
+ if (memMatrix == NULL)
+ return -1;
- //First, we clean enough blocks for the password, salt, basil and padding
- int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
+ //Places the pointers in the correct positions
+ uint64_t *ptrWord = wholeMatrix;
+ for (i = 0; i < nRows; i++)
+ {
+ memMatrix[i] = ptrWord;
+ ptrWord += ROW_LEN_INT64;
+ }
- byte *ptrByte = (byte*) wholeMatrix;
+ //=== Getting the password + salt + basil padded with 10*1 ==========//
+ //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+ //but this ensures that the password copied locally will be overwritten as soon as possible
- //Prepends the password
- memcpy(ptrByte, pwd, pwdlen);
- ptrByte += pwdlen;
+ //First, we clean enough blocks for the password, salt, basil and padding
+ int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+ / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
- //Concatenates the salt
- memcpy(ptrByte, salt, saltlen);
- ptrByte += saltlen;
+ byte *ptrByte = (byte*) wholeMatrix;
- memset(ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen));
+ //Prepends the password
+ memcpy(ptrByte, pwd, pwdlen);
+ ptrByte += pwdlen;
- //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
- memcpy(ptrByte, &kLen, sizeof(int64_t));
- ptrByte += sizeof(uint64_t);
- v64 = pwdlen;
- memcpy(ptrByte, &v64, sizeof(int64_t));
- ptrByte += sizeof(uint64_t);
- v64 = saltlen;
- memcpy(ptrByte, &v64, sizeof(int64_t));
- ptrByte += sizeof(uint64_t);
- v64 = timeCost;
- memcpy(ptrByte, &v64, sizeof(int64_t));
- ptrByte += sizeof(uint64_t);
- v64 = nRows;
- memcpy(ptrByte, &v64, sizeof(int64_t));
- ptrByte += sizeof(uint64_t);
- v64 = nCols;
- memcpy(ptrByte, &v64, sizeof(int64_t));
- ptrByte += sizeof(uint64_t);
+ //Concatenates the salt
+ memcpy(ptrByte, salt, saltlen);
+ ptrByte += saltlen;
- //Now comes the padding
- *ptrByte = 0x80; //first byte of padding: right after the password
- ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
- ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
- *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
- //==========================================================================/
+ memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
+ - (saltlen + pwdlen) );
- //======================= Initializing the Sponge State ====================//
- //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
- uint64_t _ALIGN(256) state[16];
- initState(state);
- //==========================================================================/
+ //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+ memcpy(ptrByte, &kLen, sizeof(int64_t));
+ ptrByte += sizeof(uint64_t);
+ v64 = pwdlen;
+ memcpy(ptrByte, &v64, sizeof(int64_t));
+ ptrByte += sizeof(uint64_t);
+ v64 = saltlen;
+ memcpy(ptrByte, &v64, sizeof(int64_t));
+ ptrByte += sizeof(uint64_t);
+ v64 = timeCost;
+ memcpy(ptrByte, &v64, sizeof(int64_t));
+ ptrByte += sizeof(uint64_t);
+ v64 = nRows;
+ memcpy(ptrByte, &v64, sizeof(int64_t));
+ ptrByte += sizeof(uint64_t);
+ v64 = nCols;
+ memcpy(ptrByte, &v64, sizeof(int64_t));
+ ptrByte += sizeof(uint64_t);
- //================================ Setup Phase =============================//
- //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
- ptrWord = wholeMatrix;
- for (i = 0; i < nBlocksInput; i++) {
- absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
- ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
- }
+ //Now comes the padding
+ *ptrByte = 0x80; //first byte of padding: right after the password
+ ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+ ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+ *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
- //Initializes M[0] and M[1]
- reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
+ //================= Initializing the Sponge State ====================//
+ //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+ initState(state);
- reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
+ //========================= Setup Phase =============================//
+ //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+ ptrWord = wholeMatrix;
+ for (i = 0; i < nBlocksInput; i++)
+ {
+ absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
+ ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
+ }
- do {
- //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+ //Initializes M[0] and M[1]
+ reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
- reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+ reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
- //updates the value of row* (deterministically picked during Setup))
- rowa = (rowa + step) & (window - 1);
- //update prev: it now points to the last row ever computed
- prev = row;
- //updates row: goes to the next row to be computed
- row++;
+ do
+ {
+ //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
- //Checks if all rows in the window where visited.
- if (rowa == 0) {
- step = window + gap; //changes the step: approximately doubles its value
- window *= 2; //doubles the size of the re-visitation window
- gap = -gap; //inverts the modifier to the step
- }
+ reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
- } while (row < nRows);
- //==========================================================================/
+ //updates the value of row* (deterministically picked during Setup))
+ rowa = (rowa + step) & (window - 1);
+ //update prev: it now points to the last row ever computed
+ prev = row;
+ //updates row: goes to the next row to be computed
+ row++;
- //============================ Wandering Phase =============================//
- row = 0; //Resets the visitation to the first row of the memory matrix
- for (tau = 1; tau <= timeCost; tau++) {
- //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
- step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
- do {
- //Selects a pseudorandom index row*
- //------------------------------------------------------------------------------------------
- rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
- //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
- //------------------------------------------------------------------------------------------
+ //Checks if all rows in the window where visited.
+ if (rowa == 0)
+ {
+ step = window + gap; //changes the step: approximately doubles its value
+ window *= 2; //doubles the size of the re-visitation window
+ gap = -gap; //inverts the modifier to the step
+ }
- //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
- reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+ } while (row < nRows);
- //update prev: it now points to the last row ever computed
- prev = row;
+ //===================== Wandering Phase =============================//
+ row = 0; //Resets the visitation to the first row of the memory matrix
+ for (tau = 1; tau <= timeCost; tau++)
+ {
+ //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+ step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+ do
+ {
+ //Selects a pseudorandom index row*
+ //-----------------------------------------------
+ rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+ //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+ //-------------------------------------------
- //updates row: goes to the next row to be computed
- //------------------------------------------------------------------------------------------
- row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
- //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
- //------------------------------------------------------------------------------------------
+ //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+ reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
- } while (row != 0);
- }
+ //update prev: it now points to the last row ever computed
+ prev = row;
- //============================ Wrap-up Phase ===============================//
- //Absorbs the last block of the memory matrix
- absorbBlock(state, memMatrix[rowa]);
+ //updates row: goes to the next row to be computed
+ //----------------------------------------------------
+ row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+ //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+ //----------------------------------------------------
- //Squeezes the key
- squeeze(state, K, (unsigned int) kLen);
+ } while (row != 0);
+ }
- //========================= Freeing the memory =============================//
- free(memMatrix);
- free(wholeMatrix);
+ //===================== Wrap-up Phase ===============================//
+ //Absorbs the last block of the memory matrix
+ absorbBlock(state, memMatrix[rowa]);
- return 0;
+ //Squeezes the key
+ squeeze(state, K, (unsigned int) kLen);
+
+ //================== Freeing the memory =============================//
+ free(memMatrix);
+ free(wholeMatrix);
+
+ return 0;
}
+
+// Zcoin, nRows may be any value
+int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
+ const void *salt, uint64_t saltlen, uint64_t timeCost,
+ uint64_t nRows, uint64_t nCols )
+{
+ //========================== Basic variables ============================//
+ uint64_t _ALIGN(256) state[16];
+ int64_t row = 2; //index of row to be processed
+ int64_t prev = 1; //index of prev (last row ever computed/modified)
+ int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+ int64_t tau; //Time Loop iterator
+ int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+ int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+ int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+ int64_t i; //auxiliary iteration counter
+ //=======================================================================/
+
+ //======= Initializing the Memory Matrix and pointers to it =============//
+ //Tries to allocate enough space for the whole memory matrix
+
+ const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+ const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+ i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES);
+ uint64_t *wholeMatrix = malloc(i);
+ if (wholeMatrix == NULL)
+ return -1;
+
+ memset(wholeMatrix, 0, i);
+ //Allocates pointers to each row of the matrix
+ uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*));
+ if (memMatrix == NULL)
+ return -1;
+
+ //Places the pointers in the correct positions
+ uint64_t *ptrWord = wholeMatrix;
+ for (i = 0; i < nRows; i++)
+ {
+ memMatrix[i] = ptrWord;
+ ptrWord += ROW_LEN_INT64;
+ }
+
+ //==== Getting the password + salt + basil padded with 10*1 ============//
+ //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+ //but this ensures that the password copied locally will be overwritten as soon as possible
+
+ //First, we clean enough blocks for the password, salt, basil and padding
+ uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof (uint64_t) )
+ / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
+ byte *ptrByte = (byte*) wholeMatrix;
+ memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES );
+
+ //Prepends the password
+ memcpy(ptrByte, pwd, pwdlen);
+ ptrByte += pwdlen;
+
+ //Concatenates the salt
+ memcpy(ptrByte, salt, saltlen);
+ ptrByte += saltlen;
+
+ //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+ memcpy(ptrByte, &kLen, sizeof (uint64_t));
+ ptrByte += sizeof (uint64_t);
+ memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
+ ptrByte += sizeof (uint64_t);
+ memcpy(ptrByte, &saltlen, sizeof (uint64_t));
+ ptrByte += sizeof (uint64_t);
+ memcpy(ptrByte, &timeCost, sizeof (uint64_t));
+ ptrByte += sizeof (uint64_t);
+ memcpy(ptrByte, &nRows, sizeof (uint64_t));
+ ptrByte += sizeof (uint64_t);
+ memcpy(ptrByte, &nCols, sizeof (uint64_t));
+ ptrByte += sizeof (uint64_t);
+
+ //Now comes the padding
+ *ptrByte = 0x80; //first byte of padding: right after the password
+ ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+ ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+ *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+
+ //=================== Initializing the Sponge State ====================//
+ //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+ initState( state );
+
+ //============================== Setup Phase =============================//
+ //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+ ptrWord = wholeMatrix;
+ for ( i = 0; i < nBlocksInput; i++ )
+ {
+ absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
+ ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
+ }
+
+ //Initializes M[0] and M[1]
+ reducedSqueezeRow0( state, memMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+ reducedDuplexRow1( state, memMatrix[0], memMatrix[1], nCols );
+
+ do
+ {
+ //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+ reducedDuplexRowSetup( state, memMatrix[prev], memMatrix[rowa],
+ memMatrix[row], nCols );
+
+ //updates the value of row* (deterministically picked during Setup))
+ rowa = (rowa + step) & (window - 1);
+ //update prev: it now points to the last row ever computed
+ prev = row;
+ //updates row: goes to the next row to be computed
+ row++;
+
+ //Checks if all rows in the window where visited.
+ if (rowa == 0)
+ {
+ step = window + gap; //changes the step: approximately doubles its value
+ window *= 2; //doubles the size of the re-visitation window
+ gap = -gap; //inverts the modifier to the step
+ }
+
+ } while (row < nRows);
+
+ //======================== Wandering Phase =============================//
+ row = 0; //Resets the visitation to the first row of the memory matrix
+ for ( tau = 1; tau <= timeCost; tau++ )
+ {
+ //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+ step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+ do {
+ //Selects a pseudorandom index row*
+ //----------------------------------------------------------------------
+ //rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+ rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+ //-----------------------------------------------------------------
+
+ //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+ reducedDuplexRow( state, memMatrix[prev], memMatrix[rowa],
+ memMatrix[row], nCols );
+
+ //update prev: it now points to the last row ever computed
+ prev = row;
+
+ //updates row: goes to the next row to be computed
+ //---------------------------------------------------------------
+ //row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+ row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+ //--------------------------------------------------------------------
+
+ } while (row != 0);
+ }
+
+ //========================= Wrap-up Phase ===============================//
+ //Absorbs the last block of the memory matrix
+ absorbBlock( state, memMatrix[rowa] );
+
+ //Squeezes the key
+ squeeze( state, K, kLen );
+
+ //====================== Freeing the memory =============================//
+ free( memMatrix );
+ free( wholeMatrix );
+
+ return 0;
+}
+
diff --git a/algo/lyra2/lyra2.h b/algo/lyra2/lyra2.h
index edf9179..3a9403b 100644
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -37,6 +37,10 @@ typedef unsigned char byte;
#define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes
#endif
-int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
-
+int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
+ const void *salt, uint64_t saltlen, uint64_t timeCost,
+ uint64_t nRows, uint64_t nCols );
+int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
+ const void *salt, uint64_t saltlen, uint64_t timeCost,
+ uint64_t nRows, uint64_t nCols );
#endif /* LYRA2_H_ */
diff --git a/algo/lyra2/sponge.c b/algo/lyra2/sponge.c
index e746ede..0ab66ba 100644
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -25,6 +25,8 @@
#include "sponge.h"
#include "lyra2.h"
+
+
/**
* Initializes the Sponge State. The first 512 bits are set to zeros and the remainder
* receive Blake2b's IV as per Blake2b's specification. Note: Even though sponges
@@ -36,10 +38,8 @@
*
* @param state The 1024-bit array to be initialized
*/
-void initState(uint64_t state[/*16*/])
-{
+inline void initState(uint64_t state[/*16*/]) {
#ifdef __AVX2__
-
(*(__m256i*)(&state[0])) = _mm256_setzero_si256();
(*(__m256i*)(&state[4])) = _mm256_setzero_si256();
@@ -56,18 +56,17 @@ void initState(uint64_t state[/*16*/])
//#elif defined __AVX__
#else
-
- //First 512 bis are zeros
- memset(state, 0, 64);
- //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV
- state[8] = blake2b_IV[0];
- state[9] = blake2b_IV[1];
- state[10] = blake2b_IV[2];
- state[11] = blake2b_IV[3];
- state[12] = blake2b_IV[4];
- state[13] = blake2b_IV[5];
- state[14] = blake2b_IV[6];
- state[15] = blake2b_IV[7];
+ //First 512 bis are zeros
+ memset(state, 0, 64);
+ //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV
+ state[8] = blake2b_IV[0];
+ state[9] = blake2b_IV[1];
+ state[10] = blake2b_IV[2];
+ state[11] = blake2b_IV[3];
+ state[12] = blake2b_IV[4];
+ state[13] = blake2b_IV[5];
+ state[14] = blake2b_IV[6];
+ state[15] = blake2b_IV[7];
#endif
}
@@ -76,23 +75,11 @@ void initState(uint64_t state[/*16*/])
*
* @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
*/
-__inline static void blake2bLyra( uint64_t *v )
-{
+inline static void blake2bLyra(uint64_t *v) {
#if defined __AVX2__
-
+// may be still used by squeeze
LYRA_INIT_AVX2; // defines local a[4]
- LYRA_ROUND_AVX2;
- LYRA_ROUND_AVX2;
- LYRA_ROUND_AVX2;
- LYRA_ROUND_AVX2;
- LYRA_ROUND_AVX2;
- LYRA_ROUND_AVX2;
- LYRA_ROUND_AVX2;
- LYRA_ROUND_AVX2;
- LYRA_ROUND_AVX2;
- LYRA_ROUND_AVX2;
- LYRA_ROUND_AVX2;
- LYRA_ROUND_AVX2;
+ LYRA_12_ROUNDS_AVX2( a[0], a[1], a[2], a[3] );
LYRA_CLOSE_AVX2;
#elif defined __AVX__
@@ -113,20 +100,18 @@ __inline static void blake2bLyra( uint64_t *v )
LYRA_CLOSE_AVX;
#else
-
- ROUND_LYRA(0);
- ROUND_LYRA(0);
- ROUND_LYRA(0);
- ROUND_LYRA(0);
- ROUND_LYRA(0);
- ROUND_LYRA(0);
- ROUND_LYRA(0);
- ROUND_LYRA(0);
- ROUND_LYRA(0);
- ROUND_LYRA(0);
- ROUND_LYRA(0);
- ROUND_LYRA(0);
-
+ ROUND_LYRA(0);
+ ROUND_LYRA(1);
+ ROUND_LYRA(2);
+ ROUND_LYRA(3);
+ ROUND_LYRA(4);
+ ROUND_LYRA(5);
+ ROUND_LYRA(6);
+ ROUND_LYRA(7);
+ ROUND_LYRA(8);
+ ROUND_LYRA(9);
+ ROUND_LYRA(10);
+ ROUND_LYRA(11);
#endif
}
@@ -134,19 +119,8 @@ __inline static void blake2bLyra( uint64_t *v )
* Executes a reduced version of Blake2b's G function with only one round
* @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
*/
-__inline static void reducedBlake2bLyra(uint64_t *v) {
-
-#if defined __AVX2__
- LYRA_INIT_AVX2; // defines local a[4]
- LYRA_ROUND_AVX2;
- LYRA_CLOSE_AVX2;
-#elif defined __AVX__
- LYRA_INIT_AVX; // defines locals a0[4], a1[4]
- LYRA_ROUND_AVX;
- LYRA_CLOSE_AVX;
-#else
- ROUND_LYRA(0);
-#endif
+inline static void reducedBlake2bLyra(uint64_t *v) {
+ ROUND_LYRA(0);
}
/**
@@ -157,21 +131,21 @@ __inline static void reducedBlake2bLyra(uint64_t *v) {
* @param out Array that will receive the data squeezed
* @param len The number of bytes to be squeezed into the "out" array
*/
-void squeeze(uint64_t *state, byte *out, unsigned int len)
+inline void squeeze( uint64_t *state, byte *out, unsigned int len )
{
- int fullBlocks = len / BLOCK_LEN_BYTES;
- byte *ptr = out;
- int i;
+ int fullBlocks = len / BLOCK_LEN_BYTES;
+ byte *ptr = out;
+ int i;
- //Squeezes full blocks
- for (i = 0; i < fullBlocks; i++) {
- memcpy(ptr, state, BLOCK_LEN_BYTES);
- blake2bLyra(state);
- ptr += BLOCK_LEN_BYTES;
- }
-
- //Squeezes remaining bytes
- memcpy(ptr, state, (len % BLOCK_LEN_BYTES));
+ //Squeezes full blocks
+ for ( i = 0; i < fullBlocks; i++ )
+ {
+ memcpy(ptr, state, BLOCK_LEN_BYTES);
+ blake2bLyra(state);
+ ptr += BLOCK_LEN_BYTES;
+ }
+ //Squeezes remaining bytes
+ memcpy(ptr, state, (len % BLOCK_LEN_BYTES));
}
/**
@@ -181,12 +155,10 @@ void squeeze(uint64_t *state, byte *out, unsigned int len)
* @param state The current state of the sponge
* @param in The block to be absorbed (BLOCK_LEN_INT64 words)
*/
-void absorbBlock(uint64_t *state, const uint64_t *in)
-{
-//XORs the first BLOCK_LEN_INT64 words of "in" with the current state
+inline void absorbBlock(uint64_t *state, const uint64_t *in) {
#if defined __AVX2__
- __m256i state_v[3], in_v[3];
+ __m256i state_v[4], in_v[3];
// only state is guaranteed aligned 256
state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
@@ -195,13 +167,18 @@ void absorbBlock(uint64_t *state, const uint64_t *in)
in_v [1] = _mm256_loadu_si256( (__m256i*)(&in[4]) );
state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
in_v [2] = _mm256_loadu_si256( (__m256i*)(&in[8]) );
+ state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) );
- _mm256_store_si256( (__m256i*)&state[0],
- _mm256_xor_si256( state_v[0], in_v[0] ) );
- _mm256_store_si256( (__m256i*)&state[4],
- _mm256_xor_si256( state_v[1], in_v[1] ) );
- _mm256_store_si256( (__m256i*)&state[8],
- _mm256_xor_si256( state_v[2], in_v[2] ) );
+ state_v[0] = _mm256_xor_si256( state_v[0], in_v[0] );
+ state_v[1] = _mm256_xor_si256( state_v[1], in_v[1] );
+ state_v[2] = _mm256_xor_si256( state_v[2], in_v[2] );
+
+ LYRA_12_ROUNDS_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] );
+
+ _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
+ _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
+ _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
+ _mm256_store_si256( (__m256i*)&state[12], state_v[3] );
#elif defined __AVX__
@@ -221,6 +198,8 @@ void absorbBlock(uint64_t *state, const uint64_t *in)
in_v[4] = _mm_load_si128( (__m128i*)(&in[8]) );
in_v[5] = _mm_load_si128( (__m128i*)(&in[10]) );
+// do blake2bLyra without init
+// LYRA_ROUND_AVX2( state_v )
_mm_store_si128( (__m128i*)(&state[0]),
_mm_xor_si128( state_v[0], in_v[0] ) );
_mm_store_si128( (__m128i*)(&state[2]),
@@ -234,26 +213,28 @@ void absorbBlock(uint64_t *state, const uint64_t *in)
_mm_store_si128( (__m128i*)(&state[10]),
_mm_xor_si128( state_v[5], in_v[5] ) );
-#else
+ //Applies the transformation f to the sponge's state
+ blake2bLyra(state);
- state[0] ^= in[0];
- state[1] ^= in[1];
- state[2] ^= in[2];
- state[3] ^= in[3];
- state[4] ^= in[4];
- state[5] ^= in[5];
- state[6] ^= in[6];
- state[7] ^= in[7];
- state[8] ^= in[8];
- state[9] ^= in[9];
- state[10] ^= in[10];
- state[11] ^= in[11];
+#else
+ //XORs the first BLOCK_LEN_INT64 words of "in" with the current state
+ state[0] ^= in[0];
+ state[1] ^= in[1];
+ state[2] ^= in[2];
+ state[3] ^= in[3];
+ state[4] ^= in[4];
+ state[5] ^= in[5];
+ state[6] ^= in[6];
+ state[7] ^= in[7];
+ state[8] ^= in[8];
+ state[9] ^= in[9];
+ state[10] ^= in[10];
+ state[11] ^= in[11];
+
+ //Applies the transformation f to the sponge's state
+ blake2bLyra(state);
#endif
-
-//Applies the transformation f to the sponge's state
-blake2bLyra(state);
-
}
/**
@@ -263,23 +244,28 @@ blake2bLyra(state);
* @param state The current state of the sponge
* @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words)
*/
-void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in)
-{
-
-//XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
+inline void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) {
+ //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
#if defined __AVX2__
- __m256i state_v[2], in_v[2];
+ __m256i state_v[4], in_v[2];
state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
in_v [0] = _mm256_loadu_si256( (__m256i*)(&in[0]) );
state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
in_v [1] = _mm256_loadu_si256( (__m256i*)(&in[4]) );
+ state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
+ state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) );
- _mm256_store_si256( (__m256i*)(&state[0]),
- _mm256_xor_si256( state_v[0], in_v[0] ) );
- _mm256_store_si256( (__m256i*)(&state[4]),
- _mm256_xor_si256( state_v[1], in_v[1] ) );
+ state_v[0] = _mm256_xor_si256( state_v[0], in_v[0] );
+ state_v[1] = _mm256_xor_si256( state_v[1], in_v[1] );
+
+ LYRA_12_ROUNDS_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] );
+
+ _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
+ _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
+ _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
+ _mm256_store_si256( (__m256i*)&state[12], state_v[3] );
#elif defined __AVX__
@@ -304,22 +290,24 @@ void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in)
_mm_store_si128( (__m128i*)(&state[6]),
_mm_xor_si128( state_v[3], in_v[3] ) );
+ //Applies the transformation f to the sponge's state
+ blake2bLyra(state);
+
#else
- state[0] ^= in[0];
- state[1] ^= in[1];
- state[2] ^= in[2];
- state[3] ^= in[3];
- state[4] ^= in[4];
- state[5] ^= in[5];
- state[6] ^= in[6];
- state[7] ^= in[7];
+ state[0] ^= in[0];
+ state[1] ^= in[1];
+ state[2] ^= in[2];
+ state[3] ^= in[3];
+ state[4] ^= in[4];
+ state[5] ^= in[5];
+ state[6] ^= in[6];
+ state[7] ^= in[7];
+ //Applies the transformation f to the sponge's state
+ blake2bLyra(state);
#endif
-//Applies the transformation f to the sponge's state
-blake2bLyra(state);
-
}
/**
@@ -330,21 +318,34 @@ blake2bLyra(state);
* @param state The current state of the sponge
* @param rowOut Row to receive the data squeezed
*/
-void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, const uint32_t nCols)
+inline void reducedSqueezeRow0( uint64_t* state, uint64_t* rowOut,
+ uint64_t nCols )
{
- uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
- unsigned int i;
- //M[row][C-1-col] = H.reduced_squeeze()
- for (i = 0; i < nCols; i++)
- {
+ uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
+ int i;
+ //M[row][C-1-col] = H.reduced_squeeze()
+
+#if defined __AVX2__
+ __m256i state_v[4];
+ state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
+ state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
+ state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
+ state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) );
+#endif
+
+ for ( i = 0; i < nCols; i++ )
+ {
+
#if defined __AVX2__
- _mm256_storeu_si256( (__m256i*)&ptrWord[0],
- _mm256_load_si256( (__m256i*)(&state[0]) ) );
- _mm256_storeu_si256( (__m256i*)&ptrWord[4],
- _mm256_load_si256( (__m256i*)(&state[4]) ) );
- _mm256_storeu_si256( (__m256i*)&ptrWord[8],
- _mm256_load_si256( (__m256i*)(&state[8]) ) );
+ _mm256_storeu_si256( (__m256i*)&ptrWord[0], state_v[0] );
+ _mm256_storeu_si256( (__m256i*)&ptrWord[4], state_v[1] );
+ _mm256_storeu_si256( (__m256i*)&ptrWord[8], state_v[2] );
+
+ //Goes to next block (column) that will receive the squeezed data
+ ptrWord -= BLOCK_LEN_INT64;
+
+ LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] );
#elif defined __AVX__
@@ -361,28 +362,42 @@ void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, const uint32_t nCols)
_mm_store_si128( (__m128i*)(&ptrWord[10]),
_mm_load_si128( (__m128i*)(&state[10]) ) );
-#else
+ //Goes to next block (column) that will receive the squeezed data
+ ptrWord -= BLOCK_LEN_INT64;
- ptrWord[0] = state[0];
- ptrWord[1] = state[1];
- ptrWord[2] = state[2];
- ptrWord[3] = state[3];
- ptrWord[4] = state[4];
- ptrWord[5] = state[5];
- ptrWord[6] = state[6];
- ptrWord[7] = state[7];
- ptrWord[8] = state[8];
- ptrWord[9] = state[9];
- ptrWord[10] = state[10];
- ptrWord[11] = state[11];
+ //Applies the reduced-round transformation f to the sponge's state
+ reducedBlake2bLyra(state);
+
+#else
+ ptrWord[0] = state[0];
+ ptrWord[1] = state[1];
+ ptrWord[2] = state[2];
+ ptrWord[3] = state[3];
+ ptrWord[4] = state[4];
+ ptrWord[5] = state[5];
+ ptrWord[6] = state[6];
+ ptrWord[7] = state[7];
+ ptrWord[8] = state[8];
+ ptrWord[9] = state[9];
+ ptrWord[10] = state[10];
+ ptrWord[11] = state[11];
+
+ //Goes to next block (column) that will receive the squeezed data
+ ptrWord -= BLOCK_LEN_INT64;
+
+ //Applies the reduced-round transformation f to the sponge's state
+ reducedBlake2bLyra(state);
#endif
- //Goes to next block (column) that will receive the squeezed data
- ptrWord -= BLOCK_LEN_INT64;
-
- //Applies the reduced-round transformation f to the sponge's state
- reducedBlake2bLyra(state);
}
+
+#if defined __AVX2__
+ _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
+ _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
+ _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
+ _mm256_store_si256( (__m256i*)&state[12], state_v[3] );
+#endif
+
}
/**
@@ -394,34 +409,43 @@ void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, const uint32_t nCols)
* @param rowIn Row to feed the sponge
* @param rowOut Row to receive the sponge's output
*/
-void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const uint32_t nCols)
+inline void reducedDuplexRow1( uint64_t *state, uint64_t *rowIn,
+ uint64_t *rowOut, uint64_t nCols )
{
- uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
- uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
- unsigned int i;
-
- for (i = 0; i < nCols; i++)
- {
- //Absorbing "M[prev][col]"
- #if defined __AVX2__
-
- __m256i state_v[3], in_v[3];
+ uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
+ uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+ int i;
+#if defined __AVX2__
+ __m256i state_v[4], in_v[3];
state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
- in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) );
state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
- in_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) );
state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
+ state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) );
+#endif
+
+ for ( i = 0; i < nCols; i++ )
+ {
+#if defined __AVX2__
+
+ in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) );
+ in_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) );
in_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) );
- _mm256_store_si256( (__m256i*)(&state[0]),
+ state_v[0] = _mm256_xor_si256( state_v[0], in_v[0] );
+ state_v[1] = _mm256_xor_si256( state_v[1], in_v[1] );
+ state_v[2] = _mm256_xor_si256( state_v[2], in_v[2] );
+
+ LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] );
+
+ _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]),
_mm256_xor_si256( state_v[0], in_v[0] ) );
- _mm256_store_si256( (__m256i*)(&state[4]),
+ _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]),
_mm256_xor_si256( state_v[1], in_v[1] ) );
- _mm256_store_si256( (__m256i*)(&state[8]),
+ _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
_mm256_xor_si256( state_v[2], in_v[2] ) );
- #elif defined __AVX__
+#elif defined __AVX__
__m128i state_v[6], in_v[6];
@@ -452,29 +476,31 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const
_mm_store_si128( (__m128i*)(&state[10]),
_mm_xor_si128( state_v[5], in_v[5] ) );
- #else
+ //Applies the reduced-round transformation f to the sponge's state
+ reducedBlake2bLyra(state);
- state[0] ^= (ptrWordIn[0]);
- state[1] ^= (ptrWordIn[1]);
- state[2] ^= (ptrWordIn[2]);
- state[3] ^= (ptrWordIn[3]);
- state[4] ^= (ptrWordIn[4]);
- state[5] ^= (ptrWordIn[5]);
- state[6] ^= (ptrWordIn[6]);
- state[7] ^= (ptrWordIn[7]);
- state[8] ^= (ptrWordIn[8]);
- state[9] ^= (ptrWordIn[9]);
- state[10] ^= (ptrWordIn[10]);
- state[11] ^= (ptrWordIn[11]);
+#else
- #endif
+ //Absorbing "M[prev][col]"
+ state[0] ^= (ptrWordIn[0]);
+ state[1] ^= (ptrWordIn[1]);
+ state[2] ^= (ptrWordIn[2]);
+ state[3] ^= (ptrWordIn[3]);
+ state[4] ^= (ptrWordIn[4]);
+ state[5] ^= (ptrWordIn[5]);
+ state[6] ^= (ptrWordIn[6]);
+ state[7] ^= (ptrWordIn[7]);
+ state[8] ^= (ptrWordIn[8]);
+ state[9] ^= (ptrWordIn[9]);
+ state[10] ^= (ptrWordIn[10]);
+ state[11] ^= (ptrWordIn[11]);
- //Applies the reduced-round transformation f to the sponge's state
- reducedBlake2bLyra(state);
+ //Applies the reduced-round transformation f to the sponge's state
+ reducedBlake2bLyra(state);
+#endif
- //M[row][C-1-col] = M[prev][col] XOR rand
#if defined __AVX2__
- state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
+/* state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
@@ -484,7 +510,7 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const
_mm256_xor_si256( state_v[1], in_v[1] ) );
_mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
_mm256_xor_si256( state_v[2], in_v[2] ) );
-
+*/
#elif defined __AVX__
state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
@@ -509,25 +535,34 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const
#else
- ptrWordOut[0] = ptrWordIn[0] ^ state[0];
- ptrWordOut[1] = ptrWordIn[1] ^ state[1];
- ptrWordOut[2] = ptrWordIn[2] ^ state[2];
- ptrWordOut[3] = ptrWordIn[3] ^ state[3];
- ptrWordOut[4] = ptrWordIn[4] ^ state[4];
- ptrWordOut[5] = ptrWordIn[5] ^ state[5];
- ptrWordOut[6] = ptrWordIn[6] ^ state[6];
- ptrWordOut[7] = ptrWordIn[7] ^ state[7];
- ptrWordOut[8] = ptrWordIn[8] ^ state[8];
- ptrWordOut[9] = ptrWordIn[9] ^ state[9];
- ptrWordOut[10] = ptrWordIn[10] ^ state[10];
- ptrWordOut[11] = ptrWordIn[11] ^ state[11];
- #endif
+ //M[row][C-1-col] = M[prev][col] XOR rand
+ ptrWordOut[0] = ptrWordIn[0] ^ state[0];
+ ptrWordOut[1] = ptrWordIn[1] ^ state[1];
+ ptrWordOut[2] = ptrWordIn[2] ^ state[2];
+ ptrWordOut[3] = ptrWordIn[3] ^ state[3];
+ ptrWordOut[4] = ptrWordIn[4] ^ state[4];
+ ptrWordOut[5] = ptrWordIn[5] ^ state[5];
+ ptrWordOut[6] = ptrWordIn[6] ^ state[6];
+ ptrWordOut[7] = ptrWordIn[7] ^ state[7];
+ ptrWordOut[8] = ptrWordIn[8] ^ state[8];
+ ptrWordOut[9] = ptrWordIn[9] ^ state[9];
+ ptrWordOut[10] = ptrWordIn[10] ^ state[10];
+ ptrWordOut[11] = ptrWordIn[11] ^ state[11];
+#endif
+
+ //Input: next column (i.e., next block in sequence)
+ ptrWordIn += BLOCK_LEN_INT64;
+ //Output: goes to previous column
+ ptrWordOut -= BLOCK_LEN_INT64;
+ }
+
+#if defined __AVX2__
+ _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
+ _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
+ _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
+ _mm256_store_si256( (__m256i*)&state[12], state_v[3] );
+#endif
- //Input: next column (i.e., next block in sequence)
- ptrWordIn += BLOCK_LEN_INT64;
- //Output: goes to previous column
- ptrWordOut -= BLOCK_LEN_INT64;
- }
}
/**
@@ -544,46 +579,86 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const
* @param rowOut Row receiving the output
*
*/
-void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols)
+inline void reducedDuplexRowSetup( uint64_t *state, uint64_t *rowIn,
+ uint64_t *rowInOut, uint64_t *rowOut,
+ uint64_t nCols )
{
- uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
- uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
- uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
- unsigned int i;
+ uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
+ uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
+ uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+ int i;
- for (i = 0; i < nCols; i++)
- {
- //Absorbing "M[prev] [+] M[row*]"
- #if defined __AVX2__
+#if defined __AVX2__
+ __m256i state_v[4], in_v[3], inout_v[3];
+ #define t_state in_v
- __m256i state_v[3], in_v[3], inout_v[3];
+ state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
+ state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
+ state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
+ state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) );
- state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
+ for ( i = 0; i < nCols; i++ )
+ {
in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) );
inout_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[0]) );
- state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
in_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) );
inout_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[4]) );
- state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
in_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) );
inout_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[8]) );
- _mm256_store_si256( (__m256i*)(&state[0]),
- _mm256_xor_si256( state_v[0],
- _mm256_add_epi64( in_v[0],
- inout_v[0] ) ) );
- _mm256_store_si256( (__m256i*)(&state[4]),
- _mm256_xor_si256( state_v[1],
- _mm256_add_epi64( in_v[1],
- inout_v[1] ) ) );
- _mm256_store_si256( (__m256i*)(&state[8]),
- _mm256_xor_si256( state_v[2],
- _mm256_add_epi64( in_v[2],
- inout_v[2] ) ) );
- #elif defined __AVX__
+ state_v[0] = _mm256_xor_si256( state_v[0], _mm256_add_epi64( in_v[0],
+ inout_v[0] ) );
+ state_v[1] = _mm256_xor_si256( state_v[1], _mm256_add_epi64( in_v[1],
+ inout_v[1] ) );
+ state_v[2] = _mm256_xor_si256( state_v[2], _mm256_add_epi64( in_v[2],
+ inout_v[2] ) );
+
+ LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] );
+
+ _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]),
+ _mm256_xor_si256( state_v[0], in_v[0] ) );
+ _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]),
+ _mm256_xor_si256( state_v[1], in_v[1] ) );
+ _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
+ _mm256_xor_si256( state_v[2], in_v[2] ) );
+
+ //M[row*][col] = M[row*][col] XOR rotW(rand)
+ t_state[0] = _mm256_permute4x64_epi64( state_v[0], 0x93 );
+ t_state[1] = _mm256_permute4x64_epi64( state_v[1], 0x93 );
+ t_state[2] = _mm256_permute4x64_epi64( state_v[2], 0x93 );
+
+ inout_v[0] = _mm256_xor_si256( inout_v[0],
+ _mm256_blend_epi32( t_state[0], t_state[2], 0x03 ) );
+ inout_v[1] = _mm256_xor_si256( inout_v[1],
+ _mm256_blend_epi32( t_state[1], t_state[0], 0x03 ) );
+ inout_v[2] = _mm256_xor_si256( inout_v[2],
+ _mm256_blend_epi32( t_state[2], t_state[1], 0x03 ) );
+
+ _mm256_storeu_si256( (__m256i*)&ptrWordInOut[0], inout_v[0] );
+ _mm256_storeu_si256( (__m256i*)&ptrWordInOut[4], inout_v[1] );
+ _mm256_storeu_si256( (__m256i*)&ptrWordInOut[8], inout_v[2] );
+
+ //Inputs: next column (i.e., next block in sequence)
+ ptrWordInOut += BLOCK_LEN_INT64;
+ ptrWordIn += BLOCK_LEN_INT64;
+ //Output: goes to previous column
+ ptrWordOut -= BLOCK_LEN_INT64;
+ }
+
+ _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
+ _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
+ _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
+ _mm256_store_si256( (__m256i*)&state[12], state_v[3] );
+
+ #undef t_state
+
+#elif defined __AVX__
__m128i state_v[6], in_v[6], inout_v[6];
+ for ( i = 0; i < nCols; i++ )
+ {
+
state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
@@ -630,39 +705,34 @@ void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut,
_mm_add_epi64( in_v[5],
inout_v[5] ) ) );
- #else
+ //Applies the reduced-round transformation f to the sponge's state
+ reducedBlake2bLyra(state);
+#else
+ for ( i = 0; i < nCols; i++ )
+ {
- state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]);
- state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]);
- state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]);
- state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]);
- state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]);
- state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]);
- state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]);
- state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]);
- state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]);
- state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]);
- state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
- state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
- #endif
+ //Absorbing "M[prev] [+] M[row*]"
+ state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]);
+ state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]);
+ state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]);
+ state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]);
+ state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]);
+ state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]);
+ state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]);
+ state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]);
+ state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]);
+ state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]);
+ state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
+ state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
+ //Applies the reduced-round transformation f to the sponge's state
+ reducedBlake2bLyra(state);
+
+ //M[row][col] = M[prev][col] XOR rand
+#endif
- //Applies the reduced-round transformation f to the sponge's state
- reducedBlake2bLyra(state);
- //M[row][col] = M[prev][col] XOR rand
#if defined __AVX2__
- state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
- state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
- state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
-
- _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]),
- _mm256_xor_si256( state_v[0], in_v[0] ) );
- _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]),
- _mm256_xor_si256( state_v[1], in_v[1] ) );
- _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
- _mm256_xor_si256( state_v[2], in_v[2] ) );
-
#elif defined __AVX__
state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
@@ -687,40 +757,49 @@ void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut,
#else
- ptrWordOut[0] = ptrWordIn[0] ^ state[0];
- ptrWordOut[1] = ptrWordIn[1] ^ state[1];
- ptrWordOut[2] = ptrWordIn[2] ^ state[2];
- ptrWordOut[3] = ptrWordIn[3] ^ state[3];
- ptrWordOut[4] = ptrWordIn[4] ^ state[4];
- ptrWordOut[5] = ptrWordIn[5] ^ state[5];
- ptrWordOut[6] = ptrWordIn[6] ^ state[6];
- ptrWordOut[7] = ptrWordIn[7] ^ state[7];
- ptrWordOut[8] = ptrWordIn[8] ^ state[8];
- ptrWordOut[9] = ptrWordIn[9] ^ state[9];
- ptrWordOut[10] = ptrWordIn[10] ^ state[10];
- ptrWordOut[11] = ptrWordIn[11] ^ state[11];
- #endif
+ ptrWordOut[0] = ptrWordIn[0] ^ state[0];
+ ptrWordOut[1] = ptrWordIn[1] ^ state[1];
+ ptrWordOut[2] = ptrWordIn[2] ^ state[2];
+ ptrWordOut[3] = ptrWordIn[3] ^ state[3];
+ ptrWordOut[4] = ptrWordIn[4] ^ state[4];
+ ptrWordOut[5] = ptrWordIn[5] ^ state[5];
+ ptrWordOut[6] = ptrWordIn[6] ^ state[6];
+ ptrWordOut[7] = ptrWordIn[7] ^ state[7];
+ ptrWordOut[8] = ptrWordIn[8] ^ state[8];
+ ptrWordOut[9] = ptrWordIn[9] ^ state[9];
+ ptrWordOut[10] = ptrWordIn[10] ^ state[10];
+ ptrWordOut[11] = ptrWordIn[11] ^ state[11];
+#endif
- //M[row*][col] = M[row*][col] XOR rotW(rand)
- ptrWordInOut[0] ^= state[11];
- ptrWordInOut[1] ^= state[0];
- ptrWordInOut[2] ^= state[1];
- ptrWordInOut[3] ^= state[2];
- ptrWordInOut[4] ^= state[3];
- ptrWordInOut[5] ^= state[4];
- ptrWordInOut[6] ^= state[5];
- ptrWordInOut[7] ^= state[6];
- ptrWordInOut[8] ^= state[7];
- ptrWordInOut[9] ^= state[8];
- ptrWordInOut[10] ^= state[9];
- ptrWordInOut[11] ^= state[10];
+ //M[row*][col] = M[row*][col] XOR rotW(rand)
+// Need to fix this before taking state load/store out of loop
+#ifdef __AVX2__
+
+
+#else
+
+ ptrWordInOut[0] ^= state[11];
+ ptrWordInOut[1] ^= state[0];
+ ptrWordInOut[2] ^= state[1];
+ ptrWordInOut[3] ^= state[2];
+ ptrWordInOut[4] ^= state[3];
+ ptrWordInOut[5] ^= state[4];
+ ptrWordInOut[6] ^= state[5];
+ ptrWordInOut[7] ^= state[6];
+ ptrWordInOut[8] ^= state[7];
+ ptrWordInOut[9] ^= state[8];
+ ptrWordInOut[10] ^= state[9];
+ ptrWordInOut[11] ^= state[10];
+
+ //Inputs: next column (i.e., next block in sequence)
+ ptrWordInOut += BLOCK_LEN_INT64;
+ ptrWordIn += BLOCK_LEN_INT64;
+ //Output: goes to previous column
+ ptrWordOut -= BLOCK_LEN_INT64;
+ }
+
+#endif
- //Inputs: next column (i.e., next block in sequence)
- ptrWordInOut += BLOCK_LEN_INT64;
- ptrWordIn += BLOCK_LEN_INT64;
- //Output: goes to previous column
- ptrWordOut -= BLOCK_LEN_INT64;
- }
}
/**
@@ -737,22 +816,26 @@ void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut,
* @param rowOut Row receiving the output
*
*/
-void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols)
+inline void reducedDuplexRow( uint64_t *state, uint64_t *rowIn,
+ uint64_t *rowInOut, uint64_t *rowOut,
+ uint64_t nCols )
{
- uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
- uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
- uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
- unsigned int i;
+ uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
+ uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
+ uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
+ int i;
- for (i = 0; i < nCols; i++)
- {
- //Absorbing "M[prev] [+] M[row*]"
- #if defined __AVX2__
+#if defined __AVX2__
- __m256i state_v[3], in_v[3], inout_v[3];
+ for ( i = 0; i < nCols; i++)
+ {
+
+ //Absorbing "M[prev] [+] M[row*]"
+
+ __m256i state_v[4], in_v[3], inout_v[3];
#define out_v in_v // reuse register in next code block
-
+ #define t_state in_v
state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) );
inout_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[0]) );
@@ -762,21 +845,82 @@ void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint
state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
in_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) );
inout_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[8]) );
+ state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) );
+
+ state_v[0] = _mm256_xor_si256( state_v[0], _mm256_add_epi64( in_v[0],
+ inout_v[0] ) );
+ state_v[1] = _mm256_xor_si256( state_v[1], _mm256_add_epi64( in_v[1],
+ inout_v[1] ) );
+ state_v[2] = _mm256_xor_si256( state_v[2], _mm256_add_epi64( in_v[2],
+ inout_v[2] ) );
+
+ out_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[0]) );
+ out_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[4]) );
+ out_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[8]) );
+
+ LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] );
+
+ _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
+ _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
+ _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
+ _mm256_store_si256( (__m256i*)&state[12], state_v[3] );
+
+ _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]),
+ _mm256_xor_si256( state_v[0], out_v[0] ) );
+ _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]),
+ _mm256_xor_si256( state_v[1], out_v[1] ) );
+ _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
+ _mm256_xor_si256( state_v[2], out_v[2] ) );
+
+/*
+ t_state[0] = _mm256_permute4x64_epi64( state_v[0], 0x93 );
+ t_state[1] = _mm256_permute4x64_epi64( state_v[1], 0x93 );
+ t_state[2] = _mm256_permute4x64_epi64( state_v[2], 0x93 );
+
+ inout_v[0] = _mm256_xor_si256( inout_v[0],
+ _mm256_blend_epi32( t_state[0], t_state[2], 0x03 ) );
+ inout_v[1] = _mm256_xor_si256( inout_v[1],
+ _mm256_blend_epi32( t_state[1], t_state[0], 0x03 ) );
+ inout_v[2] = _mm256_xor_si256( inout_v[2],
+ _mm256_blend_epi32( t_state[2], t_state[1], 0x03 ) );
+
+ _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[0]), inout_v[0] );
+ _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[4]), inout_v[1] );
+ _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[8]), inout_v[2] );
+
+ _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
+ _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
+ _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
+ _mm256_store_si256( (__m256i*)&state[12], state_v[3] );
+*/
+ #undef out_v
+ #undef t_state
+
+ //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+ ptrWordInOut[0] ^= state[11];
+ ptrWordInOut[1] ^= state[0];
+ ptrWordInOut[2] ^= state[1];
+ ptrWordInOut[3] ^= state[2];
+ ptrWordInOut[4] ^= state[3];
+ ptrWordInOut[5] ^= state[4];
+ ptrWordInOut[6] ^= state[5];
+ ptrWordInOut[7] ^= state[6];
+ ptrWordInOut[8] ^= state[7];
+ ptrWordInOut[9] ^= state[8];
+ ptrWordInOut[10] ^= state[9];
+ ptrWordInOut[11] ^= state[10];
+
+ //Goes to next block
+ ptrWordOut += BLOCK_LEN_INT64;
+ ptrWordInOut += BLOCK_LEN_INT64;
+ ptrWordIn += BLOCK_LEN_INT64;
+ }
- _mm256_store_si256( (__m256i*)(&state[0]),
- _mm256_xor_si256( state_v[0],
- _mm256_add_epi64( in_v[0],
- inout_v[0] ) ) );
- _mm256_store_si256( (__m256i*)(&state[4]),
- _mm256_xor_si256( state_v[1],
- _mm256_add_epi64( in_v[1],
- inout_v[1] ) ) );
- _mm256_store_si256( (__m256i*)(&state[8]),
- _mm256_xor_si256( state_v[2],
- _mm256_add_epi64( in_v[2],
- inout_v[2] ) ) );
#elif defined __AVX__
+ for ( i = 0; i < nCols; i++)
+ {
+
__m128i state_v[6], in_v[6], inout_v[6];
#define out_v in_v // reuse register in next code block
@@ -826,28 +970,35 @@ void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint
_mm_add_epi64( in_v[5],
inout_v[5] ) ) );
- #else
-
- state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]);
- state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]);
- state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]);
- state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]);
- state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]);
- state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]);
- state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]);
- state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]);
- state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]);
- state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]);
- state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
- state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
- #endif
-
//Applies the reduced-round transformation f to the sponge's state
reducedBlake2bLyra(state);
- //M[rowOut][col] = M[rowOut][col] XOR rand
- #if defined __AVX2__
+ #else
+ for ( i = 0; i < nCols; i++)
+ {
+
+ state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]);
+ state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]);
+ state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]);
+ state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]);
+ state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]);
+ state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]);
+ state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]);
+ state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]);
+ state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]);
+ state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]);
+ state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
+ state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
+
+ //Applies the reduced-round transformation f to the sponge's state
+ reducedBlake2bLyra(state);
+#endif
+
+ //M[rowOut][col] = M[rowOut][col] XOR rand
+
+ #if defined __AVX2__
+/*
state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
out_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[0]) );
state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
@@ -861,7 +1012,7 @@ void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint
_mm256_xor_si256( state_v[1], out_v[1] ) );
_mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
_mm256_xor_si256( state_v[2], out_v[2] ) );
-
+*/
#elif defined __AVX__
state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
@@ -891,22 +1042,39 @@ void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint
_mm_store_si128( (__m128i*)(&ptrWordOut[10]),
_mm_xor_si128( state_v[5], out_v[5] ) );
- #else
+ //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+ ptrWordInOut[0] ^= state[11];
+ ptrWordInOut[1] ^= state[0];
+ ptrWordInOut[2] ^= state[1];
+ ptrWordInOut[3] ^= state[2];
+ ptrWordInOut[4] ^= state[3];
+ ptrWordInOut[5] ^= state[4];
+ ptrWordInOut[6] ^= state[5];
+ ptrWordInOut[7] ^= state[6];
+ ptrWordInOut[8] ^= state[7];
+ ptrWordInOut[9] ^= state[8];
+ ptrWordInOut[10] ^= state[9];
+ ptrWordInOut[11] ^= state[10];
- ptrWordOut[0] ^= state[0];
- ptrWordOut[1] ^= state[1];
- ptrWordOut[2] ^= state[2];
- ptrWordOut[3] ^= state[3];
- ptrWordOut[4] ^= state[4];
- ptrWordOut[5] ^= state[5];
- ptrWordOut[6] ^= state[6];
- ptrWordOut[7] ^= state[7];
- ptrWordOut[8] ^= state[8];
- ptrWordOut[9] ^= state[9];
- ptrWordOut[10] ^= state[10];
- ptrWordOut[11] ^= state[11];
+ //Goes to next block
+ ptrWordOut += BLOCK_LEN_INT64;
+ ptrWordInOut += BLOCK_LEN_INT64;
+ ptrWordIn += BLOCK_LEN_INT64;
+ }
- #endif
+#else
+ ptrWordOut[0] ^= state[0];
+ ptrWordOut[1] ^= state[1];
+ ptrWordOut[2] ^= state[2];
+ ptrWordOut[3] ^= state[3];
+ ptrWordOut[4] ^= state[4];
+ ptrWordOut[5] ^= state[5];
+ ptrWordOut[6] ^= state[6];
+ ptrWordOut[7] ^= state[7];
+ ptrWordOut[8] ^= state[8];
+ ptrWordOut[9] ^= state[9];
+ ptrWordOut[10] ^= state[10];
+ ptrWordOut[11] ^= state[11];
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
ptrWordInOut[0] ^= state[11];
@@ -922,24 +1090,10 @@ void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint
ptrWordInOut[10] ^= state[9];
ptrWordInOut[11] ^= state[10];
- //Goes to next block
- ptrWordOut += BLOCK_LEN_INT64;
- ptrWordInOut += BLOCK_LEN_INT64;
- ptrWordIn += BLOCK_LEN_INT64;
- }
+ //Goes to next block
+ ptrWordOut += BLOCK_LEN_INT64;
+ ptrWordInOut += BLOCK_LEN_INT64;
+ ptrWordIn += BLOCK_LEN_INT64;
+ }
+#endif
}
-
-/**
- * Prints an array of unsigned chars
- */
-void printArray(unsigned char *array, unsigned int size, char *name)
-{
- unsigned int i;
- printf("%s: ", name);
- for (i = 0; i < size; i++) {
- printf("%2x|", array[i]);
- }
- printf("\n");
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h
index 5b38dc7..8f6cb4c 100644
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -23,42 +23,34 @@
#define SPONGE_H_
#include
+#include "avxdefs.h"
-/* Blake2b IV Array */
+#if defined(__GNUC__)
+#define ALIGN __attribute__ ((aligned(32)))
+#elif defined(_MSC_VER)
+#define ALIGN __declspec(align(32))
+#else
+#define ALIGN
+#endif
+
+
+/*Blake2b IV Array*/
static const uint64_t blake2b_IV[8] =
{
- 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
- 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
- 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
- 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
+ 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+ 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+ 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+ 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
};
-/* Blake2b's rotation */
-static __inline uint64_t rotr64(const uint64_t w, const unsigned c) {
-#ifdef _MSC_VER
- return _rotr64(w, c);
-#else
- return ( w >> c ) | ( w << ( 64 - c ) );
-#endif
+/*Blake2b's rotation*/
+static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
+ return ( w >> c ) | ( w << ( 64 - c ) );
}
#if defined __AVX2__
// only available with avx2
-// rotate each uint64 c bits
-// returns _m256i
-#define mm256_rotr_64(w,c) _mm256_or_si256(_mm256_srli_epi64(w, c), \
- _mm256_slli_epi64(w, 64 - c))
-
-// Rotate 4 uint64 (256 bits) by one uint64 (64 bits)
-// returns __m256i
-#define mm256_rotl256_1x64(s) _mm256_permute4x64_epi64( s, 0x39 )
-#define mm256_rotr256_1x64(s) _mm256_permute4x64_epi64( s, 0x93 )
-
-// swap hi and lo 128 bits in 256 bit vector
-// returns _m256i
-#define mm256_swap128(s) _mm256_permute2f128_si256( s, s, 1 )
-
// init vectors from memory
// returns void, updates defines and inits implicit args a, b, c, d
#define LYRA_INIT_AVX2 \
@@ -88,15 +80,29 @@ static __inline uint64_t rotr64(const uint64_t w, const unsigned c) {
c = _mm256_add_epi64( c, d ); \
b = mm256_rotr_64( _mm256_xor_si256( b, c ), 63 );
-#define LYRA_ROUND_AVX2 \
- G_4X64( a[0], a[1], a[2], a[3] ); \
- a[1] = mm256_rotl256_1x64( a[1]); \
- a[2] = mm256_swap128( a[2] ); \
- a[3] = mm256_rotr256_1x64( a[3] ); \
- G_4X64( a[0], a[1], a[2], a[3] ); \
- a[1] = mm256_rotr256_1x64( a[1] ); \
- a[2] = mm256_swap128( a[2] ); \
- a[3] = mm256_rotl256_1x64( a[3] );
+#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+ G_4X64( s0, s1, s2, s3 ); \
+ s1 = mm256_rotl256_1x64( s1); \
+ s2 = mm256_swap128( s2 ); \
+ s3 = mm256_rotr256_1x64( s3 ); \
+ G_4X64( s0, s1, s2, s3 ); \
+ s1 = mm256_rotr256_1x64( s1 ); \
+ s2 = mm256_swap128( s2 ); \
+ s3 = mm256_rotl256_1x64( s3 );
+
+#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
+ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
#else
// only available with avx
@@ -148,6 +154,7 @@ static __inline uint64_t rotr64(const uint64_t w, const unsigned c) {
#endif // AVX2
+/*
#if defined __AVX__
// can coexist with AVX2
@@ -161,7 +168,7 @@ static __inline uint64_t rotr64(const uint64_t w, const unsigned c) {
#define mm128_swap128(s0, s1) s0 = _mm_xor_si128(s0, s1); \
s1 = _mm_xor_si128(s0, s1); \
s0 = _mm_xor_si128(s0, s1);
-
+
// swap uint64 in 128 bit source vector, equivalent of rotating 128 bits by
// 64 bits (8 bytes)
// __m128i
@@ -193,30 +200,33 @@ static __inline uint64_t rotr64(const uint64_t w, const unsigned c) {
} while(0)
#endif // AVX
+*/
-/* Blake2b's G function */
-#define G(r,i,a,b,c,d) do { \
- a = a + b; \
- d = rotr64(d ^ a, 32); \
- c = c + d; \
- b = rotr64(b ^ c, 24); \
- a = a + b; \
- d = rotr64(d ^ a, 16); \
- c = c + d; \
- b = rotr64(b ^ c, 63); \
+// Scalar
+//Blake2b's G function
+#define G(r,i,a,b,c,d) \
+ do { \
+ a = a + b; \
+ d = rotr64(d ^ a, 32); \
+ c = c + d; \
+ b = rotr64(b ^ c, 24); \
+ a = a + b; \
+ d = rotr64(d ^ a, 16); \
+ c = c + d; \
+ b = rotr64(b ^ c, 63); \
} while(0)
/*One Round of the Blake2b's compression function*/
-#define ROUND_LYRA(r) \
- G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
- G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
- G(r,2,v[ 2],v[ 6],v[10],v[14]); \
- G(r,3,v[ 3],v[ 7],v[11],v[15]); \
- G(r,4,v[ 0],v[ 5],v[10],v[15]); \
- G(r,5,v[ 1],v[ 6],v[11],v[12]); \
- G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
- G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
+#define ROUND_LYRA(r) \
+ G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+ G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+ G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+ G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+ G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+ G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+ G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+ G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
//---- Housekeeping
@@ -224,18 +234,31 @@ void initState(uint64_t state[/*16*/]);
//---- Squeezes
void squeeze(uint64_t *state, unsigned char *out, unsigned int len);
-void reducedSqueezeRow0(uint64_t* state, uint64_t* row, const uint32_t nCols);
+void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols);
//---- Absorbs
void absorbBlock(uint64_t *state, const uint64_t *in);
void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);
//---- Duplexes
-void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const uint32_t nCols);
-void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols);
-void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols);
+void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols);
+void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
+void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
//---- Misc
void printArray(unsigned char *array, unsigned int size, char *name);
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+////TESTS////
+//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2);
+//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+/////////////
+
+
#endif /* SPONGE_H_ */
diff --git a/algo/lyra2/zcoin.c b/algo/lyra2/zcoin.c
new file mode 100644
index 0000000..2c7a747
--- /dev/null
+++ b/algo/lyra2/zcoin.c
@@ -0,0 +1,77 @@
+#include
+#include "miner.h"
+#include "algo-gate-api.h"
+#include "lyra2.h"
+
+void zcoin_hash(void *state, const void *input, uint32_t height)
+{
+
+ uint32_t _ALIGN(256) hash[16];
+
+ LYRA2Z(hash, 32, input, 80, input, 80, 2, height, 256);
+
+ memcpy(state, hash, 32);
+}
+
+//int scanhash_zcoin(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, uint32_t height)
+int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
+ uint64_t *hashes_done )
+{
+ uint32_t _ALIGN(128) hash[8];
+ uint32_t _ALIGN(128) endiandata[20];
+ uint32_t *pdata = work->data;
+ uint32_t *ptarget = work->target;
+ const uint32_t Htarg = ptarget[7];
+ const uint32_t first_nonce = pdata[19];
+ uint32_t nonce = first_nonce;
+ if (opt_benchmark)
+ ptarget[7] = 0x0000ff;
+
+ for (int i=0; i < 19; i++) {
+ be32enc(&endiandata[i], pdata[i]);
+ }
+
+ do {
+ be32enc(&endiandata[19], nonce);
+ zcoin_hash( hash, endiandata, work->height );
+
+ if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
+ work_set_target_ratio(work, hash);
+ pdata[19] = nonce;
+ *hashes_done = pdata[19] - first_nonce;
+ return 1;
+ }
+ nonce++;
+
+ } while (nonce < max_nonce && !work_restart[thr_id].restart);
+
+ pdata[19] = nonce;
+ *hashes_done = pdata[19] - first_nonce + 1;
+ return 0;
+}
+
+int64_t get_max64_0xffffLL() { return 0xffffLL; };
+
+void zcoin_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
+{
+ work->height = sctx->bloc_height;
+ return false;
+}
+
+bool register_zcoin_algo( algo_gate_t* gate )
+{
+ gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+ gate->scanhash = (void*)&scanhash_zcoin;
+ gate->hash = (void*)&zcoin_hash;
+ gate->hash_alt = (void*)&zcoin_hash;
+ gate->get_max64 = (void*)&get_max64_0xffffLL;
+ gate->set_target = (void*)&zcoin_set_target;
+ gate->prevent_dupes = (void*)&zcoin_get_work_height;
+ return true;
+};
+
diff --git a/algo/pluck.c b/algo/pluck.c
index 30224d9..b8de21e 100644
--- a/algo/pluck.c
+++ b/algo/pluck.c
@@ -487,24 +487,19 @@ int64_t pluck_get_max64 ()
return 0x1ffLL;
}
-void pluck_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (65536.0 * opt_diff_factor) );
-}
-
-bool pluck_miner_thread_init( )
+bool pluck_miner_thread_init( int thr_id )
{
scratchbuf = malloc( 128 * 1024 );
if ( scratchbuf )
return true;
- applog( LOG_ERR, "Pluck buffer allocation failed");
+ applog( LOG_ERR, "Thread %u: Pluck buffer allocation failed", thr_id );
return false;
}
bool register_pluck_algo( algo_gate_t* gate )
{
algo_not_tested();
- gate->miner_thread_init = (void*)& pluck_miner_thread_init;
+ gate->miner_thread_init = (void*)&pluck_miner_thread_init;
gate->scanhash = (void*)&scanhash_pluck;
gate->hash = (void*)&pluck_hash;
gate->set_target = (void*)&scrypt_set_target;
diff --git a/algo/scrypt.c b/algo/scrypt.c
index 31c6c09..fedc9a7 100644
--- a/algo/scrypt.c
+++ b/algo/scrypt.c
@@ -774,12 +774,12 @@ int64_t scrypt_get_max64()
return max64;
}
-bool scrypt_miner_thread_init()
+bool scrypt_miner_thread_init( int thr_id )
{
scratchbuf = scrypt_buffer_alloc( opt_scrypt_n );
if ( scratchbuf )
return true;
- applog( LOG_ERR, "Scrypt buffer allocation failed" );
+ applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id );
return false;
}
diff --git a/algo/scryptjane/scrypt-jane.c b/algo/scryptjane/scrypt-jane.c
index 8b87134..bcab680 100644
--- a/algo/scryptjane/scrypt-jane.c
+++ b/algo/scryptjane/scrypt-jane.c
@@ -231,11 +231,6 @@ void scryptjanehash(void *output, const void *input )
scrypt_free(&YX);
}
-void scryptjane_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (65536.0 * opt_diff_factor) );
-}
-
bool register_scryptjane_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_scryptjane;
diff --git a/algo/yescrypt/yescrypt.c b/algo/yescrypt/yescrypt.c
index d6d1f44..a175070 100644
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -49,19 +49,11 @@ int64_t yescrypt_get_max64 ()
return 0x1ffLL;
}
-
-void yescrypt_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (65536.0 * opt_diff_factor) );
-}
-
-
bool register_yescrypt_algo ( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_yescrypt;
gate->hash = (void*)&yescrypt_hash;
gate->hash_alt = (void*)&yescrypthash;
-// gate->set_target = (void*)&yescrypt_set_target;
gate->set_target = (void*)&scrypt_set_target;
gate->get_max64 = (void*)&yescrypt_get_max64;
return true;
diff --git a/avxdefs.h b/avxdefs.h
index d3ad5e2..09b077f 100644
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -1,7 +1,7 @@
// Some tools to help using AVX and AVX2
#include
-//#include
+#include
// Use these overlays to access the same data in memory as different types
//
@@ -15,7 +15,7 @@
typedef union
{
-#if defined __AVX__
+#if defined __AVX2__
__m256i v256;
#endif
__m128i v128[ 2];
@@ -49,47 +49,85 @@ uint8_t v8 [16];
#if defined __AVX2__
-// Rotate bits in 4 uint64
+// Rotate bits in 4 uint64 (3 instructions)
// __m256i mm256_rotr_64( __256i, int )
-#define mm256_rotr_64(w,c) _mm256_or_si256(_mm256_srli_epi64(w, c), \
- _mm256_slli_epi64(w, 64 - c))
-//static inline __m256i mm256_rotr_64 ( __m256i w, int c)
-//{
-// return _mm256_or_si256( _mm256_srli_epi64( w, c ),
-// _mm256_slli_epi64( w, 64 - c ) );
-//}
-// Rotate uint64 by one uint64
-//__m256i mm256_rotl256_1x64( _mm256i, int )
-#define mm256_rotl256_1x64(s) _mm256_permute4x64_epi64( s, 0x39 )
-#define mm256_rotr256_1x64(s) _mm256_permute4x64_epi64( s, 0x93 )
-//static inline __m256i mm256_rotl256_1x64( __m256i s )
-//{
-// return _mm256_permute4x64_epi64( s, 0x39 );
-//}
-//static inline __m256i mm256_rotr256_1x64( __m256i s )
-//{
-// return _mm256_permute4x64_epi64( s, 0x93 );
-//}
+#define mm256_rotr_64( w, c ) \
+ _mm256_or_si256( _mm256_srli_epi64(w, c), _mm256_slli_epi64(w, 64 - c) )
+
+#define mm256_rotl_64( w, c ) \
+ _mm256_or_si256( _mm256_slli_epi64(w, c), _mm256_srli_epi64(w, 64 - c) )
// swap hi and lo 128 bits in 256 bit vector
// __m256i mm256_swap128( __m256i )
-#define mm256_swap128(s) _mm256_permute2f128_si256( s, s, 1 )
-//static inline __m256i mm256_swap128( __m256i s )
-//{
-// return _mm256_permute2f128_si256( s, s, 1 );
-//}
+#define mm256_swap128( w ) \
+ _mm256_permute2f128_si256( w, w, 1 )
-#endif
+// Rotate 256 bits by 64 bits (4 uint64 by one uint64)
+//__m256i mm256_rotl256_1x64( _mm256i, int )
+#define mm256_rotl256_1x64( w ) \
+ _mm256_permute4x64_epi64( w, 0x39 )
+
+#define mm256_rotr256_1x64( w ) \
+ _mm256_permute4x64_epi64( w, 0x93 )
+
+// shift 256 bits by n*64 bits (4 uint64 by n uint64)
+#define mm256_slli256_1x64( w ) \
+ _mm256_and_si256( mm256_rotl256_1x64( w ), \
+ _mm256_set_epi64x( 0, \
+ 0xffffffffffffffffull, \
+ 0xffffffffffffffffull, \
+ 0xffffffffffffffffull ) )
+// _mm256_set_epi64x( 0xffffffffffffffffull, \
+// 0xffffffffffffffffull, \
+// 0xffffffffffffffffull, \
+// 0 ) )
+
+
+#define mm256_slli256_2x64( w ) \
+ _mm256_and_si256( mm256_swap128( w ), \
+ _mm256_set_epi64x( 0xffffffffffffffffull, \
+ 0xffffffffffffffffull, \
+ 0, \
+ 0 ) )
+
+#define mm256_slli256_3x64( w ) \
+ _mm256_and_si256( mm256_rotr256_1x64( w ), \
+ _mm256_set_epi64x( 0xffffffffffffffffull, \
+ 0, \
+ 0, \
+ 0 ) )
+
+#define mm256_srli256_1x64( w ) \
+ _mm256_and_si256( mm256_rotr256_1x64( w ), \
+ _mm256_set_epi64x( 0, \
+ 0xffffffffffffffffull, \
+ 0xffffffffffffffffull, \
+ 0xffffffffffffffffull ) )
+
+#define mm256_srli256_2x64( w ) \
+ _mm256_and_si256( mm256_swap128( w ), \
+ _mm256_set_epi64x( 0, \
+ 0, \
+ 0xffffffffffffffffull, \
+ 0xffffffffffffffffull ))
+
+#define mm256_srli256_3x64( w ) \
+ _mm256_and_si256( mm256_rotl256_1x64( w ), \
+ _mm256_set_epi64x( 0xffffffffffffffffull, \
+ 0, \
+ 0, \
+ 0 ) )
+// _mm256_set_epi64x( 0, \
+// 0, \
+// 0, \
+// 0xffffffffffffffffull ) )
+
+#endif // AVX2
// rotate bits in 2 uint64
// _m128i mm_rotr_64( __m128i, int )
#define mm_rotr_64(w,c) _mm_or_si128(_mm_srli_epi64(w, c), \
_mm_slli_epi64(w, 64 - c))
-//static inline __m128i mm_rotr_64( __m128i w, int c )
-//{
-// _mm_or_si128( _mm_srli_epi64( w, c ),
-// _mm_slli_epi64( w, 64 - c ) );
-//}
// swap 128 bit source vectors
// void mm128_swap128( __m128i, __m128i )
diff --git a/configure.ac b/configure.ac
index a3f77eb..52b1278 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.4.8-dev])
+AC_INIT([cpuminer-opt], [4.3.8-dev])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index f055c25..9a057a4 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -79,7 +79,7 @@ bool opt_debug_diff = false;
bool opt_protocol = false;
bool opt_benchmark = false;
bool opt_redirect = true;
-bool opt_showdiff = false;
+bool opt_showdiff = true;
bool opt_extranonce = true;
bool want_longpoll = true;
bool have_longpoll = false;
@@ -306,8 +306,8 @@ static bool work_decode(const json_t *val, struct work *work)
{
if ( !algo_gate.work_decode( val, work ) )
return false;
- if ((opt_showdiff || opt_max_diff > 0.) && !allow_mininginfo)
- algo_gate.calc_network_diff( work );
+ if ( !allow_mininginfo )
+ net_diff = algo_gate.calc_network_diff( work );
work->targetdiff = target_to_diff(work->target);
// for api stats, on longpoll pools
stratum_diff = work->targetdiff;
@@ -781,10 +781,6 @@ static int share_result( int result, struct work *work, const char *reason )
(uint32_t)cpu_temp(0) );
#endif
-// applog(LOG_NOTICE, "accepted: %lu/%lu (%s%%), %s %sH, %s %sH/s %s",
-// accepted_count, total_submits, accepted_rate_s,
-// hc, hc_units, hr, hr_units, sres );
-
if (reason)
{
applog(LOG_WARNING, "reject reason: %s", reason);
@@ -1467,7 +1463,7 @@ void std_build_extraheader( struct work* work, struct stratum_ctx* sctx )
work->data[31] = 0x00000280;
}
-void std_calc_network_diff( struct work* work )
+double std_calc_network_diff( struct work* work )
{
// sample for diff 43.281 : 1c05ea29
// todo: endian reversed on longpoll could be zr5 specific...
@@ -1477,11 +1473,14 @@ void std_calc_network_diff( struct work* work )
uint32_t bits = ( nbits & 0xffffff );
int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
int m;
- net_diff = (double)0x0000ffff / (double)bits;
+ double d = (double)0x0000ffff / (double)bits;
for ( m = shift; m < 29; m++ )
- net_diff *= 256.0;
+ d *= 256.0;
for ( m = 29; m < shift; m++ )
- net_diff /= 256.0;
+ d /= 256.0;
+ if ( opt_debug_diff )
+ applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
+ return d;
}
uint32_t* std_get_nonceptr( uint32_t *work_data )
@@ -1501,7 +1500,8 @@ void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
- && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) ) )
+ && ( clean_job || ( *nonceptr >= *end_nonce_ptr )
+ || ( work->job_id != g_work->job_id ) ) )
{
work_free( work );
work_copy( work, g_work );
@@ -1518,6 +1518,7 @@ void jr2_get_new_work( struct work* work, struct work* g_work, int thr_id,
uint32_t *end_nonce_ptr )
{
uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
+
// byte data[ 0..38, 43..75 ], skip over misaligned nonce [39..42]
if ( memcmp( work->data, g_work->data, algo_gate.nonce_index )
|| memcmp( ((uint8_t*) work->data) + JR2_WORK_CMP_INDEX_2,
@@ -1611,9 +1612,9 @@ static void *miner_thread( void *userdata )
}
}
- if ( !algo_gate.miner_thread_init() )
+ if ( !algo_gate.miner_thread_init( thr_id ) )
{
- applog( LOG_ERR, "FAIL: thread %u failed to initialize");
+ applog( LOG_ERR, "FAIL: thread %u failed to initialize", thr_id );
exit (1);
}
@@ -1660,6 +1661,8 @@ static void *miner_thread( void *userdata )
} // do_this_thread
algo_gate.resync_threads( &work );
+ // prevent dupes is called on every loop and has useful args so it
+ // is being used by zcoin to pass along the work height.
if ( algo_gate.prevent_dupes( &work, &stratum, thr_id ) )
continue;
// prevent scans before a job is received
@@ -2075,8 +2078,7 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
algo_gate.build_extraheader( g_work, sctx );
- if ( opt_showdiff || opt_max_diff > 0. )
- algo_gate.calc_network_diff( g_work );
+ net_diff = algo_gate.calc_network_diff( g_work );
algo_gate.set_work_data_endian( g_work );
pthread_mutex_unlock( &sctx->work_lock );
@@ -2161,6 +2163,7 @@ static void *stratum_thread(void *userdata )
sleep(opt_fail_pause);
}
+
if (jsonrpc_2)
{
work_free(&g_work);
@@ -2168,15 +2171,15 @@ static void *stratum_thread(void *userdata )
}
}
- if (stratum.job.job_id &&
- (!g_work_time || strcmp(stratum.job.job_id, g_work.job_id)) )
+ if (stratum.job.job_id &&
+ (!g_work_time || strcmp(stratum.job.job_id, g_work.job_id)) )
{
pthread_mutex_lock(&g_work_lock);
algo_gate.stratum_gen_work( &stratum, &g_work );
time(&g_work_time);
pthread_mutex_unlock(&g_work_lock);
- if (stratum.job.clean || jsonrpc_2)
+ if (stratum.job.clean || jsonrpc_2)
{
static uint32_t last_bloc_height;
if (!opt_quiet && last_bloc_height != stratum.bloc_height)
@@ -2554,7 +2557,7 @@ void parse_arg(int key, char *arg )
opt_extranonce = false;
break;
case 1013:
- opt_showdiff = true;
+ opt_showdiff = false;
break;
case 1016: /* --coinbase-addr */
pk_script_size = address_to_script(pk_script, sizeof(pk_script), arg);
diff --git a/m4/placeholder-to-prevent-git-from-deleting-this-directory b/m4/placeholder-to-prevent-git-from-deleting-this-directory
deleted file mode 100644
index e69de29..0000000
diff --git a/miner.h b/miner.h
index 17ed0bd..b619ef2 100644
--- a/miner.h
+++ b/miner.h
@@ -492,6 +492,7 @@ enum algos {
ALGO_LUFFA,
ALGO_LYRA2RE,
ALGO_LYRA2REV2,
+ ALGO_LYRA2Z,
ALGO_M7M,
ALGO_MYR_GR,
ALGO_NEOSCRYPT,
@@ -546,6 +547,7 @@ static const char *algo_names[] = {
"luffa",
"lyra2re",
"lyra2rev2",
+ "lyra2z",
"m7m",
"myr-gr",
"neoscrypt",
@@ -573,6 +575,7 @@ static const char *algo_names[] = {
"x15",
"x17",
"yescrypt",
+ "lyra2z",
"zr5",
"\0"
};
@@ -654,6 +657,7 @@ Options:\n\
luffa Luffa\n\
lyra2re lyra2\n\
lyra2rev2 lyrav2\n\
+ lyra2z Zcoin (XZC)\n\
m7m Magi (XMG)\n\
myr-gr Myriad-Groestl\n\
neoscrypt NeoScrypt(128, 2, 1)\n\
@@ -700,6 +704,7 @@ Options:\n\
--randomize Randomize scan range start to reduce duplicates\n\
-f, --diff-factor Divide req. difficulty by this factor (std is 1.0)\n\
-m, --diff-multiplier Multiply difficulty by this factor (std is 1.0)\n\
+ --hide-diff Do not display changes in difficulty\n\
-n, --nfactor neoscrypt N-Factor\n\
--coinbase-addr=ADDR payout address for solo mining\n\
--coinbase-sig=TEXT data to insert in the coinbase when possible\n\
@@ -763,6 +768,7 @@ static struct option const options[] = {
{ "diff-factor", 1, NULL, 'f' },
{ "diff", 1, NULL, 'f' }, // deprecated (alias)
{ "diff-multiplier", 1, NULL, 'm' },
+ { "hide-diff", 0, NULL, 1013 },
{ "help", 0, NULL, 'h' },
{ "nfactor", 1, NULL, 'n' },
{ "no-gbt", 0, NULL, 1011 },
@@ -783,7 +789,6 @@ static struct option const options[] = {
{ "retry-pause", 1, NULL, 'R' },
{ "randomize", 0, NULL, 1024 },
{ "scantime", 1, NULL, 's' },
- { "show-diff", 0, NULL, 1013 },
#ifdef HAVE_SYSLOG_H
{ "syslog", 0, NULL, 'S' },
#endif
diff --git a/util.c b/util.c
index a70abc6..07cab73 100644
--- a/util.c
+++ b/util.c
@@ -1630,7 +1630,7 @@ bool rpc2_job_decode(const json_t *job, struct work *work)
hashrate += thr_hashrates[i];
pthread_mutex_unlock(&stats_lock);
double diff = trunc( ( ((double)0xffffffff) / target ) );
- if (!opt_quiet)
+ if ( opt_showdiff )
// xmr pool diff can change a lot...
applog(LOG_WARNING, "Stratum difficulty set to %g", diff);
stratum_diff = diff;