diff --git a/Makefile.am b/Makefile.am index c5f9a8e..832fe6f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -105,6 +105,7 @@ cpuminer_SOURCES = \ algo/lyra2/sponge.c \ algo/lyra2/lyra2rev2.c \ algo/lyra2/lyra2re.c \ + algo/lyra2/zcoin.c \ algo/keccak/sse2/keccak.c \ algo/m7m.c \ algo/neoscrypt.c \ diff --git a/RELEASE_ANNOUNCEMENT b/RELEASE_ANNOUNCEMENT index 87c6f65..efb153a 100644 --- a/RELEASE_ANNOUNCEMENT +++ b/RELEASE_ANNOUNCEMENT @@ -12,7 +12,9 @@ comparison below. New in 3.4.8 +- added zcoin support, optimized for AVX2 but no increase in performance - fixed API display of diff for cryptonight +- --show-diff is now the default, use "--hide-diff" to disable - cleaned up some cpuminer-multi artifacts Users with non-SSE2 CPUs or who want to mine algos not supported by diff --git a/algo-gate-api.c b/algo-gate-api.c index f1d1b01..d161fe2 100644 --- a/algo-gate-api.c +++ b/algo-gate-api.c @@ -169,6 +169,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate ) case ALGO_LUFFA: register_luffa_algo ( gate ); break; case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break; case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break; + case ALGO_LYRA2Z: register_zcoin_algo ( gate ); break; case ALGO_M7M: register_m7m_algo ( gate ); break; case ALGO_MYR_GR: register_myriad_algo ( gate ); break; case ALGO_NEOSCRYPT: register_neoscrypt_algo ( gate ); break; @@ -268,6 +269,7 @@ const char* const algo_alias_map[][2] = { "sib", "x11gost" }, { "yes", "yescrypt" }, { "ziftr", "zr5" }, + { "zcoin", "lyra2z" }, { NULL, NULL } }; diff --git a/algo-gate-api.h b/algo-gate-api.h index 4d72b09..bf7bebf 100644 --- a/algo-gate-api.h +++ b/algo-gate-api.h @@ -115,7 +115,7 @@ void ( *hash_alt ) ( void*, const void*, uint32_t ); void ( *hash_suw ) ( void*, const void* ); //optional, safe to use default in most cases -bool ( *miner_thread_init ) (); +bool ( *miner_thread_init ) ( int ); void ( *stratum_gen_work ) ( struct stratum_ctx*, struct work* ); void ( *get_new_work ) ( struct work*, struct work*, int, uint32_t*, bool ); @@ -129,7 +129,7 @@ bool ( *submit_getwork_result ) ( CURL*, struct work* ); void ( *gen_merkle_root ) ( char*, struct stratum_ctx* ); void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* ); void ( *set_work_data_endian ) ( struct work* ); -void ( *calc_network_diff ) ( struct work* ); +double ( *calc_network_diff ) ( struct work* ); void ( *build_extraheader ) ( struct work*, struct stratum_ctx* ); bool ( *prevent_dupes ) ( struct work*, struct stratum_ctx*, int ); void ( *resync_threads ) ( struct work* ); @@ -229,7 +229,7 @@ void jr2_build_stratum_request ( char *req, struct work *work ); // default is do_nothing; void swab_work_data( struct work *work ); -void std_calc_network_diff( struct work *work ); +double std_calc_network_diff( struct work *work ); void std_build_extraheader( struct work *work, struct stratum_ctx *sctx ); diff --git a/algo/blake/decred.c b/algo/blake/decred.c index 44ac63f..b6684aa 100644 --- a/algo/blake/decred.c +++ b/algo/blake/decred.c @@ -115,7 +115,8 @@ uint32_t *decred_get_nonceptr( uint32_t *work_data ) // does it need to increment nonce, seems not because gen_work_now always // returns true -void decred_calc_network_diff( struct work* work ) +double decred_calc_network_diff( struct work* work ) +//void decred_calc_network_diff( struct work* work ) { // sample for diff 43.281 : 1c05ea29 // todo: endian reversed on longpoll could be zr5 specific... @@ -123,17 +124,18 @@ void decred_calc_network_diff( struct work* work ) uint32_t bits = ( nbits & 0xffffff ); int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28 int m; - net_diff = (double)0x0000ffff / (double)bits; + double d = (double)0x0000ffff / (double)bits; for ( m = shift; m < 29; m++ ) - net_diff *= 256.0; + d *= 256.0; for ( m = 29; m < shift; m++ ) - net_diff /= 256.0; + d /= 256.0; if ( shift == 28 ) - net_diff *= 256.0; // testnet + d *= 256.0; // testnet if ( opt_debug_diff ) - applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", net_diff, - shift, bits); + applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, + shift, bits ); + return net_diff; } void decred_decode_extradata( struct work* work, uint64_t* net_blocks ) diff --git a/algo/drop.c b/algo/drop.c index 2b16fb6..aa3dae3 100644 --- a/algo/drop.c +++ b/algo/drop.c @@ -233,11 +233,6 @@ void drop_get_new_work( struct work* work, struct work* g_work, int thr_id, ++(*nonceptr); } -void drop_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (65536.0 * opt_diff_factor) ); -} - void drop_display_pok( struct work* work ) { if ( work->data[0] & 0x00008000 ) diff --git a/algo/lbry.c b/algo/lbry.c index d7bd8d4..d2f081c 100644 --- a/algo/lbry.c +++ b/algo/lbry.c @@ -142,7 +142,7 @@ int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce, return 0; } -void lbry_calc_network_diff(struct work *work) +double lbry_calc_network_diff( struct work *work ) { // sample for diff 43.281 : 1c05ea29 // todo: endian reversed on longpoll could be zr5 specific... @@ -159,7 +159,7 @@ void lbry_calc_network_diff(struct work *work) if (opt_debug_diff) applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits); - net_diff = d; + return d; } // std_le should work but it doesn't diff --git a/algo/lyra2/lyra2.c b/algo/lyra2/lyra2.c index 5f1b527..fab6da5 100644 --- a/algo/lyra2/lyra2.c +++ b/algo/lyra2/lyra2.c @@ -44,171 +44,344 @@ * * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation) */ -int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols) + +// Lyra2RE & Lyra2REv2, nRows must be a power of 2 +int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, + const void *salt, uint64_t saltlen, uint64_t timeCost, + const uint64_t nRows, const uint64_t nCols ) { - //============================= Basic variables ============================// - int64_t row = 2; //index of row to be processed - int64_t prev = 1; //index of prev (last row ever computed/modified) - int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering) - int64_t tau; //Time Loop iterator - int64_t step = 1; //Visitation step (used during Setup and Wandering phases) - int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) - int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 - int64_t i; //auxiliary iteration counter - int64_t v64; // 64bit var for memcpy - //==========================================================================/ + //====================== Basic variables ============================// + uint64_t _ALIGN(256) state[16]; + int64_t row = 2; //index of row to be processed + int64_t prev = 1; //index of prev (last row ever computed/modified) + int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering) + int64_t tau; //Time Loop iterator + int64_t step = 1; //Visitation step (used during Setup and Wandering phases) + int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) + int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 + int64_t i; //auxiliary iteration counter + int64_t v64; // 64bit var for memcpy + //====================================================================/ - //========== Initializing the Memory Matrix and pointers to it =============// - //Tries to allocate enough space for the whole memory matrix + //=== Initializing the Memory Matrix and pointers to it =============// + //Tries to allocate enough space for the whole memory matrix - const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; - const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; - // for Lyra2REv2, nCols = 4, v1 was using 8 - const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES; + const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; + const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + // for Lyra2REv2, nCols = 4, v1 was using 8 + const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 + : BLOCK_LEN_BLAKE2_SAFE_BYTES; - i = (int64_t)ROW_LEN_BYTES * nRows; - uint64_t *wholeMatrix = malloc(i); - if (wholeMatrix == NULL) { - return -1; - } - memset(wholeMatrix, 0, i); + i = (int64_t)ROW_LEN_BYTES * nRows; + uint64_t *wholeMatrix = malloc(i); + if (wholeMatrix == NULL) + return -1; - //Allocates pointers to each row of the matrix - uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows); - if (memMatrix == NULL) { - return -1; - } - //Places the pointers in the correct positions - uint64_t *ptrWord = wholeMatrix; - for (i = 0; i < nRows; i++) { - memMatrix[i] = ptrWord; - ptrWord += ROW_LEN_INT64; - } - //==========================================================================/ + memset(wholeMatrix, 0, i); - //============= Getting the password + salt + basil padded with 10*1 ===============// - //OBS.:The memory matrix will temporarily hold the password: not for saving memory, - //but this ensures that the password copied locally will be overwritten as soon as possible + //Allocates pointers to each row of the matrix + uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows); + if (memMatrix == NULL) + return -1; - //First, we clean enough blocks for the password, salt, basil and padding - int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1; + //Places the pointers in the correct positions + uint64_t *ptrWord = wholeMatrix; + for (i = 0; i < nRows; i++) + { + memMatrix[i] = ptrWord; + ptrWord += ROW_LEN_INT64; + } - byte *ptrByte = (byte*) wholeMatrix; + //=== Getting the password + salt + basil padded with 10*1 ==========// + //OBS.:The memory matrix will temporarily hold the password: not for saving memory, + //but this ensures that the password copied locally will be overwritten as soon as possible - //Prepends the password - memcpy(ptrByte, pwd, pwdlen); - ptrByte += pwdlen; + //First, we clean enough blocks for the password, salt, basil and padding + int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) ) + / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1; - //Concatenates the salt - memcpy(ptrByte, salt, saltlen); - ptrByte += saltlen; + byte *ptrByte = (byte*) wholeMatrix; - memset(ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen)); + //Prepends the password + memcpy(ptrByte, pwd, pwdlen); + ptrByte += pwdlen; - //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface - memcpy(ptrByte, &kLen, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = pwdlen; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = saltlen; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = timeCost; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = nRows; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = nCols; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); + //Concatenates the salt + memcpy(ptrByte, salt, saltlen); + ptrByte += saltlen; - //Now comes the padding - *ptrByte = 0x80; //first byte of padding: right after the password - ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix - ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block - *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block - //==========================================================================/ + memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES + - (saltlen + pwdlen) ); - //======================= Initializing the Sponge State ====================// - //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) - uint64_t _ALIGN(256) state[16]; - initState(state); - //==========================================================================/ + //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface + memcpy(ptrByte, &kLen, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = pwdlen; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = saltlen; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = timeCost; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = nRows; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = nCols; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); - //================================ Setup Phase =============================// - //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits - ptrWord = wholeMatrix; - for (i = 0; i < nBlocksInput; i++) { - absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil) - ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil) - } + //Now comes the padding + *ptrByte = 0x80; //first byte of padding: right after the password + ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix + ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block + *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block - //Initializes M[0] and M[1] - reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here + //================= Initializing the Sponge State ====================// + //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) + initState(state); - reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols); + //========================= Setup Phase =============================// + //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits + ptrWord = wholeMatrix; + for (i = 0; i < nBlocksInput; i++) + { + absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil) + ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil) + } - do { - //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) + //Initializes M[0] and M[1] + reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here - reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); + reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols); - //updates the value of row* (deterministically picked during Setup)) - rowa = (rowa + step) & (window - 1); - //update prev: it now points to the last row ever computed - prev = row; - //updates row: goes to the next row to be computed - row++; + do + { + //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) - //Checks if all rows in the window where visited. - if (rowa == 0) { - step = window + gap; //changes the step: approximately doubles its value - window *= 2; //doubles the size of the re-visitation window - gap = -gap; //inverts the modifier to the step - } + reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); - } while (row < nRows); - //==========================================================================/ + //updates the value of row* (deterministically picked during Setup)) + rowa = (rowa + step) & (window - 1); + //update prev: it now points to the last row ever computed + prev = row; + //updates row: goes to the next row to be computed + row++; - //============================ Wandering Phase =============================// - row = 0; //Resets the visitation to the first row of the memory matrix - for (tau = 1; tau <= timeCost; tau++) { - //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 - step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; - do { - //Selects a pseudorandom index row* - //------------------------------------------------------------------------------------------ - rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) - //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE) - //------------------------------------------------------------------------------------------ + //Checks if all rows in the window where visited. + if (rowa == 0) + { + step = window + gap; //changes the step: approximately doubles its value + window *= 2; //doubles the size of the re-visitation window + gap = -gap; //inverts the modifier to the step + } - //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] - reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); + } while (row < nRows); - //update prev: it now points to the last row ever computed - prev = row; + //===================== Wandering Phase =============================// + row = 0; //Resets the visitation to the first row of the memory matrix + for (tau = 1; tau <= timeCost; tau++) + { + //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 + step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; + do + { + //Selects a pseudorandom index row* + //----------------------------------------------- + rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------- - //updates row: goes to the next row to be computed - //------------------------------------------------------------------------------------------ - row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) - //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) - //------------------------------------------------------------------------------------------ + //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] + reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); - } while (row != 0); - } + //update prev: it now points to the last row ever computed + prev = row; - //============================ Wrap-up Phase ===============================// - //Absorbs the last block of the memory matrix - absorbBlock(state, memMatrix[rowa]); + //updates row: goes to the next row to be computed + //---------------------------------------------------- + row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //---------------------------------------------------- - //Squeezes the key - squeeze(state, K, (unsigned int) kLen); + } while (row != 0); + } - //========================= Freeing the memory =============================// - free(memMatrix); - free(wholeMatrix); + //===================== Wrap-up Phase ===============================// + //Absorbs the last block of the memory matrix + absorbBlock(state, memMatrix[rowa]); - return 0; + //Squeezes the key + squeeze(state, K, (unsigned int) kLen); + + //================== Freeing the memory =============================// + free(memMatrix); + free(wholeMatrix); + + return 0; } + +// Zcoin, nRows may be any value +int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, + const void *salt, uint64_t saltlen, uint64_t timeCost, + uint64_t nRows, uint64_t nCols ) +{ + //========================== Basic variables ============================// + uint64_t _ALIGN(256) state[16]; + int64_t row = 2; //index of row to be processed + int64_t prev = 1; //index of prev (last row ever computed/modified) + int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering) + int64_t tau; //Time Loop iterator + int64_t step = 1; //Visitation step (used during Setup and Wandering phases) + int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) + int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 + int64_t i; //auxiliary iteration counter + //=======================================================================/ + + //======= Initializing the Memory Matrix and pointers to it =============// + //Tries to allocate enough space for the whole memory matrix + + const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; + const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + + i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES); + uint64_t *wholeMatrix = malloc(i); + if (wholeMatrix == NULL) + return -1; + + memset(wholeMatrix, 0, i); + //Allocates pointers to each row of the matrix + uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*)); + if (memMatrix == NULL) + return -1; + + //Places the pointers in the correct positions + uint64_t *ptrWord = wholeMatrix; + for (i = 0; i < nRows; i++) + { + memMatrix[i] = ptrWord; + ptrWord += ROW_LEN_INT64; + } + + //==== Getting the password + salt + basil padded with 10*1 ============// + //OBS.:The memory matrix will temporarily hold the password: not for saving memory, + //but this ensures that the password copied locally will be overwritten as soon as possible + + //First, we clean enough blocks for the password, salt, basil and padding + uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof (uint64_t) ) + / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1; + byte *ptrByte = (byte*) wholeMatrix; + memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES ); + + //Prepends the password + memcpy(ptrByte, pwd, pwdlen); + ptrByte += pwdlen; + + //Concatenates the salt + memcpy(ptrByte, salt, saltlen); + ptrByte += saltlen; + + //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface + memcpy(ptrByte, &kLen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &pwdlen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &saltlen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &timeCost, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &nRows, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &nCols, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + + //Now comes the padding + *ptrByte = 0x80; //first byte of padding: right after the password + ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix + ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block + *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block + + //=================== Initializing the Sponge State ====================// + //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) + initState( state ); + + //============================== Setup Phase =============================// + //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits + ptrWord = wholeMatrix; + for ( i = 0; i < nBlocksInput; i++ ) + { + absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil) + ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil) + } + + //Initializes M[0] and M[1] + reducedSqueezeRow0( state, memMatrix[0], nCols ); //The locally copied password is most likely overwritten here + reducedDuplexRow1( state, memMatrix[0], memMatrix[1], nCols ); + + do + { + //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) + reducedDuplexRowSetup( state, memMatrix[prev], memMatrix[rowa], + memMatrix[row], nCols ); + + //updates the value of row* (deterministically picked during Setup)) + rowa = (rowa + step) & (window - 1); + //update prev: it now points to the last row ever computed + prev = row; + //updates row: goes to the next row to be computed + row++; + + //Checks if all rows in the window where visited. + if (rowa == 0) + { + step = window + gap; //changes the step: approximately doubles its value + window *= 2; //doubles the size of the re-visitation window + gap = -gap; //inverts the modifier to the step + } + + } while (row < nRows); + + //======================== Wandering Phase =============================// + row = 0; //Resets the visitation to the first row of the memory matrix + for ( tau = 1; tau <= timeCost; tau++ ) + { + //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 + step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; + do { + //Selects a pseudorandom index row* + //---------------------------------------------------------------------- + //rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //----------------------------------------------------------------- + + //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] + reducedDuplexRow( state, memMatrix[prev], memMatrix[rowa], + memMatrix[row], nCols ); + + //update prev: it now points to the last row ever computed + prev = row; + + //updates row: goes to the next row to be computed + //--------------------------------------------------------------- + //row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //-------------------------------------------------------------------- + + } while (row != 0); + } + + //========================= Wrap-up Phase ===============================// + //Absorbs the last block of the memory matrix + absorbBlock( state, memMatrix[rowa] ); + + //Squeezes the key + squeeze( state, K, kLen ); + + //====================== Freeing the memory =============================// + free( memMatrix ); + free( wholeMatrix ); + + return 0; +} + diff --git a/algo/lyra2/lyra2.h b/algo/lyra2/lyra2.h index edf9179..3a9403b 100644 --- a/algo/lyra2/lyra2.h +++ b/algo/lyra2/lyra2.h @@ -37,6 +37,10 @@ typedef unsigned char byte; #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes #endif -int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols); - +int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, + const void *salt, uint64_t saltlen, uint64_t timeCost, + uint64_t nRows, uint64_t nCols ); +int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, + const void *salt, uint64_t saltlen, uint64_t timeCost, + uint64_t nRows, uint64_t nCols ); #endif /* LYRA2_H_ */ diff --git a/algo/lyra2/sponge.c b/algo/lyra2/sponge.c index e746ede..0ab66ba 100644 --- a/algo/lyra2/sponge.c +++ b/algo/lyra2/sponge.c @@ -25,6 +25,8 @@ #include "sponge.h" #include "lyra2.h" + + /** * Initializes the Sponge State. The first 512 bits are set to zeros and the remainder * receive Blake2b's IV as per Blake2b's specification. Note: Even though sponges @@ -36,10 +38,8 @@ * * @param state The 1024-bit array to be initialized */ -void initState(uint64_t state[/*16*/]) -{ +inline void initState(uint64_t state[/*16*/]) { #ifdef __AVX2__ - (*(__m256i*)(&state[0])) = _mm256_setzero_si256(); (*(__m256i*)(&state[4])) = _mm256_setzero_si256(); @@ -56,18 +56,17 @@ void initState(uint64_t state[/*16*/]) //#elif defined __AVX__ #else - - //First 512 bis are zeros - memset(state, 0, 64); - //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV - state[8] = blake2b_IV[0]; - state[9] = blake2b_IV[1]; - state[10] = blake2b_IV[2]; - state[11] = blake2b_IV[3]; - state[12] = blake2b_IV[4]; - state[13] = blake2b_IV[5]; - state[14] = blake2b_IV[6]; - state[15] = blake2b_IV[7]; + //First 512 bis are zeros + memset(state, 0, 64); + //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV + state[8] = blake2b_IV[0]; + state[9] = blake2b_IV[1]; + state[10] = blake2b_IV[2]; + state[11] = blake2b_IV[3]; + state[12] = blake2b_IV[4]; + state[13] = blake2b_IV[5]; + state[14] = blake2b_IV[6]; + state[15] = blake2b_IV[7]; #endif } @@ -76,23 +75,11 @@ void initState(uint64_t state[/*16*/]) * * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function */ -__inline static void blake2bLyra( uint64_t *v ) -{ +inline static void blake2bLyra(uint64_t *v) { #if defined __AVX2__ - +// may be still used by squeeze LYRA_INIT_AVX2; // defines local a[4] - LYRA_ROUND_AVX2; - LYRA_ROUND_AVX2; - LYRA_ROUND_AVX2; - LYRA_ROUND_AVX2; - LYRA_ROUND_AVX2; - LYRA_ROUND_AVX2; - LYRA_ROUND_AVX2; - LYRA_ROUND_AVX2; - LYRA_ROUND_AVX2; - LYRA_ROUND_AVX2; - LYRA_ROUND_AVX2; - LYRA_ROUND_AVX2; + LYRA_12_ROUNDS_AVX2( a[0], a[1], a[2], a[3] ); LYRA_CLOSE_AVX2; #elif defined __AVX__ @@ -113,20 +100,18 @@ __inline static void blake2bLyra( uint64_t *v ) LYRA_CLOSE_AVX; #else - - ROUND_LYRA(0); - ROUND_LYRA(0); - ROUND_LYRA(0); - ROUND_LYRA(0); - ROUND_LYRA(0); - ROUND_LYRA(0); - ROUND_LYRA(0); - ROUND_LYRA(0); - ROUND_LYRA(0); - ROUND_LYRA(0); - ROUND_LYRA(0); - ROUND_LYRA(0); - + ROUND_LYRA(0); + ROUND_LYRA(1); + ROUND_LYRA(2); + ROUND_LYRA(3); + ROUND_LYRA(4); + ROUND_LYRA(5); + ROUND_LYRA(6); + ROUND_LYRA(7); + ROUND_LYRA(8); + ROUND_LYRA(9); + ROUND_LYRA(10); + ROUND_LYRA(11); #endif } @@ -134,19 +119,8 @@ __inline static void blake2bLyra( uint64_t *v ) * Executes a reduced version of Blake2b's G function with only one round * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function */ -__inline static void reducedBlake2bLyra(uint64_t *v) { - -#if defined __AVX2__ - LYRA_INIT_AVX2; // defines local a[4] - LYRA_ROUND_AVX2; - LYRA_CLOSE_AVX2; -#elif defined __AVX__ - LYRA_INIT_AVX; // defines locals a0[4], a1[4] - LYRA_ROUND_AVX; - LYRA_CLOSE_AVX; -#else - ROUND_LYRA(0); -#endif +inline static void reducedBlake2bLyra(uint64_t *v) { + ROUND_LYRA(0); } /** @@ -157,21 +131,21 @@ __inline static void reducedBlake2bLyra(uint64_t *v) { * @param out Array that will receive the data squeezed * @param len The number of bytes to be squeezed into the "out" array */ -void squeeze(uint64_t *state, byte *out, unsigned int len) +inline void squeeze( uint64_t *state, byte *out, unsigned int len ) { - int fullBlocks = len / BLOCK_LEN_BYTES; - byte *ptr = out; - int i; + int fullBlocks = len / BLOCK_LEN_BYTES; + byte *ptr = out; + int i; - //Squeezes full blocks - for (i = 0; i < fullBlocks; i++) { - memcpy(ptr, state, BLOCK_LEN_BYTES); - blake2bLyra(state); - ptr += BLOCK_LEN_BYTES; - } - - //Squeezes remaining bytes - memcpy(ptr, state, (len % BLOCK_LEN_BYTES)); + //Squeezes full blocks + for ( i = 0; i < fullBlocks; i++ ) + { + memcpy(ptr, state, BLOCK_LEN_BYTES); + blake2bLyra(state); + ptr += BLOCK_LEN_BYTES; + } + //Squeezes remaining bytes + memcpy(ptr, state, (len % BLOCK_LEN_BYTES)); } /** @@ -181,12 +155,10 @@ void squeeze(uint64_t *state, byte *out, unsigned int len) * @param state The current state of the sponge * @param in The block to be absorbed (BLOCK_LEN_INT64 words) */ -void absorbBlock(uint64_t *state, const uint64_t *in) -{ -//XORs the first BLOCK_LEN_INT64 words of "in" with the current state +inline void absorbBlock(uint64_t *state, const uint64_t *in) { #if defined __AVX2__ - __m256i state_v[3], in_v[3]; + __m256i state_v[4], in_v[3]; // only state is guaranteed aligned 256 state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); @@ -195,13 +167,18 @@ void absorbBlock(uint64_t *state, const uint64_t *in) in_v [1] = _mm256_loadu_si256( (__m256i*)(&in[4]) ); state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); in_v [2] = _mm256_loadu_si256( (__m256i*)(&in[8]) ); + state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) ); - _mm256_store_si256( (__m256i*)&state[0], - _mm256_xor_si256( state_v[0], in_v[0] ) ); - _mm256_store_si256( (__m256i*)&state[4], - _mm256_xor_si256( state_v[1], in_v[1] ) ); - _mm256_store_si256( (__m256i*)&state[8], - _mm256_xor_si256( state_v[2], in_v[2] ) ); + state_v[0] = _mm256_xor_si256( state_v[0], in_v[0] ); + state_v[1] = _mm256_xor_si256( state_v[1], in_v[1] ); + state_v[2] = _mm256_xor_si256( state_v[2], in_v[2] ); + + LYRA_12_ROUNDS_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] ); + + _mm256_store_si256( (__m256i*)&state[0], state_v[0] ); + _mm256_store_si256( (__m256i*)&state[4], state_v[1] ); + _mm256_store_si256( (__m256i*)&state[8], state_v[2] ); + _mm256_store_si256( (__m256i*)&state[12], state_v[3] ); #elif defined __AVX__ @@ -221,6 +198,8 @@ void absorbBlock(uint64_t *state, const uint64_t *in) in_v[4] = _mm_load_si128( (__m128i*)(&in[8]) ); in_v[5] = _mm_load_si128( (__m128i*)(&in[10]) ); +// do blake2bLyra without init +// LYRA_ROUND_AVX2( state_v ) _mm_store_si128( (__m128i*)(&state[0]), _mm_xor_si128( state_v[0], in_v[0] ) ); _mm_store_si128( (__m128i*)(&state[2]), @@ -234,26 +213,28 @@ void absorbBlock(uint64_t *state, const uint64_t *in) _mm_store_si128( (__m128i*)(&state[10]), _mm_xor_si128( state_v[5], in_v[5] ) ); -#else + //Applies the transformation f to the sponge's state + blake2bLyra(state); - state[0] ^= in[0]; - state[1] ^= in[1]; - state[2] ^= in[2]; - state[3] ^= in[3]; - state[4] ^= in[4]; - state[5] ^= in[5]; - state[6] ^= in[6]; - state[7] ^= in[7]; - state[8] ^= in[8]; - state[9] ^= in[9]; - state[10] ^= in[10]; - state[11] ^= in[11]; +#else + //XORs the first BLOCK_LEN_INT64 words of "in" with the current state + state[0] ^= in[0]; + state[1] ^= in[1]; + state[2] ^= in[2]; + state[3] ^= in[3]; + state[4] ^= in[4]; + state[5] ^= in[5]; + state[6] ^= in[6]; + state[7] ^= in[7]; + state[8] ^= in[8]; + state[9] ^= in[9]; + state[10] ^= in[10]; + state[11] ^= in[11]; + + //Applies the transformation f to the sponge's state + blake2bLyra(state); #endif - -//Applies the transformation f to the sponge's state -blake2bLyra(state); - } /** @@ -263,23 +244,28 @@ blake2bLyra(state); * @param state The current state of the sponge * @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words) */ -void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) -{ - -//XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state +inline void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { + //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state #if defined __AVX2__ - __m256i state_v[2], in_v[2]; + __m256i state_v[4], in_v[2]; state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); in_v [0] = _mm256_loadu_si256( (__m256i*)(&in[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); in_v [1] = _mm256_loadu_si256( (__m256i*)(&in[4]) ); + state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); + state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) ); - _mm256_store_si256( (__m256i*)(&state[0]), - _mm256_xor_si256( state_v[0], in_v[0] ) ); - _mm256_store_si256( (__m256i*)(&state[4]), - _mm256_xor_si256( state_v[1], in_v[1] ) ); + state_v[0] = _mm256_xor_si256( state_v[0], in_v[0] ); + state_v[1] = _mm256_xor_si256( state_v[1], in_v[1] ); + + LYRA_12_ROUNDS_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] ); + + _mm256_store_si256( (__m256i*)&state[0], state_v[0] ); + _mm256_store_si256( (__m256i*)&state[4], state_v[1] ); + _mm256_store_si256( (__m256i*)&state[8], state_v[2] ); + _mm256_store_si256( (__m256i*)&state[12], state_v[3] ); #elif defined __AVX__ @@ -304,22 +290,24 @@ void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) _mm_store_si128( (__m128i*)(&state[6]), _mm_xor_si128( state_v[3], in_v[3] ) ); + //Applies the transformation f to the sponge's state + blake2bLyra(state); + #else - state[0] ^= in[0]; - state[1] ^= in[1]; - state[2] ^= in[2]; - state[3] ^= in[3]; - state[4] ^= in[4]; - state[5] ^= in[5]; - state[6] ^= in[6]; - state[7] ^= in[7]; + state[0] ^= in[0]; + state[1] ^= in[1]; + state[2] ^= in[2]; + state[3] ^= in[3]; + state[4] ^= in[4]; + state[5] ^= in[5]; + state[6] ^= in[6]; + state[7] ^= in[7]; + //Applies the transformation f to the sponge's state + blake2bLyra(state); #endif -//Applies the transformation f to the sponge's state -blake2bLyra(state); - } /** @@ -330,21 +318,34 @@ blake2bLyra(state); * @param state The current state of the sponge * @param rowOut Row to receive the data squeezed */ -void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, const uint32_t nCols) +inline void reducedSqueezeRow0( uint64_t* state, uint64_t* rowOut, + uint64_t nCols ) { - uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1] - unsigned int i; - //M[row][C-1-col] = H.reduced_squeeze() - for (i = 0; i < nCols; i++) - { + uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1] + int i; + //M[row][C-1-col] = H.reduced_squeeze() + +#if defined __AVX2__ + __m256i state_v[4]; + state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); + state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); + state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); + state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) ); +#endif + + for ( i = 0; i < nCols; i++ ) + { + #if defined __AVX2__ - _mm256_storeu_si256( (__m256i*)&ptrWord[0], - _mm256_load_si256( (__m256i*)(&state[0]) ) ); - _mm256_storeu_si256( (__m256i*)&ptrWord[4], - _mm256_load_si256( (__m256i*)(&state[4]) ) ); - _mm256_storeu_si256( (__m256i*)&ptrWord[8], - _mm256_load_si256( (__m256i*)(&state[8]) ) ); + _mm256_storeu_si256( (__m256i*)&ptrWord[0], state_v[0] ); + _mm256_storeu_si256( (__m256i*)&ptrWord[4], state_v[1] ); + _mm256_storeu_si256( (__m256i*)&ptrWord[8], state_v[2] ); + + //Goes to next block (column) that will receive the squeezed data + ptrWord -= BLOCK_LEN_INT64; + + LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] ); #elif defined __AVX__ @@ -361,28 +362,42 @@ void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, const uint32_t nCols) _mm_store_si128( (__m128i*)(&ptrWord[10]), _mm_load_si128( (__m128i*)(&state[10]) ) ); -#else + //Goes to next block (column) that will receive the squeezed data + ptrWord -= BLOCK_LEN_INT64; - ptrWord[0] = state[0]; - ptrWord[1] = state[1]; - ptrWord[2] = state[2]; - ptrWord[3] = state[3]; - ptrWord[4] = state[4]; - ptrWord[5] = state[5]; - ptrWord[6] = state[6]; - ptrWord[7] = state[7]; - ptrWord[8] = state[8]; - ptrWord[9] = state[9]; - ptrWord[10] = state[10]; - ptrWord[11] = state[11]; + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + +#else + ptrWord[0] = state[0]; + ptrWord[1] = state[1]; + ptrWord[2] = state[2]; + ptrWord[3] = state[3]; + ptrWord[4] = state[4]; + ptrWord[5] = state[5]; + ptrWord[6] = state[6]; + ptrWord[7] = state[7]; + ptrWord[8] = state[8]; + ptrWord[9] = state[9]; + ptrWord[10] = state[10]; + ptrWord[11] = state[11]; + + //Goes to next block (column) that will receive the squeezed data + ptrWord -= BLOCK_LEN_INT64; + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); #endif - //Goes to next block (column) that will receive the squeezed data - ptrWord -= BLOCK_LEN_INT64; - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); } + +#if defined __AVX2__ + _mm256_store_si256( (__m256i*)&state[0], state_v[0] ); + _mm256_store_si256( (__m256i*)&state[4], state_v[1] ); + _mm256_store_si256( (__m256i*)&state[8], state_v[2] ); + _mm256_store_si256( (__m256i*)&state[12], state_v[3] ); +#endif + } /** @@ -394,34 +409,43 @@ void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, const uint32_t nCols) * @param rowIn Row to feed the sponge * @param rowOut Row to receive the sponge's output */ -void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const uint32_t nCols) +inline void reducedDuplexRow1( uint64_t *state, uint64_t *rowIn, + uint64_t *rowOut, uint64_t nCols ) { - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row - unsigned int i; - - for (i = 0; i < nCols; i++) - { - //Absorbing "M[prev][col]" - #if defined __AVX2__ - - __m256i state_v[3], in_v[3]; + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + int i; +#if defined __AVX2__ + __m256i state_v[4], in_v[3]; state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); - in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); - in_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) ); state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); + state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) ); +#endif + + for ( i = 0; i < nCols; i++ ) + { +#if defined __AVX2__ + + in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) ); + in_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) ); in_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) ); - _mm256_store_si256( (__m256i*)(&state[0]), + state_v[0] = _mm256_xor_si256( state_v[0], in_v[0] ); + state_v[1] = _mm256_xor_si256( state_v[1], in_v[1] ); + state_v[2] = _mm256_xor_si256( state_v[2], in_v[2] ); + + LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] ); + + _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]), _mm256_xor_si256( state_v[0], in_v[0] ) ); - _mm256_store_si256( (__m256i*)(&state[4]), + _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]), _mm256_xor_si256( state_v[1], in_v[1] ) ); - _mm256_store_si256( (__m256i*)(&state[8]), + _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), _mm256_xor_si256( state_v[2], in_v[2] ) ); - #elif defined __AVX__ +#elif defined __AVX__ __m128i state_v[6], in_v[6]; @@ -452,29 +476,31 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const _mm_store_si128( (__m128i*)(&state[10]), _mm_xor_si128( state_v[5], in_v[5] ) ); - #else + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); - state[0] ^= (ptrWordIn[0]); - state[1] ^= (ptrWordIn[1]); - state[2] ^= (ptrWordIn[2]); - state[3] ^= (ptrWordIn[3]); - state[4] ^= (ptrWordIn[4]); - state[5] ^= (ptrWordIn[5]); - state[6] ^= (ptrWordIn[6]); - state[7] ^= (ptrWordIn[7]); - state[8] ^= (ptrWordIn[8]); - state[9] ^= (ptrWordIn[9]); - state[10] ^= (ptrWordIn[10]); - state[11] ^= (ptrWordIn[11]); +#else - #endif + //Absorbing "M[prev][col]" + state[0] ^= (ptrWordIn[0]); + state[1] ^= (ptrWordIn[1]); + state[2] ^= (ptrWordIn[2]); + state[3] ^= (ptrWordIn[3]); + state[4] ^= (ptrWordIn[4]); + state[5] ^= (ptrWordIn[5]); + state[6] ^= (ptrWordIn[6]); + state[7] ^= (ptrWordIn[7]); + state[8] ^= (ptrWordIn[8]); + state[9] ^= (ptrWordIn[9]); + state[10] ^= (ptrWordIn[10]); + state[11] ^= (ptrWordIn[11]); - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); +#endif - //M[row][C-1-col] = M[prev][col] XOR rand #if defined __AVX2__ - state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); +/* state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); @@ -484,7 +510,7 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const _mm256_xor_si256( state_v[1], in_v[1] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), _mm256_xor_si256( state_v[2], in_v[2] ) ); - +*/ #elif defined __AVX__ state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); @@ -509,25 +535,34 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const #else - ptrWordOut[0] = ptrWordIn[0] ^ state[0]; - ptrWordOut[1] = ptrWordIn[1] ^ state[1]; - ptrWordOut[2] = ptrWordIn[2] ^ state[2]; - ptrWordOut[3] = ptrWordIn[3] ^ state[3]; - ptrWordOut[4] = ptrWordIn[4] ^ state[4]; - ptrWordOut[5] = ptrWordIn[5] ^ state[5]; - ptrWordOut[6] = ptrWordIn[6] ^ state[6]; - ptrWordOut[7] = ptrWordIn[7] ^ state[7]; - ptrWordOut[8] = ptrWordIn[8] ^ state[8]; - ptrWordOut[9] = ptrWordIn[9] ^ state[9]; - ptrWordOut[10] = ptrWordIn[10] ^ state[10]; - ptrWordOut[11] = ptrWordIn[11] ^ state[11]; - #endif + //M[row][C-1-col] = M[prev][col] XOR rand + ptrWordOut[0] = ptrWordIn[0] ^ state[0]; + ptrWordOut[1] = ptrWordIn[1] ^ state[1]; + ptrWordOut[2] = ptrWordIn[2] ^ state[2]; + ptrWordOut[3] = ptrWordIn[3] ^ state[3]; + ptrWordOut[4] = ptrWordIn[4] ^ state[4]; + ptrWordOut[5] = ptrWordIn[5] ^ state[5]; + ptrWordOut[6] = ptrWordIn[6] ^ state[6]; + ptrWordOut[7] = ptrWordIn[7] ^ state[7]; + ptrWordOut[8] = ptrWordIn[8] ^ state[8]; + ptrWordOut[9] = ptrWordIn[9] ^ state[9]; + ptrWordOut[10] = ptrWordIn[10] ^ state[10]; + ptrWordOut[11] = ptrWordIn[11] ^ state[11]; +#endif + + //Input: next column (i.e., next block in sequence) + ptrWordIn += BLOCK_LEN_INT64; + //Output: goes to previous column + ptrWordOut -= BLOCK_LEN_INT64; + } + +#if defined __AVX2__ + _mm256_store_si256( (__m256i*)&state[0], state_v[0] ); + _mm256_store_si256( (__m256i*)&state[4], state_v[1] ); + _mm256_store_si256( (__m256i*)&state[8], state_v[2] ); + _mm256_store_si256( (__m256i*)&state[12], state_v[3] ); +#endif - //Input: next column (i.e., next block in sequence) - ptrWordIn += BLOCK_LEN_INT64; - //Output: goes to previous column - ptrWordOut -= BLOCK_LEN_INT64; - } } /** @@ -544,46 +579,86 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const * @param rowOut Row receiving the output * */ -void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols) +inline void reducedDuplexRowSetup( uint64_t *state, uint64_t *rowIn, + uint64_t *rowInOut, uint64_t *rowOut, + uint64_t nCols ) { - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* - uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row - unsigned int i; + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + int i; - for (i = 0; i < nCols; i++) - { - //Absorbing "M[prev] [+] M[row*]" - #if defined __AVX2__ +#if defined __AVX2__ + __m256i state_v[4], in_v[3], inout_v[3]; + #define t_state in_v - __m256i state_v[3], in_v[3], inout_v[3]; + state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); + state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); + state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); + state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) ); - state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); + for ( i = 0; i < nCols; i++ ) + { in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) ); inout_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[0]) ); - state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); in_v [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) ); inout_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[4]) ); - state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); in_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) ); inout_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[8]) ); - _mm256_store_si256( (__m256i*)(&state[0]), - _mm256_xor_si256( state_v[0], - _mm256_add_epi64( in_v[0], - inout_v[0] ) ) ); - _mm256_store_si256( (__m256i*)(&state[4]), - _mm256_xor_si256( state_v[1], - _mm256_add_epi64( in_v[1], - inout_v[1] ) ) ); - _mm256_store_si256( (__m256i*)(&state[8]), - _mm256_xor_si256( state_v[2], - _mm256_add_epi64( in_v[2], - inout_v[2] ) ) ); - #elif defined __AVX__ + state_v[0] = _mm256_xor_si256( state_v[0], _mm256_add_epi64( in_v[0], + inout_v[0] ) ); + state_v[1] = _mm256_xor_si256( state_v[1], _mm256_add_epi64( in_v[1], + inout_v[1] ) ); + state_v[2] = _mm256_xor_si256( state_v[2], _mm256_add_epi64( in_v[2], + inout_v[2] ) ); + + LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] ); + + _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]), + _mm256_xor_si256( state_v[0], in_v[0] ) ); + _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]), + _mm256_xor_si256( state_v[1], in_v[1] ) ); + _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), + _mm256_xor_si256( state_v[2], in_v[2] ) ); + + //M[row*][col] = M[row*][col] XOR rotW(rand) + t_state[0] = _mm256_permute4x64_epi64( state_v[0], 0x93 ); + t_state[1] = _mm256_permute4x64_epi64( state_v[1], 0x93 ); + t_state[2] = _mm256_permute4x64_epi64( state_v[2], 0x93 ); + + inout_v[0] = _mm256_xor_si256( inout_v[0], + _mm256_blend_epi32( t_state[0], t_state[2], 0x03 ) ); + inout_v[1] = _mm256_xor_si256( inout_v[1], + _mm256_blend_epi32( t_state[1], t_state[0], 0x03 ) ); + inout_v[2] = _mm256_xor_si256( inout_v[2], + _mm256_blend_epi32( t_state[2], t_state[1], 0x03 ) ); + + _mm256_storeu_si256( (__m256i*)&ptrWordInOut[0], inout_v[0] ); + _mm256_storeu_si256( (__m256i*)&ptrWordInOut[4], inout_v[1] ); + _mm256_storeu_si256( (__m256i*)&ptrWordInOut[8], inout_v[2] ); + + //Inputs: next column (i.e., next block in sequence) + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + //Output: goes to previous column + ptrWordOut -= BLOCK_LEN_INT64; + } + + _mm256_store_si256( (__m256i*)&state[0], state_v[0] ); + _mm256_store_si256( (__m256i*)&state[4], state_v[1] ); + _mm256_store_si256( (__m256i*)&state[8], state_v[2] ); + _mm256_store_si256( (__m256i*)&state[12], state_v[3] ); + + #undef t_state + +#elif defined __AVX__ __m128i state_v[6], in_v[6], inout_v[6]; + for ( i = 0; i < nCols; i++ ) + { + state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) ); state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) ); @@ -630,39 +705,34 @@ void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, _mm_add_epi64( in_v[5], inout_v[5] ) ) ); - #else + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); +#else + for ( i = 0; i < nCols; i++ ) + { - state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); - state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); - state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); - state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); - state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); - state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); - state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); - state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); - state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); - state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); - state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); - state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); - #endif + //Absorbing "M[prev] [+] M[row*]" + state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); + state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); + state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); + state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); + state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); + state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); + state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); + state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); + state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); + state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); + state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); + state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[row][col] = M[prev][col] XOR rand +#endif - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); - //M[row][col] = M[prev][col] XOR rand #if defined __AVX2__ - state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); - state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); - state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); - - _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]), - _mm256_xor_si256( state_v[0], in_v[0] ) ); - _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]), - _mm256_xor_si256( state_v[1], in_v[1] ) ); - _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), - _mm256_xor_si256( state_v[2], in_v[2] ) ); - #elif defined __AVX__ state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); @@ -687,40 +757,49 @@ void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, #else - ptrWordOut[0] = ptrWordIn[0] ^ state[0]; - ptrWordOut[1] = ptrWordIn[1] ^ state[1]; - ptrWordOut[2] = ptrWordIn[2] ^ state[2]; - ptrWordOut[3] = ptrWordIn[3] ^ state[3]; - ptrWordOut[4] = ptrWordIn[4] ^ state[4]; - ptrWordOut[5] = ptrWordIn[5] ^ state[5]; - ptrWordOut[6] = ptrWordIn[6] ^ state[6]; - ptrWordOut[7] = ptrWordIn[7] ^ state[7]; - ptrWordOut[8] = ptrWordIn[8] ^ state[8]; - ptrWordOut[9] = ptrWordIn[9] ^ state[9]; - ptrWordOut[10] = ptrWordIn[10] ^ state[10]; - ptrWordOut[11] = ptrWordIn[11] ^ state[11]; - #endif + ptrWordOut[0] = ptrWordIn[0] ^ state[0]; + ptrWordOut[1] = ptrWordIn[1] ^ state[1]; + ptrWordOut[2] = ptrWordIn[2] ^ state[2]; + ptrWordOut[3] = ptrWordIn[3] ^ state[3]; + ptrWordOut[4] = ptrWordIn[4] ^ state[4]; + ptrWordOut[5] = ptrWordIn[5] ^ state[5]; + ptrWordOut[6] = ptrWordIn[6] ^ state[6]; + ptrWordOut[7] = ptrWordIn[7] ^ state[7]; + ptrWordOut[8] = ptrWordIn[8] ^ state[8]; + ptrWordOut[9] = ptrWordIn[9] ^ state[9]; + ptrWordOut[10] = ptrWordIn[10] ^ state[10]; + ptrWordOut[11] = ptrWordIn[11] ^ state[11]; +#endif - //M[row*][col] = M[row*][col] XOR rotW(rand) - ptrWordInOut[0] ^= state[11]; - ptrWordInOut[1] ^= state[0]; - ptrWordInOut[2] ^= state[1]; - ptrWordInOut[3] ^= state[2]; - ptrWordInOut[4] ^= state[3]; - ptrWordInOut[5] ^= state[4]; - ptrWordInOut[6] ^= state[5]; - ptrWordInOut[7] ^= state[6]; - ptrWordInOut[8] ^= state[7]; - ptrWordInOut[9] ^= state[8]; - ptrWordInOut[10] ^= state[9]; - ptrWordInOut[11] ^= state[10]; + //M[row*][col] = M[row*][col] XOR rotW(rand) +// Need to fix this before taking state load/store out of loop +#ifdef __AVX2__ + + +#else + + ptrWordInOut[0] ^= state[11]; + ptrWordInOut[1] ^= state[0]; + ptrWordInOut[2] ^= state[1]; + ptrWordInOut[3] ^= state[2]; + ptrWordInOut[4] ^= state[3]; + ptrWordInOut[5] ^= state[4]; + ptrWordInOut[6] ^= state[5]; + ptrWordInOut[7] ^= state[6]; + ptrWordInOut[8] ^= state[7]; + ptrWordInOut[9] ^= state[8]; + ptrWordInOut[10] ^= state[9]; + ptrWordInOut[11] ^= state[10]; + + //Inputs: next column (i.e., next block in sequence) + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + //Output: goes to previous column + ptrWordOut -= BLOCK_LEN_INT64; + } + +#endif - //Inputs: next column (i.e., next block in sequence) - ptrWordInOut += BLOCK_LEN_INT64; - ptrWordIn += BLOCK_LEN_INT64; - //Output: goes to previous column - ptrWordOut -= BLOCK_LEN_INT64; - } } /** @@ -737,22 +816,26 @@ void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, * @param rowOut Row receiving the output * */ -void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols) +inline void reducedDuplexRow( uint64_t *state, uint64_t *rowIn, + uint64_t *rowInOut, uint64_t *rowOut, + uint64_t nCols ) { - uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row - unsigned int i; + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row + int i; - for (i = 0; i < nCols; i++) - { - //Absorbing "M[prev] [+] M[row*]" - #if defined __AVX2__ +#if defined __AVX2__ - __m256i state_v[3], in_v[3], inout_v[3]; + for ( i = 0; i < nCols; i++) + { + + //Absorbing "M[prev] [+] M[row*]" + + __m256i state_v[4], in_v[3], inout_v[3]; #define out_v in_v // reuse register in next code block - + #define t_state in_v state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); in_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) ); inout_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[0]) ); @@ -762,21 +845,82 @@ void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) ); in_v [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) ); inout_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[8]) ); + state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) ); + + state_v[0] = _mm256_xor_si256( state_v[0], _mm256_add_epi64( in_v[0], + inout_v[0] ) ); + state_v[1] = _mm256_xor_si256( state_v[1], _mm256_add_epi64( in_v[1], + inout_v[1] ) ); + state_v[2] = _mm256_xor_si256( state_v[2], _mm256_add_epi64( in_v[2], + inout_v[2] ) ); + + out_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[0]) ); + out_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[4]) ); + out_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[8]) ); + + LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] ); + + _mm256_store_si256( (__m256i*)&state[0], state_v[0] ); + _mm256_store_si256( (__m256i*)&state[4], state_v[1] ); + _mm256_store_si256( (__m256i*)&state[8], state_v[2] ); + _mm256_store_si256( (__m256i*)&state[12], state_v[3] ); + + _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]), + _mm256_xor_si256( state_v[0], out_v[0] ) ); + _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]), + _mm256_xor_si256( state_v[1], out_v[1] ) ); + _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), + _mm256_xor_si256( state_v[2], out_v[2] ) ); + +/* + t_state[0] = _mm256_permute4x64_epi64( state_v[0], 0x93 ); + t_state[1] = _mm256_permute4x64_epi64( state_v[1], 0x93 ); + t_state[2] = _mm256_permute4x64_epi64( state_v[2], 0x93 ); + + inout_v[0] = _mm256_xor_si256( inout_v[0], + _mm256_blend_epi32( t_state[0], t_state[2], 0x03 ) ); + inout_v[1] = _mm256_xor_si256( inout_v[1], + _mm256_blend_epi32( t_state[1], t_state[0], 0x03 ) ); + inout_v[2] = _mm256_xor_si256( inout_v[2], + _mm256_blend_epi32( t_state[2], t_state[1], 0x03 ) ); + + _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[0]), inout_v[0] ); + _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[4]), inout_v[1] ); + _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[8]), inout_v[2] ); + + _mm256_store_si256( (__m256i*)&state[0], state_v[0] ); + _mm256_store_si256( (__m256i*)&state[4], state_v[1] ); + _mm256_store_si256( (__m256i*)&state[8], state_v[2] ); + _mm256_store_si256( (__m256i*)&state[12], state_v[3] ); +*/ + #undef out_v + #undef t_state + + //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[11]; + ptrWordInOut[1] ^= state[0]; + ptrWordInOut[2] ^= state[1]; + ptrWordInOut[3] ^= state[2]; + ptrWordInOut[4] ^= state[3]; + ptrWordInOut[5] ^= state[4]; + ptrWordInOut[6] ^= state[5]; + ptrWordInOut[7] ^= state[6]; + ptrWordInOut[8] ^= state[7]; + ptrWordInOut[9] ^= state[8]; + ptrWordInOut[10] ^= state[9]; + ptrWordInOut[11] ^= state[10]; + + //Goes to next block + ptrWordOut += BLOCK_LEN_INT64; + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + } - _mm256_store_si256( (__m256i*)(&state[0]), - _mm256_xor_si256( state_v[0], - _mm256_add_epi64( in_v[0], - inout_v[0] ) ) ); - _mm256_store_si256( (__m256i*)(&state[4]), - _mm256_xor_si256( state_v[1], - _mm256_add_epi64( in_v[1], - inout_v[1] ) ) ); - _mm256_store_si256( (__m256i*)(&state[8]), - _mm256_xor_si256( state_v[2], - _mm256_add_epi64( in_v[2], - inout_v[2] ) ) ); #elif defined __AVX__ + for ( i = 0; i < nCols; i++) + { + __m128i state_v[6], in_v[6], inout_v[6]; #define out_v in_v // reuse register in next code block @@ -826,28 +970,35 @@ void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint _mm_add_epi64( in_v[5], inout_v[5] ) ) ); - #else - - state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); - state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); - state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); - state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); - state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); - state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); - state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); - state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); - state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); - state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); - state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); - state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); - #endif - //Applies the reduced-round transformation f to the sponge's state reducedBlake2bLyra(state); - //M[rowOut][col] = M[rowOut][col] XOR rand - #if defined __AVX2__ + #else + for ( i = 0; i < nCols; i++) + { + + state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); + state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); + state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); + state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); + state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); + state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); + state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); + state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); + state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); + state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); + state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); + state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); +#endif + + //M[rowOut][col] = M[rowOut][col] XOR rand + + #if defined __AVX2__ +/* state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) ); out_v [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[0]) ); state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) ); @@ -861,7 +1012,7 @@ void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint _mm256_xor_si256( state_v[1], out_v[1] ) ); _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]), _mm256_xor_si256( state_v[2], out_v[2] ) ); - +*/ #elif defined __AVX__ state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) ); @@ -891,22 +1042,39 @@ void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint _mm_store_si128( (__m128i*)(&ptrWordOut[10]), _mm_xor_si128( state_v[5], out_v[5] ) ); - #else + //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[11]; + ptrWordInOut[1] ^= state[0]; + ptrWordInOut[2] ^= state[1]; + ptrWordInOut[3] ^= state[2]; + ptrWordInOut[4] ^= state[3]; + ptrWordInOut[5] ^= state[4]; + ptrWordInOut[6] ^= state[5]; + ptrWordInOut[7] ^= state[6]; + ptrWordInOut[8] ^= state[7]; + ptrWordInOut[9] ^= state[8]; + ptrWordInOut[10] ^= state[9]; + ptrWordInOut[11] ^= state[10]; - ptrWordOut[0] ^= state[0]; - ptrWordOut[1] ^= state[1]; - ptrWordOut[2] ^= state[2]; - ptrWordOut[3] ^= state[3]; - ptrWordOut[4] ^= state[4]; - ptrWordOut[5] ^= state[5]; - ptrWordOut[6] ^= state[6]; - ptrWordOut[7] ^= state[7]; - ptrWordOut[8] ^= state[8]; - ptrWordOut[9] ^= state[9]; - ptrWordOut[10] ^= state[10]; - ptrWordOut[11] ^= state[11]; + //Goes to next block + ptrWordOut += BLOCK_LEN_INT64; + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + } - #endif +#else + ptrWordOut[0] ^= state[0]; + ptrWordOut[1] ^= state[1]; + ptrWordOut[2] ^= state[2]; + ptrWordOut[3] ^= state[3]; + ptrWordOut[4] ^= state[4]; + ptrWordOut[5] ^= state[5]; + ptrWordOut[6] ^= state[6]; + ptrWordOut[7] ^= state[7]; + ptrWordOut[8] ^= state[8]; + ptrWordOut[9] ^= state[9]; + ptrWordOut[10] ^= state[10]; + ptrWordOut[11] ^= state[11]; //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) ptrWordInOut[0] ^= state[11]; @@ -922,24 +1090,10 @@ void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint ptrWordInOut[10] ^= state[9]; ptrWordInOut[11] ^= state[10]; - //Goes to next block - ptrWordOut += BLOCK_LEN_INT64; - ptrWordInOut += BLOCK_LEN_INT64; - ptrWordIn += BLOCK_LEN_INT64; - } + //Goes to next block + ptrWordOut += BLOCK_LEN_INT64; + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + } +#endif } - -/** - * Prints an array of unsigned chars - */ -void printArray(unsigned char *array, unsigned int size, char *name) -{ - unsigned int i; - printf("%s: ", name); - for (i = 0; i < size; i++) { - printf("%2x|", array[i]); - } - printf("\n"); -} - -//////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h index 5b38dc7..8f6cb4c 100644 --- a/algo/lyra2/sponge.h +++ b/algo/lyra2/sponge.h @@ -23,42 +23,34 @@ #define SPONGE_H_ #include +#include "avxdefs.h" -/* Blake2b IV Array */ +#if defined(__GNUC__) +#define ALIGN __attribute__ ((aligned(32))) +#elif defined(_MSC_VER) +#define ALIGN __declspec(align(32)) +#else +#define ALIGN +#endif + + +/*Blake2b IV Array*/ static const uint64_t blake2b_IV[8] = { - 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, - 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, - 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, - 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL }; -/* Blake2b's rotation */ -static __inline uint64_t rotr64(const uint64_t w, const unsigned c) { -#ifdef _MSC_VER - return _rotr64(w, c); -#else - return ( w >> c ) | ( w << ( 64 - c ) ); -#endif +/*Blake2b's rotation*/ +static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ + return ( w >> c ) | ( w << ( 64 - c ) ); } #if defined __AVX2__ // only available with avx2 -// rotate each uint64 c bits -// returns _m256i -#define mm256_rotr_64(w,c) _mm256_or_si256(_mm256_srli_epi64(w, c), \ - _mm256_slli_epi64(w, 64 - c)) - -// Rotate 4 uint64 (256 bits) by one uint64 (64 bits) -// returns __m256i -#define mm256_rotl256_1x64(s) _mm256_permute4x64_epi64( s, 0x39 ) -#define mm256_rotr256_1x64(s) _mm256_permute4x64_epi64( s, 0x93 ) - -// swap hi and lo 128 bits in 256 bit vector -// returns _m256i -#define mm256_swap128(s) _mm256_permute2f128_si256( s, s, 1 ) - // init vectors from memory // returns void, updates defines and inits implicit args a, b, c, d #define LYRA_INIT_AVX2 \ @@ -88,15 +80,29 @@ static __inline uint64_t rotr64(const uint64_t w, const unsigned c) { c = _mm256_add_epi64( c, d ); \ b = mm256_rotr_64( _mm256_xor_si256( b, c ), 63 ); -#define LYRA_ROUND_AVX2 \ - G_4X64( a[0], a[1], a[2], a[3] ); \ - a[1] = mm256_rotl256_1x64( a[1]); \ - a[2] = mm256_swap128( a[2] ); \ - a[3] = mm256_rotr256_1x64( a[3] ); \ - G_4X64( a[0], a[1], a[2], a[3] ); \ - a[1] = mm256_rotr256_1x64( a[1] ); \ - a[2] = mm256_swap128( a[2] ); \ - a[3] = mm256_rotl256_1x64( a[3] ); +#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ + G_4X64( s0, s1, s2, s3 ); \ + s1 = mm256_rotl256_1x64( s1); \ + s2 = mm256_swap128( s2 ); \ + s3 = mm256_rotr256_1x64( s3 ); \ + G_4X64( s0, s1, s2, s3 ); \ + s1 = mm256_rotr256_1x64( s1 ); \ + s2 = mm256_swap128( s2 ); \ + s3 = mm256_rotl256_1x64( s3 ); + +#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \ + LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ + LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ + LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ + LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ + LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ + LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ + LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ + LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ + LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ + LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ + LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ + LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ #else // only available with avx @@ -148,6 +154,7 @@ static __inline uint64_t rotr64(const uint64_t w, const unsigned c) { #endif // AVX2 +/* #if defined __AVX__ // can coexist with AVX2 @@ -161,7 +168,7 @@ static __inline uint64_t rotr64(const uint64_t w, const unsigned c) { #define mm128_swap128(s0, s1) s0 = _mm_xor_si128(s0, s1); \ s1 = _mm_xor_si128(s0, s1); \ s0 = _mm_xor_si128(s0, s1); - + // swap uint64 in 128 bit source vector, equivalent of rotating 128 bits by // 64 bits (8 bytes) // __m128i @@ -193,30 +200,33 @@ static __inline uint64_t rotr64(const uint64_t w, const unsigned c) { } while(0) #endif // AVX +*/ -/* Blake2b's G function */ -#define G(r,i,a,b,c,d) do { \ - a = a + b; \ - d = rotr64(d ^ a, 32); \ - c = c + d; \ - b = rotr64(b ^ c, 24); \ - a = a + b; \ - d = rotr64(d ^ a, 16); \ - c = c + d; \ - b = rotr64(b ^ c, 63); \ +// Scalar +//Blake2b's G function +#define G(r,i,a,b,c,d) \ + do { \ + a = a + b; \ + d = rotr64(d ^ a, 32); \ + c = c + d; \ + b = rotr64(b ^ c, 24); \ + a = a + b; \ + d = rotr64(d ^ a, 16); \ + c = c + d; \ + b = rotr64(b ^ c, 63); \ } while(0) /*One Round of the Blake2b's compression function*/ -#define ROUND_LYRA(r) \ - G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ - G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ - G(r,2,v[ 2],v[ 6],v[10],v[14]); \ - G(r,3,v[ 3],v[ 7],v[11],v[15]); \ - G(r,4,v[ 0],v[ 5],v[10],v[15]); \ - G(r,5,v[ 1],v[ 6],v[11],v[12]); \ - G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ - G(r,7,v[ 3],v[ 4],v[ 9],v[14]); +#define ROUND_LYRA(r) \ + G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ + G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ + G(r,2,v[ 2],v[ 6],v[10],v[14]); \ + G(r,3,v[ 3],v[ 7],v[11],v[15]); \ + G(r,4,v[ 0],v[ 5],v[10],v[15]); \ + G(r,5,v[ 1],v[ 6],v[11],v[12]); \ + G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ + G(r,7,v[ 3],v[ 4],v[ 9],v[14]); //---- Housekeeping @@ -224,18 +234,31 @@ void initState(uint64_t state[/*16*/]); //---- Squeezes void squeeze(uint64_t *state, unsigned char *out, unsigned int len); -void reducedSqueezeRow0(uint64_t* state, uint64_t* row, const uint32_t nCols); +void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols); //---- Absorbs void absorbBlock(uint64_t *state, const uint64_t *in); void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in); //---- Duplexes -void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const uint32_t nCols); -void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols); -void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols); +void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols); +void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols); +void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols); //---- Misc void printArray(unsigned char *array, unsigned int size, char *name); +//////////////////////////////////////////////////////////////////////////////////////////////// + + +////TESTS//// +//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2); +//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +///////////// + + #endif /* SPONGE_H_ */ diff --git a/algo/lyra2/zcoin.c b/algo/lyra2/zcoin.c new file mode 100644 index 0000000..2c7a747 --- /dev/null +++ b/algo/lyra2/zcoin.c @@ -0,0 +1,77 @@ +#include +#include "miner.h" +#include "algo-gate-api.h" +#include "lyra2.h" + +void zcoin_hash(void *state, const void *input, uint32_t height) +{ + + uint32_t _ALIGN(256) hash[16]; + + LYRA2Z(hash, 32, input, 80, input, 80, 2, height, 256); + + memcpy(state, hash, 32); +} + +//int scanhash_zcoin(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, uint32_t height) +int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ) +{ + uint32_t _ALIGN(128) hash[8]; + uint32_t _ALIGN(128) endiandata[20]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t Htarg = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + uint32_t nonce = first_nonce; + if (opt_benchmark) + ptarget[7] = 0x0000ff; + + for (int i=0; i < 19; i++) { + be32enc(&endiandata[i], pdata[i]); + } + + do { + be32enc(&endiandata[19], nonce); + zcoin_hash( hash, endiandata, work->height ); + + if (hash[7] <= Htarg && fulltest(hash, ptarget)) { + work_set_target_ratio(work, hash); + pdata[19] = nonce; + *hashes_done = pdata[19] - first_nonce; + return 1; + } + nonce++; + + } while (nonce < max_nonce && !work_restart[thr_id].restart); + + pdata[19] = nonce; + *hashes_done = pdata[19] - first_nonce + 1; + return 0; +} + +int64_t get_max64_0xffffLL() { return 0xffffLL; }; + +void zcoin_set_target( struct work* work, double job_diff ) +{ + work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); +} + +bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx ) +{ + work->height = sctx->bloc_height; + return false; +} + +bool register_zcoin_algo( algo_gate_t* gate ) +{ + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; + gate->scanhash = (void*)&scanhash_zcoin; + gate->hash = (void*)&zcoin_hash; + gate->hash_alt = (void*)&zcoin_hash; + gate->get_max64 = (void*)&get_max64_0xffffLL; + gate->set_target = (void*)&zcoin_set_target; + gate->prevent_dupes = (void*)&zcoin_get_work_height; + return true; +}; + diff --git a/algo/pluck.c b/algo/pluck.c index 30224d9..b8de21e 100644 --- a/algo/pluck.c +++ b/algo/pluck.c @@ -487,24 +487,19 @@ int64_t pluck_get_max64 () return 0x1ffLL; } -void pluck_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (65536.0 * opt_diff_factor) ); -} - -bool pluck_miner_thread_init( ) +bool pluck_miner_thread_init( int thr_id ) { scratchbuf = malloc( 128 * 1024 ); if ( scratchbuf ) return true; - applog( LOG_ERR, "Pluck buffer allocation failed"); + applog( LOG_ERR, "Thread %u: Pluck buffer allocation failed", thr_id ); return false; } bool register_pluck_algo( algo_gate_t* gate ) { algo_not_tested(); - gate->miner_thread_init = (void*)& pluck_miner_thread_init; + gate->miner_thread_init = (void*)&pluck_miner_thread_init; gate->scanhash = (void*)&scanhash_pluck; gate->hash = (void*)&pluck_hash; gate->set_target = (void*)&scrypt_set_target; diff --git a/algo/scrypt.c b/algo/scrypt.c index 31c6c09..fedc9a7 100644 --- a/algo/scrypt.c +++ b/algo/scrypt.c @@ -774,12 +774,12 @@ int64_t scrypt_get_max64() return max64; } -bool scrypt_miner_thread_init() +bool scrypt_miner_thread_init( int thr_id ) { scratchbuf = scrypt_buffer_alloc( opt_scrypt_n ); if ( scratchbuf ) return true; - applog( LOG_ERR, "Scrypt buffer allocation failed" ); + applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id ); return false; } diff --git a/algo/scryptjane/scrypt-jane.c b/algo/scryptjane/scrypt-jane.c index 8b87134..bcab680 100644 --- a/algo/scryptjane/scrypt-jane.c +++ b/algo/scryptjane/scrypt-jane.c @@ -231,11 +231,6 @@ void scryptjanehash(void *output, const void *input ) scrypt_free(&YX); } -void scryptjane_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (65536.0 * opt_diff_factor) ); -} - bool register_scryptjane_algo( algo_gate_t* gate ) { gate->scanhash = (void*)&scanhash_scryptjane; diff --git a/algo/yescrypt/yescrypt.c b/algo/yescrypt/yescrypt.c index d6d1f44..a175070 100644 --- a/algo/yescrypt/yescrypt.c +++ b/algo/yescrypt/yescrypt.c @@ -49,19 +49,11 @@ int64_t yescrypt_get_max64 () return 0x1ffLL; } - -void yescrypt_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (65536.0 * opt_diff_factor) ); -} - - bool register_yescrypt_algo ( algo_gate_t* gate ) { gate->scanhash = (void*)&scanhash_yescrypt; gate->hash = (void*)&yescrypt_hash; gate->hash_alt = (void*)&yescrypthash; -// gate->set_target = (void*)&yescrypt_set_target; gate->set_target = (void*)&scrypt_set_target; gate->get_max64 = (void*)&yescrypt_get_max64; return true; diff --git a/avxdefs.h b/avxdefs.h index d3ad5e2..09b077f 100644 --- a/avxdefs.h +++ b/avxdefs.h @@ -1,7 +1,7 @@ // Some tools to help using AVX and AVX2 #include -//#include +#include // Use these overlays to access the same data in memory as different types // @@ -15,7 +15,7 @@ typedef union { -#if defined __AVX__ +#if defined __AVX2__ __m256i v256; #endif __m128i v128[ 2]; @@ -49,47 +49,85 @@ uint8_t v8 [16]; #if defined __AVX2__ -// Rotate bits in 4 uint64 +// Rotate bits in 4 uint64 (3 instructions) // __m256i mm256_rotr_64( __256i, int ) -#define mm256_rotr_64(w,c) _mm256_or_si256(_mm256_srli_epi64(w, c), \ - _mm256_slli_epi64(w, 64 - c)) -//static inline __m256i mm256_rotr_64 ( __m256i w, int c) -//{ -// return _mm256_or_si256( _mm256_srli_epi64( w, c ), -// _mm256_slli_epi64( w, 64 - c ) ); -//} -// Rotate uint64 by one uint64 -//__m256i mm256_rotl256_1x64( _mm256i, int ) -#define mm256_rotl256_1x64(s) _mm256_permute4x64_epi64( s, 0x39 ) -#define mm256_rotr256_1x64(s) _mm256_permute4x64_epi64( s, 0x93 ) -//static inline __m256i mm256_rotl256_1x64( __m256i s ) -//{ -// return _mm256_permute4x64_epi64( s, 0x39 ); -//} -//static inline __m256i mm256_rotr256_1x64( __m256i s ) -//{ -// return _mm256_permute4x64_epi64( s, 0x93 ); -//} +#define mm256_rotr_64( w, c ) \ + _mm256_or_si256( _mm256_srli_epi64(w, c), _mm256_slli_epi64(w, 64 - c) ) + +#define mm256_rotl_64( w, c ) \ + _mm256_or_si256( _mm256_slli_epi64(w, c), _mm256_srli_epi64(w, 64 - c) ) // swap hi and lo 128 bits in 256 bit vector // __m256i mm256_swap128( __m256i ) -#define mm256_swap128(s) _mm256_permute2f128_si256( s, s, 1 ) -//static inline __m256i mm256_swap128( __m256i s ) -//{ -// return _mm256_permute2f128_si256( s, s, 1 ); -//} +#define mm256_swap128( w ) \ + _mm256_permute2f128_si256( w, w, 1 ) -#endif +// Rotate 256 bits by 64 bits (4 uint64 by one uint64) +//__m256i mm256_rotl256_1x64( _mm256i, int ) +#define mm256_rotl256_1x64( w ) \ + _mm256_permute4x64_epi64( w, 0x39 ) + +#define mm256_rotr256_1x64( w ) \ + _mm256_permute4x64_epi64( w, 0x93 ) + +// shift 256 bits by n*64 bits (4 uint64 by n uint64) +#define mm256_slli256_1x64( w ) \ + _mm256_and_si256( mm256_rotl256_1x64( w ), \ + _mm256_set_epi64x( 0, \ + 0xffffffffffffffffull, \ + 0xffffffffffffffffull, \ + 0xffffffffffffffffull ) ) +// _mm256_set_epi64x( 0xffffffffffffffffull, \ +// 0xffffffffffffffffull, \ +// 0xffffffffffffffffull, \ +// 0 ) ) + + +#define mm256_slli256_2x64( w ) \ + _mm256_and_si256( mm256_swap128( w ), \ + _mm256_set_epi64x( 0xffffffffffffffffull, \ + 0xffffffffffffffffull, \ + 0, \ + 0 ) ) + +#define mm256_slli256_3x64( w ) \ + _mm256_and_si256( mm256_rotr256_1x64( w ), \ + _mm256_set_epi64x( 0xffffffffffffffffull, \ + 0, \ + 0, \ + 0 ) ) + +#define mm256_srli256_1x64( w ) \ + _mm256_and_si256( mm256_rotr256_1x64( w ), \ + _mm256_set_epi64x( 0, \ + 0xffffffffffffffffull, \ + 0xffffffffffffffffull, \ + 0xffffffffffffffffull ) ) + +#define mm256_srli256_2x64( w ) \ + _mm256_and_si256( mm256_swap128( w ), \ + _mm256_set_epi64x( 0, \ + 0, \ + 0xffffffffffffffffull, \ + 0xffffffffffffffffull )) + +#define mm256_srli256_3x64( w ) \ + _mm256_and_si256( mm256_rotl256_1x64( w ), \ + _mm256_set_epi64x( 0xffffffffffffffffull, \ + 0, \ + 0, \ + 0 ) ) +// _mm256_set_epi64x( 0, \ +// 0, \ +// 0, \ +// 0xffffffffffffffffull ) ) + +#endif // AVX2 // rotate bits in 2 uint64 // _m128i mm_rotr_64( __m128i, int ) #define mm_rotr_64(w,c) _mm_or_si128(_mm_srli_epi64(w, c), \ _mm_slli_epi64(w, 64 - c)) -//static inline __m128i mm_rotr_64( __m128i w, int c ) -//{ -// _mm_or_si128( _mm_srli_epi64( w, c ), -// _mm_slli_epi64( w, 64 - c ) ); -//} // swap 128 bit source vectors // void mm128_swap128( __m128i, __m128i ) diff --git a/configure.ac b/configure.ac index a3f77eb..52b1278 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.4.8-dev]) +AC_INIT([cpuminer-opt], [4.3.8-dev]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index f055c25..9a057a4 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -79,7 +79,7 @@ bool opt_debug_diff = false; bool opt_protocol = false; bool opt_benchmark = false; bool opt_redirect = true; -bool opt_showdiff = false; +bool opt_showdiff = true; bool opt_extranonce = true; bool want_longpoll = true; bool have_longpoll = false; @@ -306,8 +306,8 @@ static bool work_decode(const json_t *val, struct work *work) { if ( !algo_gate.work_decode( val, work ) ) return false; - if ((opt_showdiff || opt_max_diff > 0.) && !allow_mininginfo) - algo_gate.calc_network_diff( work ); + if ( !allow_mininginfo ) + net_diff = algo_gate.calc_network_diff( work ); work->targetdiff = target_to_diff(work->target); // for api stats, on longpoll pools stratum_diff = work->targetdiff; @@ -781,10 +781,6 @@ static int share_result( int result, struct work *work, const char *reason ) (uint32_t)cpu_temp(0) ); #endif -// applog(LOG_NOTICE, "accepted: %lu/%lu (%s%%), %s %sH, %s %sH/s %s", -// accepted_count, total_submits, accepted_rate_s, -// hc, hc_units, hr, hr_units, sres ); - if (reason) { applog(LOG_WARNING, "reject reason: %s", reason); @@ -1467,7 +1463,7 @@ void std_build_extraheader( struct work* work, struct stratum_ctx* sctx ) work->data[31] = 0x00000280; } -void std_calc_network_diff( struct work* work ) +double std_calc_network_diff( struct work* work ) { // sample for diff 43.281 : 1c05ea29 // todo: endian reversed on longpoll could be zr5 specific... @@ -1477,11 +1473,14 @@ void std_calc_network_diff( struct work* work ) uint32_t bits = ( nbits & 0xffffff ); int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28 int m; - net_diff = (double)0x0000ffff / (double)bits; + double d = (double)0x0000ffff / (double)bits; for ( m = shift; m < 29; m++ ) - net_diff *= 256.0; + d *= 256.0; for ( m = 29; m < shift; m++ ) - net_diff /= 256.0; + d /= 256.0; + if ( opt_debug_diff ) + applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits); + return d; } uint32_t* std_get_nonceptr( uint32_t *work_data ) @@ -1501,7 +1500,8 @@ void std_get_new_work( struct work* work, struct work* g_work, int thr_id, uint32_t *nonceptr = algo_gate.get_nonceptr( work->data ); if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size ) - && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) ) ) + && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) + || ( work->job_id != g_work->job_id ) ) ) { work_free( work ); work_copy( work, g_work ); @@ -1518,6 +1518,7 @@ void jr2_get_new_work( struct work* work, struct work* g_work, int thr_id, uint32_t *end_nonce_ptr ) { uint32_t *nonceptr = algo_gate.get_nonceptr( work->data ); + // byte data[ 0..38, 43..75 ], skip over misaligned nonce [39..42] if ( memcmp( work->data, g_work->data, algo_gate.nonce_index ) || memcmp( ((uint8_t*) work->data) + JR2_WORK_CMP_INDEX_2, @@ -1611,9 +1612,9 @@ static void *miner_thread( void *userdata ) } } - if ( !algo_gate.miner_thread_init() ) + if ( !algo_gate.miner_thread_init( thr_id ) ) { - applog( LOG_ERR, "FAIL: thread %u failed to initialize"); + applog( LOG_ERR, "FAIL: thread %u failed to initialize", thr_id ); exit (1); } @@ -1660,6 +1661,8 @@ static void *miner_thread( void *userdata ) } // do_this_thread algo_gate.resync_threads( &work ); + // prevent dupes is called on every loop and has useful args so it + // is being used by zcoin to pass along the work height. if ( algo_gate.prevent_dupes( &work, &stratum, thr_id ) ) continue; // prevent scans before a job is received @@ -2075,8 +2078,7 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i ); algo_gate.build_extraheader( g_work, sctx ); - if ( opt_showdiff || opt_max_diff > 0. ) - algo_gate.calc_network_diff( g_work ); + net_diff = algo_gate.calc_network_diff( g_work ); algo_gate.set_work_data_endian( g_work ); pthread_mutex_unlock( &sctx->work_lock ); @@ -2161,6 +2163,7 @@ static void *stratum_thread(void *userdata ) sleep(opt_fail_pause); } + if (jsonrpc_2) { work_free(&g_work); @@ -2168,15 +2171,15 @@ static void *stratum_thread(void *userdata ) } } - if (stratum.job.job_id && - (!g_work_time || strcmp(stratum.job.job_id, g_work.job_id)) ) + if (stratum.job.job_id && + (!g_work_time || strcmp(stratum.job.job_id, g_work.job_id)) ) { pthread_mutex_lock(&g_work_lock); algo_gate.stratum_gen_work( &stratum, &g_work ); time(&g_work_time); pthread_mutex_unlock(&g_work_lock); - if (stratum.job.clean || jsonrpc_2) + if (stratum.job.clean || jsonrpc_2) { static uint32_t last_bloc_height; if (!opt_quiet && last_bloc_height != stratum.bloc_height) @@ -2554,7 +2557,7 @@ void parse_arg(int key, char *arg ) opt_extranonce = false; break; case 1013: - opt_showdiff = true; + opt_showdiff = false; break; case 1016: /* --coinbase-addr */ pk_script_size = address_to_script(pk_script, sizeof(pk_script), arg); diff --git a/m4/placeholder-to-prevent-git-from-deleting-this-directory b/m4/placeholder-to-prevent-git-from-deleting-this-directory deleted file mode 100644 index e69de29..0000000 diff --git a/miner.h b/miner.h index 17ed0bd..b619ef2 100644 --- a/miner.h +++ b/miner.h @@ -492,6 +492,7 @@ enum algos { ALGO_LUFFA, ALGO_LYRA2RE, ALGO_LYRA2REV2, + ALGO_LYRA2Z, ALGO_M7M, ALGO_MYR_GR, ALGO_NEOSCRYPT, @@ -546,6 +547,7 @@ static const char *algo_names[] = { "luffa", "lyra2re", "lyra2rev2", + "lyra2z", "m7m", "myr-gr", "neoscrypt", @@ -573,6 +575,7 @@ static const char *algo_names[] = { "x15", "x17", "yescrypt", + "lyra2z", "zr5", "\0" }; @@ -654,6 +657,7 @@ Options:\n\ luffa Luffa\n\ lyra2re lyra2\n\ lyra2rev2 lyrav2\n\ + lyra2z Zcoin (XZC)\n\ m7m Magi (XMG)\n\ myr-gr Myriad-Groestl\n\ neoscrypt NeoScrypt(128, 2, 1)\n\ @@ -700,6 +704,7 @@ Options:\n\ --randomize Randomize scan range start to reduce duplicates\n\ -f, --diff-factor Divide req. difficulty by this factor (std is 1.0)\n\ -m, --diff-multiplier Multiply difficulty by this factor (std is 1.0)\n\ + --hide-diff Do not display changes in difficulty\n\ -n, --nfactor neoscrypt N-Factor\n\ --coinbase-addr=ADDR payout address for solo mining\n\ --coinbase-sig=TEXT data to insert in the coinbase when possible\n\ @@ -763,6 +768,7 @@ static struct option const options[] = { { "diff-factor", 1, NULL, 'f' }, { "diff", 1, NULL, 'f' }, // deprecated (alias) { "diff-multiplier", 1, NULL, 'm' }, + { "hide-diff", 0, NULL, 1013 }, { "help", 0, NULL, 'h' }, { "nfactor", 1, NULL, 'n' }, { "no-gbt", 0, NULL, 1011 }, @@ -783,7 +789,6 @@ static struct option const options[] = { { "retry-pause", 1, NULL, 'R' }, { "randomize", 0, NULL, 1024 }, { "scantime", 1, NULL, 's' }, - { "show-diff", 0, NULL, 1013 }, #ifdef HAVE_SYSLOG_H { "syslog", 0, NULL, 'S' }, #endif diff --git a/util.c b/util.c index a70abc6..07cab73 100644 --- a/util.c +++ b/util.c @@ -1630,7 +1630,7 @@ bool rpc2_job_decode(const json_t *job, struct work *work) hashrate += thr_hashrates[i]; pthread_mutex_unlock(&stats_lock); double diff = trunc( ( ((double)0xffffffff) / target ) ); - if (!opt_quiet) + if ( opt_showdiff ) // xmr pool diff can change a lot... applog(LOG_WARNING, "Stratum difficulty set to %g", diff); stratum_diff = diff;