This commit is contained in:
Jay D Dee
2017-02-12 12:43:08 -05:00
parent 1ee41348f4
commit 8efab74183
20 changed files with 1891 additions and 1294 deletions

View File

@@ -21,6 +21,7 @@
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <mm_malloc.h>
#include "compat.h"
#include "lyra2.h"
#include "sponge.h"
@@ -45,10 +46,9 @@
* @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
*/
// Lyra2RE & Lyra2REv2, nRows must be a power of 2
int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
const void *salt, uint64_t saltlen, uint64_t timeCost,
const uint64_t nRows, const uint64_t nCols )
int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
{
//====================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
@@ -71,26 +71,21 @@ int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
// for Lyra2REv2, nCols = 4, v1 was using 8
const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
/*
i = (int64_t)ROW_LEN_BYTES * nRows;
uint64_t *wholeMatrix = malloc(i);
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
if (wholeMatrix == NULL)
return -1;
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
#else
memset(wholeMatrix, 0, i);
//Allocates pointers to each row of the matrix
uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows);
if (memMatrix == NULL)
return -1;
//Places the pointers in the correct positions
#endif
*/
uint64_t *ptrWord = wholeMatrix;
for (i = 0; i < nRows; i++)
{
memMatrix[i] = ptrWord;
ptrWord += ROW_LEN_INT64;
}
//=== Getting the password + salt + basil padded with 10*1 ==========//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
@@ -140,31 +135,36 @@ int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
//================= Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
initState(state);
initState( state );
//========================= Setup Phase =============================//
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
ptrWord = wholeMatrix;
for (i = 0; i < nBlocksInput; i++)
{
absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
}
//Initializes M[0] and M[1]
reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
nCols);
do
{
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
&wholeMatrix[rowa*ROW_LEN_INT64],
&wholeMatrix[row*ROW_LEN_INT64], nCols );
//updates the value of row* (deterministically picked during Setup))
rowa = (rowa + step) & (window - 1);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
row++;
@@ -190,12 +190,14 @@ int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
//Selects a pseudorandom index row*
//-----------------------------------------------
rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//-------------------------------------------
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
&wholeMatrix[rowa*ROW_LEN_INT64],
&wholeMatrix[row*ROW_LEN_INT64], nCols );
//update prev: it now points to the last row ever computed
prev = row;
@@ -210,22 +212,17 @@ int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
//===================== Wrap-up Phase ===============================//
//Absorbs the last block of the memory matrix
absorbBlock(state, memMatrix[rowa]);
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
//Squeezes the key
squeeze(state, K, (unsigned int) kLen);
//================== Freeing the memory =============================//
free(memMatrix);
free(wholeMatrix);
// free(wholeMatrix);
return 0;
}
// Zcoin, nRows may be any value
int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
const void *salt, uint64_t saltlen, uint64_t timeCost,
uint64_t nRows, uint64_t nCols )
int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols )
{
//========================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
@@ -244,33 +241,27 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
/*
i = (int64_t)ROW_LEN_BYTES * nRows;
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES);
uint64_t *wholeMatrix = malloc(i);
if (wholeMatrix == NULL)
if (wholeMatrix == NULL)
return -1;
memset(wholeMatrix, 0, i);
//Allocates pointers to each row of the matrix
uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*));
if (memMatrix == NULL)
return -1;
//Places the pointers in the correct positions
uint64_t *ptrWord = wholeMatrix;
for (i = 0; i < nRows; i++)
{
memMatrix[i] = ptrWord;
ptrWord += ROW_LEN_INT64;
}
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
#else
memset(wholeMatrix, 0, i);
#endif
*/
//==== Getting the password + salt + basil padded with 10*1 ============//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
//but this ensures that the password copied locally will be overwritten as soon as possible
//First, we clean enough blocks for the password, salt, basil and padding
uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof (uint64_t) )
/ BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
byte *ptrByte = (byte*) wholeMatrix;
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES );
@@ -281,7 +272,6 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
//Concatenates the salt
memcpy(ptrByte, salt, saltlen);
ptrByte += saltlen;
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
memcpy(ptrByte, &kLen, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
@@ -304,11 +294,15 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
//=================== Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
// uint64_t *state = _mm_malloc(16 * sizeof(uint64_t), 32);
// if (state == NULL) {
// return -1;
// }
initState( state );
//============================== Setup Phase =============================//
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
ptrWord = wholeMatrix;
uint64_t *ptrWord = wholeMatrix;
for ( i = 0; i < nBlocksInput; i++ )
{
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
@@ -316,31 +310,28 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
}
//Initializes M[0] and M[1]
reducedSqueezeRow0( state, memMatrix[0], nCols ); //The locally copied password is most likely overwritten here
reducedDuplexRow1( state, memMatrix[0], memMatrix[1], nCols );
reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here
reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols);
do
{
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
reducedDuplexRowSetup( state, memMatrix[prev], memMatrix[rowa],
memMatrix[row], nCols );
do {
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
reducedDuplexRowSetup(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
//updates the value of row* (deterministically picked during Setup))
rowa = (rowa + step) & (window - 1);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
row++;
//updates the value of row* (deterministically picked during Setup))
rowa = (rowa + step) & (window - 1);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
row++;
//Checks if all rows in the window where visited.
if (rowa == 0)
{
step = window + gap; //changes the step: approximately doubles its value
window *= 2; //doubles the size of the re-visitation window
gap = -gap; //inverts the modifier to the step
}
//Checks if all rows in the window where visited.
if (rowa == 0) {
step = window + gap; //changes the step: approximately doubles its value
window *= 2; //doubles the size of the re-visitation window
gap = -gap; //inverts the modifier to the step
}
} while (row < nRows);
} while (row < nRows);
//======================== Wandering Phase =============================//
row = 0; //Resets the visitation to the first row of the memory matrix
@@ -351,20 +342,19 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
do {
//Selects a pseudorandom index row*
//----------------------------------------------------------------------
//rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
//rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//-----------------------------------------------------------------
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
reducedDuplexRow( state, memMatrix[prev], memMatrix[rowa],
memMatrix[row], nCols );
reducedDuplexRow(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
//---------------------------------------------------------------
//row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
//row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//--------------------------------------------------------------------
@@ -373,15 +363,190 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
//========================= Wrap-up Phase ===============================//
//Absorbs the last block of the memory matrix
absorbBlock( state, memMatrix[rowa] );
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
//Squeezes the key
squeeze( state, K, kLen );
//====================== Freeing the memory =============================//
free( memMatrix );
free( wholeMatrix );
// _mm_free(state);
// _mm_free( wholeMatrix );
return 0;
}
int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
{
//====================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
int64_t row = 2; //index of row to be processed
int64_t prev = 1; //index of prev (last row ever computed/modified)
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
int64_t tau; //Time Loop iterator
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
int64_t i; //auxiliary iteration counter
int64_t v64; // 64bit var for memcpy
//====================================================================/
//=== Initializing the Memory Matrix and pointers to it =============//
//Tries to allocate enough space for the whole memory matrix
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
// for Lyra2REv2, nCols = 4, v1 was using 8
const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
i = (int64_t)ROW_LEN_BYTES * nRows;
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
if (wholeMatrix == NULL)
return -1;
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
#else
memset(wholeMatrix, 0, i);
#endif
uint64_t *ptrWord = wholeMatrix;
//=== Getting the password + salt + basil padded with 10*1 ==========//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
//but this ensures that the password copied locally will be overwritten as soon as possible
//First, we clean enough blocks for the password, salt, basil and padding
int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
byte *ptrByte = (byte*) wholeMatrix;
//Prepends the password
memcpy(ptrByte, pwd, pwdlen);
ptrByte += pwdlen;
//Concatenates the salt
memcpy(ptrByte, salt, saltlen);
ptrByte += saltlen;
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
- (saltlen + pwdlen) );
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
memcpy(ptrByte, &kLen, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = pwdlen;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = saltlen;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = timeCost;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = nRows;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = nCols;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
//Now comes the padding
*ptrByte = 0x80; //first byte of padding: right after the password
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
//================= Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
initState( state );
//========================= Setup Phase =============================//
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
ptrWord = wholeMatrix;
for (i = 0; i < nBlocksInput; i++)
{
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
}
//Initializes M[0] and M[1]
reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
nCols);
do
{
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
&wholeMatrix[rowa*ROW_LEN_INT64],
&wholeMatrix[row*ROW_LEN_INT64], nCols );
//updates the value of row* (deterministically picked during Setup))
rowa = (rowa + step) & (window - 1);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
row++;
//Checks if all rows in the window where visited.
if (rowa == 0)
{
step = window + gap; //changes the step: approximately doubles its value
window *= 2; //doubles the size of the re-visitation window
gap = -gap; //inverts the modifier to the step
}
} while (row < nRows);
//===================== Wandering Phase =============================//
row = 0; //Resets the visitation to the first row of the memory matrix
for (tau = 1; tau <= timeCost; tau++)
{
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
do
{
//Selects a pseudorandom index row*
//-----------------------------------------------
rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//-------------------------------------------
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
&wholeMatrix[rowa*ROW_LEN_INT64],
&wholeMatrix[row*ROW_LEN_INT64], nCols );
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
//----------------------------------------------------
row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//----------------------------------------------------
} while (row != 0);
}
//===================== Wrap-up Phase ===============================//
//Absorbs the last block of the memory matrix
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
//Squeezes the key
squeeze(state, K, (unsigned int) kLen);
//================== Freeing the memory =============================//
free(wholeMatrix);
return 0;
}

View File

@@ -37,10 +37,20 @@ typedef unsigned char byte;
#define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes
#endif
int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
const void *salt, uint64_t saltlen, uint64_t timeCost,
uint64_t nRows, uint64_t nCols );
int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
const void *salt, uint64_t saltlen, uint64_t timeCost,
uint64_t nRows, uint64_t nCols );
#define BLOCK_LEN_M256I (BLOCK_LEN_INT64 / 4 )
#define BLOCK_LEN_M128I (BLOCK_LEN_INT64 / 2 )
int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
int LYRA2REV2( uint64_t*, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
#endif /* LYRA2_H_ */

View File

@@ -7,11 +7,14 @@
#include "algo/keccak/sph_keccak.h"
#include "lyra2.h"
#include "algo-gate-api.h"
#include "avxdefs.h"
#ifndef NO_AES_NI
#include "algo/groestl/aes_ni/hash-groestl256.h"
#endif
//__thread uint64_t* lyra2re_wholeMatrix;
typedef struct {
sph_blake256_context blake;
sph_keccak256_context keccak;
@@ -24,6 +27,7 @@ typedef struct {
} lyra2re_ctx_holder;
lyra2re_ctx_holder lyra2re_ctx;
static __thread sph_blake256_context lyra2_blake_mid;
void init_lyra2re_ctx()
{
@@ -37,6 +41,12 @@ void init_lyra2re_ctx()
#endif
}
void lyra2_blake256_midstate( const void* input )
{
memcpy( &lyra2_blake_mid, &lyra2re_ctx.blake, sizeof lyra2_blake_mid );
sph_blake256( &lyra2_blake_mid, input, 64 );
}
void lyra2re_hash(void *state, const void *input)
{
lyra2re_ctx_holder ctx;
@@ -47,13 +57,19 @@ void lyra2re_hash(void *state, const void *input)
#define hashA hash
#define hashB hash+16
sph_blake256(&ctx.blake, input, 80);
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
memcpy( &ctx.blake, &lyra2_blake_mid, sizeof lyra2_blake_mid );
sph_blake256( &ctx.blake, input + 64, 16 );
// sph_blake256(&ctx.blake, input, 80);
sph_blake256_close(&ctx.blake, hashA);
sph_keccak256(&ctx.keccak, hashA, 32);
sph_keccak256_close(&ctx.keccak, hashB);
LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
LYRA2RE( hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
sph_skein256(&ctx.skein, hashA, 32);
sph_skein256_close(&ctx.skein, hashB);
@@ -81,6 +97,8 @@ int scanhash_lyra2re(int thr_id, struct work *work,
swab32_array( endiandata, pdata, 20 );
lyra2_blake256_midstate( endiandata );
do {
be32enc(&endiandata[19], nonce);
lyra2re_hash(hash, endiandata);
@@ -112,10 +130,34 @@ void lyra2re_set_target ( struct work* work, double job_diff )
work_set_target(work, job_diff / (128.0 * opt_diff_factor) );
}
/*
bool lyra2re_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
lyra2re_wholeMatrix = _mm_malloc( i, 64 );
if ( lyra2re_wholeMatrix == NULL )
return false;
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)lyra2re_wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)lyra2re_wholeMatrix, i/16 );
#else
memset( lyra2re_wholeMatrix, 0, i );
#endif
return true;
}
*/
bool register_lyra2re_algo( algo_gate_t* gate )
{
init_lyra2re_ctx();
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
// gate->miner_thread_init = (void*)&lyra2re_thread_init;
gate->scanhash = (void*)&scanhash_lyra2re;
gate->hash = (void*)&lyra2re_hash;
gate->hash_alt = (void*)&lyra2re_hash;

View File

@@ -8,10 +8,11 @@
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "lyra2.h"
#include "avxdefs.h"
__thread uint64_t* l2v2_wholeMatrix;
typedef struct {
cubehashParam cube1;
@@ -23,7 +24,8 @@ typedef struct {
} lyra2v2_ctx_holder;
lyra2v2_ctx_holder lyra2v2_ctx;
static lyra2v2_ctx_holder lyra2v2_ctx;
static __thread sph_blake256_context l2v2_blake_mid;
void init_lyra2rev2_ctx()
{
@@ -35,14 +37,23 @@ void init_lyra2rev2_ctx()
sph_bmw256_init( &lyra2v2_ctx.bmw );
}
void l2v2_blake256_midstate( const void* input )
{
memcpy( &l2v2_blake_mid, &lyra2v2_ctx.blake, sizeof l2v2_blake_mid );
sph_blake256( &l2v2_blake_mid, input, 64 );
}
void lyra2rev2_hash( void *state, const void *input )
{
lyra2v2_ctx_holder ctx;
memcpy( &ctx, &lyra2v2_ctx, sizeof(lyra2v2_ctx) );
uint32_t _ALIGN(128) hashA[8], hashB[8];
sph_blake256( &ctx.blake, input, 80 );
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
memcpy( &ctx.blake, &l2v2_blake_mid, sizeof l2v2_blake_mid );
sph_blake256( &ctx.blake, (uint8_t*)input + midlen, tail );
sph_blake256_close( &ctx.blake, hashA );
sph_keccak256( &ctx.keccak, hashA, 32 );
@@ -50,18 +61,14 @@ void lyra2rev2_hash( void *state, const void *input )
cubehashUpdateDigest( &ctx.cube1, (byte*) hashA,
(const byte*) hashB, 32 );
// cubehashUpdate( &ctx.cube1, (const byte*) hashB,32 );
// cubehashDigest( &ctx.cube1, (byte*)hashA );
LYRA2( hashA, 32, hashA, 32, hashA, 32, 1, 4, 4 );
LYRA2REV2( l2v2_wholeMatrix, hashA, 32, hashA, 32, hashA, 32, 1, 4, 4 );
sph_skein256( &ctx.skein, hashA, 32 );
sph_skein256_close( &ctx.skein, hashB );
cubehashUpdateDigest( &ctx.cube2, (byte*) hashA,
(const byte*) hashB, 32 );
// cubehashUpdate( &ctx.cube2, (const byte*) hashB,32 );
// cubehashDigest( &ctx.cube2, (byte*)hashA );
sph_bmw256( &ctx.bmw, hashA, 32 );
sph_bmw256_close( &ctx.bmw, hashB );
@@ -85,6 +92,8 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
swab32_array( endiandata, pdata, 20 );
l2v2_blake256_midstate( endiandata );
do {
be32enc(&endiandata[19], nonce);
lyra2rev2_hash(hash, endiandata);
@@ -112,10 +121,33 @@ void lyra2rev2_set_target( struct work* work, double job_diff )
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool lyra2rev2_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
l2v2_wholeMatrix = _mm_malloc( i, 64 );
if ( l2v2_wholeMatrix == NULL )
return false;
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)l2v2_wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)l2v2_wholeMatrix, i/16 );
#else
memset( l2v2_wholeMatrix, 0, i );
#endif
return true;
}
bool register_lyra2rev2_algo( algo_gate_t* gate )
{
init_lyra2rev2_ctx();
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
gate->scanhash = (void*)&scanhash_lyra2rev2;
gate->hash = (void*)&lyra2rev2_hash;
gate->hash_alt = (void*)&lyra2rev2_hash;

File diff suppressed because it is too large Load Diff

View File

@@ -51,24 +51,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#if defined __AVX2__
// only available with avx2
// init vectors from memory
// returns void, updates defines and inits implicit args a, b, c, d
#define LYRA_INIT_AVX2 \
__m256i a[4]; \
a[0] = _mm256_load_si256( (__m256i*)(&v[ 0]) ); \
a[1] = _mm256_load_si256( (__m256i*)(&v[ 4]) ); \
a[2] = _mm256_load_si256( (__m256i*)(&v[ 8]) ); \
a[3] = _mm256_load_si256( (__m256i*)(&v[12]) );
// save to memory
// returns void
#define LYRA_CLOSE_AVX2 \
_mm256_store_si256( (__m256i*)(&v[ 0]), a[0] ); \
_mm256_store_si256( (__m256i*)(&v[ 4]), a[1] ); \
_mm256_store_si256( (__m256i*)(&v[ 8]), a[2] ); \
_mm256_store_si256( (__m256i*)(&v[12]), a[3] );
// process 4 rows in parallel
// process 4 columns in parallel
// returns void, updates all args
#define G_4X64(a,b,c,d) \
a = _mm256_add_epi64( a, b ); \
@@ -107,28 +90,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#else
// only available with avx
#define LYRA_INIT_AVX \
__m128i a0[4], a1[4]; \
a0[0] = _mm_load_si128( (__m128i*)(&v[ 0]) ); \
a1[0] = _mm_load_si128( (__m128i*)(&v[ 2]) ); \
a0[1] = _mm_load_si128( (__m128i*)(&v[ 4]) ); \
a1[1] = _mm_load_si128( (__m128i*)(&v[ 6]) ); \
a0[2] = _mm_load_si128( (__m128i*)(&v[ 8]) ); \
a1[2] = _mm_load_si128( (__m128i*)(&v[10]) ); \
a0[3] = _mm_load_si128( (__m128i*)(&v[12]) ); \
a1[3] = _mm_load_si128( (__m128i*)(&v[14]) );
#define LYRA_CLOSE_AVX \
_mm_store_si128( (__m128i*)(&v[ 0]), a0[0] ); \
_mm_store_si128( (__m128i*)(&v[ 2]), a1[0] ); \
_mm_store_si128( (__m128i*)(&v[ 4]), a0[1] ); \
_mm_store_si128( (__m128i*)(&v[ 6]), a1[1] ); \
_mm_store_si128( (__m128i*)(&v[ 8]), a0[2] ); \
_mm_store_si128( (__m128i*)(&v[10]), a1[2] ); \
_mm_store_si128( (__m128i*)(&v[12]), a0[3] ); \
_mm_store_si128( (__m128i*)(&v[14]), a1[3] );
// process 2 rows in parallel
// process 2 columns in parallel
// returns void, all args updated
#define G_2X64(a,b,c,d) \
a = _mm_add_epi64( a, b ); \
@@ -140,68 +102,35 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
c = _mm_add_epi64( c, d ); \
b = mm_rotr_64( _mm_xor_si128( b, c ), 63 );
#define LYRA_ROUND_AVX \
G_2X64( a0[0], a0[1], a0[2], a0[3] ); \
G_2X64( a1[0], a1[1], a1[2], a1[3] ); \
mm128_rotl256_1x64( a0[1], a1[1] ); \
mm128_swap128( a0[2], a1[2] ); \
mm128_rotr256_1x64( a0[3], a1[3] ); \
G_2X64( a0[0], a0[1], a0[2], a0[3] ); \
G_2X64( a1[0], a1[1], a1[2], a1[3] ); \
mm128_rotr256_1x64( a0[1], a1[1] ); \
mm128_swap128( a0[2], a1[2] ); \
mm128_rotl256_1x64( a0[3], a1[3] );
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm128_rotl256_1x64( s2, s3 ); \
mm128_swap128( s4, s5 ); \
mm128_rotr256_1x64( s6, s7 ); \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm128_rotr256_1x64( s2, s3 ); \
mm128_swap128( s4, s5 ); \
mm128_rotl256_1x64( s6, s7 );
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
#endif // AVX2
/*
#if defined __AVX__
// can coexist with AVX2
// rotate each uint64 c bits
// _m128i
#define mm_rotr_64(w,c) _mm_or_si128(_mm_srli_epi64(w, c), \
_mm_slli_epi64(w, 64 - c))
// swap 128 bit source vectors, equivalent of rotating 256 bits by 128 bits
// void
#define mm128_swap128(s0, s1) s0 = _mm_xor_si128(s0, s1); \
s1 = _mm_xor_si128(s0, s1); \
s0 = _mm_xor_si128(s0, s1);
// swap uint64 in 128 bit source vector, equivalent of rotating 128 bits by
// 64 bits (8 bytes)
// __m128i
#define mm128_swap64(s) _mm_or_si128( _mm_slli_si128( s, 8 ), \
_mm_srli_si128( s, 8 ) )
// rotate 2 128 bit vectors as one 256 vector by 1 uint64, very inefficient
// returns void, args updated
#define mm128_rotl256_1x64(s0, s1) do { \
__m128i t; \
s0 = mm128_swap64( s0); \
s1 = mm128_swap64( s1); \
t = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
_mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
s1 = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
_mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
s0 = t; \
} while(0)
#define mm128_rotr256_1x64(s0, s1) do { \
__m128i t; \
s0 = mm128_swap64( s0); \
s1 = mm128_swap64( s1); \
t = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
_mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
s1 = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
_mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
s0 = t; \
} while(0)
#endif // AVX
*/
// Scalar
//Blake2b's G function
#define G(r,i,a,b,c,d) \

View File

@@ -1,20 +1,40 @@
#include <memory.h>
#include <mm_malloc.h>
#include "miner.h"
#include "algo-gate-api.h"
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
#include "avxdefs.h"
void zcoin_hash(void *state, const void *input, uint32_t height)
__thread uint64_t* zcoin_wholeMatrix;
static __thread sph_blake256_context zcoin_blake_mid;
void zcoin_midstate( const void* input )
{
uint32_t _ALIGN(256) hash[16];
// LYRA2Z(hash, 32, input, 80, input, 80, 2, height, 256);
LYRA2Z(hash, 32, input, 80, input, 80, 2, 8192, 256);
memcpy(state, hash, 32);
sph_blake256_init( &zcoin_blake_mid );
sph_blake256( &zcoin_blake_mid, input, 64 );
}
// block 2050 new algo, blake plus new lyra parms. new input
// is power of 2 so normal lyra can be used
//void zcoin_hash(void *state, const void *input, uint32_t height)
void zcoin_hash(void *state, const void *input )
{
uint32_t _ALIGN(256) hash[16];
sph_blake256_context ctx_blake;
memcpy( &ctx_blake, &zcoin_blake_mid, sizeof zcoin_blake_mid );
sph_blake256( &ctx_blake, input + 64, 16 );
sph_blake256_close( &ctx_blake, hash );
LYRA2Z( zcoin_wholeMatrix, hash, 32, hash, 32, hash, 32, 8, 8, 8);
memcpy(state, hash, 32);
}
//int scanhash_zcoin(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, uint32_t height)
int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
@@ -25,6 +45,7 @@ int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
if (opt_benchmark)
ptarget[7] = 0x0000ff;
@@ -32,9 +53,11 @@ int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
be32enc(&endiandata[i], pdata[i]);
}
zcoin_midstate( endiandata );
do {
be32enc(&endiandata[19], nonce);
zcoin_hash( hash, endiandata, work->height );
zcoin_hash( hash, endiandata );
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
work_set_target_ratio(work, hash);
@@ -57,22 +80,45 @@ void zcoin_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
/*
bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
{
work->height = sctx->bloc_height;
return false;
}
*/
bool zcoin_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
zcoin_wholeMatrix = _mm_malloc( i, 64 );
if ( zcoin_wholeMatrix == NULL )
return false;
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)zcoin_wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)zcoin_wholeMatrix, i/16 );
#else
memset( zcoin_wholeMatrix, 0, i );
#endif
return true;
}
bool register_zcoin_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&zcoin_thread_init;
gate->scanhash = (void*)&scanhash_zcoin;
gate->hash = (void*)&zcoin_hash;
gate->hash_alt = (void*)&zcoin_hash;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&zcoin_set_target;
gate->prevent_dupes = (void*)&zcoin_get_work_height;
// gate->prevent_dupes = (void*)&zcoin_get_work_height;
return true;
};

View File

@@ -2,13 +2,15 @@
#include "miner.h"
#include "algo-gate-api.h"
#include "lyra2.h"
#include "avxdefs.h"
__thread uint64_t* zoin_wholeMatrix;
void zoin_hash(void *state, const void *input, uint32_t height)
{
uint32_t _ALIGN(256) hash[16];
LYRA2Z(hash, 32, input, 80, input, 80, 2, 330, 256);
LYRA2Z( zoin_wholeMatrix, hash, 32, input, 80, input, 80, 2, 330, 256);
memcpy(state, hash, 32);
}
@@ -53,22 +55,45 @@ void zoin_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
/*
bool zoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
{
work->height = sctx->bloc_height;
return false;
}
*/
bool zoin_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 256; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 330; // nRows;
zoin_wholeMatrix = _mm_malloc( i, 64 );
if ( zoin_wholeMatrix == NULL )
return false;
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)zoin_wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)zoin_wholeMatrix, i/16 );
#else
memset( zoin_wholeMatrix, 0, i );
#endif
return true;
}
bool register_zoin_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&zoin_thread_init;
gate->scanhash = (void*)&scanhash_zoin;
gate->hash = (void*)&zoin_hash;
gate->hash_alt = (void*)&zoin_hash;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&zoin_set_target;
gate->prevent_dupes = (void*)&zoin_get_work_height;
// gate->prevent_dupes = (void*)&zoin_get_work_height;
return true;
};