This commit is contained in:
Jay D Dee
2019-12-25 01:26:26 -05:00
parent c65b0ff7a6
commit 241bc26767
35 changed files with 3036 additions and 643 deletions

View File

@@ -1,15 +1,206 @@
#include "lyra2-gate.h"
#include <memory.h>
#include <mm_malloc.h>
#if defined (ALLIUM_4WAY)
#include "algo/blake/blake-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/groestl/aes_ni/hash-groestl256.h"
#if defined (ALLIUM_8WAY)
typedef struct {
blake256_8way_context blake;
keccak256_8way_context keccak;
cube_4way_context cube;
skein256_8way_context skein;
hashState_groestl256 groestl;
} allium_8way_ctx_holder;
static __thread allium_8way_ctx_holder allium_8way_ctx;
bool init_allium_8way_ctx()
{
keccak256_8way_init( &allium_8way_ctx.keccak );
cube_4way_init( &allium_8way_ctx.cube, 256, 16, 32 );
skein256_8way_init( &allium_8way_ctx.skein );
init_groestl256( &allium_8way_ctx.groestl, 32 );
return true;
}
void allium_8way_hash( void *state, const void *input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (128)));
uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
blake256_8way_close( &ctx.blake, vhash );
rintrlv_8x32_8x64( vhashA, vhash, 256 );
keccak256_8way_update( &ctx.keccak, vhashA, 32 );
keccak256_8way_close( &ctx.keccak, vhash );
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, 256 );
intrlv_2x256( vhash, hash0, hash1, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash0, hash1, vhash, 256 );
intrlv_2x256( vhash, hash2, hash3, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash2, hash3, vhash, 256 );
intrlv_2x256( vhash, hash4, hash5, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash4, hash5, vhash, 256 );
intrlv_2x256( vhash, hash6, hash7, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash6, hash7, vhash, 256 );
/*
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
*/
intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
cube_4way_init( &ctx.cube, 256, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
intrlv_2x256( vhash, hash0, hash1, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash0, hash1, vhash, 256 );
intrlv_2x256( vhash, hash2, hash3, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash2, hash3, vhash, 256 );
intrlv_2x256( vhash, hash4, hash5, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash4, hash5, vhash, 256 );
intrlv_2x256( vhash, hash6, hash7, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash6, hash7, vhash, 256 );
/*
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
*/
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, 256 );
skein256_8way_update( &ctx.skein, vhash, 32 );
skein256_8way_close( &ctx.skein, vhash );
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, 256 );
update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+128, hash4, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+160, hash5, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+192, hash6, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
}
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 8;
const uint32_t Htarg = ptarget[7];
__m256i *noncev = (__m256i*)vdata + 19; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake256_8way_init( &allium_8way_ctx.blake );
blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );
do {
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
n+3, n+2, n+1, n ) );
allium_8way_hash( hash, vdata );
pdata[19] = n;
for ( int lane = 0; lane < 8; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
{
if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_lane_solution( work, hash+(lane<<3), mythr, lane );
}
}
n += 8;
} while ( (n < last_nonce) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce;
return 0;
}
#elif defined (ALLIUM_4WAY)
typedef struct {
blake256_4way_context blake;
keccak256_4way_context keccak;

View File

@@ -129,7 +129,11 @@ bool register_lyra2rev2_algo( algo_gate_t* gate )
bool register_lyra2z_algo( algo_gate_t* gate )
{
#if defined(LYRA2Z_8WAY)
#if defined(LYRA2Z_16WAY)
gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_16way;
gate->hash = (void*)&lyra2z_16way_hash;
#elif defined(LYRA2Z_8WAY)
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_8way;
gate->hash = (void*)&lyra2z_8way_hash;
@@ -142,7 +146,7 @@ bool register_lyra2z_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_lyra2z;
gate->hash = (void*)&lyra2z_hash;
#endif
gate->optimizations = SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE42_OPT | AVX2_OPT | AVX512_OPT;
opt_target_factor = 256.0;
return true;
};
@@ -170,7 +174,11 @@ bool register_lyra2h_algo( algo_gate_t* gate )
bool register_allium_algo( algo_gate_t* gate )
{
#if defined (ALLIUM_4WAY)
#if defined (ALLIUM_8WAY)
gate->miner_thread_init = (void*)&init_allium_8way_ctx;
gate->scanhash = (void*)&scanhash_allium_8way;
gate->hash = (void*)&allium_8way_hash;
#elif defined (ALLIUM_4WAY)
gate->miner_thread_init = (void*)&init_allium_4way_ctx;
gate->scanhash = (void*)&scanhash_allium_4way;
gate->hash = (void*)&allium_4way_hash;
@@ -179,7 +187,7 @@ bool register_allium_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_allium;
gate->hash = (void*)&allium_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
opt_target_factor = 256.0;
return true;
};

View File

@@ -85,17 +85,25 @@ bool init_lyra2rev2_ctx();
/////////////////////////
#if defined(__SSE2__)
#define LYRA2Z_4WAY
#endif
#if defined(__AVX2__)
#define LYRA2Z_8WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define LYRA2Z_16WAY 1
#elif defined(__AVX2__)
#define LYRA2Z_8WAY 1
#elif defined(__SSE2__)
#define LYRA2Z_4WAY 1
#endif
#define LYRA2Z_MATRIX_SIZE BLOCK_LEN_INT64 * 8 * 8 * 8
#if defined(LYRA2Z_8WAY)
#if defined(LYRA2Z_16WAY)
void lyra2z_16way_hash( void *state, const void *input );
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2z_16way_thread_init();
#elif defined(LYRA2Z_8WAY)
void lyra2z_8way_hash( void *state, const void *input );
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
@@ -144,13 +152,22 @@ bool lyra2h_thread_init();
//////////////////////////////////
#if defined(__AVX2__) && defined(__AES__)
#define ALLIUM_4WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define ALLIUM_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define ALLIUM_4WAY 1
#endif
bool register_allium_algo( algo_gate_t* gate );
#if defined(ALLIUM_4WAY)
#if defined(ALLIUM_8WAY)
void allium_8way_hash( void *state, const void *input );
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool init_allium_8way_ctx();
#elif defined(ALLIUM_4WAY)
void allium_4way_hash( void *state, const void *input );
int scanhash_allium_4way( struct work *work, uint32_t max_nonce,

View File

@@ -26,14 +26,17 @@
#include "lyra2.h"
#include "sponge.h"
// LYRA2RE 8 cols 8 rows used by lyea2re, allium, phi2, x22i, x25x.
// LYRA2RE 8 cols 8 rows used by lyra2re, allium, phi2, x22i, x25x,
// dynamic matrix allocation.
//
// LYRA2REV2 4 cols 4 rows used by lyra2rev2.
// LYRA2REV2 4 cols 4 rows used by lyra2rev2 and x21s, static matrix
// allocation.
//
// LYRA2REV3 4 cols 4 rows with an extra twist in calculating
// rowa in the wandering phase. Used by lyra2rev3.
// rowa in the wandering phase. Used by lyra2rev3. Static matrix
// allocation.
//
// LYRA2Z various cols & rows and supports 80 input. Used by lyra2z,
// LYRA2Z various cols & rows and supports 80 byte input. Used by lyra2z,
// lyra2z330, lyra2h,
@@ -60,7 +63,7 @@
*/
// For lyra2rev3.
// convert a simple offset to an index into interleaved data.
// convert a simple offset to an index into 2x4 u64 interleaved data.
// good for state and 4 row matrix.
// index = ( int( off / 4 ) * 2 ) + ( off mod 4 )
@@ -202,12 +205,8 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
// hard coded for 32 byte input as well as matrix size.
// Other required versions include 80 byte input and different block
// sizez
// sizes.
//int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
// const void *pwd, const uint64_t pwdlen, const void *salt,
// const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
// const uint64_t nCols )
{
//====================== Basic variables ============================//
uint64_t _ALIGN(256) state[32];
@@ -335,159 +334,111 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
return 0;
}
#endif // AVX512
#if 0
//////////////////////////////////////////////////
int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
const uint64_t timeCost, const uint64_t nRows,
const uint64_t nCols )
int LYRA2Z_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
const void *pwd, const uint64_t pwdlen, const uint64_t timeCost,
const uint64_t nRows, const uint64_t nCols )
{
//========================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
int64_t row = 2; //index of row to be processed
int64_t prev = 1; //index of prev (last row ever computed/modified)
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
int64_t tau; //Time Loop iterator
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
// int64_t i; //auxiliary iteration counter
uint64_t _ALIGN(256) state[32];
int64_t row = 2;
int64_t prev = 1;
int64_t rowa0 = 0;
int64_t rowa1 = 0;
int64_t tau;
int64_t step = 1;
int64_t window = 2;
int64_t gap = 1;
//=======================================================================/
//======= Initializing the Memory Matrix and pointers to it =============//
//Tries to allocate enough space for the whole memory matrix
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
//==== Getting the password + salt + basil padded with 10*1 ============//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
//but this ensures that the password copied locally will be overwritten as soon as possible
//First, we clean enough blocks for the password, salt, basil and padding
uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 *
uint64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 *
sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
byte *ptrByte = (byte*) wholeMatrix;
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES );
//Prepends the password
memcpy(ptrByte, pwd, pwdlen);
ptrByte += pwdlen;
uint64_t *ptr = wholeMatrix;
uint64_t *pw = (uint64_t*)pwd;
//Concatenates the salt
memcpy(ptrByte, salt, saltlen);
ptrByte += saltlen;
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
memcpy(ptrByte, &kLen, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &saltlen, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &timeCost, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &nRows, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &nCols, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy( ptr, pw, 2*pwdlen ); // password
ptr += pwdlen>>2;
memcpy( ptr, pw, 2*pwdlen ); // password lane 1
ptr += pwdlen>>2;
//Now comes the padding
*ptrByte = 0x80; //first byte of padding: right after the password
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
// now build the rest interleaving on the fly.
ptr[0] = ptr[ 4] = kLen;
ptr[1] = ptr[ 5] = pwdlen;
ptr[2] = ptr[ 6] = pwdlen; // saltlen
ptr[3] = ptr[ 7] = timeCost;
ptr[8] = ptr[12] = nRows;
ptr[9] = ptr[13] = nCols;
ptr[10] = ptr[14] = 0x80;
ptr[11] = ptr[15] = 0x0100000000000000;
//=================== Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
// uint64_t *state = _mm_malloc(16 * sizeof(uint64_t), 32);
// if (state == NULL) {
// return -1;
// }
// initState( state );
uint64_t *ptrWord = wholeMatrix;
//============================== Setup Phase =============================//
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
uint64_t *ptrWord = wholeMatrix;
absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput,
BLOCK_LEN_BLAKE2_SAFE_INT64 );
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput,
BLOCK_LEN_BLAKE2_SAFE_INT64 );
/*
for ( i = 0; i < nBlocksInput; i++ )
{
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
}
*/
//Initializes M[0] and M[1]
reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here
reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols);
//Initializes M[0] and M[1]
reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );
do {
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
reducedDuplexRowSetup(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
reducedDuplexRow1_2way( state, &wholeMatrix[0],
&wholeMatrix[ 2 * ROW_LEN_INT64 ], nCols );
//updates the value of row* (deterministically picked during Setup))
rowa = (rowa + step) & (window - 1);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
row++;
do
{
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
//Checks if all rows in the window where visited.
if (rowa == 0) {
step = window + gap; //changes the step: approximately doubles its value
window *= 2; //doubles the size of the re-visitation window
gap = -gap; //inverts the modifier to the step
}
reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64],
&wholeMatrix[ 2* rowa0 * ROW_LEN_INT64],
&wholeMatrix[ 2* row*ROW_LEN_INT64],
nCols );
} while (row < nRows);
rowa0 = (rowa0 + step) & (window - 1);
prev = row;
row++;
//======================== Wandering Phase =============================//
row = 0; //Resets the visitation to the first row of the memory matrix
for ( tau = 1; tau <= timeCost; tau++ )
{
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
if ( rowa0 == 0 )
{
step = window + gap;
window *= 2;
gap = -gap;
}
} while ( row < nRows );
row = 0;
for ( tau = 1; tau <= timeCost; tau++ )
{
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
do {
//Selects a pseudorandom index row*
//----------------------------------------------------------------------
//rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//-----------------------------------------------------------------
do
{
rowa0 = state[ 0 ] % nRows;
rowa1 = state[ 4 ] % nRows;
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
reducedDuplexRow(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
&wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
&wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
&wholeMatrix[ 2* row *ROW_LEN_INT64 ],
nCols );
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
//---------------------------------------------------------------
//row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//--------------------------------------------------------------------
row = (row + step) % nRows;
} while (row != 0);
}
}
//========================= Wrap-up Phase ===============================//
//Absorbs the last block of the memory matrix
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64 ],
&wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64 ] );
//Squeezes the key
squeeze( state, K, kLen );
//Squeezes the key
squeeze_2way( state, K, (unsigned int) kLen );
return 0;
return 0;
}
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
////////////////////////////////////////////////////
// Lyra2RE doesn't like the new wholeMatrix implementation
int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
@@ -495,7 +446,7 @@ int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
const uint64_t nRows, const uint64_t nCols )
{
//====================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
uint64_t _ALIGN(256) state[32];
int64_t row = 2; //index of row to be processed
int64_t prev = 1; //index of prev (last row ever computed/modified)
int64_t rowa0 = 0;
@@ -517,25 +468,15 @@ int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
i = (int64_t)ROW_LEN_BYTES * nRows;
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
uint64_t *wholeMatrix = _mm_malloc( 2*i, 64 );
if (wholeMatrix == NULL)
return -1;
#if defined(__AVX2__)
memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
#elif defined(__SSE2__)
memset_zero_128( (__m128i*)wholeMatrix, i>>4 );
#else
memset( wholeMatrix, 0, i );
#endif
memset_zero_512( (__m512i*)wholeMatrix, i>>5 );
uint64_t *ptrWord = wholeMatrix;
uint64_t *pw = (uint64_t*)pwd;
//=== Getting the password + salt + basil padded with 10*1 ==========//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
//but this ensures that the password copied locally will be overwritten as soon as possible
//First, we clean enough blocks for the password, salt, basil and padding
int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
@@ -558,66 +499,8 @@ int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
ptr[10] = ptr[14] = 0x80;
ptr[11] = ptr[15] = 0x0100000000000000;
/*
byte *ptrByte = (byte*) wholeMatrix;
//Prepends the password
memcpy(ptrByte, pwd, pwdlen);
ptrByte += pwdlen;
//Concatenates the salt
memcpy(ptrByte, salt, saltlen);
ptrByte += saltlen;
// memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
// - (saltlen + pwdlen) );
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
memcpy(ptrByte, &kLen, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = pwdlen;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = saltlen;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = timeCost;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = nRows;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = nCols;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
//Now comes the padding
*ptrByte = 0x80; //first byte of padding: right after the password
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
//================= Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
// initState( state );
//========================= Setup Phase =============================//
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
ptrWord = wholeMatrix;
*/
absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
/*
for (i = 0; i < nBlocksInput; i++)
{
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
}
*/
//Initializes M[0] and M[1]
reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here

View File

@@ -62,6 +62,8 @@ int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *sa
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
int LYRA2REV2_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
@@ -69,6 +71,9 @@ int LYRA2REV2_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
int LYRA2Z_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
#endif
#endif /* LYRA2_H_ */

View File

@@ -1,13 +1,240 @@
#include "lyra2-gate.h"
#ifdef LYRA2Z_4WAY
#include <memory.h>
#include <mm_malloc.h>
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake-hash-4way.h"
#if defined(LYRA2Z_16WAY)
__thread uint64_t* lyra2z_16way_matrix;
bool lyra2z_16way_thread_init()
{
return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
}
static __thread blake256_16way_context l2z_16way_blake_mid;
void lyra2z_16way_midstate( const void* input )
{
blake256_16way_init( &l2z_16way_blake_mid );
blake256_16way_update( &l2z_16way_blake_mid, input, 64 );
}
void lyra2z_16way_hash( void *state, const void *input )
{
uint32_t vhash[8*16] __attribute__ ((aligned (128)));
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t hash8[8] __attribute__ ((aligned (64)));
uint32_t hash9[8] __attribute__ ((aligned (64)));
uint32_t hash10[8] __attribute__ ((aligned (64)));
uint32_t hash11[8] __attribute__ ((aligned (64)));
uint32_t hash12[8] __attribute__ ((aligned (64)));
uint32_t hash13[8] __attribute__ ((aligned (64)));
uint32_t hash14[8] __attribute__ ((aligned (64)));
uint32_t hash15[8] __attribute__ ((aligned (64)));
blake256_16way_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &l2z_16way_blake_mid, sizeof l2z_16way_blake_mid );
blake256_16way_update( &ctx_blake, input + (64*16), 16 );
blake256_16way_close( &ctx_blake, vhash );
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
vhash, 256 );
intrlv_2x256( vhash, hash0, hash1, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash0, hash1, vhash, 256 );
intrlv_2x256( vhash, hash2, hash3, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash2, hash3, vhash, 256 );
intrlv_2x256( vhash, hash4, hash5, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash4, hash5, vhash, 256 );
intrlv_2x256( vhash, hash6, hash7, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash6, hash7, vhash, 256 );
intrlv_2x256( vhash, hash8, hash9, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash8, hash9, vhash, 256 );
intrlv_2x256( vhash, hash10, hash11, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash10, hash11, vhash, 256 );
intrlv_2x256( vhash, hash12, hash13, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash12, hash13, vhash, 256 );
intrlv_2x256( vhash, hash14, hash15, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash14, hash15, vhash, 256 );
memcpy( state, hash0, 32 );
memcpy( state+ 32, hash1, 32 );
memcpy( state+ 64, hash2, 32 );
memcpy( state+ 96, hash3, 32 );
memcpy( state+128, hash4, 32 );
memcpy( state+160, hash5, 32 );
memcpy( state+192, hash6, 32 );
memcpy( state+224, hash7, 32 );
memcpy( state+256, hash8, 32 );
memcpy( state+288, hash9, 32 );
memcpy( state+320, hash10, 32 );
memcpy( state+352, hash11, 32 );
memcpy( state+384, hash12, 32 );
memcpy( state+416, hash13, 32 );
memcpy( state+448, hash14, 32 );
memcpy( state+480, hash15, 32 );
}
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
__m512i *noncev = (__m512i*)vdata + 19; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
mm512_bswap32_intrlv80_16x32( vdata, pdata );
lyra2z_16way_midstate( vdata );
do {
*noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4,
n+ 3, n+ 2, n+ 1, n ) );
lyra2z_16way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 16; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
&& !opt_benchmark )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 16;
} while ( (n < max_nonce-16) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
#elif defined(LYRA2Z_8WAY)
__thread uint64_t* lyra2z_8way_matrix;
bool lyra2z_8way_thread_init()
{
return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
}
static __thread blake256_8way_context l2z_8way_blake_mid;
void lyra2z_8way_midstate( const void* input )
{
blake256_8way_init( &l2z_8way_blake_mid );
blake256_8way( &l2z_8way_blake_mid, input, 64 );
}
void lyra2z_8way_hash( void *state, const void *input )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
blake256_8way( &ctx_blake, input + (64*8), 16 );
blake256_8way_close( &ctx_blake, vhash );
dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash4, 32, hash4, 32, hash4, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash5, 32, hash5, 32, hash5, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
memcpy( state, hash0, 32 );
memcpy( state+ 32, hash1, 32 );
memcpy( state+ 64, hash2, 32 );
memcpy( state+ 96, hash3, 32 );
memcpy( state+128, hash4, 32 );
memcpy( state+160, hash5, 32 );
memcpy( state+192, hash6, 32 );
memcpy( state+224, hash7, 32 );
}
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 19; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
mm256_bswap32_intrlv80_8x32( vdata, pdata );
lyra2z_8way_midstate( vdata );
do {
*noncev = mm256_bswap_32(
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
lyra2z_8way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 8; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
&& !opt_benchmark )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 8;
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
#elif defined(LYRA2Z_4WAY)
__thread uint64_t* lyra2z_4way_matrix;
bool lyra2z_4way_thread_init()
@@ -85,100 +312,3 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
#endif
#if defined(LYRA2Z_8WAY)
__thread uint64_t* lyra2z_8way_matrix;
bool lyra2z_8way_thread_init()
{
return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
}
static __thread blake256_8way_context l2z_8way_blake_mid;
void lyra2z_8way_midstate( const void* input )
{
blake256_8way_init( &l2z_8way_blake_mid );
blake256_8way( &l2z_8way_blake_mid, input, 64 );
}
void lyra2z_8way_hash( void *state, const void *input )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
blake256_8way( &ctx_blake, input + (64*8), 16 );
blake256_8way_close( &ctx_blake, vhash );
dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash4, 32, hash4, 32, hash4, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash5, 32, hash5, 32, hash5, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
memcpy( state, hash0, 32 );
memcpy( state+ 32, hash1, 32 );
memcpy( state+ 64, hash2, 32 );
memcpy( state+ 96, hash3, 32 );
memcpy( state+128, hash4, 32 );
memcpy( state+160, hash5, 32 );
memcpy( state+192, hash6, 32 );
memcpy( state+224, hash7, 32 );
}
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 19; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
mm256_bswap32_intrlv80_8x32( vdata, pdata );
lyra2z_8way_midstate( vdata );
do {
*noncev = mm256_bswap_32(
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
lyra2z_8way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 8; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
&& !opt_benchmark )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 8;
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif