mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.5.6
This commit is contained in:
@@ -85,10 +85,11 @@ performance.
|
||||
|
||||
2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
|
||||
Centos are known to work and have all dependencies in their repositories.
|
||||
Others may work but may require more effort. 64 bit Windows OS is now supported
|
||||
with mingw_w64 and msys.
|
||||
Others may work but may require more effort.
|
||||
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
|
||||
|
||||
3. Stratum pool, cpuminer-opt only supports stratum minning.
|
||||
3. Stratum pool, cpuminer-opt only supports stratum minning. Some algos
|
||||
may work wallet mining but there are no guarantees.
|
||||
|
||||
Errata
|
||||
------
|
||||
@@ -96,7 +97,6 @@ Errata
|
||||
cpuminer-opt does not work mining Decred algo at Nicehash and produces
|
||||
only "invalid extranonce2 size" rejects.
|
||||
|
||||
x11evo optimizations not available on Windows.
|
||||
Benchmark testing does not work for x11evo.
|
||||
|
||||
Bugs
|
||||
|
@@ -3,6 +3,18 @@ Compile instruction for Linux and Windows are at the bottom of this file.
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.5.6
|
||||
|
||||
Updated Lyra2z for new zcoin algo post block 2050.
|
||||
Cleanup up Lyra2 code and increased performance
|
||||
- Lyra2REv2 +11%
|
||||
- Lyra2RE +6%
|
||||
- Lyra2Z (zcoin) +12%
|
||||
Fixed performance of x11evo on Windows to match Linux.
|
||||
Timetravel 3% to 5% faster
|
||||
Whirlpool algo 15% faster.
|
||||
Removed aclocal.m4 from .gitignore.
|
||||
|
||||
v3.5.5
|
||||
|
||||
x11evo fixed on Windows but at reduced performance.
|
||||
|
@@ -3,6 +3,7 @@
|
||||
#include "cryptonight.h"
|
||||
#include "miner.h"
|
||||
#include "crypto/c_keccak.h"
|
||||
#include "avxdefs.h"
|
||||
|
||||
void aesni_parallel_noxor(uint8_t *long_state, uint8_t *text, uint8_t *ExpandedKey);
|
||||
void aesni_parallel_xor(uint8_t *text, uint8_t *ExpandedKey, uint8_t *long_state);
|
||||
@@ -147,6 +148,11 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
|
||||
_mm_store_si128(&(longoutput[(i >> 4) + 7]), xmminput[7]);
|
||||
}
|
||||
|
||||
// cast_m128i( ctx.a ) = _mm_xor_si128( casti_m128i( ctx.state.k, 0 ) ,
|
||||
// casti_m128i( ctx.state.k, 2 ) );
|
||||
// cast_m128i( ctx.b ) = _mm_xor_si128( casti_m128i( ctx.state.k, 1 ),
|
||||
// casti_m128i( ctx.state.k, 3 ) );
|
||||
|
||||
ctx.a[0] = ((uint64_t *)ctx.state.k)[0] ^ ((uint64_t *)ctx.state.k)[4];
|
||||
ctx.b[0] = ((uint64_t *)ctx.state.k)[2] ^ ((uint64_t *)ctx.state.k)[6];
|
||||
ctx.a[1] = ((uint64_t *)ctx.state.k)[1] ^ ((uint64_t *)ctx.state.k)[5];
|
||||
@@ -196,9 +202,12 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
|
||||
a[1] += lo;
|
||||
}
|
||||
uint64_t *dst = (uint64_t*)&ctx.long_state[c[0] & 0x1FFFF0];
|
||||
|
||||
// cast_m128i( dst ) = cast_m128i( a );
|
||||
dst[0] = a[0];
|
||||
dst[1] = a[1];
|
||||
|
||||
// cast_m128i( a ) = _mm_xor_si128( cast_m128i( a ), cast_m128i( b ) );
|
||||
a[0] ^= b[0];
|
||||
a[1] ^= b[1];
|
||||
b_x = c_x;
|
||||
|
@@ -275,7 +275,7 @@ HashReturn init_luffa(hashState_luffa *state, int hashbitlen)
|
||||
CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
|
||||
for ( i=0; i<10; i++ )
|
||||
state->chainv[i] = _mm_load_si128( (__m128i*)&IV[i*4] );
|
||||
// memset(state->buffer, 0, sizeof state->buffer );
|
||||
memset(state->buffer, 0, sizeof state->buffer );
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
|
@@ -21,6 +21,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "compat.h"
|
||||
#include "lyra2.h"
|
||||
#include "sponge.h"
|
||||
@@ -45,10 +46,9 @@
|
||||
* @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
|
||||
*/
|
||||
|
||||
// Lyra2RE & Lyra2REv2, nRows must be a power of 2
|
||||
int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
||||
const void *salt, uint64_t saltlen, uint64_t timeCost,
|
||||
const uint64_t nRows, const uint64_t nCols )
|
||||
int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||
uint64_t pwdlen, const void *salt, uint64_t saltlen,
|
||||
uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
|
||||
{
|
||||
//====================== Basic variables ============================//
|
||||
uint64_t _ALIGN(256) state[16];
|
||||
@@ -71,26 +71,21 @@ int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
||||
// for Lyra2REv2, nCols = 4, v1 was using 8
|
||||
const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
|
||||
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
|
||||
|
||||
/*
|
||||
i = (int64_t)ROW_LEN_BYTES * nRows;
|
||||
uint64_t *wholeMatrix = malloc(i);
|
||||
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
|
||||
if (wholeMatrix == NULL)
|
||||
return -1;
|
||||
|
||||
#if defined (__AVX2__)
|
||||
memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
|
||||
#elif defined(__AVX__)
|
||||
memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
|
||||
#else
|
||||
memset(wholeMatrix, 0, i);
|
||||
|
||||
//Allocates pointers to each row of the matrix
|
||||
uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows);
|
||||
if (memMatrix == NULL)
|
||||
return -1;
|
||||
|
||||
//Places the pointers in the correct positions
|
||||
#endif
|
||||
*/
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
for (i = 0; i < nRows; i++)
|
||||
{
|
||||
memMatrix[i] = ptrWord;
|
||||
ptrWord += ROW_LEN_INT64;
|
||||
}
|
||||
|
||||
//=== Getting the password + salt + basil padded with 10*1 ==========//
|
||||
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
|
||||
@@ -140,31 +135,36 @@ int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
||||
|
||||
//================= Initializing the Sponge State ====================//
|
||||
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
|
||||
initState(state);
|
||||
|
||||
initState( state );
|
||||
|
||||
//========================= Setup Phase =============================//
|
||||
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
|
||||
|
||||
ptrWord = wholeMatrix;
|
||||
for (i = 0; i < nBlocksInput; i++)
|
||||
{
|
||||
absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
|
||||
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
|
||||
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
|
||||
}
|
||||
|
||||
//Initializes M[0] and M[1]
|
||||
reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
|
||||
reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
|
||||
|
||||
reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
|
||||
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
|
||||
nCols);
|
||||
|
||||
do
|
||||
{
|
||||
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
|
||||
|
||||
reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
|
||||
reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||
|
||||
//updates the value of row* (deterministically picked during Setup))
|
||||
rowa = (rowa + step) & (window - 1);
|
||||
//update prev: it now points to the last row ever computed
|
||||
|
||||
prev = row;
|
||||
//updates row: goes to the next row to be computed
|
||||
row++;
|
||||
@@ -190,12 +190,14 @@ int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
||||
//Selects a pseudorandom index row*
|
||||
//-----------------------------------------------
|
||||
rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
|
||||
//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//-------------------------------------------
|
||||
|
||||
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
|
||||
reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
|
||||
|
||||
reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||
//update prev: it now points to the last row ever computed
|
||||
prev = row;
|
||||
|
||||
@@ -210,22 +212,17 @@ int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
||||
|
||||
//===================== Wrap-up Phase ===============================//
|
||||
//Absorbs the last block of the memory matrix
|
||||
absorbBlock(state, memMatrix[rowa]);
|
||||
|
||||
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
|
||||
//Squeezes the key
|
||||
squeeze(state, K, (unsigned int) kLen);
|
||||
|
||||
//================== Freeing the memory =============================//
|
||||
free(memMatrix);
|
||||
free(wholeMatrix);
|
||||
// free(wholeMatrix);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Zcoin, nRows may be any value
|
||||
int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
||||
const void *salt, uint64_t saltlen, uint64_t timeCost,
|
||||
uint64_t nRows, uint64_t nCols )
|
||||
int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols )
|
||||
{
|
||||
//========================== Basic variables ============================//
|
||||
uint64_t _ALIGN(256) state[16];
|
||||
@@ -244,33 +241,27 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
||||
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
/*
|
||||
i = (int64_t)ROW_LEN_BYTES * nRows;
|
||||
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
|
||||
|
||||
i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES);
|
||||
uint64_t *wholeMatrix = malloc(i);
|
||||
if (wholeMatrix == NULL)
|
||||
if (wholeMatrix == NULL)
|
||||
return -1;
|
||||
|
||||
memset(wholeMatrix, 0, i);
|
||||
//Allocates pointers to each row of the matrix
|
||||
uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*));
|
||||
if (memMatrix == NULL)
|
||||
return -1;
|
||||
|
||||
//Places the pointers in the correct positions
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
for (i = 0; i < nRows; i++)
|
||||
{
|
||||
memMatrix[i] = ptrWord;
|
||||
ptrWord += ROW_LEN_INT64;
|
||||
}
|
||||
|
||||
#if defined (__AVX2__)
|
||||
memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
|
||||
#elif defined(__AVX__)
|
||||
memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
|
||||
#else
|
||||
memset(wholeMatrix, 0, i);
|
||||
#endif
|
||||
*/
|
||||
//==== Getting the password + salt + basil padded with 10*1 ============//
|
||||
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
|
||||
//but this ensures that the password copied locally will be overwritten as soon as possible
|
||||
|
||||
//First, we clean enough blocks for the password, salt, basil and padding
|
||||
uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof (uint64_t) )
|
||||
/ BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
|
||||
uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
|
||||
byte *ptrByte = (byte*) wholeMatrix;
|
||||
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES );
|
||||
|
||||
@@ -281,7 +272,6 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
||||
//Concatenates the salt
|
||||
memcpy(ptrByte, salt, saltlen);
|
||||
ptrByte += saltlen;
|
||||
|
||||
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
|
||||
memcpy(ptrByte, &kLen, sizeof (uint64_t));
|
||||
ptrByte += sizeof (uint64_t);
|
||||
@@ -304,11 +294,15 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
||||
|
||||
//=================== Initializing the Sponge State ====================//
|
||||
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
|
||||
// uint64_t *state = _mm_malloc(16 * sizeof(uint64_t), 32);
|
||||
// if (state == NULL) {
|
||||
// return -1;
|
||||
// }
|
||||
initState( state );
|
||||
|
||||
//============================== Setup Phase =============================//
|
||||
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
|
||||
ptrWord = wholeMatrix;
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
for ( i = 0; i < nBlocksInput; i++ )
|
||||
{
|
||||
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
|
||||
@@ -316,31 +310,28 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
||||
}
|
||||
|
||||
//Initializes M[0] and M[1]
|
||||
reducedSqueezeRow0( state, memMatrix[0], nCols ); //The locally copied password is most likely overwritten here
|
||||
reducedDuplexRow1( state, memMatrix[0], memMatrix[1], nCols );
|
||||
reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here
|
||||
reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols);
|
||||
|
||||
do
|
||||
{
|
||||
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
|
||||
reducedDuplexRowSetup( state, memMatrix[prev], memMatrix[rowa],
|
||||
memMatrix[row], nCols );
|
||||
do {
|
||||
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
|
||||
reducedDuplexRowSetup(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
|
||||
|
||||
//updates the value of row* (deterministically picked during Setup))
|
||||
rowa = (rowa + step) & (window - 1);
|
||||
//update prev: it now points to the last row ever computed
|
||||
prev = row;
|
||||
//updates row: goes to the next row to be computed
|
||||
row++;
|
||||
//updates the value of row* (deterministically picked during Setup))
|
||||
rowa = (rowa + step) & (window - 1);
|
||||
//update prev: it now points to the last row ever computed
|
||||
prev = row;
|
||||
//updates row: goes to the next row to be computed
|
||||
row++;
|
||||
|
||||
//Checks if all rows in the window where visited.
|
||||
if (rowa == 0)
|
||||
{
|
||||
step = window + gap; //changes the step: approximately doubles its value
|
||||
window *= 2; //doubles the size of the re-visitation window
|
||||
gap = -gap; //inverts the modifier to the step
|
||||
}
|
||||
//Checks if all rows in the window where visited.
|
||||
if (rowa == 0) {
|
||||
step = window + gap; //changes the step: approximately doubles its value
|
||||
window *= 2; //doubles the size of the re-visitation window
|
||||
gap = -gap; //inverts the modifier to the step
|
||||
}
|
||||
|
||||
} while (row < nRows);
|
||||
} while (row < nRows);
|
||||
|
||||
//======================== Wandering Phase =============================//
|
||||
row = 0; //Resets the visitation to the first row of the memory matrix
|
||||
@@ -351,20 +342,19 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
||||
do {
|
||||
//Selects a pseudorandom index row*
|
||||
//----------------------------------------------------------------------
|
||||
//rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
//rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//-----------------------------------------------------------------
|
||||
|
||||
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
|
||||
reducedDuplexRow( state, memMatrix[prev], memMatrix[rowa],
|
||||
memMatrix[row], nCols );
|
||||
reducedDuplexRow(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
|
||||
|
||||
//update prev: it now points to the last row ever computed
|
||||
prev = row;
|
||||
|
||||
//updates row: goes to the next row to be computed
|
||||
//---------------------------------------------------------------
|
||||
//row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
//row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
@@ -373,15 +363,190 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
||||
|
||||
//========================= Wrap-up Phase ===============================//
|
||||
//Absorbs the last block of the memory matrix
|
||||
absorbBlock( state, memMatrix[rowa] );
|
||||
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
|
||||
|
||||
//Squeezes the key
|
||||
squeeze( state, K, kLen );
|
||||
|
||||
//====================== Freeing the memory =============================//
|
||||
free( memMatrix );
|
||||
free( wholeMatrix );
|
||||
|
||||
// _mm_free(state);
|
||||
// _mm_free( wholeMatrix );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
|
||||
uint64_t pwdlen, const void *salt, uint64_t saltlen,
|
||||
uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
|
||||
{
|
||||
//====================== Basic variables ============================//
|
||||
uint64_t _ALIGN(256) state[16];
|
||||
int64_t row = 2; //index of row to be processed
|
||||
int64_t prev = 1; //index of prev (last row ever computed/modified)
|
||||
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
|
||||
int64_t tau; //Time Loop iterator
|
||||
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
|
||||
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
|
||||
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
|
||||
int64_t i; //auxiliary iteration counter
|
||||
int64_t v64; // 64bit var for memcpy
|
||||
//====================================================================/
|
||||
|
||||
//=== Initializing the Memory Matrix and pointers to it =============//
|
||||
//Tries to allocate enough space for the whole memory matrix
|
||||
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
// for Lyra2REv2, nCols = 4, v1 was using 8
|
||||
const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
|
||||
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
|
||||
|
||||
i = (int64_t)ROW_LEN_BYTES * nRows;
|
||||
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
|
||||
if (wholeMatrix == NULL)
|
||||
return -1;
|
||||
|
||||
#if defined (__AVX2__)
|
||||
memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
|
||||
#elif defined(__AVX__)
|
||||
memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
|
||||
#else
|
||||
memset(wholeMatrix, 0, i);
|
||||
#endif
|
||||
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
|
||||
//=== Getting the password + salt + basil padded with 10*1 ==========//
|
||||
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
|
||||
//but this ensures that the password copied locally will be overwritten as soon as possible
|
||||
|
||||
//First, we clean enough blocks for the password, salt, basil and padding
|
||||
int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
|
||||
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
||||
|
||||
byte *ptrByte = (byte*) wholeMatrix;
|
||||
|
||||
//Prepends the password
|
||||
memcpy(ptrByte, pwd, pwdlen);
|
||||
ptrByte += pwdlen;
|
||||
|
||||
//Concatenates the salt
|
||||
memcpy(ptrByte, salt, saltlen);
|
||||
ptrByte += saltlen;
|
||||
|
||||
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
|
||||
- (saltlen + pwdlen) );
|
||||
|
||||
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
|
||||
memcpy(ptrByte, &kLen, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = pwdlen;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = saltlen;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = timeCost;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = nRows;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = nCols;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
|
||||
//Now comes the padding
|
||||
*ptrByte = 0x80; //first byte of padding: right after the password
|
||||
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
|
||||
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
|
||||
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
|
||||
|
||||
//================= Initializing the Sponge State ====================//
|
||||
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
|
||||
|
||||
initState( state );
|
||||
|
||||
//========================= Setup Phase =============================//
|
||||
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
|
||||
|
||||
ptrWord = wholeMatrix;
|
||||
for (i = 0; i < nBlocksInput; i++)
|
||||
{
|
||||
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
|
||||
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
|
||||
}
|
||||
//Initializes M[0] and M[1]
|
||||
reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
|
||||
|
||||
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
|
||||
nCols);
|
||||
|
||||
do
|
||||
{
|
||||
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
|
||||
|
||||
reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||
|
||||
//updates the value of row* (deterministically picked during Setup))
|
||||
rowa = (rowa + step) & (window - 1);
|
||||
//update prev: it now points to the last row ever computed
|
||||
|
||||
prev = row;
|
||||
//updates row: goes to the next row to be computed
|
||||
row++;
|
||||
|
||||
//Checks if all rows in the window where visited.
|
||||
if (rowa == 0)
|
||||
{
|
||||
step = window + gap; //changes the step: approximately doubles its value
|
||||
window *= 2; //doubles the size of the re-visitation window
|
||||
gap = -gap; //inverts the modifier to the step
|
||||
}
|
||||
|
||||
} while (row < nRows);
|
||||
|
||||
//===================== Wandering Phase =============================//
|
||||
row = 0; //Resets the visitation to the first row of the memory matrix
|
||||
for (tau = 1; tau <= timeCost; tau++)
|
||||
{
|
||||
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
|
||||
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
|
||||
do
|
||||
{
|
||||
//Selects a pseudorandom index row*
|
||||
//-----------------------------------------------
|
||||
rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
|
||||
//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//-------------------------------------------
|
||||
|
||||
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
|
||||
reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||
//update prev: it now points to the last row ever computed
|
||||
prev = row;
|
||||
|
||||
//updates row: goes to the next row to be computed
|
||||
//----------------------------------------------------
|
||||
row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//----------------------------------------------------
|
||||
|
||||
} while (row != 0);
|
||||
}
|
||||
|
||||
//===================== Wrap-up Phase ===============================//
|
||||
//Absorbs the last block of the memory matrix
|
||||
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
|
||||
//Squeezes the key
|
||||
squeeze(state, K, (unsigned int) kLen);
|
||||
|
||||
//================== Freeing the memory =============================//
|
||||
free(wholeMatrix);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@@ -37,10 +37,20 @@ typedef unsigned char byte;
|
||||
#define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes
|
||||
#endif
|
||||
|
||||
int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
||||
const void *salt, uint64_t saltlen, uint64_t timeCost,
|
||||
uint64_t nRows, uint64_t nCols );
|
||||
int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
||||
const void *salt, uint64_t saltlen, uint64_t timeCost,
|
||||
uint64_t nRows, uint64_t nCols );
|
||||
#define BLOCK_LEN_M256I (BLOCK_LEN_INT64 / 4 )
|
||||
#define BLOCK_LEN_M128I (BLOCK_LEN_INT64 / 2 )
|
||||
|
||||
int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
|
||||
uint64_t pwdlen, const void *salt, uint64_t saltlen,
|
||||
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
|
||||
|
||||
|
||||
int LYRA2REV2( uint64_t*, void *K, uint64_t kLen, const void *pwd,
|
||||
uint64_t pwdlen, const void *salt, uint64_t saltlen,
|
||||
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
|
||||
|
||||
int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
|
||||
uint64_t pwdlen, const void *salt, uint64_t saltlen,
|
||||
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
|
||||
|
||||
#endif /* LYRA2_H_ */
|
||||
|
@@ -7,11 +7,14 @@
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "lyra2.h"
|
||||
#include "algo-gate-api.h"
|
||||
#include "avxdefs.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#include "algo/groestl/aes_ni/hash-groestl256.h"
|
||||
#endif
|
||||
|
||||
//__thread uint64_t* lyra2re_wholeMatrix;
|
||||
|
||||
typedef struct {
|
||||
sph_blake256_context blake;
|
||||
sph_keccak256_context keccak;
|
||||
@@ -24,6 +27,7 @@ typedef struct {
|
||||
} lyra2re_ctx_holder;
|
||||
|
||||
lyra2re_ctx_holder lyra2re_ctx;
|
||||
static __thread sph_blake256_context lyra2_blake_mid;
|
||||
|
||||
void init_lyra2re_ctx()
|
||||
{
|
||||
@@ -37,6 +41,12 @@ void init_lyra2re_ctx()
|
||||
#endif
|
||||
}
|
||||
|
||||
void lyra2_blake256_midstate( const void* input )
|
||||
{
|
||||
memcpy( &lyra2_blake_mid, &lyra2re_ctx.blake, sizeof lyra2_blake_mid );
|
||||
sph_blake256( &lyra2_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
void lyra2re_hash(void *state, const void *input)
|
||||
{
|
||||
lyra2re_ctx_holder ctx;
|
||||
@@ -47,13 +57,19 @@ void lyra2re_hash(void *state, const void *input)
|
||||
#define hashA hash
|
||||
#define hashB hash+16
|
||||
|
||||
sph_blake256(&ctx.blake, input, 80);
|
||||
const int midlen = 64; // bytes
|
||||
const int tail = 80 - midlen; // 16
|
||||
|
||||
memcpy( &ctx.blake, &lyra2_blake_mid, sizeof lyra2_blake_mid );
|
||||
sph_blake256( &ctx.blake, input + 64, 16 );
|
||||
|
||||
// sph_blake256(&ctx.blake, input, 80);
|
||||
sph_blake256_close(&ctx.blake, hashA);
|
||||
|
||||
sph_keccak256(&ctx.keccak, hashA, 32);
|
||||
sph_keccak256_close(&ctx.keccak, hashB);
|
||||
|
||||
LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
|
||||
LYRA2RE( hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
|
||||
|
||||
sph_skein256(&ctx.skein, hashA, 32);
|
||||
sph_skein256_close(&ctx.skein, hashB);
|
||||
@@ -81,6 +97,8 @@ int scanhash_lyra2re(int thr_id, struct work *work,
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
lyra2_blake256_midstate( endiandata );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], nonce);
|
||||
lyra2re_hash(hash, endiandata);
|
||||
@@ -112,10 +130,34 @@ void lyra2re_set_target ( struct work* work, double job_diff )
|
||||
work_set_target(work, job_diff / (128.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
/*
|
||||
bool lyra2re_thread_init()
|
||||
{
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
|
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
|
||||
int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
|
||||
lyra2re_wholeMatrix = _mm_malloc( i, 64 );
|
||||
|
||||
if ( lyra2re_wholeMatrix == NULL )
|
||||
return false;
|
||||
|
||||
#if defined (__AVX2__)
|
||||
memset_zero_m256i( (__m256i*)lyra2re_wholeMatrix, i/32 );
|
||||
#elif defined(__AVX__)
|
||||
memset_zero_m128i( (__m128i*)lyra2re_wholeMatrix, i/16 );
|
||||
#else
|
||||
memset( lyra2re_wholeMatrix, 0, i );
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
bool register_lyra2re_algo( algo_gate_t* gate )
|
||||
{
|
||||
init_lyra2re_ctx();
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
// gate->miner_thread_init = (void*)&lyra2re_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2re;
|
||||
gate->hash = (void*)&lyra2re_hash;
|
||||
gate->hash_alt = (void*)&lyra2re_hash;
|
||||
|
@@ -8,10 +8,11 @@
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
|
||||
#include "lyra2.h"
|
||||
#include "avxdefs.h"
|
||||
|
||||
__thread uint64_t* l2v2_wholeMatrix;
|
||||
|
||||
typedef struct {
|
||||
cubehashParam cube1;
|
||||
@@ -23,7 +24,8 @@ typedef struct {
|
||||
|
||||
} lyra2v2_ctx_holder;
|
||||
|
||||
lyra2v2_ctx_holder lyra2v2_ctx;
|
||||
static lyra2v2_ctx_holder lyra2v2_ctx;
|
||||
static __thread sph_blake256_context l2v2_blake_mid;
|
||||
|
||||
void init_lyra2rev2_ctx()
|
||||
{
|
||||
@@ -35,14 +37,23 @@ void init_lyra2rev2_ctx()
|
||||
sph_bmw256_init( &lyra2v2_ctx.bmw );
|
||||
}
|
||||
|
||||
void l2v2_blake256_midstate( const void* input )
|
||||
{
|
||||
memcpy( &l2v2_blake_mid, &lyra2v2_ctx.blake, sizeof l2v2_blake_mid );
|
||||
sph_blake256( &l2v2_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
void lyra2rev2_hash( void *state, const void *input )
|
||||
{
|
||||
lyra2v2_ctx_holder ctx;
|
||||
memcpy( &ctx, &lyra2v2_ctx, sizeof(lyra2v2_ctx) );
|
||||
|
||||
uint32_t _ALIGN(128) hashA[8], hashB[8];
|
||||
|
||||
sph_blake256( &ctx.blake, input, 80 );
|
||||
const int midlen = 64; // bytes
|
||||
const int tail = 80 - midlen; // 16
|
||||
|
||||
memcpy( &ctx.blake, &l2v2_blake_mid, sizeof l2v2_blake_mid );
|
||||
sph_blake256( &ctx.blake, (uint8_t*)input + midlen, tail );
|
||||
sph_blake256_close( &ctx.blake, hashA );
|
||||
|
||||
sph_keccak256( &ctx.keccak, hashA, 32 );
|
||||
@@ -50,18 +61,14 @@ void lyra2rev2_hash( void *state, const void *input )
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube1, (byte*) hashA,
|
||||
(const byte*) hashB, 32 );
|
||||
// cubehashUpdate( &ctx.cube1, (const byte*) hashB,32 );
|
||||
// cubehashDigest( &ctx.cube1, (byte*)hashA );
|
||||
|
||||
LYRA2( hashA, 32, hashA, 32, hashA, 32, 1, 4, 4 );
|
||||
LYRA2REV2( l2v2_wholeMatrix, hashA, 32, hashA, 32, hashA, 32, 1, 4, 4 );
|
||||
|
||||
sph_skein256( &ctx.skein, hashA, 32 );
|
||||
sph_skein256_close( &ctx.skein, hashB );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube2, (byte*) hashA,
|
||||
(const byte*) hashB, 32 );
|
||||
// cubehashUpdate( &ctx.cube2, (const byte*) hashB,32 );
|
||||
// cubehashDigest( &ctx.cube2, (byte*)hashA );
|
||||
|
||||
sph_bmw256( &ctx.bmw, hashA, 32 );
|
||||
sph_bmw256_close( &ctx.bmw, hashB );
|
||||
@@ -85,6 +92,8 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
l2v2_blake256_midstate( endiandata );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], nonce);
|
||||
lyra2rev2_hash(hash, endiandata);
|
||||
@@ -112,10 +121,33 @@ void lyra2rev2_set_target( struct work* work, double job_diff )
|
||||
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
|
||||
bool lyra2rev2_thread_init()
|
||||
{
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
|
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
|
||||
int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
|
||||
l2v2_wholeMatrix = _mm_malloc( i, 64 );
|
||||
|
||||
if ( l2v2_wholeMatrix == NULL )
|
||||
return false;
|
||||
|
||||
#if defined (__AVX2__)
|
||||
memset_zero_m256i( (__m256i*)l2v2_wholeMatrix, i/32 );
|
||||
#elif defined(__AVX__)
|
||||
memset_zero_m128i( (__m128i*)l2v2_wholeMatrix, i/16 );
|
||||
#else
|
||||
memset( l2v2_wholeMatrix, 0, i );
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_lyra2rev2_algo( algo_gate_t* gate )
|
||||
{
|
||||
init_lyra2rev2_ctx();
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2rev2;
|
||||
gate->hash = (void*)&lyra2rev2_hash;
|
||||
gate->hash_alt = (void*)&lyra2rev2_hash;
|
||||
|
1570
algo/lyra2/sponge.c
1570
algo/lyra2/sponge.c
File diff suppressed because it is too large
Load Diff
@@ -51,24 +51,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
#if defined __AVX2__
|
||||
// only available with avx2
|
||||
|
||||
// init vectors from memory
|
||||
// returns void, updates defines and inits implicit args a, b, c, d
|
||||
#define LYRA_INIT_AVX2 \
|
||||
__m256i a[4]; \
|
||||
a[0] = _mm256_load_si256( (__m256i*)(&v[ 0]) ); \
|
||||
a[1] = _mm256_load_si256( (__m256i*)(&v[ 4]) ); \
|
||||
a[2] = _mm256_load_si256( (__m256i*)(&v[ 8]) ); \
|
||||
a[3] = _mm256_load_si256( (__m256i*)(&v[12]) );
|
||||
|
||||
// save to memory
|
||||
// returns void
|
||||
#define LYRA_CLOSE_AVX2 \
|
||||
_mm256_store_si256( (__m256i*)(&v[ 0]), a[0] ); \
|
||||
_mm256_store_si256( (__m256i*)(&v[ 4]), a[1] ); \
|
||||
_mm256_store_si256( (__m256i*)(&v[ 8]), a[2] ); \
|
||||
_mm256_store_si256( (__m256i*)(&v[12]), a[3] );
|
||||
|
||||
// process 4 rows in parallel
|
||||
// process 4 columns in parallel
|
||||
// returns void, updates all args
|
||||
#define G_4X64(a,b,c,d) \
|
||||
a = _mm256_add_epi64( a, b ); \
|
||||
@@ -107,28 +90,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
#else
|
||||
// only available with avx
|
||||
|
||||
#define LYRA_INIT_AVX \
|
||||
__m128i a0[4], a1[4]; \
|
||||
a0[0] = _mm_load_si128( (__m128i*)(&v[ 0]) ); \
|
||||
a1[0] = _mm_load_si128( (__m128i*)(&v[ 2]) ); \
|
||||
a0[1] = _mm_load_si128( (__m128i*)(&v[ 4]) ); \
|
||||
a1[1] = _mm_load_si128( (__m128i*)(&v[ 6]) ); \
|
||||
a0[2] = _mm_load_si128( (__m128i*)(&v[ 8]) ); \
|
||||
a1[2] = _mm_load_si128( (__m128i*)(&v[10]) ); \
|
||||
a0[3] = _mm_load_si128( (__m128i*)(&v[12]) ); \
|
||||
a1[3] = _mm_load_si128( (__m128i*)(&v[14]) );
|
||||
|
||||
#define LYRA_CLOSE_AVX \
|
||||
_mm_store_si128( (__m128i*)(&v[ 0]), a0[0] ); \
|
||||
_mm_store_si128( (__m128i*)(&v[ 2]), a1[0] ); \
|
||||
_mm_store_si128( (__m128i*)(&v[ 4]), a0[1] ); \
|
||||
_mm_store_si128( (__m128i*)(&v[ 6]), a1[1] ); \
|
||||
_mm_store_si128( (__m128i*)(&v[ 8]), a0[2] ); \
|
||||
_mm_store_si128( (__m128i*)(&v[10]), a1[2] ); \
|
||||
_mm_store_si128( (__m128i*)(&v[12]), a0[3] ); \
|
||||
_mm_store_si128( (__m128i*)(&v[14]), a1[3] );
|
||||
|
||||
// process 2 rows in parallel
|
||||
// process 2 columns in parallel
|
||||
// returns void, all args updated
|
||||
#define G_2X64(a,b,c,d) \
|
||||
a = _mm_add_epi64( a, b ); \
|
||||
@@ -140,68 +102,35 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
c = _mm_add_epi64( c, d ); \
|
||||
b = mm_rotr_64( _mm_xor_si128( b, c ), 63 );
|
||||
|
||||
#define LYRA_ROUND_AVX \
|
||||
G_2X64( a0[0], a0[1], a0[2], a0[3] ); \
|
||||
G_2X64( a1[0], a1[1], a1[2], a1[3] ); \
|
||||
mm128_rotl256_1x64( a0[1], a1[1] ); \
|
||||
mm128_swap128( a0[2], a1[2] ); \
|
||||
mm128_rotr256_1x64( a0[3], a1[3] ); \
|
||||
G_2X64( a0[0], a0[1], a0[2], a0[3] ); \
|
||||
G_2X64( a1[0], a1[1], a1[2], a1[3] ); \
|
||||
mm128_rotr256_1x64( a0[1], a1[1] ); \
|
||||
mm128_swap128( a0[2], a1[2] ); \
|
||||
mm128_rotl256_1x64( a0[3], a1[3] );
|
||||
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
mm128_rotl256_1x64( s2, s3 ); \
|
||||
mm128_swap128( s4, s5 ); \
|
||||
mm128_rotr256_1x64( s6, s7 ); \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
mm128_rotr256_1x64( s2, s3 ); \
|
||||
mm128_swap128( s4, s5 ); \
|
||||
mm128_rotl256_1x64( s6, s7 );
|
||||
|
||||
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
/*
|
||||
#if defined __AVX__
|
||||
// can coexist with AVX2
|
||||
|
||||
// rotate each uint64 c bits
|
||||
// _m128i
|
||||
#define mm_rotr_64(w,c) _mm_or_si128(_mm_srli_epi64(w, c), \
|
||||
_mm_slli_epi64(w, 64 - c))
|
||||
|
||||
// swap 128 bit source vectors, equivalent of rotating 256 bits by 128 bits
|
||||
// void
|
||||
#define mm128_swap128(s0, s1) s0 = _mm_xor_si128(s0, s1); \
|
||||
s1 = _mm_xor_si128(s0, s1); \
|
||||
s0 = _mm_xor_si128(s0, s1);
|
||||
|
||||
// swap uint64 in 128 bit source vector, equivalent of rotating 128 bits by
|
||||
// 64 bits (8 bytes)
|
||||
// __m128i
|
||||
#define mm128_swap64(s) _mm_or_si128( _mm_slli_si128( s, 8 ), \
|
||||
_mm_srli_si128( s, 8 ) )
|
||||
|
||||
// rotate 2 128 bit vectors as one 256 vector by 1 uint64, very inefficient
|
||||
// returns void, args updated
|
||||
#define mm128_rotl256_1x64(s0, s1) do { \
|
||||
__m128i t; \
|
||||
s0 = mm128_swap64( s0); \
|
||||
s1 = mm128_swap64( s1); \
|
||||
t = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
|
||||
_mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
|
||||
s1 = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
|
||||
_mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
|
||||
s0 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_rotr256_1x64(s0, s1) do { \
|
||||
__m128i t; \
|
||||
s0 = mm128_swap64( s0); \
|
||||
s1 = mm128_swap64( s1); \
|
||||
t = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
|
||||
_mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
|
||||
s1 = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
|
||||
_mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
|
||||
s0 = t; \
|
||||
} while(0)
|
||||
|
||||
#endif // AVX
|
||||
*/
|
||||
|
||||
// Scalar
|
||||
//Blake2b's G function
|
||||
#define G(r,i,a,b,c,d) \
|
||||
|
@@ -1,20 +1,40 @@
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "miner.h"
|
||||
#include "algo-gate-api.h"
|
||||
#include "lyra2.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "avxdefs.h"
|
||||
|
||||
void zcoin_hash(void *state, const void *input, uint32_t height)
|
||||
__thread uint64_t* zcoin_wholeMatrix;
|
||||
|
||||
static __thread sph_blake256_context zcoin_blake_mid;
|
||||
|
||||
|
||||
void zcoin_midstate( const void* input )
|
||||
{
|
||||
|
||||
uint32_t _ALIGN(256) hash[16];
|
||||
|
||||
// LYRA2Z(hash, 32, input, 80, input, 80, 2, height, 256);
|
||||
LYRA2Z(hash, 32, input, 80, input, 80, 2, 8192, 256);
|
||||
|
||||
memcpy(state, hash, 32);
|
||||
sph_blake256_init( &zcoin_blake_mid );
|
||||
sph_blake256( &zcoin_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
// block 2050 new algo, blake plus new lyra parms. new input
|
||||
// is power of 2 so normal lyra can be used
|
||||
//void zcoin_hash(void *state, const void *input, uint32_t height)
|
||||
void zcoin_hash(void *state, const void *input )
|
||||
{
|
||||
uint32_t _ALIGN(256) hash[16];
|
||||
|
||||
sph_blake256_context ctx_blake;
|
||||
|
||||
memcpy( &ctx_blake, &zcoin_blake_mid, sizeof zcoin_blake_mid );
|
||||
sph_blake256( &ctx_blake, input + 64, 16 );
|
||||
sph_blake256_close( &ctx_blake, hash );
|
||||
|
||||
LYRA2Z( zcoin_wholeMatrix, hash, 32, hash, 32, hash, 32, 8, 8, 8);
|
||||
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
|
||||
//int scanhash_zcoin(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, uint32_t height)
|
||||
int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
@@ -25,6 +45,7 @@ int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
|
||||
if (opt_benchmark)
|
||||
ptarget[7] = 0x0000ff;
|
||||
|
||||
@@ -32,9 +53,11 @@ int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
}
|
||||
|
||||
zcoin_midstate( endiandata );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], nonce);
|
||||
zcoin_hash( hash, endiandata, work->height );
|
||||
zcoin_hash( hash, endiandata );
|
||||
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
work_set_target_ratio(work, hash);
|
||||
@@ -57,22 +80,45 @@ void zcoin_set_target( struct work* work, double job_diff )
|
||||
{
|
||||
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
/*
|
||||
bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
|
||||
{
|
||||
work->height = sctx->bloc_height;
|
||||
return false;
|
||||
}
|
||||
*/
|
||||
|
||||
bool zcoin_thread_init()
|
||||
{
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
|
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
|
||||
int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
|
||||
zcoin_wholeMatrix = _mm_malloc( i, 64 );
|
||||
|
||||
if ( zcoin_wholeMatrix == NULL )
|
||||
return false;
|
||||
|
||||
#if defined (__AVX2__)
|
||||
memset_zero_m256i( (__m256i*)zcoin_wholeMatrix, i/32 );
|
||||
#elif defined(__AVX__)
|
||||
memset_zero_m128i( (__m128i*)zcoin_wholeMatrix, i/16 );
|
||||
#else
|
||||
memset( zcoin_wholeMatrix, 0, i );
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_zcoin_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->miner_thread_init = (void*)&zcoin_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_zcoin;
|
||||
gate->hash = (void*)&zcoin_hash;
|
||||
gate->hash_alt = (void*)&zcoin_hash;
|
||||
gate->get_max64 = (void*)&get_max64_0xffffLL;
|
||||
gate->set_target = (void*)&zcoin_set_target;
|
||||
gate->prevent_dupes = (void*)&zcoin_get_work_height;
|
||||
// gate->prevent_dupes = (void*)&zcoin_get_work_height;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -2,13 +2,15 @@
|
||||
#include "miner.h"
|
||||
#include "algo-gate-api.h"
|
||||
#include "lyra2.h"
|
||||
#include "avxdefs.h"
|
||||
|
||||
__thread uint64_t* zoin_wholeMatrix;
|
||||
|
||||
void zoin_hash(void *state, const void *input, uint32_t height)
|
||||
{
|
||||
|
||||
uint32_t _ALIGN(256) hash[16];
|
||||
|
||||
LYRA2Z(hash, 32, input, 80, input, 80, 2, 330, 256);
|
||||
LYRA2Z( zoin_wholeMatrix, hash, 32, input, 80, input, 80, 2, 330, 256);
|
||||
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
@@ -53,22 +55,45 @@ void zoin_set_target( struct work* work, double job_diff )
|
||||
{
|
||||
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
/*
|
||||
bool zoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
|
||||
{
|
||||
work->height = sctx->bloc_height;
|
||||
return false;
|
||||
}
|
||||
*/
|
||||
|
||||
bool zoin_thread_init()
|
||||
{
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 256; // nCols
|
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
|
||||
int i = (int64_t)ROW_LEN_BYTES * 330; // nRows;
|
||||
zoin_wholeMatrix = _mm_malloc( i, 64 );
|
||||
|
||||
if ( zoin_wholeMatrix == NULL )
|
||||
return false;
|
||||
|
||||
#if defined (__AVX2__)
|
||||
memset_zero_m256i( (__m256i*)zoin_wholeMatrix, i/32 );
|
||||
#elif defined(__AVX__)
|
||||
memset_zero_m128i( (__m128i*)zoin_wholeMatrix, i/16 );
|
||||
#else
|
||||
memset( zoin_wholeMatrix, 0, i );
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_zoin_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->miner_thread_init = (void*)&zoin_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_zoin;
|
||||
gate->hash = (void*)&zoin_hash;
|
||||
gate->hash_alt = (void*)&zoin_hash;
|
||||
gate->get_max64 = (void*)&get_max64_0xffffLL;
|
||||
gate->set_target = (void*)&zoin_set_target;
|
||||
gate->prevent_dupes = (void*)&zoin_get_work_height;
|
||||
// gate->prevent_dupes = (void*)&zoin_get_work_height;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
95
algo/m7m.c
95
algo/m7m.c
@@ -175,13 +175,13 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
|
||||
|
||||
memcpy(data, pdata, 80);
|
||||
|
||||
sph_sha256( &ctx1.sha256, data, M7_MIDSTATE_LEN );
|
||||
sph_sha512( &ctx1.sha512, data, M7_MIDSTATE_LEN );
|
||||
sph_keccak512( &ctx1.keccak, data, M7_MIDSTATE_LEN );
|
||||
sph_sha256( &ctx1.sha256, data, M7_MIDSTATE_LEN );
|
||||
sph_sha512( &ctx1.sha512, data, M7_MIDSTATE_LEN );
|
||||
sph_keccak512( &ctx1.keccak, data, M7_MIDSTATE_LEN );
|
||||
sph_whirlpool( &ctx1.whirlpool, data, M7_MIDSTATE_LEN );
|
||||
sph_haval256_5( &ctx1.haval, data, M7_MIDSTATE_LEN );
|
||||
sph_tiger( &ctx1.tiger, data, M7_MIDSTATE_LEN );
|
||||
sph_ripemd160( &ctx1.ripemd, data, M7_MIDSTATE_LEN );
|
||||
sph_haval256_5( &ctx1.haval, data, M7_MIDSTATE_LEN );
|
||||
sph_tiger( &ctx1.tiger, data, M7_MIDSTATE_LEN );
|
||||
sph_ripemd160( &ctx1.ripemd, data, M7_MIDSTATE_LEN );
|
||||
|
||||
mpz_t magipi, magisw, product, bns0, bns1;
|
||||
mpf_t magifpi, magifpi0, mpt1, mpt2, mptmp, mpten;
|
||||
@@ -228,40 +228,11 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
|
||||
sph_ripemd160( &ctx2.ripemd, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||
sph_ripemd160_close( &ctx2.ripemd, (void*)(bhash[6]) );
|
||||
|
||||
/*
|
||||
ctx2_sha256 = ctx_sha256;
|
||||
sph_sha256 (&ctx2_sha256, data_p64, 80 - M7_MIDSTATE_LEN);
|
||||
sph_sha256_close(&ctx2_sha256, (void*)(bhash[0]));
|
||||
|
||||
ctx2_sha512 = ctx_sha512;
|
||||
sph_sha512 (&ctx2_sha512, data_p64, 80 - M7_MIDSTATE_LEN);
|
||||
sph_sha512_close(&ctx2_sha512, (void*)(bhash[1]));
|
||||
|
||||
ctx2_keccak = ctx_keccak;
|
||||
sph_keccak512 (&ctx2_keccak, data_p64, 80 - M7_MIDSTATE_LEN);
|
||||
sph_keccak512_close(&ctx2_keccak, (void*)(bhash[2]));
|
||||
|
||||
ctx2_whirlpool = ctx_whirlpool;
|
||||
sph_whirlpool (&ctx2_whirlpool, data_p64, 80 - M7_MIDSTATE_LEN);
|
||||
sph_whirlpool_close(&ctx2_whirlpool, (void*)(bhash[3]));
|
||||
|
||||
ctx2_haval = ctx_haval;
|
||||
sph_haval256_5 (&ctx2_haval, data_p64, 80 - M7_MIDSTATE_LEN);
|
||||
sph_haval256_5_close(&ctx2_haval, (void*)(bhash[4]));
|
||||
|
||||
ctx2_tiger = ctx_tiger;
|
||||
sph_tiger (&ctx2_tiger, data_p64, 80 - M7_MIDSTATE_LEN);
|
||||
sph_tiger_close(&ctx2_tiger, (void*)(bhash[5]));
|
||||
|
||||
ctx2_ripemd = ctx_ripemd;
|
||||
sph_ripemd160 (&ctx2_ripemd, data_p64, 80 - M7_MIDSTATE_LEN);
|
||||
sph_ripemd160_close(&ctx2_ripemd, (void*)(bhash[6]));
|
||||
*/
|
||||
|
||||
mpz_import(bns0, a, -1, p, -1, 0, bhash[0]);
|
||||
mpz_set(bns1, bns0);
|
||||
mpz_set(product, bns0);
|
||||
for(int i=1; i < 7; i++){
|
||||
for ( i=1; i < 7; i++ )
|
||||
{
|
||||
mpz_import(bns0, a, -1, p, -1, 0, bhash[i]);
|
||||
mpz_add(bns1, bns1, bns0);
|
||||
mpz_mul(product, product, bns0);
|
||||
@@ -275,11 +246,6 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
|
||||
sph_sha256( &ctxf_sha256, bdata, bytes );
|
||||
sph_sha256_close( &ctxf_sha256, (void*)(hash) );
|
||||
|
||||
/*
|
||||
sph_sha256 (&ctx_final_sha256, bdata, bytes);
|
||||
sph_sha256_close(&ctx_final_sha256, (void*)(hash));
|
||||
*/
|
||||
|
||||
digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
|
||||
mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
|
||||
mpf_set_prec_raw(magifpi, prec);
|
||||
@@ -291,7 +257,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
|
||||
mpzscale = 1;
|
||||
mpz_set_ui(magisw, usw_);
|
||||
|
||||
for(i = 0; i < 5; i++)
|
||||
for ( i = 0; i < 5; i++ )
|
||||
{
|
||||
mpf_set_d(mpt1, 0.25*mpzscale);
|
||||
mpf_sub(mpt1, mpt1, mpt2);
|
||||
@@ -314,23 +280,22 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
|
||||
|
||||
sph_sha256( &ctxf_sha256, bdata, bytes );
|
||||
sph_sha256_close( &ctxf_sha256, (void*)(hash) );
|
||||
|
||||
/*
|
||||
sph_sha256 (&ctx_final_sha256, bdata, bytes);
|
||||
sph_sha256_close(&ctx_final_sha256, (void*)(hash));
|
||||
*/
|
||||
}
|
||||
|
||||
const unsigned char *hash_ = (const unsigned char *)hash;
|
||||
const unsigned char *target_ = (const unsigned char *)ptarget;
|
||||
for (i = 31; i >= 0; i--) {
|
||||
if (hash_[i] != target_[i]) {
|
||||
for ( i = 31; i >= 0; i-- )
|
||||
{
|
||||
if ( hash_[i] != target_[i] )
|
||||
{
|
||||
rc = hash_[i] < target_[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (unlikely(rc)) {
|
||||
if (opt_debug) {
|
||||
if ( unlikely(rc) )
|
||||
{
|
||||
if ( opt_debug )
|
||||
{
|
||||
bin2hex(hash_str, (unsigned char *)hash, 32);
|
||||
bin2hex(target_str, (unsigned char *)ptarget, 32);
|
||||
bin2hex(data_str, (unsigned char *)data, 80);
|
||||
@@ -343,20 +308,22 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
|
||||
goto out;
|
||||
}
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
pdata[19] = n;
|
||||
|
||||
out:
|
||||
mpf_set_prec_raw(magifpi, prec0);
|
||||
mpf_set_prec_raw(magifpi0, prec0);
|
||||
mpf_set_prec_raw(mptmp, prec0);
|
||||
mpf_set_prec_raw(mpt1, prec0);
|
||||
mpf_set_prec_raw(mpt2, prec0);
|
||||
mpf_clear(magifpi);
|
||||
mpf_clear(magifpi0);
|
||||
mpf_clear(mpten);
|
||||
mpf_clear(mptmp);
|
||||
mpf_clear(mpt1);
|
||||
mpf_clear(mpt2);
|
||||
mpz_clears(magipi, magisw, product, bns0, bns1, NULL);
|
||||
mpf_set_prec_raw(magifpi, prec0);
|
||||
mpf_set_prec_raw(magifpi0, prec0);
|
||||
mpf_set_prec_raw(mptmp, prec0);
|
||||
mpf_set_prec_raw(mpt1, prec0);
|
||||
mpf_set_prec_raw(mpt2, prec0);
|
||||
mpf_clear(magifpi);
|
||||
mpf_clear(magifpi0);
|
||||
mpf_clear(mpten);
|
||||
mpf_clear(mptmp);
|
||||
mpf_clear(mpt1);
|
||||
mpf_clear(mpt2);
|
||||
mpz_clears(magipi, magisw, product, bns0, bns1, NULL);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return rc;
|
||||
|
@@ -5,6 +5,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "avxdefs.h"
|
||||
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
@@ -99,6 +100,7 @@ typedef struct {
|
||||
} tt_ctx_holder;
|
||||
|
||||
tt_ctx_holder tt_ctx;
|
||||
__thread tt_ctx_holder tt_mid;
|
||||
|
||||
void init_tt_ctx()
|
||||
{
|
||||
@@ -125,6 +127,8 @@ void timetravel_hash(void *output, const void *input)
|
||||
tt_ctx_holder ctx;
|
||||
memcpy( &ctx, &tt_ctx, sizeof(tt_ctx) );
|
||||
int i;
|
||||
const int midlen = 64; // bytes
|
||||
const int tail = 80 - midlen; // 16
|
||||
|
||||
for ( i = 0; i < HASH_FUNC_COUNT; i++ )
|
||||
{
|
||||
@@ -140,50 +144,129 @@ void timetravel_hash(void *output, const void *input)
|
||||
}
|
||||
hashB = &hash[16 * i];
|
||||
|
||||
switch ( permutation[i] )
|
||||
switch ( permutation[i] )
|
||||
{
|
||||
case 0:
|
||||
// if ( i == 0 )
|
||||
// {
|
||||
// memcpy( &ctx.blake, &tt_mid.blake, sizeof tt_mid.blake );
|
||||
// sph_blake256( &ctx.blake, input + midlen, tail );
|
||||
// sph_blake256_close( &ctx.blake, hashB );
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
sph_blake512( &ctx.blake, hashA, dataLen );
|
||||
sph_blake512_close( &ctx.blake, hashB );
|
||||
// }
|
||||
break;
|
||||
case 1:
|
||||
if ( i == 0 )
|
||||
{
|
||||
case 0:
|
||||
sph_blake512( &ctx.blake, hashA, dataLen );
|
||||
sph_blake512_close( &ctx.blake, hashB );
|
||||
break;
|
||||
case 1:
|
||||
sph_bmw512( &ctx.bmw, hashA, dataLen );
|
||||
sph_bmw512_close( &ctx.bmw, hashB );
|
||||
break;
|
||||
case 2:
|
||||
memcpy( &ctx.bmw, &tt_mid.bmw, sizeof tt_mid.bmw );
|
||||
sph_bmw512( &ctx.bmw, input + midlen, tail );
|
||||
sph_bmw512_close( &ctx.bmw, hashB );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_bmw512( &ctx.bmw, hashA, dataLen );
|
||||
sph_bmw512_close( &ctx.bmw, hashB );
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512( &ctx.groestl, hashA, dataLen );
|
||||
sph_groestl512_close( &ctx.groestl, hashB );
|
||||
if ( i == 0 )
|
||||
{
|
||||
memcpy( &ctx.groestl, &tt_mid.groestl, sizeof tt_mid.groestl );
|
||||
sph_groestl512( &ctx.groestl, input + midlen, tail );
|
||||
sph_groestl512_close( &ctx.groestl, hashB );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_groestl512( &ctx.groestl, hashA, dataLen );
|
||||
sph_groestl512_close( &ctx.groestl, hashB );
|
||||
}
|
||||
#else
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hashB,
|
||||
(char*)hashA, dataLen*8 );
|
||||
|
||||
if ( i == 0 )
|
||||
{
|
||||
memcpy( &ctx.groestl, &tt_mid.groestl, sizeof tt_mid.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hashB,
|
||||
(char*)input + midlen, tail*8 );
|
||||
}
|
||||
else
|
||||
{
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hashB,
|
||||
(char*)hashA, dataLen*8 );
|
||||
}
|
||||
#endif
|
||||
break;
|
||||
case 3:
|
||||
sph_skein512( &ctx.skein, hashA, dataLen );
|
||||
sph_skein512_close( &ctx.skein, hashB );
|
||||
break;
|
||||
case 4:
|
||||
sph_jh512( &ctx.jh, hashA, dataLen );
|
||||
sph_jh512_close( &ctx.jh, hashB);
|
||||
break;
|
||||
case 5:
|
||||
sph_keccak512( &ctx.keccak, hashA, dataLen );
|
||||
sph_keccak512_close( &ctx.keccak, hashB );
|
||||
break;
|
||||
case 6:
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
|
||||
(const BitSequence*)hashA, dataLen );
|
||||
break;
|
||||
case 7:
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hashB,
|
||||
(const byte*) hashA, dataLen );
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
if ( i == 0 )
|
||||
{
|
||||
memcpy( &ctx.skein, &tt_mid.skein, sizeof tt_mid.skein );
|
||||
sph_skein512( &ctx.skein, input + midlen, tail );
|
||||
sph_skein512_close( &ctx.skein, hashB );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_skein512( &ctx.skein, hashA, dataLen );
|
||||
sph_skein512_close( &ctx.skein, hashB );
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
if ( i == 0 )
|
||||
{
|
||||
memcpy( &ctx.jh, &tt_mid.jh, sizeof tt_mid.jh );
|
||||
sph_jh512( &ctx.jh, input + midlen, tail );
|
||||
sph_jh512_close( &ctx.jh, hashB );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_jh512( &ctx.jh, hashA, dataLen );
|
||||
sph_jh512_close( &ctx.jh, hashB);
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
if ( i == 0 )
|
||||
{
|
||||
memcpy( &ctx.keccak, &tt_mid.keccak, sizeof tt_mid.keccak );
|
||||
sph_keccak512( &ctx.keccak, input + midlen, tail );
|
||||
sph_keccak512_close( &ctx.keccak, hashB );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_keccak512( &ctx.keccak, hashA, dataLen );
|
||||
sph_keccak512_close( &ctx.keccak, hashB );
|
||||
}
|
||||
break;
|
||||
case 6:
|
||||
// if ( i == 0 )
|
||||
// {
|
||||
// memcpy( &ctx.luffa, &tt_mid.luffa, sizeof tt_mid.luffa );
|
||||
// update_and_final_luffa( &ctx.luffa, hashB,
|
||||
// input + 64, 16 );
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
|
||||
hashA, dataLen );
|
||||
// }
|
||||
break;
|
||||
case 7:
|
||||
if ( i == 0 )
|
||||
{
|
||||
memcpy( &ctx.cube, &tt_mid.cube, sizeof tt_mid.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, hashB,
|
||||
input + midlen, tail );
|
||||
}
|
||||
else
|
||||
{
|
||||
cubehashUpdateDigest( &ctx.cube, hashB, hashA, dataLen );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
memcpy(output, &hash[16 * (HASH_FUNC_COUNT - 1)], 32);
|
||||
}
|
||||
@@ -191,52 +274,98 @@ void timetravel_hash(void *output, const void *input)
|
||||
int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
int i;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
int i;
|
||||
|
||||
if (opt_benchmark)
|
||||
ptarget[7] = 0x0cff;
|
||||
if (opt_benchmark)
|
||||
ptarget[7] = 0x0cff;
|
||||
|
||||
for (int k=0; k < 19; k++)
|
||||
be32enc(&endiandata[k], pdata[k]);
|
||||
for (int k=0; k < 19; k++)
|
||||
be32enc(&endiandata[k], pdata[k]);
|
||||
|
||||
const uint32_t timestamp = endiandata[17];
|
||||
if ( timestamp != s_ntime )
|
||||
const uint32_t timestamp = endiandata[17];
|
||||
if ( timestamp != s_ntime )
|
||||
{
|
||||
const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP )
|
||||
% HASH_FUNC_COUNT_PERMUTATIONS;
|
||||
for ( i = 0; i < HASH_FUNC_COUNT; i++ )
|
||||
permutation[i] = i;
|
||||
for ( i = 0; i < steps; i++ )
|
||||
next_permutation( permutation, permutation + HASH_FUNC_COUNT );
|
||||
s_ntime = timestamp;
|
||||
|
||||
// do midstate precalc for first function
|
||||
switch ( permutation[0] )
|
||||
{
|
||||
case 0:
|
||||
// memcpy( &tt_mid.blake, &tt_ctx.blake, sizeof(tt_mid.blake) );
|
||||
// sph_blake256( &tt_mid.blake, endiandata, 64 );
|
||||
break;
|
||||
case 1:
|
||||
memcpy( &tt_mid.bmw, &tt_ctx.bmw, sizeof(tt_mid.bmw) );
|
||||
sph_bmw512( &tt_mid.bmw, endiandata, 64 );
|
||||
break;
|
||||
case 2:
|
||||
#ifdef NO_AES_NI
|
||||
memcpy( &tt_mid.groestl, &tt_ctx.groestl, sizeof(tt_mid.groestl ) );
|
||||
sph_groestl512( &tt_mid.groestl, endiandata, 64 );
|
||||
#else
|
||||
memcpy( &tt_mid.groestl, &tt_ctx.groestl, sizeof(tt_mid.groestl ) );
|
||||
update_groestl( &tt_mid.groestl, (char*)endiandata, 64*8 );
|
||||
#endif
|
||||
break;
|
||||
case 3:
|
||||
memcpy( &tt_mid.skein, &tt_ctx.skein, sizeof(tt_mid.skein ) );
|
||||
sph_skein512( &tt_mid.skein, endiandata, 64 );
|
||||
break;
|
||||
case 4:
|
||||
memcpy( &tt_mid.jh, &tt_ctx.jh, sizeof(tt_mid.jh ) );
|
||||
sph_jh512( &tt_mid.jh, endiandata, 64 );
|
||||
break;
|
||||
case 5:
|
||||
memcpy( &tt_mid.keccak, &tt_ctx.keccak, sizeof(tt_mid.keccak ) );
|
||||
sph_keccak512( &tt_mid.keccak, endiandata, 64 );
|
||||
break;
|
||||
case 6:
|
||||
// init_luffa( &tt_mid.luffa, 512 );
|
||||
// memcpy( &tt_mid.luffa, &tt_ctx.luffa, sizeof(tt_mid.luffa ) );
|
||||
// update_luffa( &tt_mid.luffa, endiandata, 64 );
|
||||
break;
|
||||
case 7:
|
||||
memcpy( &tt_mid.cube, &tt_ctx.cube, sizeof(tt_mid.cube ) );
|
||||
cubehashUpdate( &tt_mid.cube, endiandata, 64 );
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
do {
|
||||
be32enc( &endiandata[19], nonce );
|
||||
timetravel_hash( hash, endiandata );
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
|
||||
{
|
||||
const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP )
|
||||
% HASH_FUNC_COUNT_PERMUTATIONS;
|
||||
for ( i = 0; i < HASH_FUNC_COUNT; i++ )
|
||||
permutation[i] = i;
|
||||
for ( i = 0; i < steps; i++ )
|
||||
next_permutation( permutation, permutation + HASH_FUNC_COUNT );
|
||||
s_ntime = timestamp;
|
||||
}
|
||||
work_set_target_ratio( work, hash );
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
}
|
||||
nonce++;
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], nonce);
|
||||
timetravel_hash(hash, endiandata);
|
||||
} while (nonce < max_nonce && !(*restart));
|
||||
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
work_set_target_ratio(work, hash);
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
}
|
||||
nonce++;
|
||||
|
||||
} while (nonce < max_nonce && !(*restart));
|
||||
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void timetravel_set_target( struct work* work, double job_diff )
|
||||
|
@@ -95,5 +95,6 @@ bool register_veltor_algo( algo_gate_t* gate )
|
||||
gate->hash = (void*)&veltorhash;
|
||||
gate->hash_alt = (void*)&veltorhash;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@@ -7,44 +7,58 @@
|
||||
#include <stdio.h>
|
||||
#include "sph_whirlpool.h"
|
||||
|
||||
typedef struct {
|
||||
sph_whirlpool_context whirl1;
|
||||
sph_whirlpool_context whirl2;
|
||||
sph_whirlpool_context whirl3;
|
||||
sph_whirlpool_context whirl4;
|
||||
} whirlpool_ctx_holder;
|
||||
|
||||
static whirlpool_ctx_holder whirl_ctx;
|
||||
static __thread sph_whirlpool_context whirl1_mid_ctx;
|
||||
|
||||
void init_whirlpool_ctx()
|
||||
{
|
||||
sph_whirlpool1_init( &whirl_ctx.whirl1 );
|
||||
sph_whirlpool1_init( &whirl_ctx.whirl2 );
|
||||
sph_whirlpool1_init( &whirl_ctx.whirl3 );
|
||||
sph_whirlpool1_init( &whirl_ctx.whirl4 );
|
||||
}
|
||||
|
||||
void whirlpool_hash(void *state, const void *input)
|
||||
{
|
||||
sph_whirlpool_context ctx_whirlpool;
|
||||
whirlpool_ctx_holder ctx;
|
||||
memcpy( &ctx, &whirl_ctx, sizeof(whirl_ctx) );
|
||||
|
||||
const int midlen = 64;
|
||||
const int tail = 80 - midlen;
|
||||
unsigned char hash[128]; // uint32_t hashA[16], hashB[16];
|
||||
#define hashB hash+64
|
||||
|
||||
memset(hash, 0, sizeof hash);
|
||||
// copy cached midstate
|
||||
memcpy( &ctx.whirl1, &whirl1_mid_ctx, sizeof whirl1_mid_ctx );
|
||||
sph_whirlpool1( &ctx.whirl1, input + midlen, tail );
|
||||
sph_whirlpool1_close(&ctx.whirl1, hash);
|
||||
|
||||
sph_whirlpool1_init(&ctx_whirlpool);
|
||||
sph_whirlpool1(&ctx_whirlpool, input, 80);
|
||||
sph_whirlpool1_close(&ctx_whirlpool, hash);
|
||||
sph_whirlpool1(&ctx.whirl2, hash, 64);
|
||||
sph_whirlpool1_close(&ctx.whirl2, hashB);
|
||||
|
||||
sph_whirlpool1_init(&ctx_whirlpool);
|
||||
sph_whirlpool1(&ctx_whirlpool, hash, 64);
|
||||
sph_whirlpool1_close(&ctx_whirlpool, hashB);
|
||||
sph_whirlpool1(&ctx.whirl3, hashB, 64);
|
||||
sph_whirlpool1_close(&ctx.whirl3, hash);
|
||||
|
||||
sph_whirlpool1_init(&ctx_whirlpool);
|
||||
sph_whirlpool1(&ctx_whirlpool, hashB, 64);
|
||||
sph_whirlpool1_close(&ctx_whirlpool, hash);
|
||||
|
||||
sph_whirlpool1_init(&ctx_whirlpool);
|
||||
sph_whirlpool1(&ctx_whirlpool, hash, 64);
|
||||
sph_whirlpool1_close(&ctx_whirlpool, hash);
|
||||
sph_whirlpool1(&ctx.whirl4, hash, 64);
|
||||
sph_whirlpool1_close(&ctx.whirl4, hash);
|
||||
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
|
||||
void whirlpool_midstate(void *state, const void *input)
|
||||
void whirlpool_midstate( const void* input )
|
||||
{
|
||||
sph_whirlpool_context ctx;
|
||||
|
||||
sph_whirlpool1_init(&ctx);
|
||||
sph_whirlpool1(&ctx, input, 64);
|
||||
|
||||
memcpy(state, ctx.state, 64);
|
||||
memcpy( &whirl1_mid_ctx, &whirl_ctx.whirl1, sizeof whirl1_mid_ctx );
|
||||
sph_whirlpool1( &whirl1_mid_ctx, input, 64 );
|
||||
}
|
||||
|
||||
|
||||
int scanhash_whirlpool(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
|
||||
{
|
||||
uint32_t _ALIGN(128) endiandata[20];
|
||||
@@ -59,6 +73,8 @@ int scanhash_whirlpool(int thr_id, struct work* work, uint32_t max_nonce, unsign
|
||||
for (int i=0; i < 19; i++)
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
|
||||
whirlpool_midstate( endiandata );
|
||||
|
||||
do {
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t vhash[8];
|
||||
@@ -82,9 +98,9 @@ int scanhash_whirlpool(int thr_id, struct work* work, uint32_t max_nonce, unsign
|
||||
|
||||
bool register_whirlpool_algo( algo_gate_t* gate )
|
||||
{
|
||||
algo_not_tested();
|
||||
gate->scanhash = (void*)&scanhash_whirlpool;
|
||||
gate->hash = (void*)&whirlpool_hash;
|
||||
init_whirlpool_ctx();
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -71,10 +71,18 @@ void init_x11evo_ctx()
|
||||
sph_shavite512_init( &x11evo_ctx.shavite );
|
||||
}
|
||||
|
||||
/*
|
||||
uint32_t getCurrentAlgoSeq(uint32_t current_time, uint32_t base_time)
|
||||
{
|
||||
return (current_time - base_time) / (60 * 60 * 24);
|
||||
}
|
||||
*/
|
||||
|
||||
static inline int getCurrentAlgoSeq( uint32_t current_time )
|
||||
{
|
||||
// change once per day
|
||||
return (int) (current_time - INITIAL_DATE) / (60 * 60 * 24);
|
||||
}
|
||||
|
||||
// swap_vars doesn't work here
|
||||
void evo_swap( uint8_t *a, uint8_t *b )
|
||||
@@ -136,41 +144,37 @@ void getAlgoString( char *str, uint32_t count )
|
||||
//applog(LOG_DEBUG, "nextPerm %s", str);
|
||||
}
|
||||
|
||||
// Broken on Windows
|
||||
#if !((defined(__WINDOWS__)) || (defined(__WIN64)))
|
||||
static __thread uint32_t saved_ntime = UINT32_MAX;
|
||||
#endif
|
||||
static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
static int s_seq = -1;
|
||||
|
||||
void evocoin_twisted_code( char *result, char *code )
|
||||
static void evo_twisted_code(uint32_t ntime, char *permstr)
|
||||
{
|
||||
uint32_t h32, *be32 = get_stratum_job_ntime();
|
||||
#if !((defined(__WINDOWS__)) || (defined(__WIN64)))
|
||||
if ( *be32 != saved_ntime )
|
||||
{
|
||||
#endif
|
||||
h32 = be32toh(*be32);
|
||||
uint32_t count = getCurrentAlgoSeq(h32, INITIAL_DATE);
|
||||
getAlgoString(code, count);
|
||||
sprintf(result, "_%d_%s_", count, code);
|
||||
#if !((defined(__WINDOWS__)) || (defined(__WIN64)))
|
||||
saved_ntime = *be32;
|
||||
}
|
||||
#endif
|
||||
int seq = getCurrentAlgoSeq(ntime);
|
||||
if (s_seq != seq)
|
||||
{
|
||||
getAlgoString(permstr, seq);
|
||||
s_seq = seq;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void x11evo_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t hash[16];
|
||||
char completeCode[64];
|
||||
char resultCode[HASH_FUNC_COUNT + 1];
|
||||
x11evo_ctx_holder ctx;
|
||||
memcpy( &ctx, &x11evo_ctx, sizeof(x11evo_ctx) );
|
||||
evocoin_twisted_code( completeCode, resultCode );
|
||||
|
||||
if ( s_seq == -1 )
|
||||
{
|
||||
uint32_t *data = (uint32_t*) input;
|
||||
const uint32_t ntime = data[17];
|
||||
evo_twisted_code(ntime, hashOrder);
|
||||
}
|
||||
|
||||
int i;
|
||||
for ( i = 0; i < strlen(resultCode); i++ )
|
||||
for ( i = 0; i < strlen(hashOrder); i++ )
|
||||
{
|
||||
char elem = resultCode[i];
|
||||
char elem = hashOrder[i];
|
||||
uint8_t idx;
|
||||
if (elem >= 'A')
|
||||
idx = elem - 'A' + 10;
|
||||
@@ -196,8 +200,6 @@ static inline void x11evo_hash( void *state, const void *input )
|
||||
#else
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
// update_groestl( &ctx.groestl, (char*)hash, 512 );
|
||||
// final_groestl( &ctx.groestl, (char*)hash );
|
||||
#endif
|
||||
break;
|
||||
case 3:
|
||||
@@ -215,14 +217,10 @@ static inline void x11evo_hash( void *state, const void *input )
|
||||
case 6:
|
||||
update_and_final_luffa( &ctx.luffa, (char*)hash,
|
||||
(const char*)hash, 64 );
|
||||
// update_luffa( &ctx.luffa, (char*)hash, 64 );
|
||||
// final_luffa( &ctx.luffa, (char*)hash );
|
||||
break;
|
||||
case 7:
|
||||
cubehashUpdateDigest( &ctx.cube, (char*)hash,
|
||||
(const char*)hash, 64 );
|
||||
// cubehashUpdate( &ctx.cube, (char*)hash, 64 );
|
||||
// cubehashDigest( &ctx.cube, (char*)hash );
|
||||
break;
|
||||
case 8:
|
||||
sph_shavite512( &ctx.shavite, (char*)hash, size );
|
||||
@@ -239,8 +237,6 @@ static inline void x11evo_hash( void *state, const void *input )
|
||||
#else
|
||||
update_final_echo( &ctx.echo, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
// update_echo( &ctx.echo, (char*)hash, 512 );
|
||||
// final_echo( &ctx.echo, (char*)hash );
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
@@ -263,6 +259,13 @@ int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
int ntime = endiandata[17];
|
||||
if ( ntime != s_ntime || s_seq == -1 )
|
||||
{
|
||||
evo_twisted_code( ntime, hashOrder );
|
||||
s_ntime = ntime;
|
||||
}
|
||||
|
||||
uint32_t hmask = 0xFFFFFFFF;
|
||||
if ( Htarg > 0 )
|
||||
{
|
||||
|
249
avxdefs.h
249
avxdefs.h
@@ -34,6 +34,140 @@ uint16_t v16[ 8];
|
||||
uint8_t v8 [16];
|
||||
} area128;
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// Replacements for vectorized data
|
||||
// n = number of __m256i (32 bytes)
|
||||
inline void memset_zero_m256i( __m256i *dst, int n )
|
||||
{
|
||||
for ( int i = 0; i < n; i++ ) dst[i] = _mm256_setzero_si256();
|
||||
}
|
||||
|
||||
inline void memset_m256i( __m256i *dst, const __m256i a, int n )
|
||||
{
|
||||
for ( int i = 0; i < n; i++ ) dst[i] = a;
|
||||
}
|
||||
|
||||
|
||||
// optimized copying, first fit is usually best. If none of these works there
|
||||
// are __m128i versions or plain memcpy.
|
||||
|
||||
// Fixed size
|
||||
|
||||
// multi buffered copy for 64 bytes, the size of a cache line.
|
||||
// minimum alignment is 32 bytes, optimum for cache is 64.
|
||||
// src & dst are __m256i*
|
||||
inline void mcpy64_m256i( __m256i* dst, const __m256i* src )
|
||||
{
|
||||
const __m256i* dest = dst;
|
||||
const __m256i* srce = src;
|
||||
__m256i a = _mm256_load_si256( srce );
|
||||
__m256i b = _mm256_load_si256( srce + 1 );
|
||||
_mm256_store_si256( dest, a );
|
||||
_mm256_store_si256( dest + 1, b );
|
||||
}
|
||||
|
||||
inline void mcpy96_m256i( __m256i* dst, const __m256i* src )
|
||||
{
|
||||
const __m256i* dest = dst;
|
||||
const __m256i* srce = src;
|
||||
__m256i a = _mm256_load_si256( srce );
|
||||
__m256i b = _mm256_load_si256( srce + 1 );
|
||||
_mm256_store_si256( dest, a );
|
||||
__m256i c = _mm256_load_si256( srce + 2 );
|
||||
_mm256_store_si256( dest + 1, b );
|
||||
_mm256_store_si256( dest + 2, c );
|
||||
}
|
||||
|
||||
inline void mcpy128_m256i( __m256i* dst, const __m256i* src )
|
||||
{
|
||||
const __m256i* dest = dst;
|
||||
const __m256i* srce = src;
|
||||
__m256i a = _mm256_load_si256( srce );
|
||||
__m256i b = _mm256_load_si256( srce + 1 );
|
||||
__m256i c = _mm256_load_si256( srce + 2 );
|
||||
_mm256_store_si256( dest , a );
|
||||
__m256i d = _mm256_load_si256( srce + 3 );
|
||||
_mm256_store_si256( dest + 1, b );
|
||||
a = _mm256_load_si256( srce + 4 );
|
||||
_mm256_store_si256( dest + 2, c );
|
||||
b = _mm256_load_si256( srce + 5 );
|
||||
_mm256_store_si256( dest + 3, d );
|
||||
c = _mm256_load_si256( srce + 6 );
|
||||
_mm256_store_si256( dest + 4, a );
|
||||
d = _mm256_load_si256( srce + 7 );
|
||||
_mm256_store_si256( dest + 5, b );
|
||||
_mm256_store_si256( dest + 6, c );
|
||||
_mm256_store_si256( dest + 7, d );
|
||||
}
|
||||
|
||||
// Variable size
|
||||
|
||||
// copy multiples of 64 bytes using quad buffering with interleave
|
||||
// of first read of next line with last write of current line.
|
||||
// n is a multiple of 32 bytes (_m256i size)
|
||||
// minimum alignment: 32 bytes
|
||||
// optimum alignment: 64 bytes (cache line size)
|
||||
// minimum size.....: 128 bytes (4*n)
|
||||
// recommended size.: 256+ bytes
|
||||
// minimum increment: 128 bytes
|
||||
// Only the first load or store in a cache line triggers a memory access.
|
||||
// the subsequent actions are trivial because they benefit from data
|
||||
// cached by the first.
|
||||
// Priming the second cache line is done before dumping the first to
|
||||
// give read priority to ensure there are no gaps in data available to
|
||||
// the cpu caused by waiting for data to be written back.
|
||||
|
||||
inline void mcpy_m256i_x4( __m256i *dst, const __m256i *src, const int n )
|
||||
{
|
||||
const __m256i* dest = dst;
|
||||
const __m256i* srce = src;
|
||||
|
||||
// preload 1 cache line to absorb startup latency
|
||||
__m256i a = _mm256_load_si256( srce );
|
||||
__m256i b = _mm256_load_si256( srce + 1 );
|
||||
// start loading second line, queue while waiting
|
||||
__m256i c = _mm256_load_si256( srce + 2 );
|
||||
// start writing first line, as soon as data available,
|
||||
// second line read will have priority on the bus
|
||||
_mm256_store_si256( dest, a );
|
||||
__m256i d;
|
||||
|
||||
int i;
|
||||
const int loops = n/4 - 1;
|
||||
|
||||
for ( i = 0; i < loops; i++ )
|
||||
{
|
||||
const int i4 = i*4;
|
||||
const __m256i* si4 = (__m256i*)(srce + i4);
|
||||
const __m256i* di4 = (__m256i*)(dest + i4);
|
||||
|
||||
d = _mm256_load_si256( si4 + 3 );
|
||||
_mm256_store_si256( di4 + 1, b );
|
||||
// start loading next line
|
||||
a = _mm256_load_si256( si4 + 4 );
|
||||
_mm256_store_si256( di4 + 2, c );
|
||||
b = _mm256_load_si256( si4 + 5 );
|
||||
_mm256_store_si256( di4 + 3, d );
|
||||
c = _mm256_load_si256( si4 + 6 );
|
||||
// start writing next line
|
||||
_mm256_store_si256( di4 + 4, a );
|
||||
}
|
||||
// finish last line
|
||||
d = _mm256_load_si256( srce + n - 4 );
|
||||
_mm256_store_si256( dest + n - 3, b );
|
||||
_mm256_store_si256( dest + n - 2, c );
|
||||
_mm256_store_si256( dest + n - 1, d );
|
||||
}
|
||||
|
||||
// basic __m256i memcpy
|
||||
|
||||
inline void memcpy_m256i( __m256i *dst, const __m256i *src, int n )
|
||||
{
|
||||
for ( int i = 0; i < n; i ++ ) dst[i] = src[i];
|
||||
}
|
||||
|
||||
|
||||
// For cheating with pointer types
|
||||
|
||||
// p = any aligned pointer
|
||||
@@ -63,8 +197,6 @@ uint8_t v8 [16];
|
||||
// dst = a( b, a[127:0] ) ; mask == 1
|
||||
//__m256i _mm256_inserti128_si256(__m256i a, __m128i b, const int mask);
|
||||
|
||||
#if defined __AVX2__
|
||||
|
||||
// Rotate bits in 4 uint64 (3 instructions)
|
||||
// __m256i mm256_rotr_64( __256i, int )
|
||||
#define mm256_rotr_64( w, c ) \
|
||||
@@ -141,6 +273,119 @@ uint8_t v8 [16];
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
// Replacements for vectorized data
|
||||
|
||||
inline void memset_zero_m128i( __m128i *dst, int n )
|
||||
{
|
||||
for ( int i = 0; i < n; i++ ) dst[i] = _mm_setzero_si128();
|
||||
}
|
||||
|
||||
inline void memset_m128i( __m128i *dst, const __m128i a, int n )
|
||||
{
|
||||
for ( int i = 0; i < n; i++ ) dst[i] = a;
|
||||
}
|
||||
|
||||
// __m128i versions of optimized copying
|
||||
|
||||
inline void mcpy32_m128i( __m128i* dst, const __m128i* src )
|
||||
{
|
||||
const __m128i* dest = dst;
|
||||
const __m128i* srce = src;
|
||||
// 4 loads fills cache line
|
||||
__m128i a = _mm_load_si128( srce );
|
||||
__m128i b = _mm_load_si128( srce + 1 );
|
||||
_mm_store_si128( dest, a );
|
||||
_mm_store_si128( dest + 1, b );
|
||||
}
|
||||
|
||||
inline void mcpy64_m128i( __m128i* dst, const __m128i* src )
|
||||
{
|
||||
const __m128i* dest = dst;
|
||||
const __m128i* srce = src;
|
||||
// 4 loads fills cache line
|
||||
__m128i a = _mm_load_si128( srce );
|
||||
__m128i b = _mm_load_si128( srce + 1 );
|
||||
__m128i c = _mm_load_si128( srce + 2 );
|
||||
__m128i d = _mm_load_si128( srce + 3 );
|
||||
// need to store a before overwriting it
|
||||
_mm_store_si128( dest, a );
|
||||
a = _mm_load_si128( srce + 4 );
|
||||
_mm_store_si128( dest + 1, b );
|
||||
b = _mm_load_si128( srce + 5 );
|
||||
_mm_store_si128( dest + 2, c );
|
||||
c = _mm_load_si128( srce + 6 );
|
||||
_mm_store_si128( dest + 3, d );
|
||||
d = _mm_load_si128( srce + 7 );
|
||||
_mm_store_si128( dest + 4, a );
|
||||
d = _mm_load_si128( srce + 7 );
|
||||
_mm_store_si128( dest + 5, b );
|
||||
_mm_store_si128( dest + 6, c );
|
||||
_mm_store_si128( dest + 7, d );
|
||||
}
|
||||
|
||||
// Variable length
|
||||
|
||||
// copy multiples of 16 bytes using quad buffering.
|
||||
// n is a multiple of 16 bytes (__m128i size)
|
||||
// minimum alignment: 16 bytes
|
||||
// optimum alignment: 64 bytes (cache line size)
|
||||
// minimum size.....: 32 bytes (4*n)
|
||||
// recommended size.: 96+ bytes
|
||||
// minimum increment: 32 bytes
|
||||
inline void memcpy_m128i_x4( __m128i *dst, const __m128i *src, const int n )
|
||||
{
|
||||
// preload 1 cache line to absorb startup latency
|
||||
__m128i a = _mm_load_si128( src );
|
||||
__m128i b = _mm_load_si128( src + 1 );
|
||||
__m128i c = _mm_load_si128( src + 2 );
|
||||
__m128i d = _mm_load_si128( src + 3 );
|
||||
|
||||
int i;
|
||||
const int loops = n/4 - 1;
|
||||
const __m128i* dst_n = (__m128i*)(dst + n);
|
||||
|
||||
for ( i = 0; i < loops; i++ )
|
||||
{
|
||||
const int i4 = i*4;
|
||||
const __m128i* si4 = (__m128i*)(src + i4);
|
||||
const __m128i* di4 = (__m128i*)(dst + i4);
|
||||
|
||||
// need to free a before overwriting it
|
||||
_mm_store_si128( di4, a );
|
||||
a = _mm_load_si128( di4 + 4 );
|
||||
_mm_store_si128( di4 + 1, b );
|
||||
b = _mm_load_si128( di4 + 5 );
|
||||
_mm_store_si128( di4 + 2, c );
|
||||
c = _mm_load_si128( di4 + 6 );
|
||||
_mm_store_si128( di4 + 3, d );
|
||||
d = _mm_load_si128( di4 + 7 );
|
||||
}
|
||||
_mm_store_si128( dst_n - 4, a );
|
||||
_mm_store_si128( dst_n - 3, b );
|
||||
_mm_store_si128( dst_n - 2, c );
|
||||
_mm_store_si128( dst_n - 1, d );
|
||||
}
|
||||
|
||||
// basic __m128i copy
|
||||
inline void memcpy_m128i( __m128i *dst, const __m128i *src, int n )
|
||||
{
|
||||
for ( int i = 0; i < n; i ++ ) dst[i] = src[i];
|
||||
}
|
||||
|
||||
// For cheating with pointer types
|
||||
|
||||
// p = any aligned pointer
|
||||
// returns p as pointer to vector type
|
||||
#define castp_m128i(p) ((__m128i*)(p))
|
||||
|
||||
// p = any aligned pointer
|
||||
// returns *p, watch your pointer arithmetic
|
||||
#define cast_m128i(p) (*((__m128i*)(p)))
|
||||
|
||||
// p = any aligned pointer, i = scaled array index
|
||||
// returns p[i]
|
||||
#define casti_m128i(p,i) (((__m128i*)(p))[(i)])
|
||||
|
||||
// rotate bits in 2 uint64
|
||||
// _m128i mm_rotr_64( __m128i, int )
|
||||
#define mm_rotr_64(w,c) _mm_or_si128(_mm_srli_epi64(w, c), \
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.5.5])
|
||||
AC_INIT([cpuminer-opt], [3.5.6])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
140
cpu-miner.c
140
cpu-miner.c
@@ -2156,79 +2156,85 @@ static void *stratum_thread(void *userdata )
|
||||
}
|
||||
}
|
||||
|
||||
while (!stratum.curl)
|
||||
{
|
||||
pthread_mutex_lock(&g_work_lock);
|
||||
g_work_time = 0;
|
||||
pthread_mutex_unlock(&g_work_lock);
|
||||
restart_threads();
|
||||
if (!stratum_connect(&stratum, stratum.url)
|
||||
|| !stratum_subscribe(&stratum)
|
||||
|| !stratum_authorize(&stratum, rpc_user, rpc_pass))
|
||||
{
|
||||
stratum_disconnect(&stratum);
|
||||
if (opt_retries >= 0 && ++failures > opt_retries)
|
||||
{
|
||||
applog(LOG_ERR, "...terminating workio thread");
|
||||
tq_push(thr_info[work_thr_id].q, NULL);
|
||||
goto out;
|
||||
}
|
||||
if (!opt_benchmark)
|
||||
applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
|
||||
|
||||
sleep(opt_fail_pause);
|
||||
}
|
||||
while ( !stratum.curl )
|
||||
{
|
||||
pthread_mutex_lock(&g_work_lock);
|
||||
g_work_time = 0;
|
||||
pthread_mutex_unlock(&g_work_lock);
|
||||
restart_threads();
|
||||
if (!stratum_connect(&stratum, stratum.url)
|
||||
|| !stratum_subscribe(&stratum)
|
||||
|| !stratum_authorize(&stratum, rpc_user, rpc_pass))
|
||||
{
|
||||
stratum_disconnect(&stratum);
|
||||
if (opt_retries >= 0 && ++failures > opt_retries)
|
||||
{
|
||||
applog(LOG_ERR, "...terminating workio thread");
|
||||
tq_push(thr_info[work_thr_id].q, NULL);
|
||||
goto out;
|
||||
}
|
||||
if (!opt_benchmark)
|
||||
applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
|
||||
sleep(opt_fail_pause);
|
||||
}
|
||||
|
||||
if (jsonrpc_2)
|
||||
{
|
||||
work_free(&g_work);
|
||||
work_copy(&g_work, &stratum.work);
|
||||
}
|
||||
}
|
||||
if (jsonrpc_2)
|
||||
{
|
||||
work_free(&g_work);
|
||||
work_copy(&g_work, &stratum.work);
|
||||
}
|
||||
}
|
||||
|
||||
if (stratum.job.job_id &&
|
||||
(!g_work_time || strcmp(stratum.job.job_id, g_work.job_id)) )
|
||||
{
|
||||
pthread_mutex_lock(&g_work_lock);
|
||||
algo_gate.stratum_gen_work( &stratum, &g_work );
|
||||
time(&g_work_time);
|
||||
pthread_mutex_unlock(&g_work_lock);
|
||||
if (stratum.job.job_id &&
|
||||
(!g_work_time || strcmp(stratum.job.job_id, g_work.job_id)) )
|
||||
{
|
||||
pthread_mutex_lock(&g_work_lock);
|
||||
algo_gate.stratum_gen_work( &stratum, &g_work );
|
||||
time(&g_work_time);
|
||||
pthread_mutex_unlock(&g_work_lock);
|
||||
restart_threads();
|
||||
|
||||
if (stratum.job.clean || jsonrpc_2)
|
||||
{
|
||||
static uint32_t last_bloc_height;
|
||||
if (!opt_quiet && last_bloc_height != stratum.bloc_height)
|
||||
{
|
||||
last_bloc_height = stratum.bloc_height;
|
||||
if (net_diff > 0.)
|
||||
applog(LOG_BLUE, "%s block %d, diff %.3f",
|
||||
algo_names[opt_algo], stratum.bloc_height, net_diff);
|
||||
else
|
||||
applog(LOG_BLUE, "%s %s block %d", short_url,
|
||||
algo_names[opt_algo], stratum.bloc_height);
|
||||
}
|
||||
restart_threads();
|
||||
}
|
||||
else if (opt_debug && !opt_quiet)
|
||||
{
|
||||
if (stratum.job.clean || jsonrpc_2)
|
||||
{
|
||||
static uint32_t last_bloc_height;
|
||||
if ( last_bloc_height != stratum.bloc_height )
|
||||
{
|
||||
last_bloc_height = stratum.bloc_height;
|
||||
if ( !opt_quiet )
|
||||
{
|
||||
if (net_diff > 0.)
|
||||
applog(LOG_BLUE, "%s block %d, diff %.3f",
|
||||
algo_names[opt_algo], stratum.bloc_height, net_diff);
|
||||
else
|
||||
applog(LOG_BLUE, "%s %s block %d", short_url,
|
||||
algo_names[opt_algo], stratum.bloc_height);
|
||||
}
|
||||
}
|
||||
restart_threads();
|
||||
}
|
||||
else if (opt_debug && !opt_quiet)
|
||||
{
|
||||
applog(LOG_BLUE, "%s asks job %d for block %d", short_url,
|
||||
strtoul(stratum.job.job_id, NULL, 16), stratum.bloc_height);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!stratum_socket_full(&stratum, opt_timeout)) {
|
||||
applog(LOG_ERR, "Stratum connection timeout");
|
||||
s = NULL;
|
||||
} else
|
||||
s = stratum_recv_line(&stratum);
|
||||
if (!s) {
|
||||
stratum_disconnect(&stratum);
|
||||
applog(LOG_ERR, "Stratum connection interrupted");
|
||||
continue;
|
||||
}
|
||||
if (!stratum_handle_method(&stratum, s))
|
||||
stratum_handle_response(s);
|
||||
free(s);
|
||||
if ( !stratum_socket_full( &stratum, opt_timeout ) )
|
||||
{
|
||||
applog(LOG_ERR, "Stratum connection timeout");
|
||||
s = NULL;
|
||||
}
|
||||
else
|
||||
s = stratum_recv_line(&stratum);
|
||||
if ( !s )
|
||||
{
|
||||
stratum_disconnect(&stratum);
|
||||
applog(LOG_ERR, "Stratum connection interrupted");
|
||||
continue;
|
||||
}
|
||||
if (!stratum_handle_method(&stratum, s))
|
||||
stratum_handle_response(s);
|
||||
free(s);
|
||||
}
|
||||
out:
|
||||
return NULL;
|
||||
|
Reference in New Issue
Block a user