This commit is contained in:
Jay D Dee
2017-02-12 12:43:08 -05:00
parent 1ee41348f4
commit 8efab74183
20 changed files with 1891 additions and 1294 deletions

View File

@@ -85,10 +85,11 @@ performance.
2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
Centos are known to work and have all dependencies in their repositories.
Others may work but may require more effort. 64 bit Windows OS is now supported
with mingw_w64 and msys.
Others may work but may require more effort.
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
3. Stratum pool, cpuminer-opt only supports stratum minning.
3. Stratum pool, cpuminer-opt only supports stratum minning. Some algos
may work wallet mining but there are no guarantees.
Errata
------
@@ -96,7 +97,6 @@ Errata
cpuminer-opt does not work mining Decred algo at Nicehash and produces
only "invalid extranonce2 size" rejects.
x11evo optimizations not available on Windows.
Benchmark testing does not work for x11evo.
Bugs

View File

@@ -3,6 +3,18 @@ Compile instruction for Linux and Windows are at the bottom of this file.
Change Log
----------
v3.5.6
Updated Lyra2z for new zcoin algo post block 2050.
Cleanup up Lyra2 code and increased performance
- Lyra2REv2 +11%
- Lyra2RE +6%
- Lyra2Z (zcoin) +12%
Fixed performance of x11evo on Windows to match Linux.
Timetravel 3% to 5% faster
Whirlpool algo 15% faster.
Removed aclocal.m4 from .gitignore.
v3.5.5
x11evo fixed on Windows but at reduced performance.

View File

@@ -3,6 +3,7 @@
#include "cryptonight.h"
#include "miner.h"
#include "crypto/c_keccak.h"
#include "avxdefs.h"
void aesni_parallel_noxor(uint8_t *long_state, uint8_t *text, uint8_t *ExpandedKey);
void aesni_parallel_xor(uint8_t *text, uint8_t *ExpandedKey, uint8_t *long_state);
@@ -147,6 +148,11 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
_mm_store_si128(&(longoutput[(i >> 4) + 7]), xmminput[7]);
}
// cast_m128i( ctx.a ) = _mm_xor_si128( casti_m128i( ctx.state.k, 0 ) ,
// casti_m128i( ctx.state.k, 2 ) );
// cast_m128i( ctx.b ) = _mm_xor_si128( casti_m128i( ctx.state.k, 1 ),
// casti_m128i( ctx.state.k, 3 ) );
ctx.a[0] = ((uint64_t *)ctx.state.k)[0] ^ ((uint64_t *)ctx.state.k)[4];
ctx.b[0] = ((uint64_t *)ctx.state.k)[2] ^ ((uint64_t *)ctx.state.k)[6];
ctx.a[1] = ((uint64_t *)ctx.state.k)[1] ^ ((uint64_t *)ctx.state.k)[5];
@@ -196,9 +202,12 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
a[1] += lo;
}
uint64_t *dst = (uint64_t*)&ctx.long_state[c[0] & 0x1FFFF0];
// cast_m128i( dst ) = cast_m128i( a );
dst[0] = a[0];
dst[1] = a[1];
// cast_m128i( a ) = _mm_xor_si128( cast_m128i( a ), cast_m128i( b ) );
a[0] ^= b[0];
a[1] ^= b[1];
b_x = c_x;

View File

@@ -275,7 +275,7 @@ HashReturn init_luffa(hashState_luffa *state, int hashbitlen)
CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
for ( i=0; i<10; i++ )
state->chainv[i] = _mm_load_si128( (__m128i*)&IV[i*4] );
// memset(state->buffer, 0, sizeof state->buffer );
memset(state->buffer, 0, sizeof state->buffer );
return SUCCESS;
}

View File

@@ -21,6 +21,7 @@
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <mm_malloc.h>
#include "compat.h"
#include "lyra2.h"
#include "sponge.h"
@@ -45,10 +46,9 @@
* @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
*/
// Lyra2RE & Lyra2REv2, nRows must be a power of 2
int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
const void *salt, uint64_t saltlen, uint64_t timeCost,
const uint64_t nRows, const uint64_t nCols )
int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
{
//====================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
@@ -71,26 +71,21 @@ int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
// for Lyra2REv2, nCols = 4, v1 was using 8
const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
/*
i = (int64_t)ROW_LEN_BYTES * nRows;
uint64_t *wholeMatrix = malloc(i);
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
if (wholeMatrix == NULL)
return -1;
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
#else
memset(wholeMatrix, 0, i);
//Allocates pointers to each row of the matrix
uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows);
if (memMatrix == NULL)
return -1;
//Places the pointers in the correct positions
#endif
*/
uint64_t *ptrWord = wholeMatrix;
for (i = 0; i < nRows; i++)
{
memMatrix[i] = ptrWord;
ptrWord += ROW_LEN_INT64;
}
//=== Getting the password + salt + basil padded with 10*1 ==========//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
@@ -140,31 +135,36 @@ int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
//================= Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
initState(state);
initState( state );
//========================= Setup Phase =============================//
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
ptrWord = wholeMatrix;
for (i = 0; i < nBlocksInput; i++)
{
absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
}
//Initializes M[0] and M[1]
reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
nCols);
do
{
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
&wholeMatrix[rowa*ROW_LEN_INT64],
&wholeMatrix[row*ROW_LEN_INT64], nCols );
//updates the value of row* (deterministically picked during Setup))
rowa = (rowa + step) & (window - 1);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
row++;
@@ -190,12 +190,14 @@ int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
//Selects a pseudorandom index row*
//-----------------------------------------------
rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//-------------------------------------------
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
&wholeMatrix[rowa*ROW_LEN_INT64],
&wholeMatrix[row*ROW_LEN_INT64], nCols );
//update prev: it now points to the last row ever computed
prev = row;
@@ -210,22 +212,17 @@ int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
//===================== Wrap-up Phase ===============================//
//Absorbs the last block of the memory matrix
absorbBlock(state, memMatrix[rowa]);
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
//Squeezes the key
squeeze(state, K, (unsigned int) kLen);
//================== Freeing the memory =============================//
free(memMatrix);
free(wholeMatrix);
// free(wholeMatrix);
return 0;
}
// Zcoin, nRows may be any value
int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
const void *salt, uint64_t saltlen, uint64_t timeCost,
uint64_t nRows, uint64_t nCols )
int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols )
{
//========================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
@@ -244,33 +241,27 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
/*
i = (int64_t)ROW_LEN_BYTES * nRows;
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES);
uint64_t *wholeMatrix = malloc(i);
if (wholeMatrix == NULL)
if (wholeMatrix == NULL)
return -1;
memset(wholeMatrix, 0, i);
//Allocates pointers to each row of the matrix
uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*));
if (memMatrix == NULL)
return -1;
//Places the pointers in the correct positions
uint64_t *ptrWord = wholeMatrix;
for (i = 0; i < nRows; i++)
{
memMatrix[i] = ptrWord;
ptrWord += ROW_LEN_INT64;
}
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
#else
memset(wholeMatrix, 0, i);
#endif
*/
//==== Getting the password + salt + basil padded with 10*1 ============//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
//but this ensures that the password copied locally will be overwritten as soon as possible
//First, we clean enough blocks for the password, salt, basil and padding
uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof (uint64_t) )
/ BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
byte *ptrByte = (byte*) wholeMatrix;
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES );
@@ -281,7 +272,6 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
//Concatenates the salt
memcpy(ptrByte, salt, saltlen);
ptrByte += saltlen;
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
memcpy(ptrByte, &kLen, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
@@ -304,11 +294,15 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
//=================== Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
// uint64_t *state = _mm_malloc(16 * sizeof(uint64_t), 32);
// if (state == NULL) {
// return -1;
// }
initState( state );
//============================== Setup Phase =============================//
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
ptrWord = wholeMatrix;
uint64_t *ptrWord = wholeMatrix;
for ( i = 0; i < nBlocksInput; i++ )
{
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
@@ -316,31 +310,28 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
}
//Initializes M[0] and M[1]
reducedSqueezeRow0( state, memMatrix[0], nCols ); //The locally copied password is most likely overwritten here
reducedDuplexRow1( state, memMatrix[0], memMatrix[1], nCols );
reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here
reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols);
do
{
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
reducedDuplexRowSetup( state, memMatrix[prev], memMatrix[rowa],
memMatrix[row], nCols );
do {
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
reducedDuplexRowSetup(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
//updates the value of row* (deterministically picked during Setup))
rowa = (rowa + step) & (window - 1);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
row++;
//updates the value of row* (deterministically picked during Setup))
rowa = (rowa + step) & (window - 1);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
row++;
//Checks if all rows in the window where visited.
if (rowa == 0)
{
step = window + gap; //changes the step: approximately doubles its value
window *= 2; //doubles the size of the re-visitation window
gap = -gap; //inverts the modifier to the step
}
//Checks if all rows in the window where visited.
if (rowa == 0) {
step = window + gap; //changes the step: approximately doubles its value
window *= 2; //doubles the size of the re-visitation window
gap = -gap; //inverts the modifier to the step
}
} while (row < nRows);
} while (row < nRows);
//======================== Wandering Phase =============================//
row = 0; //Resets the visitation to the first row of the memory matrix
@@ -351,20 +342,19 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
do {
//Selects a pseudorandom index row*
//----------------------------------------------------------------------
//rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
//rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//-----------------------------------------------------------------
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
reducedDuplexRow( state, memMatrix[prev], memMatrix[rowa],
memMatrix[row], nCols );
reducedDuplexRow(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
//---------------------------------------------------------------
//row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
//row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//--------------------------------------------------------------------
@@ -373,15 +363,190 @@ int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
//========================= Wrap-up Phase ===============================//
//Absorbs the last block of the memory matrix
absorbBlock( state, memMatrix[rowa] );
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
//Squeezes the key
squeeze( state, K, kLen );
//====================== Freeing the memory =============================//
free( memMatrix );
free( wholeMatrix );
// _mm_free(state);
// _mm_free( wholeMatrix );
return 0;
}
int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
{
//====================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
int64_t row = 2; //index of row to be processed
int64_t prev = 1; //index of prev (last row ever computed/modified)
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
int64_t tau; //Time Loop iterator
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
int64_t i; //auxiliary iteration counter
int64_t v64; // 64bit var for memcpy
//====================================================================/
//=== Initializing the Memory Matrix and pointers to it =============//
//Tries to allocate enough space for the whole memory matrix
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
// for Lyra2REv2, nCols = 4, v1 was using 8
const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
i = (int64_t)ROW_LEN_BYTES * nRows;
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
if (wholeMatrix == NULL)
return -1;
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
#else
memset(wholeMatrix, 0, i);
#endif
uint64_t *ptrWord = wholeMatrix;
//=== Getting the password + salt + basil padded with 10*1 ==========//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
//but this ensures that the password copied locally will be overwritten as soon as possible
//First, we clean enough blocks for the password, salt, basil and padding
int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
byte *ptrByte = (byte*) wholeMatrix;
//Prepends the password
memcpy(ptrByte, pwd, pwdlen);
ptrByte += pwdlen;
//Concatenates the salt
memcpy(ptrByte, salt, saltlen);
ptrByte += saltlen;
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
- (saltlen + pwdlen) );
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
memcpy(ptrByte, &kLen, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = pwdlen;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = saltlen;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = timeCost;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = nRows;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = nCols;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
//Now comes the padding
*ptrByte = 0x80; //first byte of padding: right after the password
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
//================= Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
initState( state );
//========================= Setup Phase =============================//
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
ptrWord = wholeMatrix;
for (i = 0; i < nBlocksInput; i++)
{
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
}
//Initializes M[0] and M[1]
reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
nCols);
do
{
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
&wholeMatrix[rowa*ROW_LEN_INT64],
&wholeMatrix[row*ROW_LEN_INT64], nCols );
//updates the value of row* (deterministically picked during Setup))
rowa = (rowa + step) & (window - 1);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
row++;
//Checks if all rows in the window where visited.
if (rowa == 0)
{
step = window + gap; //changes the step: approximately doubles its value
window *= 2; //doubles the size of the re-visitation window
gap = -gap; //inverts the modifier to the step
}
} while (row < nRows);
//===================== Wandering Phase =============================//
row = 0; //Resets the visitation to the first row of the memory matrix
for (tau = 1; tau <= timeCost; tau++)
{
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
do
{
//Selects a pseudorandom index row*
//-----------------------------------------------
rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//-------------------------------------------
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
&wholeMatrix[rowa*ROW_LEN_INT64],
&wholeMatrix[row*ROW_LEN_INT64], nCols );
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
//----------------------------------------------------
row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//----------------------------------------------------
} while (row != 0);
}
//===================== Wrap-up Phase ===============================//
//Absorbs the last block of the memory matrix
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
//Squeezes the key
squeeze(state, K, (unsigned int) kLen);
//================== Freeing the memory =============================//
free(wholeMatrix);
return 0;
}

View File

@@ -37,10 +37,20 @@ typedef unsigned char byte;
#define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes
#endif
int LYRA2( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
const void *salt, uint64_t saltlen, uint64_t timeCost,
uint64_t nRows, uint64_t nCols );
int LYRA2Z( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
const void *salt, uint64_t saltlen, uint64_t timeCost,
uint64_t nRows, uint64_t nCols );
#define BLOCK_LEN_M256I (BLOCK_LEN_INT64 / 4 )
#define BLOCK_LEN_M128I (BLOCK_LEN_INT64 / 2 )
int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
int LYRA2REV2( uint64_t*, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
#endif /* LYRA2_H_ */

View File

@@ -7,11 +7,14 @@
#include "algo/keccak/sph_keccak.h"
#include "lyra2.h"
#include "algo-gate-api.h"
#include "avxdefs.h"
#ifndef NO_AES_NI
#include "algo/groestl/aes_ni/hash-groestl256.h"
#endif
//__thread uint64_t* lyra2re_wholeMatrix;
typedef struct {
sph_blake256_context blake;
sph_keccak256_context keccak;
@@ -24,6 +27,7 @@ typedef struct {
} lyra2re_ctx_holder;
lyra2re_ctx_holder lyra2re_ctx;
static __thread sph_blake256_context lyra2_blake_mid;
void init_lyra2re_ctx()
{
@@ -37,6 +41,12 @@ void init_lyra2re_ctx()
#endif
}
void lyra2_blake256_midstate( const void* input )
{
memcpy( &lyra2_blake_mid, &lyra2re_ctx.blake, sizeof lyra2_blake_mid );
sph_blake256( &lyra2_blake_mid, input, 64 );
}
void lyra2re_hash(void *state, const void *input)
{
lyra2re_ctx_holder ctx;
@@ -47,13 +57,19 @@ void lyra2re_hash(void *state, const void *input)
#define hashA hash
#define hashB hash+16
sph_blake256(&ctx.blake, input, 80);
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
memcpy( &ctx.blake, &lyra2_blake_mid, sizeof lyra2_blake_mid );
sph_blake256( &ctx.blake, input + 64, 16 );
// sph_blake256(&ctx.blake, input, 80);
sph_blake256_close(&ctx.blake, hashA);
sph_keccak256(&ctx.keccak, hashA, 32);
sph_keccak256_close(&ctx.keccak, hashB);
LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
LYRA2RE( hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
sph_skein256(&ctx.skein, hashA, 32);
sph_skein256_close(&ctx.skein, hashB);
@@ -81,6 +97,8 @@ int scanhash_lyra2re(int thr_id, struct work *work,
swab32_array( endiandata, pdata, 20 );
lyra2_blake256_midstate( endiandata );
do {
be32enc(&endiandata[19], nonce);
lyra2re_hash(hash, endiandata);
@@ -112,10 +130,34 @@ void lyra2re_set_target ( struct work* work, double job_diff )
work_set_target(work, job_diff / (128.0 * opt_diff_factor) );
}
/*
bool lyra2re_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
lyra2re_wholeMatrix = _mm_malloc( i, 64 );
if ( lyra2re_wholeMatrix == NULL )
return false;
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)lyra2re_wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)lyra2re_wholeMatrix, i/16 );
#else
memset( lyra2re_wholeMatrix, 0, i );
#endif
return true;
}
*/
bool register_lyra2re_algo( algo_gate_t* gate )
{
init_lyra2re_ctx();
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
// gate->miner_thread_init = (void*)&lyra2re_thread_init;
gate->scanhash = (void*)&scanhash_lyra2re;
gate->hash = (void*)&lyra2re_hash;
gate->hash_alt = (void*)&lyra2re_hash;

View File

@@ -8,10 +8,11 @@
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "lyra2.h"
#include "avxdefs.h"
__thread uint64_t* l2v2_wholeMatrix;
typedef struct {
cubehashParam cube1;
@@ -23,7 +24,8 @@ typedef struct {
} lyra2v2_ctx_holder;
lyra2v2_ctx_holder lyra2v2_ctx;
static lyra2v2_ctx_holder lyra2v2_ctx;
static __thread sph_blake256_context l2v2_blake_mid;
void init_lyra2rev2_ctx()
{
@@ -35,14 +37,23 @@ void init_lyra2rev2_ctx()
sph_bmw256_init( &lyra2v2_ctx.bmw );
}
void l2v2_blake256_midstate( const void* input )
{
memcpy( &l2v2_blake_mid, &lyra2v2_ctx.blake, sizeof l2v2_blake_mid );
sph_blake256( &l2v2_blake_mid, input, 64 );
}
void lyra2rev2_hash( void *state, const void *input )
{
lyra2v2_ctx_holder ctx;
memcpy( &ctx, &lyra2v2_ctx, sizeof(lyra2v2_ctx) );
uint32_t _ALIGN(128) hashA[8], hashB[8];
sph_blake256( &ctx.blake, input, 80 );
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
memcpy( &ctx.blake, &l2v2_blake_mid, sizeof l2v2_blake_mid );
sph_blake256( &ctx.blake, (uint8_t*)input + midlen, tail );
sph_blake256_close( &ctx.blake, hashA );
sph_keccak256( &ctx.keccak, hashA, 32 );
@@ -50,18 +61,14 @@ void lyra2rev2_hash( void *state, const void *input )
cubehashUpdateDigest( &ctx.cube1, (byte*) hashA,
(const byte*) hashB, 32 );
// cubehashUpdate( &ctx.cube1, (const byte*) hashB,32 );
// cubehashDigest( &ctx.cube1, (byte*)hashA );
LYRA2( hashA, 32, hashA, 32, hashA, 32, 1, 4, 4 );
LYRA2REV2( l2v2_wholeMatrix, hashA, 32, hashA, 32, hashA, 32, 1, 4, 4 );
sph_skein256( &ctx.skein, hashA, 32 );
sph_skein256_close( &ctx.skein, hashB );
cubehashUpdateDigest( &ctx.cube2, (byte*) hashA,
(const byte*) hashB, 32 );
// cubehashUpdate( &ctx.cube2, (const byte*) hashB,32 );
// cubehashDigest( &ctx.cube2, (byte*)hashA );
sph_bmw256( &ctx.bmw, hashA, 32 );
sph_bmw256_close( &ctx.bmw, hashB );
@@ -85,6 +92,8 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
swab32_array( endiandata, pdata, 20 );
l2v2_blake256_midstate( endiandata );
do {
be32enc(&endiandata[19], nonce);
lyra2rev2_hash(hash, endiandata);
@@ -112,10 +121,33 @@ void lyra2rev2_set_target( struct work* work, double job_diff )
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool lyra2rev2_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
l2v2_wholeMatrix = _mm_malloc( i, 64 );
if ( l2v2_wholeMatrix == NULL )
return false;
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)l2v2_wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)l2v2_wholeMatrix, i/16 );
#else
memset( l2v2_wholeMatrix, 0, i );
#endif
return true;
}
bool register_lyra2rev2_algo( algo_gate_t* gate )
{
init_lyra2rev2_ctx();
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
gate->scanhash = (void*)&scanhash_lyra2rev2;
gate->hash = (void*)&lyra2rev2_hash;
gate->hash_alt = (void*)&lyra2rev2_hash;

File diff suppressed because it is too large Load Diff

View File

@@ -51,24 +51,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#if defined __AVX2__
// only available with avx2
// init vectors from memory
// returns void, updates defines and inits implicit args a, b, c, d
#define LYRA_INIT_AVX2 \
__m256i a[4]; \
a[0] = _mm256_load_si256( (__m256i*)(&v[ 0]) ); \
a[1] = _mm256_load_si256( (__m256i*)(&v[ 4]) ); \
a[2] = _mm256_load_si256( (__m256i*)(&v[ 8]) ); \
a[3] = _mm256_load_si256( (__m256i*)(&v[12]) );
// save to memory
// returns void
#define LYRA_CLOSE_AVX2 \
_mm256_store_si256( (__m256i*)(&v[ 0]), a[0] ); \
_mm256_store_si256( (__m256i*)(&v[ 4]), a[1] ); \
_mm256_store_si256( (__m256i*)(&v[ 8]), a[2] ); \
_mm256_store_si256( (__m256i*)(&v[12]), a[3] );
// process 4 rows in parallel
// process 4 columns in parallel
// returns void, updates all args
#define G_4X64(a,b,c,d) \
a = _mm256_add_epi64( a, b ); \
@@ -107,28 +90,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#else
// only available with avx
#define LYRA_INIT_AVX \
__m128i a0[4], a1[4]; \
a0[0] = _mm_load_si128( (__m128i*)(&v[ 0]) ); \
a1[0] = _mm_load_si128( (__m128i*)(&v[ 2]) ); \
a0[1] = _mm_load_si128( (__m128i*)(&v[ 4]) ); \
a1[1] = _mm_load_si128( (__m128i*)(&v[ 6]) ); \
a0[2] = _mm_load_si128( (__m128i*)(&v[ 8]) ); \
a1[2] = _mm_load_si128( (__m128i*)(&v[10]) ); \
a0[3] = _mm_load_si128( (__m128i*)(&v[12]) ); \
a1[3] = _mm_load_si128( (__m128i*)(&v[14]) );
#define LYRA_CLOSE_AVX \
_mm_store_si128( (__m128i*)(&v[ 0]), a0[0] ); \
_mm_store_si128( (__m128i*)(&v[ 2]), a1[0] ); \
_mm_store_si128( (__m128i*)(&v[ 4]), a0[1] ); \
_mm_store_si128( (__m128i*)(&v[ 6]), a1[1] ); \
_mm_store_si128( (__m128i*)(&v[ 8]), a0[2] ); \
_mm_store_si128( (__m128i*)(&v[10]), a1[2] ); \
_mm_store_si128( (__m128i*)(&v[12]), a0[3] ); \
_mm_store_si128( (__m128i*)(&v[14]), a1[3] );
// process 2 rows in parallel
// process 2 columns in parallel
// returns void, all args updated
#define G_2X64(a,b,c,d) \
a = _mm_add_epi64( a, b ); \
@@ -140,68 +102,35 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
c = _mm_add_epi64( c, d ); \
b = mm_rotr_64( _mm_xor_si128( b, c ), 63 );
#define LYRA_ROUND_AVX \
G_2X64( a0[0], a0[1], a0[2], a0[3] ); \
G_2X64( a1[0], a1[1], a1[2], a1[3] ); \
mm128_rotl256_1x64( a0[1], a1[1] ); \
mm128_swap128( a0[2], a1[2] ); \
mm128_rotr256_1x64( a0[3], a1[3] ); \
G_2X64( a0[0], a0[1], a0[2], a0[3] ); \
G_2X64( a1[0], a1[1], a1[2], a1[3] ); \
mm128_rotr256_1x64( a0[1], a1[1] ); \
mm128_swap128( a0[2], a1[2] ); \
mm128_rotl256_1x64( a0[3], a1[3] );
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm128_rotl256_1x64( s2, s3 ); \
mm128_swap128( s4, s5 ); \
mm128_rotr256_1x64( s6, s7 ); \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm128_rotr256_1x64( s2, s3 ); \
mm128_swap128( s4, s5 ); \
mm128_rotl256_1x64( s6, s7 );
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
#endif // AVX2
/*
#if defined __AVX__
// can coexist with AVX2
// rotate each uint64 c bits
// _m128i
#define mm_rotr_64(w,c) _mm_or_si128(_mm_srli_epi64(w, c), \
_mm_slli_epi64(w, 64 - c))
// swap 128 bit source vectors, equivalent of rotating 256 bits by 128 bits
// void
#define mm128_swap128(s0, s1) s0 = _mm_xor_si128(s0, s1); \
s1 = _mm_xor_si128(s0, s1); \
s0 = _mm_xor_si128(s0, s1);
// swap uint64 in 128 bit source vector, equivalent of rotating 128 bits by
// 64 bits (8 bytes)
// __m128i
#define mm128_swap64(s) _mm_or_si128( _mm_slli_si128( s, 8 ), \
_mm_srli_si128( s, 8 ) )
// rotate 2 128 bit vectors as one 256 vector by 1 uint64, very inefficient
// returns void, args updated
#define mm128_rotl256_1x64(s0, s1) do { \
__m128i t; \
s0 = mm128_swap64( s0); \
s1 = mm128_swap64( s1); \
t = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
_mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
s1 = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
_mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
s0 = t; \
} while(0)
#define mm128_rotr256_1x64(s0, s1) do { \
__m128i t; \
s0 = mm128_swap64( s0); \
s1 = mm128_swap64( s1); \
t = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
_mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
s1 = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
_mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
s0 = t; \
} while(0)
#endif // AVX
*/
// Scalar
//Blake2b's G function
#define G(r,i,a,b,c,d) \

View File

@@ -1,20 +1,40 @@
#include <memory.h>
#include <mm_malloc.h>
#include "miner.h"
#include "algo-gate-api.h"
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
#include "avxdefs.h"
void zcoin_hash(void *state, const void *input, uint32_t height)
__thread uint64_t* zcoin_wholeMatrix;
static __thread sph_blake256_context zcoin_blake_mid;
void zcoin_midstate( const void* input )
{
uint32_t _ALIGN(256) hash[16];
// LYRA2Z(hash, 32, input, 80, input, 80, 2, height, 256);
LYRA2Z(hash, 32, input, 80, input, 80, 2, 8192, 256);
memcpy(state, hash, 32);
sph_blake256_init( &zcoin_blake_mid );
sph_blake256( &zcoin_blake_mid, input, 64 );
}
// block 2050 new algo, blake plus new lyra parms. new input
// is power of 2 so normal lyra can be used
//void zcoin_hash(void *state, const void *input, uint32_t height)
void zcoin_hash(void *state, const void *input )
{
uint32_t _ALIGN(256) hash[16];
sph_blake256_context ctx_blake;
memcpy( &ctx_blake, &zcoin_blake_mid, sizeof zcoin_blake_mid );
sph_blake256( &ctx_blake, input + 64, 16 );
sph_blake256_close( &ctx_blake, hash );
LYRA2Z( zcoin_wholeMatrix, hash, 32, hash, 32, hash, 32, 8, 8, 8);
memcpy(state, hash, 32);
}
//int scanhash_zcoin(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, uint32_t height)
int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
@@ -25,6 +45,7 @@ int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
if (opt_benchmark)
ptarget[7] = 0x0000ff;
@@ -32,9 +53,11 @@ int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
be32enc(&endiandata[i], pdata[i]);
}
zcoin_midstate( endiandata );
do {
be32enc(&endiandata[19], nonce);
zcoin_hash( hash, endiandata, work->height );
zcoin_hash( hash, endiandata );
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
work_set_target_ratio(work, hash);
@@ -57,22 +80,45 @@ void zcoin_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
/*
bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
{
work->height = sctx->bloc_height;
return false;
}
*/
bool zcoin_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
zcoin_wholeMatrix = _mm_malloc( i, 64 );
if ( zcoin_wholeMatrix == NULL )
return false;
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)zcoin_wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)zcoin_wholeMatrix, i/16 );
#else
memset( zcoin_wholeMatrix, 0, i );
#endif
return true;
}
bool register_zcoin_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&zcoin_thread_init;
gate->scanhash = (void*)&scanhash_zcoin;
gate->hash = (void*)&zcoin_hash;
gate->hash_alt = (void*)&zcoin_hash;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&zcoin_set_target;
gate->prevent_dupes = (void*)&zcoin_get_work_height;
// gate->prevent_dupes = (void*)&zcoin_get_work_height;
return true;
};

View File

@@ -2,13 +2,15 @@
#include "miner.h"
#include "algo-gate-api.h"
#include "lyra2.h"
#include "avxdefs.h"
__thread uint64_t* zoin_wholeMatrix;
void zoin_hash(void *state, const void *input, uint32_t height)
{
uint32_t _ALIGN(256) hash[16];
LYRA2Z(hash, 32, input, 80, input, 80, 2, 330, 256);
LYRA2Z( zoin_wholeMatrix, hash, 32, input, 80, input, 80, 2, 330, 256);
memcpy(state, hash, 32);
}
@@ -53,22 +55,45 @@ void zoin_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
/*
bool zoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
{
work->height = sctx->bloc_height;
return false;
}
*/
bool zoin_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 256; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 330; // nRows;
zoin_wholeMatrix = _mm_malloc( i, 64 );
if ( zoin_wholeMatrix == NULL )
return false;
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)zoin_wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)zoin_wholeMatrix, i/16 );
#else
memset( zoin_wholeMatrix, 0, i );
#endif
return true;
}
bool register_zoin_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&zoin_thread_init;
gate->scanhash = (void*)&scanhash_zoin;
gate->hash = (void*)&zoin_hash;
gate->hash_alt = (void*)&zoin_hash;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&zoin_set_target;
gate->prevent_dupes = (void*)&zoin_get_work_height;
// gate->prevent_dupes = (void*)&zoin_get_work_height;
return true;
};

View File

@@ -175,13 +175,13 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
memcpy(data, pdata, 80);
sph_sha256( &ctx1.sha256, data, M7_MIDSTATE_LEN );
sph_sha512( &ctx1.sha512, data, M7_MIDSTATE_LEN );
sph_keccak512( &ctx1.keccak, data, M7_MIDSTATE_LEN );
sph_sha256( &ctx1.sha256, data, M7_MIDSTATE_LEN );
sph_sha512( &ctx1.sha512, data, M7_MIDSTATE_LEN );
sph_keccak512( &ctx1.keccak, data, M7_MIDSTATE_LEN );
sph_whirlpool( &ctx1.whirlpool, data, M7_MIDSTATE_LEN );
sph_haval256_5( &ctx1.haval, data, M7_MIDSTATE_LEN );
sph_tiger( &ctx1.tiger, data, M7_MIDSTATE_LEN );
sph_ripemd160( &ctx1.ripemd, data, M7_MIDSTATE_LEN );
sph_haval256_5( &ctx1.haval, data, M7_MIDSTATE_LEN );
sph_tiger( &ctx1.tiger, data, M7_MIDSTATE_LEN );
sph_ripemd160( &ctx1.ripemd, data, M7_MIDSTATE_LEN );
mpz_t magipi, magisw, product, bns0, bns1;
mpf_t magifpi, magifpi0, mpt1, mpt2, mptmp, mpten;
@@ -228,40 +228,11 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
sph_ripemd160( &ctx2.ripemd, data_p64, 80 - M7_MIDSTATE_LEN );
sph_ripemd160_close( &ctx2.ripemd, (void*)(bhash[6]) );
/*
ctx2_sha256 = ctx_sha256;
sph_sha256 (&ctx2_sha256, data_p64, 80 - M7_MIDSTATE_LEN);
sph_sha256_close(&ctx2_sha256, (void*)(bhash[0]));
ctx2_sha512 = ctx_sha512;
sph_sha512 (&ctx2_sha512, data_p64, 80 - M7_MIDSTATE_LEN);
sph_sha512_close(&ctx2_sha512, (void*)(bhash[1]));
ctx2_keccak = ctx_keccak;
sph_keccak512 (&ctx2_keccak, data_p64, 80 - M7_MIDSTATE_LEN);
sph_keccak512_close(&ctx2_keccak, (void*)(bhash[2]));
ctx2_whirlpool = ctx_whirlpool;
sph_whirlpool (&ctx2_whirlpool, data_p64, 80 - M7_MIDSTATE_LEN);
sph_whirlpool_close(&ctx2_whirlpool, (void*)(bhash[3]));
ctx2_haval = ctx_haval;
sph_haval256_5 (&ctx2_haval, data_p64, 80 - M7_MIDSTATE_LEN);
sph_haval256_5_close(&ctx2_haval, (void*)(bhash[4]));
ctx2_tiger = ctx_tiger;
sph_tiger (&ctx2_tiger, data_p64, 80 - M7_MIDSTATE_LEN);
sph_tiger_close(&ctx2_tiger, (void*)(bhash[5]));
ctx2_ripemd = ctx_ripemd;
sph_ripemd160 (&ctx2_ripemd, data_p64, 80 - M7_MIDSTATE_LEN);
sph_ripemd160_close(&ctx2_ripemd, (void*)(bhash[6]));
*/
mpz_import(bns0, a, -1, p, -1, 0, bhash[0]);
mpz_set(bns1, bns0);
mpz_set(product, bns0);
for(int i=1; i < 7; i++){
for ( i=1; i < 7; i++ )
{
mpz_import(bns0, a, -1, p, -1, 0, bhash[i]);
mpz_add(bns1, bns1, bns0);
mpz_mul(product, product, bns0);
@@ -275,11 +246,6 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
sph_sha256( &ctxf_sha256, bdata, bytes );
sph_sha256_close( &ctxf_sha256, (void*)(hash) );
/*
sph_sha256 (&ctx_final_sha256, bdata, bytes);
sph_sha256_close(&ctx_final_sha256, (void*)(hash));
*/
digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
mpf_set_prec_raw(magifpi, prec);
@@ -291,7 +257,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
mpzscale = 1;
mpz_set_ui(magisw, usw_);
for(i = 0; i < 5; i++)
for ( i = 0; i < 5; i++ )
{
mpf_set_d(mpt1, 0.25*mpzscale);
mpf_sub(mpt1, mpt1, mpt2);
@@ -314,23 +280,22 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
sph_sha256( &ctxf_sha256, bdata, bytes );
sph_sha256_close( &ctxf_sha256, (void*)(hash) );
/*
sph_sha256 (&ctx_final_sha256, bdata, bytes);
sph_sha256_close(&ctx_final_sha256, (void*)(hash));
*/
}
const unsigned char *hash_ = (const unsigned char *)hash;
const unsigned char *target_ = (const unsigned char *)ptarget;
for (i = 31; i >= 0; i--) {
if (hash_[i] != target_[i]) {
for ( i = 31; i >= 0; i-- )
{
if ( hash_[i] != target_[i] )
{
rc = hash_[i] < target_[i];
break;
}
}
if (unlikely(rc)) {
if (opt_debug) {
if ( unlikely(rc) )
{
if ( opt_debug )
{
bin2hex(hash_str, (unsigned char *)hash, 32);
bin2hex(target_str, (unsigned char *)ptarget, 32);
bin2hex(data_str, (unsigned char *)data, 80);
@@ -343,20 +308,22 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
goto out;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
pdata[19] = n;
out:
mpf_set_prec_raw(magifpi, prec0);
mpf_set_prec_raw(magifpi0, prec0);
mpf_set_prec_raw(mptmp, prec0);
mpf_set_prec_raw(mpt1, prec0);
mpf_set_prec_raw(mpt2, prec0);
mpf_clear(magifpi);
mpf_clear(magifpi0);
mpf_clear(mpten);
mpf_clear(mptmp);
mpf_clear(mpt1);
mpf_clear(mpt2);
mpz_clears(magipi, magisw, product, bns0, bns1, NULL);
mpf_set_prec_raw(magifpi, prec0);
mpf_set_prec_raw(magifpi0, prec0);
mpf_set_prec_raw(mptmp, prec0);
mpf_set_prec_raw(mpt1, prec0);
mpf_set_prec_raw(mpt2, prec0);
mpf_clear(magifpi);
mpf_clear(magifpi0);
mpf_clear(mpten);
mpf_clear(mptmp);
mpf_clear(mpt1);
mpf_clear(mpt2);
mpz_clears(magipi, magisw, product, bns0, bns1, NULL);
*hashes_done = n - first_nonce + 1;
return rc;

View File

@@ -5,6 +5,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "avxdefs.h"
#include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h"
@@ -99,6 +100,7 @@ typedef struct {
} tt_ctx_holder;
tt_ctx_holder tt_ctx;
__thread tt_ctx_holder tt_mid;
void init_tt_ctx()
{
@@ -125,6 +127,8 @@ void timetravel_hash(void *output, const void *input)
tt_ctx_holder ctx;
memcpy( &ctx, &tt_ctx, sizeof(tt_ctx) );
int i;
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
for ( i = 0; i < HASH_FUNC_COUNT; i++ )
{
@@ -140,50 +144,129 @@ void timetravel_hash(void *output, const void *input)
}
hashB = &hash[16 * i];
switch ( permutation[i] )
switch ( permutation[i] )
{
case 0:
// if ( i == 0 )
// {
// memcpy( &ctx.blake, &tt_mid.blake, sizeof tt_mid.blake );
// sph_blake256( &ctx.blake, input + midlen, tail );
// sph_blake256_close( &ctx.blake, hashB );
// }
// else
// {
sph_blake512( &ctx.blake, hashA, dataLen );
sph_blake512_close( &ctx.blake, hashB );
// }
break;
case 1:
if ( i == 0 )
{
case 0:
sph_blake512( &ctx.blake, hashA, dataLen );
sph_blake512_close( &ctx.blake, hashB );
break;
case 1:
sph_bmw512( &ctx.bmw, hashA, dataLen );
sph_bmw512_close( &ctx.bmw, hashB );
break;
case 2:
memcpy( &ctx.bmw, &tt_mid.bmw, sizeof tt_mid.bmw );
sph_bmw512( &ctx.bmw, input + midlen, tail );
sph_bmw512_close( &ctx.bmw, hashB );
}
else
{
sph_bmw512( &ctx.bmw, hashA, dataLen );
sph_bmw512_close( &ctx.bmw, hashB );
}
break;
case 2:
#ifdef NO_AES_NI
sph_groestl512( &ctx.groestl, hashA, dataLen );
sph_groestl512_close( &ctx.groestl, hashB );
if ( i == 0 )
{
memcpy( &ctx.groestl, &tt_mid.groestl, sizeof tt_mid.groestl );
sph_groestl512( &ctx.groestl, input + midlen, tail );
sph_groestl512_close( &ctx.groestl, hashB );
}
else
{
sph_groestl512( &ctx.groestl, hashA, dataLen );
sph_groestl512_close( &ctx.groestl, hashB );
}
#else
update_and_final_groestl( &ctx.groestl, (char*)hashB,
(char*)hashA, dataLen*8 );
if ( i == 0 )
{
memcpy( &ctx.groestl, &tt_mid.groestl, sizeof tt_mid.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hashB,
(char*)input + midlen, tail*8 );
}
else
{
update_and_final_groestl( &ctx.groestl, (char*)hashB,
(char*)hashA, dataLen*8 );
}
#endif
break;
case 3:
sph_skein512( &ctx.skein, hashA, dataLen );
sph_skein512_close( &ctx.skein, hashB );
break;
case 4:
sph_jh512( &ctx.jh, hashA, dataLen );
sph_jh512_close( &ctx.jh, hashB);
break;
case 5:
sph_keccak512( &ctx.keccak, hashA, dataLen );
sph_keccak512_close( &ctx.keccak, hashB );
break;
case 6:
update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
(const BitSequence*)hashA, dataLen );
break;
case 7:
cubehashUpdateDigest( &ctx.cube, (byte*)hashB,
(const byte*) hashA, dataLen );
break;
default:
break;
}
}
break;
case 3:
if ( i == 0 )
{
memcpy( &ctx.skein, &tt_mid.skein, sizeof tt_mid.skein );
sph_skein512( &ctx.skein, input + midlen, tail );
sph_skein512_close( &ctx.skein, hashB );
}
else
{
sph_skein512( &ctx.skein, hashA, dataLen );
sph_skein512_close( &ctx.skein, hashB );
}
break;
case 4:
if ( i == 0 )
{
memcpy( &ctx.jh, &tt_mid.jh, sizeof tt_mid.jh );
sph_jh512( &ctx.jh, input + midlen, tail );
sph_jh512_close( &ctx.jh, hashB );
}
else
{
sph_jh512( &ctx.jh, hashA, dataLen );
sph_jh512_close( &ctx.jh, hashB);
}
break;
case 5:
if ( i == 0 )
{
memcpy( &ctx.keccak, &tt_mid.keccak, sizeof tt_mid.keccak );
sph_keccak512( &ctx.keccak, input + midlen, tail );
sph_keccak512_close( &ctx.keccak, hashB );
}
else
{
sph_keccak512( &ctx.keccak, hashA, dataLen );
sph_keccak512_close( &ctx.keccak, hashB );
}
break;
case 6:
// if ( i == 0 )
// {
// memcpy( &ctx.luffa, &tt_mid.luffa, sizeof tt_mid.luffa );
// update_and_final_luffa( &ctx.luffa, hashB,
// input + 64, 16 );
// }
// else
// {
update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
hashA, dataLen );
// }
break;
case 7:
if ( i == 0 )
{
memcpy( &ctx.cube, &tt_mid.cube, sizeof tt_mid.cube );
cubehashUpdateDigest( &ctx.cube, hashB,
input + midlen, tail );
}
else
{
cubehashUpdateDigest( &ctx.cube, hashB, hashA, dataLen );
}
break;
default:
break;
}
}
memcpy(output, &hash[16 * (HASH_FUNC_COUNT - 1)], 32);
}
@@ -191,52 +274,98 @@ void timetravel_hash(void *output, const void *input)
int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t _ALIGN(64) hash[8];
uint32_t _ALIGN(64) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(64) hash[8];
uint32_t _ALIGN(64) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
int i;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
int i;
if (opt_benchmark)
ptarget[7] = 0x0cff;
if (opt_benchmark)
ptarget[7] = 0x0cff;
for (int k=0; k < 19; k++)
be32enc(&endiandata[k], pdata[k]);
for (int k=0; k < 19; k++)
be32enc(&endiandata[k], pdata[k]);
const uint32_t timestamp = endiandata[17];
if ( timestamp != s_ntime )
const uint32_t timestamp = endiandata[17];
if ( timestamp != s_ntime )
{
const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP )
% HASH_FUNC_COUNT_PERMUTATIONS;
for ( i = 0; i < HASH_FUNC_COUNT; i++ )
permutation[i] = i;
for ( i = 0; i < steps; i++ )
next_permutation( permutation, permutation + HASH_FUNC_COUNT );
s_ntime = timestamp;
// do midstate precalc for first function
switch ( permutation[0] )
{
case 0:
// memcpy( &tt_mid.blake, &tt_ctx.blake, sizeof(tt_mid.blake) );
// sph_blake256( &tt_mid.blake, endiandata, 64 );
break;
case 1:
memcpy( &tt_mid.bmw, &tt_ctx.bmw, sizeof(tt_mid.bmw) );
sph_bmw512( &tt_mid.bmw, endiandata, 64 );
break;
case 2:
#ifdef NO_AES_NI
memcpy( &tt_mid.groestl, &tt_ctx.groestl, sizeof(tt_mid.groestl ) );
sph_groestl512( &tt_mid.groestl, endiandata, 64 );
#else
memcpy( &tt_mid.groestl, &tt_ctx.groestl, sizeof(tt_mid.groestl ) );
update_groestl( &tt_mid.groestl, (char*)endiandata, 64*8 );
#endif
break;
case 3:
memcpy( &tt_mid.skein, &tt_ctx.skein, sizeof(tt_mid.skein ) );
sph_skein512( &tt_mid.skein, endiandata, 64 );
break;
case 4:
memcpy( &tt_mid.jh, &tt_ctx.jh, sizeof(tt_mid.jh ) );
sph_jh512( &tt_mid.jh, endiandata, 64 );
break;
case 5:
memcpy( &tt_mid.keccak, &tt_ctx.keccak, sizeof(tt_mid.keccak ) );
sph_keccak512( &tt_mid.keccak, endiandata, 64 );
break;
case 6:
// init_luffa( &tt_mid.luffa, 512 );
// memcpy( &tt_mid.luffa, &tt_ctx.luffa, sizeof(tt_mid.luffa ) );
// update_luffa( &tt_mid.luffa, endiandata, 64 );
break;
case 7:
memcpy( &tt_mid.cube, &tt_ctx.cube, sizeof(tt_mid.cube ) );
cubehashUpdate( &tt_mid.cube, endiandata, 64 );
break;
default:
break;
}
}
do {
be32enc( &endiandata[19], nonce );
timetravel_hash( hash, endiandata );
if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
{
const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP )
% HASH_FUNC_COUNT_PERMUTATIONS;
for ( i = 0; i < HASH_FUNC_COUNT; i++ )
permutation[i] = i;
for ( i = 0; i < steps; i++ )
next_permutation( permutation, permutation + HASH_FUNC_COUNT );
s_ntime = timestamp;
}
work_set_target_ratio( work, hash );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 1;
}
nonce++;
do {
be32enc(&endiandata[19], nonce);
timetravel_hash(hash, endiandata);
} while (nonce < max_nonce && !(*restart));
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
work_set_target_ratio(work, hash);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 1;
}
nonce++;
} while (nonce < max_nonce && !(*restart));
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
void timetravel_set_target( struct work* work, double job_diff )

View File

@@ -95,5 +95,6 @@ bool register_veltor_algo( algo_gate_t* gate )
gate->hash = (void*)&veltorhash;
gate->hash_alt = (void*)&veltorhash;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
}

View File

@@ -7,44 +7,58 @@
#include <stdio.h>
#include "sph_whirlpool.h"
typedef struct {
sph_whirlpool_context whirl1;
sph_whirlpool_context whirl2;
sph_whirlpool_context whirl3;
sph_whirlpool_context whirl4;
} whirlpool_ctx_holder;
static whirlpool_ctx_holder whirl_ctx;
static __thread sph_whirlpool_context whirl1_mid_ctx;
void init_whirlpool_ctx()
{
sph_whirlpool1_init( &whirl_ctx.whirl1 );
sph_whirlpool1_init( &whirl_ctx.whirl2 );
sph_whirlpool1_init( &whirl_ctx.whirl3 );
sph_whirlpool1_init( &whirl_ctx.whirl4 );
}
void whirlpool_hash(void *state, const void *input)
{
sph_whirlpool_context ctx_whirlpool;
whirlpool_ctx_holder ctx;
memcpy( &ctx, &whirl_ctx, sizeof(whirl_ctx) );
const int midlen = 64;
const int tail = 80 - midlen;
unsigned char hash[128]; // uint32_t hashA[16], hashB[16];
#define hashB hash+64
memset(hash, 0, sizeof hash);
// copy cached midstate
memcpy( &ctx.whirl1, &whirl1_mid_ctx, sizeof whirl1_mid_ctx );
sph_whirlpool1( &ctx.whirl1, input + midlen, tail );
sph_whirlpool1_close(&ctx.whirl1, hash);
sph_whirlpool1_init(&ctx_whirlpool);
sph_whirlpool1(&ctx_whirlpool, input, 80);
sph_whirlpool1_close(&ctx_whirlpool, hash);
sph_whirlpool1(&ctx.whirl2, hash, 64);
sph_whirlpool1_close(&ctx.whirl2, hashB);
sph_whirlpool1_init(&ctx_whirlpool);
sph_whirlpool1(&ctx_whirlpool, hash, 64);
sph_whirlpool1_close(&ctx_whirlpool, hashB);
sph_whirlpool1(&ctx.whirl3, hashB, 64);
sph_whirlpool1_close(&ctx.whirl3, hash);
sph_whirlpool1_init(&ctx_whirlpool);
sph_whirlpool1(&ctx_whirlpool, hashB, 64);
sph_whirlpool1_close(&ctx_whirlpool, hash);
sph_whirlpool1_init(&ctx_whirlpool);
sph_whirlpool1(&ctx_whirlpool, hash, 64);
sph_whirlpool1_close(&ctx_whirlpool, hash);
sph_whirlpool1(&ctx.whirl4, hash, 64);
sph_whirlpool1_close(&ctx.whirl4, hash);
memcpy(state, hash, 32);
}
void whirlpool_midstate(void *state, const void *input)
void whirlpool_midstate( const void* input )
{
sph_whirlpool_context ctx;
sph_whirlpool1_init(&ctx);
sph_whirlpool1(&ctx, input, 64);
memcpy(state, ctx.state, 64);
memcpy( &whirl1_mid_ctx, &whirl_ctx.whirl1, sizeof whirl1_mid_ctx );
sph_whirlpool1( &whirl1_mid_ctx, input, 64 );
}
int scanhash_whirlpool(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t _ALIGN(128) endiandata[20];
@@ -59,6 +73,8 @@ int scanhash_whirlpool(int thr_id, struct work* work, uint32_t max_nonce, unsign
for (int i=0; i < 19; i++)
be32enc(&endiandata[i], pdata[i]);
whirlpool_midstate( endiandata );
do {
const uint32_t Htarg = ptarget[7];
uint32_t vhash[8];
@@ -82,9 +98,9 @@ int scanhash_whirlpool(int thr_id, struct work* work, uint32_t max_nonce, unsign
bool register_whirlpool_algo( algo_gate_t* gate )
{
algo_not_tested();
gate->scanhash = (void*)&scanhash_whirlpool;
gate->hash = (void*)&whirlpool_hash;
init_whirlpool_ctx();
return true;
};

View File

@@ -71,10 +71,18 @@ void init_x11evo_ctx()
sph_shavite512_init( &x11evo_ctx.shavite );
}
/*
uint32_t getCurrentAlgoSeq(uint32_t current_time, uint32_t base_time)
{
return (current_time - base_time) / (60 * 60 * 24);
}
*/
static inline int getCurrentAlgoSeq( uint32_t current_time )
{
// change once per day
return (int) (current_time - INITIAL_DATE) / (60 * 60 * 24);
}
// swap_vars doesn't work here
void evo_swap( uint8_t *a, uint8_t *b )
@@ -136,41 +144,37 @@ void getAlgoString( char *str, uint32_t count )
//applog(LOG_DEBUG, "nextPerm %s", str);
}
// Broken on Windows
#if !((defined(__WINDOWS__)) || (defined(__WIN64)))
static __thread uint32_t saved_ntime = UINT32_MAX;
#endif
static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
static __thread uint32_t s_ntime = UINT32_MAX;
static int s_seq = -1;
void evocoin_twisted_code( char *result, char *code )
static void evo_twisted_code(uint32_t ntime, char *permstr)
{
uint32_t h32, *be32 = get_stratum_job_ntime();
#if !((defined(__WINDOWS__)) || (defined(__WIN64)))
if ( *be32 != saved_ntime )
{
#endif
h32 = be32toh(*be32);
uint32_t count = getCurrentAlgoSeq(h32, INITIAL_DATE);
getAlgoString(code, count);
sprintf(result, "_%d_%s_", count, code);
#if !((defined(__WINDOWS__)) || (defined(__WIN64)))
saved_ntime = *be32;
}
#endif
int seq = getCurrentAlgoSeq(ntime);
if (s_seq != seq)
{
getAlgoString(permstr, seq);
s_seq = seq;
}
}
static inline void x11evo_hash( void *state, const void *input )
{
uint32_t hash[16];
char completeCode[64];
char resultCode[HASH_FUNC_COUNT + 1];
x11evo_ctx_holder ctx;
memcpy( &ctx, &x11evo_ctx, sizeof(x11evo_ctx) );
evocoin_twisted_code( completeCode, resultCode );
if ( s_seq == -1 )
{
uint32_t *data = (uint32_t*) input;
const uint32_t ntime = data[17];
evo_twisted_code(ntime, hashOrder);
}
int i;
for ( i = 0; i < strlen(resultCode); i++ )
for ( i = 0; i < strlen(hashOrder); i++ )
{
char elem = resultCode[i];
char elem = hashOrder[i];
uint8_t idx;
if (elem >= 'A')
idx = elem - 'A' + 10;
@@ -196,8 +200,6 @@ static inline void x11evo_hash( void *state, const void *input )
#else
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, 512 );
// update_groestl( &ctx.groestl, (char*)hash, 512 );
// final_groestl( &ctx.groestl, (char*)hash );
#endif
break;
case 3:
@@ -215,14 +217,10 @@ static inline void x11evo_hash( void *state, const void *input )
case 6:
update_and_final_luffa( &ctx.luffa, (char*)hash,
(const char*)hash, 64 );
// update_luffa( &ctx.luffa, (char*)hash, 64 );
// final_luffa( &ctx.luffa, (char*)hash );
break;
case 7:
cubehashUpdateDigest( &ctx.cube, (char*)hash,
(const char*)hash, 64 );
// cubehashUpdate( &ctx.cube, (char*)hash, 64 );
// cubehashDigest( &ctx.cube, (char*)hash );
break;
case 8:
sph_shavite512( &ctx.shavite, (char*)hash, size );
@@ -239,8 +237,6 @@ static inline void x11evo_hash( void *state, const void *input )
#else
update_final_echo( &ctx.echo, (char*)hash,
(const char*)hash, 512 );
// update_echo( &ctx.echo, (char*)hash, 512 );
// final_echo( &ctx.echo, (char*)hash );
#endif
break;
}
@@ -263,6 +259,13 @@ int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
swab32_array( endiandata, pdata, 20 );
int ntime = endiandata[17];
if ( ntime != s_ntime || s_seq == -1 )
{
evo_twisted_code( ntime, hashOrder );
s_ntime = ntime;
}
uint32_t hmask = 0xFFFFFFFF;
if ( Htarg > 0 )
{

249
avxdefs.h
View File

@@ -34,6 +34,140 @@ uint16_t v16[ 8];
uint8_t v8 [16];
} area128;
#if defined (__AVX2__)
// Replacements for vectorized data
// n = number of __m256i (32 bytes)
inline void memset_zero_m256i( __m256i *dst, int n )
{
for ( int i = 0; i < n; i++ ) dst[i] = _mm256_setzero_si256();
}
inline void memset_m256i( __m256i *dst, const __m256i a, int n )
{
for ( int i = 0; i < n; i++ ) dst[i] = a;
}
// optimized copying, first fit is usually best. If none of these works there
// are __m128i versions or plain memcpy.
// Fixed size
// multi buffered copy for 64 bytes, the size of a cache line.
// minimum alignment is 32 bytes, optimum for cache is 64.
// src & dst are __m256i*
inline void mcpy64_m256i( __m256i* dst, const __m256i* src )
{
const __m256i* dest = dst;
const __m256i* srce = src;
__m256i a = _mm256_load_si256( srce );
__m256i b = _mm256_load_si256( srce + 1 );
_mm256_store_si256( dest, a );
_mm256_store_si256( dest + 1, b );
}
inline void mcpy96_m256i( __m256i* dst, const __m256i* src )
{
const __m256i* dest = dst;
const __m256i* srce = src;
__m256i a = _mm256_load_si256( srce );
__m256i b = _mm256_load_si256( srce + 1 );
_mm256_store_si256( dest, a );
__m256i c = _mm256_load_si256( srce + 2 );
_mm256_store_si256( dest + 1, b );
_mm256_store_si256( dest + 2, c );
}
inline void mcpy128_m256i( __m256i* dst, const __m256i* src )
{
const __m256i* dest = dst;
const __m256i* srce = src;
__m256i a = _mm256_load_si256( srce );
__m256i b = _mm256_load_si256( srce + 1 );
__m256i c = _mm256_load_si256( srce + 2 );
_mm256_store_si256( dest , a );
__m256i d = _mm256_load_si256( srce + 3 );
_mm256_store_si256( dest + 1, b );
a = _mm256_load_si256( srce + 4 );
_mm256_store_si256( dest + 2, c );
b = _mm256_load_si256( srce + 5 );
_mm256_store_si256( dest + 3, d );
c = _mm256_load_si256( srce + 6 );
_mm256_store_si256( dest + 4, a );
d = _mm256_load_si256( srce + 7 );
_mm256_store_si256( dest + 5, b );
_mm256_store_si256( dest + 6, c );
_mm256_store_si256( dest + 7, d );
}
// Variable size
// copy multiples of 64 bytes using quad buffering with interleave
// of first read of next line with last write of current line.
// n is a multiple of 32 bytes (_m256i size)
// minimum alignment: 32 bytes
// optimum alignment: 64 bytes (cache line size)
// minimum size.....: 128 bytes (4*n)
// recommended size.: 256+ bytes
// minimum increment: 128 bytes
// Only the first load or store in a cache line triggers a memory access.
// the subsequent actions are trivial because they benefit from data
// cached by the first.
// Priming the second cache line is done before dumping the first to
// give read priority to ensure there are no gaps in data available to
// the cpu caused by waiting for data to be written back.
inline void mcpy_m256i_x4( __m256i *dst, const __m256i *src, const int n )
{
const __m256i* dest = dst;
const __m256i* srce = src;
// preload 1 cache line to absorb startup latency
__m256i a = _mm256_load_si256( srce );
__m256i b = _mm256_load_si256( srce + 1 );
// start loading second line, queue while waiting
__m256i c = _mm256_load_si256( srce + 2 );
// start writing first line, as soon as data available,
// second line read will have priority on the bus
_mm256_store_si256( dest, a );
__m256i d;
int i;
const int loops = n/4 - 1;
for ( i = 0; i < loops; i++ )
{
const int i4 = i*4;
const __m256i* si4 = (__m256i*)(srce + i4);
const __m256i* di4 = (__m256i*)(dest + i4);
d = _mm256_load_si256( si4 + 3 );
_mm256_store_si256( di4 + 1, b );
// start loading next line
a = _mm256_load_si256( si4 + 4 );
_mm256_store_si256( di4 + 2, c );
b = _mm256_load_si256( si4 + 5 );
_mm256_store_si256( di4 + 3, d );
c = _mm256_load_si256( si4 + 6 );
// start writing next line
_mm256_store_si256( di4 + 4, a );
}
// finish last line
d = _mm256_load_si256( srce + n - 4 );
_mm256_store_si256( dest + n - 3, b );
_mm256_store_si256( dest + n - 2, c );
_mm256_store_si256( dest + n - 1, d );
}
// basic __m256i memcpy
inline void memcpy_m256i( __m256i *dst, const __m256i *src, int n )
{
for ( int i = 0; i < n; i ++ ) dst[i] = src[i];
}
// For cheating with pointer types
// p = any aligned pointer
@@ -63,8 +197,6 @@ uint8_t v8 [16];
// dst = a( b, a[127:0] ) ; mask == 1
//__m256i _mm256_inserti128_si256(__m256i a, __m128i b, const int mask);
#if defined __AVX2__
// Rotate bits in 4 uint64 (3 instructions)
// __m256i mm256_rotr_64( __256i, int )
#define mm256_rotr_64( w, c ) \
@@ -141,6 +273,119 @@ uint8_t v8 [16];
#endif // AVX2
// Replacements for vectorized data
inline void memset_zero_m128i( __m128i *dst, int n )
{
for ( int i = 0; i < n; i++ ) dst[i] = _mm_setzero_si128();
}
inline void memset_m128i( __m128i *dst, const __m128i a, int n )
{
for ( int i = 0; i < n; i++ ) dst[i] = a;
}
// __m128i versions of optimized copying
inline void mcpy32_m128i( __m128i* dst, const __m128i* src )
{
const __m128i* dest = dst;
const __m128i* srce = src;
// 4 loads fills cache line
__m128i a = _mm_load_si128( srce );
__m128i b = _mm_load_si128( srce + 1 );
_mm_store_si128( dest, a );
_mm_store_si128( dest + 1, b );
}
inline void mcpy64_m128i( __m128i* dst, const __m128i* src )
{
const __m128i* dest = dst;
const __m128i* srce = src;
// 4 loads fills cache line
__m128i a = _mm_load_si128( srce );
__m128i b = _mm_load_si128( srce + 1 );
__m128i c = _mm_load_si128( srce + 2 );
__m128i d = _mm_load_si128( srce + 3 );
// need to store a before overwriting it
_mm_store_si128( dest, a );
a = _mm_load_si128( srce + 4 );
_mm_store_si128( dest + 1, b );
b = _mm_load_si128( srce + 5 );
_mm_store_si128( dest + 2, c );
c = _mm_load_si128( srce + 6 );
_mm_store_si128( dest + 3, d );
d = _mm_load_si128( srce + 7 );
_mm_store_si128( dest + 4, a );
d = _mm_load_si128( srce + 7 );
_mm_store_si128( dest + 5, b );
_mm_store_si128( dest + 6, c );
_mm_store_si128( dest + 7, d );
}
// Variable length
// copy multiples of 16 bytes using quad buffering.
// n is a multiple of 16 bytes (__m128i size)
// minimum alignment: 16 bytes
// optimum alignment: 64 bytes (cache line size)
// minimum size.....: 32 bytes (4*n)
// recommended size.: 96+ bytes
// minimum increment: 32 bytes
inline void memcpy_m128i_x4( __m128i *dst, const __m128i *src, const int n )
{
// preload 1 cache line to absorb startup latency
__m128i a = _mm_load_si128( src );
__m128i b = _mm_load_si128( src + 1 );
__m128i c = _mm_load_si128( src + 2 );
__m128i d = _mm_load_si128( src + 3 );
int i;
const int loops = n/4 - 1;
const __m128i* dst_n = (__m128i*)(dst + n);
for ( i = 0; i < loops; i++ )
{
const int i4 = i*4;
const __m128i* si4 = (__m128i*)(src + i4);
const __m128i* di4 = (__m128i*)(dst + i4);
// need to free a before overwriting it
_mm_store_si128( di4, a );
a = _mm_load_si128( di4 + 4 );
_mm_store_si128( di4 + 1, b );
b = _mm_load_si128( di4 + 5 );
_mm_store_si128( di4 + 2, c );
c = _mm_load_si128( di4 + 6 );
_mm_store_si128( di4 + 3, d );
d = _mm_load_si128( di4 + 7 );
}
_mm_store_si128( dst_n - 4, a );
_mm_store_si128( dst_n - 3, b );
_mm_store_si128( dst_n - 2, c );
_mm_store_si128( dst_n - 1, d );
}
// basic __m128i copy
inline void memcpy_m128i( __m128i *dst, const __m128i *src, int n )
{
for ( int i = 0; i < n; i ++ ) dst[i] = src[i];
}
// For cheating with pointer types
// p = any aligned pointer
// returns p as pointer to vector type
#define castp_m128i(p) ((__m128i*)(p))
// p = any aligned pointer
// returns *p, watch your pointer arithmetic
#define cast_m128i(p) (*((__m128i*)(p)))
// p = any aligned pointer, i = scaled array index
// returns p[i]
#define casti_m128i(p,i) (((__m128i*)(p))[(i)])
// rotate bits in 2 uint64
// _m128i mm_rotr_64( __m128i, int )
#define mm_rotr_64(w,c) _mm_or_si128(_mm_srli_epi64(w, c), \

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.5.5])
AC_INIT([cpuminer-opt], [3.5.6])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -2156,79 +2156,85 @@ static void *stratum_thread(void *userdata )
}
}
while (!stratum.curl)
{
pthread_mutex_lock(&g_work_lock);
g_work_time = 0;
pthread_mutex_unlock(&g_work_lock);
restart_threads();
if (!stratum_connect(&stratum, stratum.url)
|| !stratum_subscribe(&stratum)
|| !stratum_authorize(&stratum, rpc_user, rpc_pass))
{
stratum_disconnect(&stratum);
if (opt_retries >= 0 && ++failures > opt_retries)
{
applog(LOG_ERR, "...terminating workio thread");
tq_push(thr_info[work_thr_id].q, NULL);
goto out;
}
if (!opt_benchmark)
applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
sleep(opt_fail_pause);
}
while ( !stratum.curl )
{
pthread_mutex_lock(&g_work_lock);
g_work_time = 0;
pthread_mutex_unlock(&g_work_lock);
restart_threads();
if (!stratum_connect(&stratum, stratum.url)
|| !stratum_subscribe(&stratum)
|| !stratum_authorize(&stratum, rpc_user, rpc_pass))
{
stratum_disconnect(&stratum);
if (opt_retries >= 0 && ++failures > opt_retries)
{
applog(LOG_ERR, "...terminating workio thread");
tq_push(thr_info[work_thr_id].q, NULL);
goto out;
}
if (!opt_benchmark)
applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
sleep(opt_fail_pause);
}
if (jsonrpc_2)
{
work_free(&g_work);
work_copy(&g_work, &stratum.work);
}
}
if (jsonrpc_2)
{
work_free(&g_work);
work_copy(&g_work, &stratum.work);
}
}
if (stratum.job.job_id &&
(!g_work_time || strcmp(stratum.job.job_id, g_work.job_id)) )
{
pthread_mutex_lock(&g_work_lock);
algo_gate.stratum_gen_work( &stratum, &g_work );
time(&g_work_time);
pthread_mutex_unlock(&g_work_lock);
if (stratum.job.job_id &&
(!g_work_time || strcmp(stratum.job.job_id, g_work.job_id)) )
{
pthread_mutex_lock(&g_work_lock);
algo_gate.stratum_gen_work( &stratum, &g_work );
time(&g_work_time);
pthread_mutex_unlock(&g_work_lock);
restart_threads();
if (stratum.job.clean || jsonrpc_2)
{
static uint32_t last_bloc_height;
if (!opt_quiet && last_bloc_height != stratum.bloc_height)
{
last_bloc_height = stratum.bloc_height;
if (net_diff > 0.)
applog(LOG_BLUE, "%s block %d, diff %.3f",
algo_names[opt_algo], stratum.bloc_height, net_diff);
else
applog(LOG_BLUE, "%s %s block %d", short_url,
algo_names[opt_algo], stratum.bloc_height);
}
restart_threads();
}
else if (opt_debug && !opt_quiet)
{
if (stratum.job.clean || jsonrpc_2)
{
static uint32_t last_bloc_height;
if ( last_bloc_height != stratum.bloc_height )
{
last_bloc_height = stratum.bloc_height;
if ( !opt_quiet )
{
if (net_diff > 0.)
applog(LOG_BLUE, "%s block %d, diff %.3f",
algo_names[opt_algo], stratum.bloc_height, net_diff);
else
applog(LOG_BLUE, "%s %s block %d", short_url,
algo_names[opt_algo], stratum.bloc_height);
}
}
restart_threads();
}
else if (opt_debug && !opt_quiet)
{
applog(LOG_BLUE, "%s asks job %d for block %d", short_url,
strtoul(stratum.job.job_id, NULL, 16), stratum.bloc_height);
}
}
}
}
if (!stratum_socket_full(&stratum, opt_timeout)) {
applog(LOG_ERR, "Stratum connection timeout");
s = NULL;
} else
s = stratum_recv_line(&stratum);
if (!s) {
stratum_disconnect(&stratum);
applog(LOG_ERR, "Stratum connection interrupted");
continue;
}
if (!stratum_handle_method(&stratum, s))
stratum_handle_response(s);
free(s);
if ( !stratum_socket_full( &stratum, opt_timeout ) )
{
applog(LOG_ERR, "Stratum connection timeout");
s = NULL;
}
else
s = stratum_recv_line(&stratum);
if ( !s )
{
stratum_disconnect(&stratum);
applog(LOG_ERR, "Stratum connection interrupted");
continue;
}
if (!stratum_handle_method(&stratum, s))
stratum_handle_response(s);
free(s);
}
out:
return NULL;