This commit is contained in:
Jay D Dee
2019-12-25 01:26:26 -05:00
parent c65b0ff7a6
commit 241bc26767
35 changed files with 3036 additions and 643 deletions

View File

@@ -84,6 +84,7 @@ cpuminer_SOURCES = \
algo/cubehash/cubehash_sse2.c\
algo/cubehash/cube-hash-2way.c \
algo/echo/sph_echo.c \
algo/echo/echo-hash-4way.c \
algo/echo/aes_ni/hash.c\
algo/gost/sph_gost.c \
algo/groestl/sph_groestl.c \

View File

@@ -7,9 +7,11 @@ Security warning
----------------
Miner programs are often flagged as malware by antivirus programs. This is
a false positive, they are flagged simply because they are cryptocurrency
miners. The source code is open for anyone to inspect. If you don't trust
the software, don't use it.
usually a false positive, they are flagged simply because they are
cryptocurrency miners. However, some malware has been spread using the
cover that miners are known to be subject to false positives. Always be on
alert. The source code of cpuminer-opt is open for anyone to inspect.
If you don't trust the software don't download it.
The cryptographic hashing code has been taken from trusted sources but has been
modified for speed at the expense of accepted security practices. This
@@ -33,6 +35,16 @@ not supported. FreeBSD YMMV.
Change Log
----------
v3.10.6
Added support for SSL stratum: stratum+tcps://
Added job id reporting again, but leaner, suppressed with --quiet.
AVX512 for x21s, x22i, lyra2z, allium
Fixed share overflow warnings mining lbry with Ryzen (SHA).
v3.10.5
AVX512 for x17, sonoa, xevan, hmq1725, lyra2rev3, lyra2rev2.

View File

@@ -463,6 +463,38 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
return 0;
}
// Update and final when inlen is a multiple of 64 bytes
int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
const void *input, uint64_t inlen )
{
__m256i *in = (__m256i*)input;
__m256i *buf = (__m256i*)S->buf;
while( inlen > BLAKE2S_BLOCKBYTES )
{
memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
S->buflen = BLAKE2S_BLOCKBYTES;
inlen -= BLAKE2S_BLOCKBYTES;
S->t[0] += BLAKE2S_BLOCKBYTES;
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
blake2s_8way_compress( S, buf );
S->buflen = 0;
in += ( BLAKE2S_BLOCKBYTES >> 2 );
}
// last block
memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
S->buflen = BLAKE2S_BLOCKBYTES;
S->t[0] += S->buflen;
S->t[1] += ( S->t[0] < S->buflen );
if ( S->last_node ) S->f[1] = ~0U;
S->f[0] = ~0U;
blake2s_8way_compress( S, buf );
for ( int i = 0; i < 8; ++i )
casti_m256i( out, i ) = S->h[ i ];
return 0;
}
#endif // __AVX2__

View File

@@ -95,8 +95,8 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen );
int blake2s_8way_update( blake2s_8way_state *S, const void *in,
uint64_t inlen );
int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
//int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
// const void *input, uint64_t inlen );
int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
const void *input, uint64_t inlen );
#endif

559
algo/echo/echo-hash-4way.c Normal file
View File

@@ -0,0 +1,559 @@
#if defined(__AVX512VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#include "simd-utils.h"
#include "echo-hash-4way.h"
/*
#include <memory.h>
#include "miner.h"
#include "hash_api.h"
//#include "vperm.h"
#include <immintrin.h>
*/
/*
#ifndef NO_AES_NI
#include <wmmintrin.h>
#else
#include <tmmintrin.h>
#endif
*/
// not used
/*
const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1};
const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C};
const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1};
const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8};
const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09};
const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79};
const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8};
const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170};
const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1};
const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363};
const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6};
const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b};
const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e};
const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e};
const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515};
const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c};
const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601};
const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06};
const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b};
*/
/*
MYALIGN const unsigned int const1[] = {0x00000001, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int mul2mask[] = {0x00001b00, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int lsbmask[] = {0x01010101, 0x01010101, 0x01010101, 0x01010101};
MYALIGN const unsigned int invshiftrows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
MYALIGN const unsigned int zero[] = {0x00000000, 0x00000000, 0x00000000, 0x00000000};
*/
MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
// do these need to be reversed?
#define mul2mask \
m512_const4_32( 0x00001b00, 0, 0, 0 )
#define lsbmask m512_const1_32( 0x01010101 )
#define ECHO_SUBBYTES( state, i, j ) \
state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
state[i][j] = _mm512_aesenc_epi128( state[i][j], m512_zero ); \
k1 = _mm512_add_epi32( k1, m512_one_32 )
#define ECHO_MIXBYTES( state1, state2, j, t1, t2, s2 ) do \
{ \
const int j1 = ( j+1 ) & 3; \
const int j2 = ( j+2 ) & 3; \
const int j3 = ( j+3 ) & 3; \
s2 = _mm512_add_epi8( state1[ 0 ] [j ], state1[ 0 ][ j ] ); \
t1 = _mm512_srli_epi16( state1[ 0 ][ j ], 7 ); \
t1 = _mm512_and_si128( t1, lsbmask );\
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
s2 = _mm512_xor_si512( s2, t2 ); \
state2[ 0 ] [j ] = s2; \
state2[ 1 ] [j ] = state1[ 0 ][ j ]; \
state2[ 2 ] [j ] = state1[ 0 ][ j ]; \
state2[ 3 ] [j ] = _mm512_xor_si512( s2, state1[ 0 ][ j ] );\
s2 = _mm512_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \
t1 = _mm512_srli_epi16( state1[ 1 ][ j1 ], 7 ); \
t1 = _mm512_and_si512( t1, lsbmask ); \
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
s2 = _mm512_xor_si512( s2, t2 );\
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
_mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
s2 = _mm512_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \
t1 = _mm512_srli_epi16( state1[ 2 ][ j2 ], 7 ); \
t1 = _mm512_and_si512( t1, lsbmask ); \
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
s2 = _mm512_xor_si512( s2, t2 ); \
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
_mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
state2[ 2 ][ j ] = _mm512_xor_si512128( state2[ 2 ][ j ], s2 ); \
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
t1 = _mm512_srli_epi16( state1[ 3 ][ j3 ], 7 ); \
t1 = _mm512_and_si512( t1, lsbmask ); \
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
s2 = _mm512_xor_si512( s2, t2 ); \
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
_mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 )
} while(0)
#define ECHO_ROUND_UNROLL2 \
ECHO_SUBBYTES(_state, 0, 0);\
ECHO_SUBBYTES(_state, 1, 0);\
ECHO_SUBBYTES(_state, 2, 0);\
ECHO_SUBBYTES(_state, 3, 0);\
ECHO_SUBBYTES(_state, 0, 1);\
ECHO_SUBBYTES(_state, 1, 1);\
ECHO_SUBBYTES(_state, 2, 1);\
ECHO_SUBBYTES(_state, 3, 1);\
ECHO_SUBBYTES(_state, 0, 2);\
ECHO_SUBBYTES(_state, 1, 2);\
ECHO_SUBBYTES(_state, 2, 2);\
ECHO_SUBBYTES(_state, 3, 2);\
ECHO_SUBBYTES(_state, 0, 3);\
ECHO_SUBBYTES(_state, 1, 3);\
ECHO_SUBBYTES(_state, 2, 3);\
ECHO_SUBBYTES(_state, 3, 3);\
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
ECHO_SUBBYTES(_state2, 0, 0);\
ECHO_SUBBYTES(_state2, 1, 0);\
ECHO_SUBBYTES(_state2, 2, 0);\
ECHO_SUBBYTES(_state2, 3, 0);\
ECHO_SUBBYTES(_state2, 0, 1);\
ECHO_SUBBYTES(_state2, 1, 1);\
ECHO_SUBBYTES(_state2, 2, 1);\
ECHO_SUBBYTES(_state2, 3, 1);\
ECHO_SUBBYTES(_state2, 0, 2);\
ECHO_SUBBYTES(_state2, 1, 2);\
ECHO_SUBBYTES(_state2, 2, 2);\
ECHO_SUBBYTES(_state2, 3, 2);\
ECHO_SUBBYTES(_state2, 0, 3);\
ECHO_SUBBYTES(_state2, 1, 3);\
ECHO_SUBBYTES(_state2, 2, 3);\
ECHO_SUBBYTES(_state2, 3, 3);\
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
#define SAVESTATE(dst, src)\
dst[0][0] = src[0][0];\
dst[0][1] = src[0][1];\
dst[0][2] = src[0][2];\
dst[0][3] = src[0][3];\
dst[1][0] = src[1][0];\
dst[1][1] = src[1][1];\
dst[1][2] = src[1][2];\
dst[1][3] = src[1][3];\
dst[2][0] = src[2][0];\
dst[2][1] = src[2][1];\
dst[2][2] = src[2][2];\
dst[2][3] = src[2][3];\
dst[3][0] = src[3][0];\
dst[3][1] = src[3][1];\
dst[3][2] = src[3][2];\
dst[3][3] = src[3][3]
void echo_4way_compress( echo_4way_context *ctx, const unsigned char *pmsg,
unsigned int uBlockCount )
{
unsigned int r, b, i, j;
__m512i t1, t2, s2, k1;
__m512i _state[4][4], _state2[4][4], _statebackup[4][4];
// unroll
for ( i = 0; i < 4; i++ )
for ( j = 0; j < ctx->uHashSize / 256; j++ )
_state[ i ][ j ] = ctx->state[ i ][ j ];
for ( b = 0; b < uBlockCount; b++ )
{
ctx->k = _mm512_add_epi64( ctx->k, ctx->const1536 );
// load message, make aligned, remove loadu
for( j = ctx->uHashSize / 256; j < 4; j++ )
{
for ( i = 0; i < 4; i++ )
{
_state[ i ][ j ] = _mm512_loadu_si512(
(__m512i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i );
}
}
// save state
SAVESTATE( _statebackup, _state );
k1 = ctx->k;
for ( r = 0; r < ctx->uRounds / 2; r++ )
{
ECHO_ROUND_UNROLL2;
}
if ( ctx->uHashSize == 256 )
{
for ( i = 0; i < 4; i++ )
{
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
_state[ i ][ 1 ] );
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
_state[ i ][ 2 ] );
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
_state[ i ][ 3 ] );
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
_statebackup[ i ][ 0 ] );
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
_statebackup[ i ][ 1 ] );
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
_statebackup[ i ][ 2 ] ) ;
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
_statebackup[ i ][ 3 ] );
}
}
else
{
for ( i = 0; i < 4; i++ )
{
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
_state[ i ][ 2 ] );
_state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
_state[ i ][ 3 ] );
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
_statebackup[ i ][ 0 ] );
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ] [0 ],
_statebackup[ i ][ 2 ] );
_state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
_statebackup[ i ][ 1 ] );
_state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
_statebackup[ i ][ 3 ] );
}
}
pmsg += ctx->uBlockLength;
}
SAVESTATE(ctx->state, _state);
}
int echo_4way_init( echo_4way_context *ctx, int nHashSize )
{
int i, j;
ctx->k = m512_zero;
ctx->processed_bits = 0;
ctx->uBufferBytes = 0;
switch( nHashSize )
{
case 256:
ctx->uHashSize = 256;
ctx->uBlockLength = 192;
ctx->uRounds = 8;
ctx->hashsize = _mm512_const4_32( 0, 0, 0, 0x100 );
ctx->const1536 = _mm512_const4_32( 0, 0, 0, 0x600 );
break;
case 512:
ctx->uHashSize = 512;
ctx->uBlockLength = 128;
ctx->uRounds = 10;
ctx->hashsize = _mm512_const4_32( 0, 0, 0, 0x200 );
ctx->const1536 = _mm512_const4_32( 0, 0, 0, 0x400);
break;
default:
return BAD_HASHBITLEN;
}
for( i = 0; i < 4; i++ )
for( j = 0; j < nHashSize / 256; j++ )
ctx->state[ i ][ j ] = ctx->hashsize;
for( i = 0; i < 4; i++ )
for( j = nHashSize / 256; j < 4; j++ )
ctx->state[ i ][ j ] = m512_zero;
return SUCCESS;
}
int echo_4way_update( echo_4way_context *state, const BitSequence *data, DataLength databitlen )
{
unsigned int uByteLength, uBlockCount, uRemainingBytes;
uByteLength = (unsigned int)(databitlen / 8);
if ( ( state->uBufferBytes + uByteLength ) >= state->uBlockLength )
{
if ( state->uBufferBytes != 0 )
{
// Fill the buffer
memcpy( state->buffer + state->uBufferBytes,
(void*)data, state->uBlockLength - state->uBufferBytes );
// Process buffer
echo_4way_compress( state, state->buffer, 1 );
state->processed_bits += state->uBlockLength * 8;
data += state->uBlockLength - state->uBufferBytes;
uByteLength -= state->uBlockLength - state->uBufferBytes;
}
// buffer now does not contain any unprocessed bytes
uBlockCount = uByteLength / state->uBlockLength;
uRemainingBytes = uByteLength % state->uBlockLength;
if ( uBlockCount > 0 )
{
echo_4way_compress( state, data, uBlockCount );
state->processed_bits += uBlockCount * state->uBlockLength * 8;
data += uBlockCount * state->uBlockLength;
}
if ( uRemainingBytes > 0 )
{
memcpy( state->buffer, (void*)data, uRemainingBytes );
}
state->uBufferBytes = uRemainingBytes;
}
else
{
memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength );
state->uBufferBytes += uByteLength;
}
return 0;
}
echo_4way_close( echo_4way_context *state, BitSequence *hashval )
{
__m512i remainingbits;
// Add remaining bytes in the buffer
state->processed_bits += state->uBufferBytes * 8;
remainingbits = _mm512_set4_epi32( 0, 0, 0, state->uBufferBytes * 8 );
// Pad with 0x80
state->buffer[ state->uBufferBytes++ ] = 0x80;
// Enough buffer space for padding in this block?
if ( ( state->uBlockLength - state->uBufferBytes ) >= 18)
{
// Pad with zeros
memset( state->buffer + state->uBufferBytes, 0,
state->uBlockLength - ( state->uBufferBytes + 18 ) );
// Hash size
*( (unsigned short*)( state->buffer + state->uBlockLength - 18 ) )
= state->uHashSize;
// Processed bits
*( ( DataLength*)( state->buffer + state->uBlockLength - 16 ) )
= state->processed_bits;
*( ( DataLength*)( state->buffer + state->uBlockLength - 8 ) ) = 0;
// Last block contains message bits?
if ( state->uBufferBytes == 1 )
{
state->k = _mm512_xor_si512( state->k, state->k );
state->k = _mm512_sub_epi64( state->k, state->const1536 );
}
else
{
state->k = _mm512_add_epi64( state->k, remainingbits );
state->k = _mm512_sub_epi64( state->k, state->const1536 );
}
// Compress
echo_4way_compress( state, state->buffer, 1 );
}
else
{
// Fill with zero and compress
memset( state->buffer + state->uBufferBytes, 0,
state->uBlockLength - state->uBufferBytes );
state->k = _mm512_add_epi64( state->k, remainingbits );
state->k = _mm512_sub_epi64( state->k, state->const1536 );
echo_4way_compress( state, state->buffer, 1 );
// Last block
memset( state->buffer, 0, state->uBlockLength - 18 );
// Hash size
*( (unsigned short*)( state->buffer + state->uBlockLength - 18 ) )
= state->uHashSize;
// Processed bits
*( (DataLength*)( state->buffer + state->uBlockLength - 16 ) )
= state->processed_bits;
*( (DataLength*)( state->buffer + state->uBlockLength - 8 ) ) = 0;
// Compress the last block
state->k = _mm512_xor_si512(state->k, state->k);
state->k = _mm512_sub_epi64(state->k, state->const1536);
echo_4way_compress(state, state->buffer, 1);
}
// Store the hash value
_mm512_storeu_si512( (__m512i*)hashval + 0, state->state[ 0][ 0 ]);
_mm512_storeu_si512( (__m512i*)hashval + 1, state->state[ 1][ 0 ]);
if ( state->uHashSize == 512 )
{
_mm512_storeu_si512((__m512i*)hashval + 2, state->state[ 2 ][ 0 ]);
_mm512_storeu_si512((__m512i*)hashval + 3, state->state[ 3 ][ 0 ]);
}
return 0;
}
int echo_4way_update_close( echo_4way_context *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen )
{
unsigned int uByteLength, uBlockCount, uRemainingBytes;
uByteLength = (unsigned int)(databitlen / 8);
if ( (state->uBufferBytes + uByteLength) >= state->uBlockLength )
{
if ( state->uBufferBytes != 0 )
{
// Fill the buffer
memcpy( state->buffer + state->uBufferBytes,
(void*)data, state->uBlockLength - state->uBufferBytes );
// Process buffer
echo_4way_compress( state, state->buffer, 1 );
state->processed_bits += state->uBlockLength * 8;
data += state->uBlockLength - state->uBufferBytes;
uByteLength -= state->uBlockLength - state->uBufferBytes;
}
// buffer now does not contain any unprocessed bytes
uBlockCount = uByteLength / state->uBlockLength;
uRemainingBytes = uByteLength % state->uBlockLength;
if ( uBlockCount > 0 )
{
echo_4way_compress( state, data, uBlockCount );
state->processed_bits += uBlockCount * state->uBlockLength * 8;
data += uBlockCount * state->uBlockLength;
}
if ( uRemainingBytes > 0 )
memcpy(state->buffer, (void*)data, uRemainingBytes);
state->uBufferBytes = uRemainingBytes;
}
else
{
memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength );
state->uBufferBytes += uByteLength;
}
__m512i remainingbits;
// Add remaining bytes in the buffer
state->processed_bits += state->uBufferBytes * 8;
remainingbits = _mm512_set4_epi32( 0, 0, 0, state->uBufferBytes * 8 );
// Pad with 0x80
state->buffer[ state->uBufferBytes++ ] = 0x80;
// Enough buffer space for padding in this block?
if ( (state->uBlockLength - state->uBufferBytes) >= 18 )
{
// Pad with zeros
memset( state->buffer + state->uBufferBytes, 0,i
state->uBlockLength - (state->uBufferBytes + 18) );
// Hash size
*( (unsigned short*)(state->buffer + state->uBlockLength - 18) )
= state->uHashSize;
// Processed bits
*( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
state->processed_bits;
*( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
// Last block contains message bits?
if( state->uBufferBytes == 1 )
{
state->k = _mm512_xor_si512( state->k, state->k );
state->k = _mm512_sub_epi64( state->k, state->const1536 );
}
else
{
state->k = _mm_add_epi64( state->k, remainingbits );
state->k = _mm_sub_epi64( state->k, state->const1536 );
}
// Compress
echo_4way_compress( state, state->buffer, 1 );
}
else
{
// Fill with zero and compress
memset( state->buffer + state->uBufferBytes, 0,
state->uBlockLength - state->uBufferBytes );
state->k = _mm512_add_epi64( state->k, remainingbits );
state->k = _mm512_sub_epi64( state->k, state->const1536 );
echo_4way_compress( state, state->buffer, 1 );
// Last block
memset( state->buffer, 0, state->uBlockLength - 18 );
// Hash size
*( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) =
state->uHashSize;
// Processed bits
*( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
state->processed_bits;
*( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
// Compress the last block
state->k = _mm512_xor_si512( state->k, state->k );
state->k = _mm512_sub_epi64( state->k, state->const1536 );
echo_4way_compress( state, state->buffer, 1) ;
}
// Store the hash value
_mm512_storeu_si512( (__m512i*)hashval + 0, state->state[ 0 ][ 0] );
_mm512_storeu_si512( (__m512i*)hashval + 1, state->state[ 1 ][ 0] );
if ( state->uHashSize == 512 )
{
_mm512_storeu_si512( (__m512i*)hashval + 2, state->state[ 2 ][ 0 ] );
_mm512_storeu_si512( (__m512i*)hashval + 3, state->state[ 3 ][ 0 ] );
}
return 0;
}
#endif

View File

@@ -0,0 +1,36 @@
#if !defined(ECHO_HASH_4WAY_H__)
#define ECHO_HASH_4WAY_H__ 1
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#include "simd-utils.h"
typedef struct
{
__m512i state[4][4];
__m512i buffer[ 4 * 192 / 16 ]; // 4x128 interleaved 192 bytes
__m512i k;
__m512i hashsize;
__m512i const1536;
unsigned int uRounds;
unsigned int uHashSize;
unsigned int uBlockLength;
unsigned int uBufferBytes;
unsigned int processed_bits;
} echo_4way_context __attribute__ ((aligned (64)));
int echo_4way_init( echo_4way_context *state, int hashbitlen );
int echo_4way_update( echo_4way_context *state, const void *data,
unsigned int databitlen);
int echo_close( echo_4way_context *state, void *hashval );
int echo_4way_update_close( echo_4way_context *state, void *hashval,
const void *data, int databitlen );
#endif
#endif

View File

@@ -9,6 +9,7 @@
//#ifndef NO_AES_NI
// Not to be confused with AVX512VAES
#define VAES
// #define VAVX
// #define VVPERM

View File

@@ -1,15 +1,206 @@
#include "lyra2-gate.h"
#include <memory.h>
#include <mm_malloc.h>
#if defined (ALLIUM_4WAY)
#include "algo/blake/blake-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/groestl/aes_ni/hash-groestl256.h"
#if defined (ALLIUM_8WAY)
typedef struct {
blake256_8way_context blake;
keccak256_8way_context keccak;
cube_4way_context cube;
skein256_8way_context skein;
hashState_groestl256 groestl;
} allium_8way_ctx_holder;
static __thread allium_8way_ctx_holder allium_8way_ctx;
bool init_allium_8way_ctx()
{
keccak256_8way_init( &allium_8way_ctx.keccak );
cube_4way_init( &allium_8way_ctx.cube, 256, 16, 32 );
skein256_8way_init( &allium_8way_ctx.skein );
init_groestl256( &allium_8way_ctx.groestl, 32 );
return true;
}
void allium_8way_hash( void *state, const void *input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (128)));
uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
blake256_8way_close( &ctx.blake, vhash );
rintrlv_8x32_8x64( vhashA, vhash, 256 );
keccak256_8way_update( &ctx.keccak, vhashA, 32 );
keccak256_8way_close( &ctx.keccak, vhash );
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, 256 );
intrlv_2x256( vhash, hash0, hash1, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash0, hash1, vhash, 256 );
intrlv_2x256( vhash, hash2, hash3, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash2, hash3, vhash, 256 );
intrlv_2x256( vhash, hash4, hash5, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash4, hash5, vhash, 256 );
intrlv_2x256( vhash, hash6, hash7, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash6, hash7, vhash, 256 );
/*
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
*/
intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
cube_4way_init( &ctx.cube, 256, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
intrlv_2x256( vhash, hash0, hash1, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash0, hash1, vhash, 256 );
intrlv_2x256( vhash, hash2, hash3, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash2, hash3, vhash, 256 );
intrlv_2x256( vhash, hash4, hash5, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash4, hash5, vhash, 256 );
intrlv_2x256( vhash, hash6, hash7, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash6, hash7, vhash, 256 );
/*
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
*/
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, 256 );
skein256_8way_update( &ctx.skein, vhash, 32 );
skein256_8way_close( &ctx.skein, vhash );
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, 256 );
update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+128, hash4, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+160, hash5, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+192, hash6, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
}
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 8;
const uint32_t Htarg = ptarget[7];
__m256i *noncev = (__m256i*)vdata + 19; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake256_8way_init( &allium_8way_ctx.blake );
blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );
do {
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
n+3, n+2, n+1, n ) );
allium_8way_hash( hash, vdata );
pdata[19] = n;
for ( int lane = 0; lane < 8; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
{
if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_lane_solution( work, hash+(lane<<3), mythr, lane );
}
}
n += 8;
} while ( (n < last_nonce) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce;
return 0;
}
#elif defined (ALLIUM_4WAY)
typedef struct {
blake256_4way_context blake;
keccak256_4way_context keccak;

View File

@@ -129,7 +129,11 @@ bool register_lyra2rev2_algo( algo_gate_t* gate )
bool register_lyra2z_algo( algo_gate_t* gate )
{
#if defined(LYRA2Z_8WAY)
#if defined(LYRA2Z_16WAY)
gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_16way;
gate->hash = (void*)&lyra2z_16way_hash;
#elif defined(LYRA2Z_8WAY)
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_8way;
gate->hash = (void*)&lyra2z_8way_hash;
@@ -142,7 +146,7 @@ bool register_lyra2z_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_lyra2z;
gate->hash = (void*)&lyra2z_hash;
#endif
gate->optimizations = SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE42_OPT | AVX2_OPT | AVX512_OPT;
opt_target_factor = 256.0;
return true;
};
@@ -170,7 +174,11 @@ bool register_lyra2h_algo( algo_gate_t* gate )
bool register_allium_algo( algo_gate_t* gate )
{
#if defined (ALLIUM_4WAY)
#if defined (ALLIUM_8WAY)
gate->miner_thread_init = (void*)&init_allium_8way_ctx;
gate->scanhash = (void*)&scanhash_allium_8way;
gate->hash = (void*)&allium_8way_hash;
#elif defined (ALLIUM_4WAY)
gate->miner_thread_init = (void*)&init_allium_4way_ctx;
gate->scanhash = (void*)&scanhash_allium_4way;
gate->hash = (void*)&allium_4way_hash;
@@ -179,7 +187,7 @@ bool register_allium_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_allium;
gate->hash = (void*)&allium_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
opt_target_factor = 256.0;
return true;
};

View File

@@ -85,17 +85,25 @@ bool init_lyra2rev2_ctx();
/////////////////////////
#if defined(__SSE2__)
#define LYRA2Z_4WAY
#endif
#if defined(__AVX2__)
#define LYRA2Z_8WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define LYRA2Z_16WAY 1
#elif defined(__AVX2__)
#define LYRA2Z_8WAY 1
#elif defined(__SSE2__)
#define LYRA2Z_4WAY 1
#endif
#define LYRA2Z_MATRIX_SIZE BLOCK_LEN_INT64 * 8 * 8 * 8
#if defined(LYRA2Z_8WAY)
#if defined(LYRA2Z_16WAY)
void lyra2z_16way_hash( void *state, const void *input );
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2z_16way_thread_init();
#elif defined(LYRA2Z_8WAY)
void lyra2z_8way_hash( void *state, const void *input );
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
@@ -144,13 +152,22 @@ bool lyra2h_thread_init();
//////////////////////////////////
#if defined(__AVX2__) && defined(__AES__)
#define ALLIUM_4WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define ALLIUM_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define ALLIUM_4WAY 1
#endif
bool register_allium_algo( algo_gate_t* gate );
#if defined(ALLIUM_4WAY)
#if defined(ALLIUM_8WAY)
void allium_8way_hash( void *state, const void *input );
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool init_allium_8way_ctx();
#elif defined(ALLIUM_4WAY)
void allium_4way_hash( void *state, const void *input );
int scanhash_allium_4way( struct work *work, uint32_t max_nonce,

View File

@@ -26,14 +26,17 @@
#include "lyra2.h"
#include "sponge.h"
// LYRA2RE 8 cols 8 rows used by lyea2re, allium, phi2, x22i, x25x.
// LYRA2RE 8 cols 8 rows used by lyra2re, allium, phi2, x22i, x25x,
// dynamic matrix allocation.
//
// LYRA2REV2 4 cols 4 rows used by lyra2rev2.
// LYRA2REV2 4 cols 4 rows used by lyra2rev2 and x21s, static matrix
// allocation.
//
// LYRA2REV3 4 cols 4 rows with an extra twist in calculating
// rowa in the wandering phase. Used by lyra2rev3.
// rowa in the wandering phase. Used by lyra2rev3. Static matrix
// allocation.
//
// LYRA2Z various cols & rows and supports 80 input. Used by lyra2z,
// LYRA2Z various cols & rows and supports 80 byte input. Used by lyra2z,
// lyra2z330, lyra2h,
@@ -60,7 +63,7 @@
*/
// For lyra2rev3.
// convert a simple offset to an index into interleaved data.
// convert a simple offset to an index into 2x4 u64 interleaved data.
// good for state and 4 row matrix.
// index = ( int( off / 4 ) * 2 ) + ( off mod 4 )
@@ -202,12 +205,8 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
// hard coded for 32 byte input as well as matrix size.
// Other required versions include 80 byte input and different block
// sizez
// sizes.
//int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
// const void *pwd, const uint64_t pwdlen, const void *salt,
// const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
// const uint64_t nCols )
{
//====================== Basic variables ============================//
uint64_t _ALIGN(256) state[32];
@@ -335,159 +334,111 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
return 0;
}
#endif // AVX512
#if 0
//////////////////////////////////////////////////
int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
const uint64_t timeCost, const uint64_t nRows,
const uint64_t nCols )
int LYRA2Z_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
const void *pwd, const uint64_t pwdlen, const uint64_t timeCost,
const uint64_t nRows, const uint64_t nCols )
{
//========================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
int64_t row = 2; //index of row to be processed
int64_t prev = 1; //index of prev (last row ever computed/modified)
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
int64_t tau; //Time Loop iterator
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
// int64_t i; //auxiliary iteration counter
uint64_t _ALIGN(256) state[32];
int64_t row = 2;
int64_t prev = 1;
int64_t rowa0 = 0;
int64_t rowa1 = 0;
int64_t tau;
int64_t step = 1;
int64_t window = 2;
int64_t gap = 1;
//=======================================================================/
//======= Initializing the Memory Matrix and pointers to it =============//
//Tries to allocate enough space for the whole memory matrix
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
//==== Getting the password + salt + basil padded with 10*1 ============//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
//but this ensures that the password copied locally will be overwritten as soon as possible
//First, we clean enough blocks for the password, salt, basil and padding
uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 *
uint64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 *
sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
byte *ptrByte = (byte*) wholeMatrix;
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES );
//Prepends the password
memcpy(ptrByte, pwd, pwdlen);
ptrByte += pwdlen;
uint64_t *ptr = wholeMatrix;
uint64_t *pw = (uint64_t*)pwd;
//Concatenates the salt
memcpy(ptrByte, salt, saltlen);
ptrByte += saltlen;
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
memcpy(ptrByte, &kLen, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &saltlen, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &timeCost, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &nRows, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &nCols, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy( ptr, pw, 2*pwdlen ); // password
ptr += pwdlen>>2;
memcpy( ptr, pw, 2*pwdlen ); // password lane 1
ptr += pwdlen>>2;
//Now comes the padding
*ptrByte = 0x80; //first byte of padding: right after the password
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
// now build the rest interleaving on the fly.
ptr[0] = ptr[ 4] = kLen;
ptr[1] = ptr[ 5] = pwdlen;
ptr[2] = ptr[ 6] = pwdlen; // saltlen
ptr[3] = ptr[ 7] = timeCost;
ptr[8] = ptr[12] = nRows;
ptr[9] = ptr[13] = nCols;
ptr[10] = ptr[14] = 0x80;
ptr[11] = ptr[15] = 0x0100000000000000;
//=================== Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
// uint64_t *state = _mm_malloc(16 * sizeof(uint64_t), 32);
// if (state == NULL) {
// return -1;
// }
// initState( state );
uint64_t *ptrWord = wholeMatrix;
//============================== Setup Phase =============================//
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
uint64_t *ptrWord = wholeMatrix;
absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput,
BLOCK_LEN_BLAKE2_SAFE_INT64 );
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput,
BLOCK_LEN_BLAKE2_SAFE_INT64 );
/*
for ( i = 0; i < nBlocksInput; i++ )
{
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
}
*/
//Initializes M[0] and M[1]
reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here
reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols);
//Initializes M[0] and M[1]
reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );
do {
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
reducedDuplexRowSetup(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
reducedDuplexRow1_2way( state, &wholeMatrix[0],
&wholeMatrix[ 2 * ROW_LEN_INT64 ], nCols );
//updates the value of row* (deterministically picked during Setup))
rowa = (rowa + step) & (window - 1);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
row++;
do
{
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
//Checks if all rows in the window where visited.
if (rowa == 0) {
step = window + gap; //changes the step: approximately doubles its value
window *= 2; //doubles the size of the re-visitation window
gap = -gap; //inverts the modifier to the step
}
reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64],
&wholeMatrix[ 2* rowa0 * ROW_LEN_INT64],
&wholeMatrix[ 2* row*ROW_LEN_INT64],
nCols );
} while (row < nRows);
rowa0 = (rowa0 + step) & (window - 1);
prev = row;
row++;
//======================== Wandering Phase =============================//
row = 0; //Resets the visitation to the first row of the memory matrix
for ( tau = 1; tau <= timeCost; tau++ )
{
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
if ( rowa0 == 0 )
{
step = window + gap;
window *= 2;
gap = -gap;
}
} while ( row < nRows );
row = 0;
for ( tau = 1; tau <= timeCost; tau++ )
{
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
do {
//Selects a pseudorandom index row*
//----------------------------------------------------------------------
//rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//-----------------------------------------------------------------
do
{
rowa0 = state[ 0 ] % nRows;
rowa1 = state[ 4 ] % nRows;
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
reducedDuplexRow(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
&wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
&wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
&wholeMatrix[ 2* row *ROW_LEN_INT64 ],
nCols );
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
//---------------------------------------------------------------
//row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//--------------------------------------------------------------------
row = (row + step) % nRows;
} while (row != 0);
}
}
//========================= Wrap-up Phase ===============================//
//Absorbs the last block of the memory matrix
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64 ],
&wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64 ] );
//Squeezes the key
squeeze( state, K, kLen );
//Squeezes the key
squeeze_2way( state, K, (unsigned int) kLen );
return 0;
return 0;
}
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
////////////////////////////////////////////////////
// Lyra2RE doesn't like the new wholeMatrix implementation
int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
@@ -495,7 +446,7 @@ int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
const uint64_t nRows, const uint64_t nCols )
{
//====================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
uint64_t _ALIGN(256) state[32];
int64_t row = 2; //index of row to be processed
int64_t prev = 1; //index of prev (last row ever computed/modified)
int64_t rowa0 = 0;
@@ -517,25 +468,15 @@ int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
i = (int64_t)ROW_LEN_BYTES * nRows;
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
uint64_t *wholeMatrix = _mm_malloc( 2*i, 64 );
if (wholeMatrix == NULL)
return -1;
#if defined(__AVX2__)
memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
#elif defined(__SSE2__)
memset_zero_128( (__m128i*)wholeMatrix, i>>4 );
#else
memset( wholeMatrix, 0, i );
#endif
memset_zero_512( (__m512i*)wholeMatrix, i>>5 );
uint64_t *ptrWord = wholeMatrix;
uint64_t *pw = (uint64_t*)pwd;
//=== Getting the password + salt + basil padded with 10*1 ==========//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
//but this ensures that the password copied locally will be overwritten as soon as possible
//First, we clean enough blocks for the password, salt, basil and padding
int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
@@ -558,66 +499,8 @@ int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
ptr[10] = ptr[14] = 0x80;
ptr[11] = ptr[15] = 0x0100000000000000;
/*
byte *ptrByte = (byte*) wholeMatrix;
//Prepends the password
memcpy(ptrByte, pwd, pwdlen);
ptrByte += pwdlen;
//Concatenates the salt
memcpy(ptrByte, salt, saltlen);
ptrByte += saltlen;
// memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
// - (saltlen + pwdlen) );
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
memcpy(ptrByte, &kLen, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = pwdlen;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = saltlen;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = timeCost;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = nRows;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = nCols;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
//Now comes the padding
*ptrByte = 0x80; //first byte of padding: right after the password
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
//================= Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
// initState( state );
//========================= Setup Phase =============================//
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
ptrWord = wholeMatrix;
*/
absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
/*
for (i = 0; i < nBlocksInput; i++)
{
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
}
*/
//Initializes M[0] and M[1]
reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here

View File

@@ -62,6 +62,8 @@ int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *sa
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
int LYRA2REV2_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
@@ -69,6 +71,9 @@ int LYRA2REV2_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
int LYRA2Z_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
#endif
#endif /* LYRA2_H_ */

View File

@@ -1,13 +1,240 @@
#include "lyra2-gate.h"
#ifdef LYRA2Z_4WAY
#include <memory.h>
#include <mm_malloc.h>
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake-hash-4way.h"
#if defined(LYRA2Z_16WAY)
__thread uint64_t* lyra2z_16way_matrix;
bool lyra2z_16way_thread_init()
{
return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
}
static __thread blake256_16way_context l2z_16way_blake_mid;
void lyra2z_16way_midstate( const void* input )
{
blake256_16way_init( &l2z_16way_blake_mid );
blake256_16way_update( &l2z_16way_blake_mid, input, 64 );
}
void lyra2z_16way_hash( void *state, const void *input )
{
uint32_t vhash[8*16] __attribute__ ((aligned (128)));
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t hash8[8] __attribute__ ((aligned (64)));
uint32_t hash9[8] __attribute__ ((aligned (64)));
uint32_t hash10[8] __attribute__ ((aligned (64)));
uint32_t hash11[8] __attribute__ ((aligned (64)));
uint32_t hash12[8] __attribute__ ((aligned (64)));
uint32_t hash13[8] __attribute__ ((aligned (64)));
uint32_t hash14[8] __attribute__ ((aligned (64)));
uint32_t hash15[8] __attribute__ ((aligned (64)));
blake256_16way_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &l2z_16way_blake_mid, sizeof l2z_16way_blake_mid );
blake256_16way_update( &ctx_blake, input + (64*16), 16 );
blake256_16way_close( &ctx_blake, vhash );
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
vhash, 256 );
intrlv_2x256( vhash, hash0, hash1, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash0, hash1, vhash, 256 );
intrlv_2x256( vhash, hash2, hash3, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash2, hash3, vhash, 256 );
intrlv_2x256( vhash, hash4, hash5, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash4, hash5, vhash, 256 );
intrlv_2x256( vhash, hash6, hash7, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash6, hash7, vhash, 256 );
intrlv_2x256( vhash, hash8, hash9, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash8, hash9, vhash, 256 );
intrlv_2x256( vhash, hash10, hash11, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash10, hash11, vhash, 256 );
intrlv_2x256( vhash, hash12, hash13, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash12, hash13, vhash, 256 );
intrlv_2x256( vhash, hash14, hash15, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash14, hash15, vhash, 256 );
memcpy( state, hash0, 32 );
memcpy( state+ 32, hash1, 32 );
memcpy( state+ 64, hash2, 32 );
memcpy( state+ 96, hash3, 32 );
memcpy( state+128, hash4, 32 );
memcpy( state+160, hash5, 32 );
memcpy( state+192, hash6, 32 );
memcpy( state+224, hash7, 32 );
memcpy( state+256, hash8, 32 );
memcpy( state+288, hash9, 32 );
memcpy( state+320, hash10, 32 );
memcpy( state+352, hash11, 32 );
memcpy( state+384, hash12, 32 );
memcpy( state+416, hash13, 32 );
memcpy( state+448, hash14, 32 );
memcpy( state+480, hash15, 32 );
}
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
__m512i *noncev = (__m512i*)vdata + 19; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
mm512_bswap32_intrlv80_16x32( vdata, pdata );
lyra2z_16way_midstate( vdata );
do {
*noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4,
n+ 3, n+ 2, n+ 1, n ) );
lyra2z_16way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 16; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
&& !opt_benchmark )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 16;
} while ( (n < max_nonce-16) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
#elif defined(LYRA2Z_8WAY)
__thread uint64_t* lyra2z_8way_matrix;
bool lyra2z_8way_thread_init()
{
return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
}
static __thread blake256_8way_context l2z_8way_blake_mid;
void lyra2z_8way_midstate( const void* input )
{
blake256_8way_init( &l2z_8way_blake_mid );
blake256_8way( &l2z_8way_blake_mid, input, 64 );
}
void lyra2z_8way_hash( void *state, const void *input )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
blake256_8way( &ctx_blake, input + (64*8), 16 );
blake256_8way_close( &ctx_blake, vhash );
dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash4, 32, hash4, 32, hash4, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash5, 32, hash5, 32, hash5, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
memcpy( state, hash0, 32 );
memcpy( state+ 32, hash1, 32 );
memcpy( state+ 64, hash2, 32 );
memcpy( state+ 96, hash3, 32 );
memcpy( state+128, hash4, 32 );
memcpy( state+160, hash5, 32 );
memcpy( state+192, hash6, 32 );
memcpy( state+224, hash7, 32 );
}
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 19; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
mm256_bswap32_intrlv80_8x32( vdata, pdata );
lyra2z_8way_midstate( vdata );
do {
*noncev = mm256_bswap_32(
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
lyra2z_8way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 8; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
&& !opt_benchmark )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 8;
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
#elif defined(LYRA2Z_4WAY)
__thread uint64_t* lyra2z_4way_matrix;
bool lyra2z_4way_thread_init()
@@ -85,100 +312,3 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
#endif
#if defined(LYRA2Z_8WAY)
__thread uint64_t* lyra2z_8way_matrix;
bool lyra2z_8way_thread_init()
{
return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
}
static __thread blake256_8way_context l2z_8way_blake_mid;
void lyra2z_8way_midstate( const void* input )
{
blake256_8way_init( &l2z_8way_blake_mid );
blake256_8way( &l2z_8way_blake_mid, input, 64 );
}
void lyra2z_8way_hash( void *state, const void *input )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
blake256_8way( &ctx_blake, input + (64*8), 16 );
blake256_8way_close( &ctx_blake, vhash );
dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash4, 32, hash4, 32, hash4, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash5, 32, hash5, 32, hash5, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
memcpy( state, hash0, 32 );
memcpy( state+ 32, hash1, 32 );
memcpy( state+ 64, hash2, 32 );
memcpy( state+ 96, hash3, 32 );
memcpy( state+128, hash4, 32 );
memcpy( state+160, hash5, 32 );
memcpy( state+192, hash6, 32 );
memcpy( state+224, hash7, 32 );
}
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 19; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
mm256_bswap32_intrlv80_8x32( vdata, pdata );
lyra2z_8way_midstate( vdata );
do {
*noncev = mm256_bswap_32(
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
lyra2z_8way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 8; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
&& !opt_benchmark )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 8;
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

View File

@@ -10,7 +10,140 @@
#define LBRY_MIDSTATE 64
#define LBRY_TAIL (LBRY_INPUT_SIZE) - (LBRY_MIDSTATE)
#if defined(LBRY_8WAY)
#if defined(LBRY_16WAY)
static __thread sha256_16way_context sha256_16w_mid;
void lbry_16way_hash( void* output, const void* input )
{
uint32_t _ALIGN(128) vhashA[16<<4];
uint32_t _ALIGN(64) vhashB[16<<4];
uint32_t _ALIGN(64) vhashC[16<<4];
uint32_t _ALIGN(64) h0[32];
uint32_t _ALIGN(64) h1[32];
uint32_t _ALIGN(64) h2[32];
uint32_t _ALIGN(64) h3[32];
uint32_t _ALIGN(64) h4[32];
uint32_t _ALIGN(64) h5[32];
uint32_t _ALIGN(64) h6[32];
uint32_t _ALIGN(64) h7[32];
uint32_t _ALIGN(64) h8[32];
uint32_t _ALIGN(64) h9[32];
uint32_t _ALIGN(64) h10[32];
uint32_t _ALIGN(64) h11[32];
uint32_t _ALIGN(64) h12[32];
uint32_t _ALIGN(64) h13[32];
uint32_t _ALIGN(64) h14[32];
uint32_t _ALIGN(64) h15[32];
sha256_16way_context ctx_sha256 __attribute__ ((aligned (64)));
sha512_8way_context ctx_sha512;
ripemd160_16way_context ctx_ripemd;
memcpy( &ctx_sha256, &sha256_16w_mid, sizeof(ctx_sha256) );
sha256_16way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<4), LBRY_TAIL );
sha256_16way_close( &ctx_sha256, vhashA );
sha256_16way_init( &ctx_sha256 );
sha256_16way_update( &ctx_sha256, vhashA, 32 );
sha256_16way_close( &ctx_sha256, vhashA );
// reinterleave to do sha512 4-way 64 bit twice.
dintrlv_16x32( h0, h1, h2, h3, h4, h5, h6, h7,
h8, h9, h10, h11, h12, h13, h14, h15, vhashA, 256 );
intrlv_8x64( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 256 );
intrlv_8x64( vhashB, h8, h9, h10, h11, h12, h13, h14, h15, 256 );
sha512_8way_init( &ctx_sha512 );
sha512_8way_update( &ctx_sha512, vhashA, 32 );
sha512_8way_close( &ctx_sha512, vhashA );
sha512_8way_init( &ctx_sha512 );
sha512_8way_update( &ctx_sha512, vhashB, 32 );
sha512_8way_close( &ctx_sha512, vhashB );
// back to 8-way 32 bit
dintrlv_8x64( h0, h1, h2, h3,h4, h5, h6, h7, vhashA, 512 );
dintrlv_8x64( h8, h9, h10, h11, h12, h13, h14, h15, vhashB, 512 );
intrlv_16x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7,
h8, h9, h10, h11, h12, h13, h14, h15, 512 );
ripemd160_16way_init( &ctx_ripemd );
ripemd160_16way_update( &ctx_ripemd, vhashA, 32 );
ripemd160_16way_close( &ctx_ripemd, vhashB );
ripemd160_16way_init( &ctx_ripemd );
ripemd160_16way_update( &ctx_ripemd, vhashA+(8<<4), 32 );
ripemd160_16way_close( &ctx_ripemd, vhashC );
sha256_16way_init( &ctx_sha256 );
sha256_16way_update( &ctx_sha256, vhashB, 20 );
sha256_16way_update( &ctx_sha256, vhashC, 20 );
sha256_16way_close( &ctx_sha256, vhashA );
sha256_16way_init( &ctx_sha256 );
sha256_16way_update( &ctx_sha256, vhashA, 32 );
sha256_16way_close( &ctx_sha256, output );
}
int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[32*16] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<4]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[27];
const uint32_t first_nonce = pdata[27];
const uint32_t Htarg = ptarget[7];
uint32_t edata[32] __attribute__ ((aligned (64)));
__m512i *noncev = (__m512i*)vdata + 27; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
// we need bigendian data...
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
sha256_16way_init( &sha256_16w_mid );
sha256_16way( &sha256_16w_mid, vdata, LBRY_MIDSTATE );
do
{
*noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4,
n+ 3, n+ 2, n+ 1, n ) );
lbry_16way_hash( hash, vdata );
for ( int i = 0; i < 16; i++ )
if ( unlikely( hash7[ i ] <= Htarg ) )
{
// deinterleave hash for lane
extr_lane_16x32( lane_hash, hash, i, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[27] = n + i;
submit_lane_solution( work, lane_hash, mythr, i );
}
}
n += 16;
} while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return 0;
}
#elif defined(LBRY_8WAY)
static __thread sha256_8way_context sha256_8w_mid;
@@ -91,11 +224,6 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
__m256i *noncev = (__m256i*)vdata + 27; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// we need bigendian data...
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
@@ -106,33 +234,30 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
intrlv_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 1024 );
edata, edata, edata, edata, 1024 );
sha256_8way_init( &sha256_8w_mid );
sha256_8way( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] )
do
{
uint32_t mask = masks[m];
do
{
*noncev = mm256_bswap_32( _mm256_set_epi32(
n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
lbry_8way_hash( hash, vdata );
*noncev = mm256_bswap_32( _mm256_set_epi32(
n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
lbry_8way_hash( hash, vdata );
for ( int i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
for ( int i = 0; i < 8; i++ )
if ( unlikely( hash7[ i ] <= Htarg ) )
{
// deinterleave hash for lane
extr_lane_8x32( lane_hash, hash, i, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
// deinterleave hash for lane
extr_lane_8x32( lane_hash, hash, i, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[27] = n + i;
submit_lane_solution( work, lane_hash, mythr, i );
}
pdata[27] = n + i;
submit_lane_solution( work, lane_hash, mythr, i );
}
n += 8;
} while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
break;
}
}
n += 8;
} while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return 0;
}

View File

@@ -98,16 +98,23 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }
bool register_lbry_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT | SHA_OPT;
#if defined (LBRY_8WAY)
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
#if defined (LBRY_16WAY)
gate->scanhash = (void*)&scanhash_lbry_16way;
gate->hash = (void*)&lbry_16way_hash;
gate->optimizations = AVX2_OPT | AVX512_OPT;
#elif defined (LBRY_8WAY)
gate->scanhash = (void*)&scanhash_lbry_8way;
gate->hash = (void*)&lbry_8way_hash;
gate->optimizations = AVX2_OPT | AVX512_OPT;
#elif defined (LBRY_4WAY)
gate->scanhash = (void*)&scanhash_lbry_4way;
gate->hash = (void*)&lbry_4way_hash;
gate->optimizations = AVX2_OPT | AVX512_OPT;
#else
gate->scanhash = (void*)&scanhash_lbry;
gate->hash = (void*)&lbry_hash;
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
#endif
gate->calc_network_diff = (void*)&lbry_calc_network_diff;
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;

View File

@@ -4,11 +4,20 @@
#include "algo-gate-api.h"
#include <stdint.h>
// 16 way needs sha256 16 way
//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// #define LBRY_16WAY
#if defined(__AVX2__)
#define LBRY_8WAY
#endif
/*
#if !defined(__SHA__)
#if defined(__AVX2__)
#define LBRY_8WAY
#endif
#endif
*/
#define LBRY_NTIME_INDEX 25
#define LBRY_NBITS_INDEX 26
@@ -18,7 +27,12 @@
bool register_lbry_algo( algo_gate_t* gate );
#if defined(LBRY_8WAY)
#if defined(LBRY_16WAY)
void lbry_16way_hash( void *state, const void *input );
int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(LBRY_8WAY)
void lbry_8way_hash( void *state, const void *input );
int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,

View File

@@ -80,9 +80,6 @@ int scanhash_lbry( struct work *work, uint32_t max_nonce,
// we need bigendian data...
swab32_array( endiandata, pdata, 32 );
#ifdef DEBUG_ALGO
printf("[%d] Htarg=%X\n", thr_id, Htarg);
#endif
for (int m=0; m < sizeof(masks); m++) {
if (Htarg <= htmax[m]) {
uint32_t mask = masks[m];
@@ -90,23 +87,11 @@ int scanhash_lbry( struct work *work, uint32_t max_nonce,
pdata[27] = ++n;
be32enc(&endiandata[27], n);
lbry_hash(hash64, &endiandata);
#ifndef DEBUG_ALGO
if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
return true;
pdata[27] = n;
submit_solution( work, hash64, mythr );
}
#else
if (!(n % 0x1000) && !thr_id) printf(".");
if (!(hash64[7] & mask)) {
printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
return true;
}
}
#endif
} while (n < max_nonce && !work_restart[thr_id].restart);
// see blake.c if else to understand the loop on htmax => mask
} while ( (n < max_nonce -8) && !work_restart[thr_id].restart);
break;
}
}

View File

@@ -623,3 +623,303 @@ void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst )
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// RIPEMD-160 16 way
#define F16W_1(x, y, z) \
_mm512_xor_si512( _mm512_xor_si512( x, y ), z )
#define F16W_2(x, y, z) \
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( y, z ), x ), z )
#define F16W_3(x, y, z) \
_mm512_xor_si512( _mm512_or_si512( x, mm512_not( y ) ), z )
#define F16W_4(x, y, z) \
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( x, y ), z ), y )
#define F16W_5(x, y, z) \
_mm512_xor_si512( x, _mm512_or_si512( y, mm512_not( z ) ) )
#define RR_16W(a, b, c, d, e, f, s, r, k) \
do{ \
a = _mm512_add_epi32( mm512_rol_32( _mm512_add_epi32( _mm512_add_epi32( \
_mm512_add_epi32( a, f( b ,c, d ) ), r ), \
m512_const1_64( k ) ), s ), e ); \
c = mm512_rol_32( c, 10 );\
} while (0)
#define ROUND1_16W(a, b, c, d, e, f, s, r, k) \
RR_16W(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
#define ROUND2_16W(a, b, c, d, e, f, s, r, k) \
RR_16W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
static void ripemd160_16way_round( ripemd160_16way_context *sc )
{
const __m512i *in = (__m512i*)sc->buf;
__m512i *h = (__m512i*)sc->val;
register __m512i A1, B1, C1, D1, E1;
register __m512i A2, B2, C2, D2, E2;
__m512i tmp;
A1 = A2 = h[0];
B1 = B2 = h[1];
C1 = C2 = h[2];
D1 = D2 = h[3];
E1 = E2 = h[4];
ROUND1_16W( A, B, C, D, E, F16W_1, 11, in[ 0], 1 );
ROUND1_16W( E, A, B, C, D, F16W_1, 14, in[ 1], 1 );
ROUND1_16W( D, E, A, B, C, F16W_1, 15, in[ 2], 1 );
ROUND1_16W( C, D, E, A, B, F16W_1, 12, in[ 3], 1 );
ROUND1_16W( B, C, D, E, A, F16W_1, 5, in[ 4], 1 );
ROUND1_16W( A, B, C, D, E, F16W_1, 8, in[ 5], 1 );
ROUND1_16W( E, A, B, C, D, F16W_1, 7, in[ 6], 1 );
ROUND1_16W( D, E, A, B, C, F16W_1, 9, in[ 7], 1 );
ROUND1_16W( C, D, E, A, B, F16W_1, 11, in[ 8], 1 );
ROUND1_16W( B, C, D, E, A, F16W_1, 13, in[ 9], 1 );
ROUND1_16W( A, B, C, D, E, F16W_1, 14, in[10], 1 );
ROUND1_16W( E, A, B, C, D, F16W_1, 15, in[11], 1 );
ROUND1_16W( D, E, A, B, C, F16W_1, 6, in[12], 1 );
ROUND1_16W( C, D, E, A, B, F16W_1, 7, in[13], 1 );
ROUND1_16W( B, C, D, E, A, F16W_1, 9, in[14], 1 );
ROUND1_16W( A, B, C, D, E, F16W_1, 8, in[15], 1 );
ROUND1_16W( E, A, B, C, D, F16W_2, 7, in[ 7], 2 );
ROUND1_16W( D, E, A, B, C, F16W_2, 6, in[ 4], 2 );
ROUND1_16W( C, D, E, A, B, F16W_2, 8, in[13], 2 );
ROUND1_16W( B, C, D, E, A, F16W_2, 13, in[ 1], 2 );
ROUND1_16W( A, B, C, D, E, F16W_2, 11, in[10], 2 );
ROUND1_16W( E, A, B, C, D, F16W_2, 9, in[ 6], 2 );
ROUND1_16W( D, E, A, B, C, F16W_2, 7, in[15], 2 );
ROUND1_16W( C, D, E, A, B, F16W_2, 15, in[ 3], 2 );
ROUND1_16W( B, C, D, E, A, F16W_2, 7, in[12], 2 );
ROUND1_16W( A, B, C, D, E, F16W_2, 12, in[ 0], 2 );
ROUND1_16W( E, A, B, C, D, F16W_2, 15, in[ 9], 2 );
ROUND1_16W( D, E, A, B, C, F16W_2, 9, in[ 5], 2 );
ROUND1_16W( C, D, E, A, B, F16W_2, 11, in[ 2], 2 );
ROUND1_16W( B, C, D, E, A, F16W_2, 7, in[14], 2 );
ROUND1_16W( A, B, C, D, E, F16W_2, 13, in[11], 2 );
ROUND1_16W( E, A, B, C, D, F16W_2, 12, in[ 8], 2 );
ROUND1_16W( D, E, A, B, C, F16W_3, 11, in[ 3], 3 );
ROUND1_16W( C, D, E, A, B, F16W_3, 13, in[10], 3 );
ROUND1_16W( B, C, D, E, A, F16W_3, 6, in[14], 3 );
ROUND1_16W( A, B, C, D, E, F16W_3, 7, in[ 4], 3 );
ROUND1_16W( E, A, B, C, D, F16W_3, 14, in[ 9], 3 );
ROUND1_16W( D, E, A, B, C, F16W_3, 9, in[15], 3 );
ROUND1_16W( C, D, E, A, B, F16W_3, 13, in[ 8], 3 );
ROUND1_16W( B, C, D, E, A, F16W_3, 15, in[ 1], 3 );
ROUND1_16W( A, B, C, D, E, F16W_3, 14, in[ 2], 3 );
ROUND1_16W( E, A, B, C, D, F16W_3, 8, in[ 7], 3 );
ROUND1_16W( D, E, A, B, C, F16W_3, 13, in[ 0], 3 );
ROUND1_16W( C, D, E, A, B, F16W_3, 6, in[ 6], 3 );
ROUND1_16W( B, C, D, E, A, F16W_3, 5, in[13], 3 );
ROUND1_16W( A, B, C, D, E, F16W_3, 12, in[11], 3 );
ROUND1_16W( E, A, B, C, D, F16W_3, 7, in[ 5], 3 );
ROUND1_16W( D, E, A, B, C, F16W_3, 5, in[12], 3 );
ROUND1_16W( C, D, E, A, B, F16W_4, 11, in[ 1], 4 );
ROUND1_16W( B, C, D, E, A, F16W_4, 12, in[ 9], 4 );
ROUND1_16W( A, B, C, D, E, F16W_4, 14, in[11], 4 );
ROUND1_16W( E, A, B, C, D, F16W_4, 15, in[10], 4 );
ROUND1_16W( D, E, A, B, C, F16W_4, 14, in[ 0], 4 );
ROUND1_16W( C, D, E, A, B, F16W_4, 15, in[ 8], 4 );
ROUND1_16W( B, C, D, E, A, F16W_4, 9, in[12], 4 );
ROUND1_16W( A, B, C, D, E, F16W_4, 8, in[ 4], 4 );
ROUND1_16W( E, A, B, C, D, F16W_4, 9, in[13], 4 );
ROUND1_16W( D, E, A, B, C, F16W_4, 14, in[ 3], 4 );
ROUND1_16W( C, D, E, A, B, F16W_4, 5, in[ 7], 4 );
ROUND1_16W( B, C, D, E, A, F16W_4, 6, in[15], 4 );
ROUND1_16W( A, B, C, D, E, F16W_4, 8, in[14], 4 );
ROUND1_16W( E, A, B, C, D, F16W_4, 6, in[ 5], 4 );
ROUND1_16W( D, E, A, B, C, F16W_4, 5, in[ 6], 4 );
ROUND1_16W( C, D, E, A, B, F16W_4, 12, in[ 2], 4 );
ROUND1_16W( B, C, D, E, A, F16W_5, 9, in[ 4], 5 );
ROUND1_16W( A, B, C, D, E, F16W_5, 15, in[ 0], 5 );
ROUND1_16W( E, A, B, C, D, F16W_5, 5, in[ 5], 5 );
ROUND1_16W( D, E, A, B, C, F16W_5, 11, in[ 9], 5 );
ROUND1_16W( C, D, E, A, B, F16W_5, 6, in[ 7], 5 );
ROUND1_16W( B, C, D, E, A, F16W_5, 8, in[12], 5 );
ROUND1_16W( A, B, C, D, E, F16W_5, 13, in[ 2], 5 );
ROUND1_16W( E, A, B, C, D, F16W_5, 12, in[10], 5 );
ROUND1_16W( D, E, A, B, C, F16W_5, 5, in[14], 5 );
ROUND1_16W( C, D, E, A, B, F16W_5, 12, in[ 1], 5 );
ROUND1_16W( B, C, D, E, A, F16W_5, 13, in[ 3], 5 );
ROUND1_16W( A, B, C, D, E, F16W_5, 14, in[ 8], 5 );
ROUND1_16W( E, A, B, C, D, F16W_5, 11, in[11], 5 );
ROUND1_16W( D, E, A, B, C, F16W_5, 8, in[ 6], 5 );
ROUND1_16W( C, D, E, A, B, F16W_5, 5, in[15], 5 );
ROUND1_16W( B, C, D, E, A, F16W_5, 6, in[13], 5 );
ROUND2_16W( A, B, C, D, E, F16W_5, 8, in[ 5], 1 );
ROUND2_16W( E, A, B, C, D, F16W_5, 9, in[14], 1 );
ROUND2_16W( D, E, A, B, C, F16W_5, 9, in[ 7], 1 );
ROUND2_16W( C, D, E, A, B, F16W_5, 11, in[ 0], 1 );
ROUND2_16W( B, C, D, E, A, F16W_5, 13, in[ 9], 1 );
ROUND2_16W( A, B, C, D, E, F16W_5, 15, in[ 2], 1 );
ROUND2_16W( E, A, B, C, D, F16W_5, 15, in[11], 1 );
ROUND2_16W( D, E, A, B, C, F16W_5, 5, in[ 4], 1 );
ROUND2_16W( C, D, E, A, B, F16W_5, 7, in[13], 1 );
ROUND2_16W( B, C, D, E, A, F16W_5, 7, in[ 6], 1 );
ROUND2_16W( A, B, C, D, E, F16W_5, 8, in[15], 1 );
ROUND2_16W( E, A, B, C, D, F16W_5, 11, in[ 8], 1 );
ROUND2_16W( D, E, A, B, C, F16W_5, 14, in[ 1], 1 );
ROUND2_16W( C, D, E, A, B, F16W_5, 14, in[10], 1 );
ROUND2_16W( B, C, D, E, A, F16W_5, 12, in[ 3], 1 );
ROUND2_16W( A, B, C, D, E, F16W_5, 6, in[12], 1 );
ROUND2_16W( E, A, B, C, D, F16W_4, 9, in[ 6], 2 );
ROUND2_16W( D, E, A, B, C, F16W_4, 13, in[11], 2 );
ROUND2_16W( C, D, E, A, B, F16W_4, 15, in[ 3], 2 );
ROUND2_16W( B, C, D, E, A, F16W_4, 7, in[ 7], 2 );
ROUND2_16W( A, B, C, D, E, F16W_4, 12, in[ 0], 2 );
ROUND2_16W( E, A, B, C, D, F16W_4, 8, in[13], 2 );
ROUND2_16W( D, E, A, B, C, F16W_4, 9, in[ 5], 2 );
ROUND2_16W( C, D, E, A, B, F16W_4, 11, in[10], 2 );
ROUND2_16W( B, C, D, E, A, F16W_4, 7, in[14], 2 );
ROUND2_16W( A, B, C, D, E, F16W_4, 7, in[15], 2 );
ROUND2_16W( E, A, B, C, D, F16W_4, 12, in[ 8], 2 );
ROUND2_16W( D, E, A, B, C, F16W_4, 7, in[12], 2 );
ROUND2_16W( C, D, E, A, B, F16W_4, 6, in[ 4], 2 );
ROUND2_16W( B, C, D, E, A, F16W_4, 15, in[ 9], 2 );
ROUND2_16W( A, B, C, D, E, F16W_4, 13, in[ 1], 2 );
ROUND2_16W( E, A, B, C, D, F16W_4, 11, in[ 2], 2 );
ROUND2_16W( D, E, A, B, C, F16W_3, 9, in[15], 3 );
ROUND2_16W( C, D, E, A, B, F16W_3, 7, in[ 5], 3 );
ROUND2_16W( B, C, D, E, A, F16W_3, 15, in[ 1], 3 );
ROUND2_16W( A, B, C, D, E, F16W_3, 11, in[ 3], 3 );
ROUND2_16W( E, A, B, C, D, F16W_3, 8, in[ 7], 3 );
ROUND2_16W( D, E, A, B, C, F16W_3, 6, in[14], 3 );
ROUND2_16W( C, D, E, A, B, F16W_3, 6, in[ 6], 3 );
ROUND2_16W( B, C, D, E, A, F16W_3, 14, in[ 9], 3 );
ROUND2_16W( A, B, C, D, E, F16W_3, 12, in[11], 3 );
ROUND2_16W( E, A, B, C, D, F16W_3, 13, in[ 8], 3 );
ROUND2_16W( D, E, A, B, C, F16W_3, 5, in[12], 3 );
ROUND2_16W( C, D, E, A, B, F16W_3, 14, in[ 2], 3 );
ROUND2_16W( B, C, D, E, A, F16W_3, 13, in[10], 3 );
ROUND2_16W( A, B, C, D, E, F16W_3, 13, in[ 0], 3 );
ROUND2_16W( E, A, B, C, D, F16W_3, 7, in[ 4], 3 );
ROUND2_16W( D, E, A, B, C, F16W_3, 5, in[13], 3 );
ROUND2_16W( C, D, E, A, B, F16W_2, 15, in[ 8], 4 );
ROUND2_16W( B, C, D, E, A, F16W_2, 5, in[ 6], 4 );
ROUND2_16W( A, B, C, D, E, F16W_2, 8, in[ 4], 4 );
ROUND2_16W( E, A, B, C, D, F16W_2, 11, in[ 1], 4 );
ROUND2_16W( D, E, A, B, C, F16W_2, 14, in[ 3], 4 );
ROUND2_16W( C, D, E, A, B, F16W_2, 14, in[11], 4 );
ROUND2_16W( B, C, D, E, A, F16W_2, 6, in[15], 4 );
ROUND2_16W( A, B, C, D, E, F16W_2, 14, in[ 0], 4 );
ROUND2_16W( E, A, B, C, D, F16W_2, 6, in[ 5], 4 );
ROUND2_16W( D, E, A, B, C, F16W_2, 9, in[12], 4 );
ROUND2_16W( C, D, E, A, B, F16W_2, 12, in[ 2], 4 );
ROUND2_16W( B, C, D, E, A, F16W_2, 9, in[13], 4 );
ROUND2_16W( A, B, C, D, E, F16W_2, 12, in[ 9], 4 );
ROUND2_16W( E, A, B, C, D, F16W_2, 5, in[ 7], 4 );
ROUND2_16W( D, E, A, B, C, F16W_2, 15, in[10], 4 );
ROUND2_16W( C, D, E, A, B, F16W_2, 8, in[14], 4 );
ROUND2_16W( B, C, D, E, A, F16W_1, 8, in[12], 5 );
ROUND2_16W( A, B, C, D, E, F16W_1, 5, in[15], 5 );
ROUND2_16W( E, A, B, C, D, F16W_1, 12, in[10], 5 );
ROUND2_16W( D, E, A, B, C, F16W_1, 9, in[ 4], 5 );
ROUND2_16W( C, D, E, A, B, F16W_1, 12, in[ 1], 5 );
ROUND2_16W( B, C, D, E, A, F16W_1, 5, in[ 5], 5 );
ROUND2_16W( A, B, C, D, E, F16W_1, 14, in[ 8], 5 );
ROUND2_16W( E, A, B, C, D, F16W_1, 6, in[ 7], 5 );
ROUND2_16W( D, E, A, B, C, F16W_1, 8, in[ 6], 5 );
ROUND2_16W( C, D, E, A, B, F16W_1, 13, in[ 2], 5 );
ROUND2_16W( B, C, D, E, A, F16W_1, 6, in[13], 5 );
ROUND2_16W( A, B, C, D, E, F16W_1, 5, in[14], 5 );
ROUND2_16W( E, A, B, C, D, F16W_1, 15, in[ 0], 5 );
ROUND2_16W( D, E, A, B, C, F16W_1, 13, in[ 3], 5 );
ROUND2_16W( C, D, E, A, B, F16W_1, 11, in[ 9], 5 );
ROUND2_16W( B, C, D, E, A, F16W_1, 11, in[11], 5 );
tmp = _mm512_add_epi32( _mm512_add_epi32( h[1], C1 ), D2 );
h[1] = _mm512_add_epi32( _mm512_add_epi32( h[2], D1 ), E2 );
h[2] = _mm512_add_epi32( _mm512_add_epi32( h[3], E1 ), A2 );
h[3] = _mm512_add_epi32( _mm512_add_epi32( h[4], A1 ), B2 );
h[4] = _mm512_add_epi32( _mm512_add_epi32( h[0], B1 ), C2 );
h[0] = tmp;
}
void ripemd160_16way_init( ripemd160_16way_context *sc )
{
sc->val[0] = m512_const1_64( 0x6745230167452301 );
sc->val[1] = m512_const1_64( 0xEFCDAB89EFCDAB89 );
sc->val[2] = m512_const1_64( 0x98BADCFE98BADCFE );
sc->val[3] = m512_const1_64( 0x1032547610325476 );
sc->val[4] = m512_const1_64( 0xC3D2E1F0C3D2E1F0 );
sc->count_high = sc->count_low = 0;
}
void ripemd160_16way( ripemd160_16way_context *sc, const void *data,
size_t len )
{
__m512i *vdata = (__m512i*)data;
size_t ptr;
const int block_size = 64;
ptr = (unsigned)sc->count_low & (block_size - 1U);
while ( len > 0 )
{
size_t clen;
uint32_t clow, clow2;
clen = block_size - ptr;
if ( clen > len )
clen = len;
memcpy_512( sc->buf + (ptr>>2), vdata, clen>>2 );
vdata = vdata + (clen>>2);
ptr += clen;
len -= clen;
if ( ptr == block_size )
{
ripemd160_16way_round( sc );
ptr = 0;
}
clow = sc->count_low;
clow2 = clow + clen;
sc->count_low = clow2;
if ( clow2 < clow )
sc->count_high++;
}
}
void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst )
{
unsigned ptr, u;
uint32_t low, high;
const int block_size = 64;
const int pad = block_size - 8;
ptr = (unsigned)sc->count_low & ( block_size - 1U);
sc->buf[ ptr>>2 ] = m512_const1_32( 0x80 );
ptr += 4;
if ( ptr > pad )
{
memset_zero_512( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
ripemd160_16way_round( sc );
memset_zero_512( sc->buf, pad>>2 );
}
else
memset_zero_512( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
low = sc->count_low;
high = (sc->count_high << 3) | (low >> 29);
low = low << 3;
sc->buf[ pad>>2 ] = _mm512_set1_epi32( low );
sc->buf[ (pad>>2) + 1 ] = _mm512_set1_epi32( high );
ripemd160_16way_round( sc );
for (u = 0; u < 5; u ++)
casti_m512i( dst, u ) = sc->val[u];
}
#endif // AVX512

View File

@@ -32,7 +32,21 @@ void ripemd160_8way_init( ripemd160_8way_context *sc );
void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len );
void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
typedef struct
{
__m512i buf[64>>2];
__m512i val[5];
uint32_t count_high, count_low;
} __attribute__ ((aligned (128))) ripemd160_16way_context;
void ripemd160_16way_init( ripemd160_16way_context *sc );
void ripemd160_16way( ripemd160_16way_context *sc, const void *data,
size_t len );
void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst );
#endif // AVX512
#endif // __AVX2__
#endif // __SSE4_2__
#endif // RIPEMD_HASH_4WAY_H__

View File

@@ -74,7 +74,8 @@ typedef struct {
} sha256_8way_context __attribute__ ((aligned (128)));
void sha256_8way_init( sha256_8way_context *sc );
void sha256_8way( sha256_8way_context *sc, const void *data, size_t len );
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
#define sha256_8way sha256_8way_update
void sha256_8way_close( sha256_8way_context *sc, void *dst );
//#define SPH_SIZE_sha512 512

View File

@@ -100,9 +100,20 @@ c512( sph_shavite_big_context *sc, const void *msg )
p3 = h[3];
// round
// working proof of concept
/*
__m512i K = m512_const1_128( m[0] );
__m512i X = _mm512_xor_si512( m512_const1_128( p1 ), K );
X = _mm512_aesenc_epi128( X, m512_zero );
k00 = _mm512_castsi512_si128( K );
x = _mm512_castsi512_si128( X );
*/
k00 = m[0];
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, zero );
k01 = m[1];
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, zero );

View File

@@ -52,10 +52,10 @@ bool register_x16r_algo( algo_gate_t* gate )
bool register_x16rv2_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
#if defined (X16RV2_8WAY)
gate->scanhash = (void*)&scanhash_x16rv2_8way;
gate->hash = (void*)&x16rv2_8way_hash;
#elif defined (X16R_4WAY)
#elif defined (X16RV2_4WAY)
gate->scanhash = (void*)&scanhash_x16rv2_4way;
gate->hash = (void*)&x16rv2_4way_hash;
#else
@@ -205,10 +205,10 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
bool register_x16rt_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
#if defined (X16RT_8WAY)
gate->scanhash = (void*)&scanhash_x16rt_8way;
gate->hash = (void*)&x16rt_8way_hash;
#elif defined (X16R_4WAY)
#elif defined (X16RT_4WAY)
gate->scanhash = (void*)&scanhash_x16rt_4way;
gate->hash = (void*)&x16rt_4way_hash;
#else
@@ -222,10 +222,10 @@ bool register_x16rt_algo( algo_gate_t* gate )
bool register_x16rt_veil_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
#if defined (X16RT_8WAY)
gate->scanhash = (void*)&scanhash_x16rt_8way;
gate->hash = (void*)&x16rt_8way_hash;
#elif defined (X16R_4WAY)
#elif defined (X16RT_4WAY)
gate->scanhash = (void*)&scanhash_x16rt_4way;
gate->hash = (void*)&x16rt_4way_hash;
#else
@@ -258,16 +258,23 @@ bool register_hex_algo( algo_gate_t* gate )
bool register_x21s_algo( algo_gate_t* gate )
{
#if defined (X16R_4WAY)
#if defined (X21S_8WAY)
gate->scanhash = (void*)&scanhash_x21s_8way;
gate->hash = (void*)&x21s_8way_hash;
gate->miner_thread_init = (void*)&x21s_8way_thread_init;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
#elif defined (X21S_4WAY)
gate->scanhash = (void*)&scanhash_x21s_4way;
gate->hash = (void*)&x21s_4way_hash;
gate->miner_thread_init = (void*)&x21s_4way_thread_init;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
#else
gate->scanhash = (void*)&scanhash_x21s;
gate->hash = (void*)&x21s_hash;
gate->miner_thread_init = (void*)&x21s_thread_init;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
// gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
opt_target_factor = 256.0;
return true;

View File

@@ -12,6 +12,24 @@
#define X16R_4WAY 1
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X16RV2_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X16RV2_4WAY 1
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X16RT_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X16RT_4WAY 1
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X21S_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X21S_4WAY 1
#endif
enum x16r_Algo {
BLAKE = 0,
BMW,
@@ -46,18 +64,13 @@ bool register_x16rt_algo( algo_gate_t* gate );
bool register_hex__algo( algo_gate_t* gate );
bool register_x21s__algo( algo_gate_t* gate );
// x16r, x16s
#if defined(X16R_8WAY)
void x16r_8way_hash( void *state, const void *input );
int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void x16rv2_8way_hash( void *state, const void *input );
int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void x16rt_8way_hash( void *state, const void *input );
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X16R_4WAY)
@@ -65,31 +78,65 @@ void x16r_4way_hash( void *state, const void *input );
int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void x16rv2_4way_hash( void *state, const void *input );
int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void x16rt_4way_hash( void *state, const void *input );
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void x16r_hash( void *state, const void *input );
int scanhash_x16r( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
// x16Rv2
#if defined(X16RV2_8WAY)
void x16rv2_8way_hash( void *state, const void *input );
int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X16RV2_4WAY)
void x16rv2_4way_hash( void *state, const void *input );
int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void x16rv2_hash( void *state, const void *input );
int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
// x16rt, veil
#if defined(X16RT_8WAY)
void x16rt_8way_hash( void *state, const void *input );
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X16RT_4WAY)
void x16rt_4way_hash( void *state, const void *input );
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void x16rt_hash( void *state, const void *input );
int scanhash_x16rt( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(X16R_4WAY)
// x21s
#if defined(X21S_8WAY)
void x21s_8way_hash( void *state, const void *input );
int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool x21s_8way_thread_init();
#elif defined(X21S_4WAY)
void x21s_4way_hash( void *state, const void *input );
int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,

View File

@@ -24,7 +24,7 @@
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
#if defined (X16R_8WAY)
#if defined (X16RT_8WAY)
union _x16rt_8way_context_overlay
{
@@ -407,7 +407,7 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined (X16R_4WAY)
#elif defined (X16RT_4WAY)
union _x16rt_4way_context_overlay
{

View File

@@ -31,7 +31,7 @@
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
#if defined (X16R_8WAY)
#if defined (X16RV2_8WAY)
union _x16rv2_8way_context_overlay
{
@@ -497,10 +497,7 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined (X16R_4WAY)
#elif defined (X16RV2_4WAY)
union _x16rv2_4way_context_overlay
{

View File

@@ -1,13 +1,10 @@
/**
* x16r algo implementation
* x21s algo implementation
*
* Implementation by tpruvot@github Jan 2018
* Optimized by JayDDee@github Jan 2018
*/
#include "x16r-gate.h"
#if defined (X16R_4WAY)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -21,6 +18,7 @@
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
@@ -38,6 +36,483 @@
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
#if defined (X21S_8WAY)
static __thread uint64_t* x21s_8way_matrix;
union _x21s_8way_context_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
hashState_groestl groestl;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
sph_shavite512_context shavite;
simd_4way_context simd;
hashState_echo echo;
hamsi512_8way_context hamsi;
sph_fugue512_context fugue;
shabal512_8way_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
haval256_5_8way_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
sha256_8way_context sha256;
} __attribute__ ((aligned (64)));
typedef union _x21s_8way_context_overlay x21s_8way_context_overlay;
void x21s_8way_hash( void* output, const void* input )
{
uint32_t vhash[24*8] __attribute__ ((aligned (128)));
uint32_t hash0[24] __attribute__ ((aligned (64)));
uint32_t hash1[24] __attribute__ ((aligned (64)));
uint32_t hash2[24] __attribute__ ((aligned (64)));
uint32_t hash3[24] __attribute__ ((aligned (64)));
uint32_t hash4[24] __attribute__ ((aligned (64)));
uint32_t hash5[24] __attribute__ ((aligned (64)));
uint32_t hash6[24] __attribute__ ((aligned (64)));
uint32_t hash7[24] __attribute__ ((aligned (64)));
x21s_8way_context_overlay ctx;
void *in0 = (void*) hash0;
void *in1 = (void*) hash1;
void *in2 = (void*) hash2;
void *in3 = (void*) hash3;
void *in4 = (void*) hash4;
void *in5 = (void*) hash5;
void *in6 = (void*) hash6;
void *in7 = (void*) hash7;
int size = 80;
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
input, 640 );
for ( int i = 0; i < 16; i++ )
{
const char elem = hashOrder[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case BLAKE:
blake512_8way_init( &ctx.blake );
if ( i == 0 )
blake512_8way_update( &ctx.blake, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
blake512_8way_update( &ctx.blake, vhash, size );
}
blake512_8way_close( &ctx.blake, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case BMW:
bmw512_8way_init( &ctx.bmw );
if ( i == 0 )
bmw512_8way_update( &ctx.bmw, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
bmw512_8way_update( &ctx.bmw, vhash, size );
}
bmw512_8way_close( &ctx.bmw, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case GROESTL:
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(const char*)in0, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(const char*)in1, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(const char*)in2, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(const char*)in3, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash4,
(const char*)in4, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash5,
(const char*)in5, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash6,
(const char*)in6, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash7,
(const char*)in7, size<<3 );
break;
case SKEIN:
skein512_8way_init( &ctx.skein );
if ( i == 0 )
skein512_8way_update( &ctx.skein, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
skein512_8way_update( &ctx.skein, vhash, size );
}
skein512_8way_close( &ctx.skein, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case JH:
jh512_8way_init( &ctx.jh );
if ( i == 0 )
jh512_8way_update( &ctx.jh, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
jh512_8way_update( &ctx.jh, vhash, size );
}
jh512_8way_close( &ctx.jh, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case KECCAK:
keccak512_8way_init( &ctx.keccak );
if ( i == 0 )
keccak512_8way_update( &ctx.keccak, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
keccak512_8way_update( &ctx.keccak, vhash, size );
}
keccak512_8way_close( &ctx.keccak, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case LUFFA:
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
break;
case CUBEHASH:
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
break;
case SHAVITE:
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in0, size );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in1, size );
sph_shavite512_close( &ctx.shavite, hash1 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in2, size );
sph_shavite512_close( &ctx.shavite, hash2 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in3, size );
sph_shavite512_close( &ctx.shavite, hash3 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in4, size );
sph_shavite512_close( &ctx.shavite, hash4 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in5, size );
sph_shavite512_close( &ctx.shavite, hash5 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in6, size );
sph_shavite512_close( &ctx.shavite, hash6 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in7, size );
sph_shavite512_close( &ctx.shavite, hash7 );
break;
case SIMD:
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
break;
case ECHO:
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash0,
(const BitSequence*)in0, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash1,
(const BitSequence*)in1, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash2,
(const BitSequence*)in2, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash3,
(const BitSequence*)in3, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash4,
(const BitSequence*)in4, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash5,
(const BitSequence*)in5, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash6,
(const BitSequence*)in6, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash7,
(const BitSequence*)in7, size<<3 );
break;
case HAMSI:
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, size );
hamsi512_8way_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case FUGUE:
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in0, size );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in1, size );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in2, size );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in3, size );
sph_fugue512_close( &ctx.fugue, hash3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in4, size );
sph_fugue512_close( &ctx.fugue, hash4 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in5, size );
sph_fugue512_close( &ctx.fugue, hash5 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in6, size );
sph_fugue512_close( &ctx.fugue, hash6 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in7, size );
sph_fugue512_close( &ctx.fugue, hash7 );
break;
case SHABAL:
intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, size );
shabal512_8way_close( &ctx.shabal, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case WHIRLPOOL:
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in0, size );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in1, size );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in2, size );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in3, size );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in4, size );
sph_whirlpool_close( &ctx.whirlpool, hash4 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in5, size );
sph_whirlpool_close( &ctx.whirlpool, hash5 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in6, size );
sph_whirlpool_close( &ctx.whirlpool, hash6 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in7, size );
sph_whirlpool_close( &ctx.whirlpool, hash7 );
break;
case SHA_512:
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, size );
sha512_8way_close( &ctx.sha512, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
}
size = 64;
}
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhash, 64 );
haval256_5_8way_close( &ctx.haval, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash0, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash1, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash1 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash2, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash2 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash3, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash3 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash4, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash4 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash5, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash5 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash6, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash6 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash7, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash7 );
intrlv_2x256( vhash, hash0, hash1, 256 );
LYRA2REV2_2WAY( x21s_8way_matrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash0, hash1, vhash, 256 );
intrlv_2x256( vhash, hash2, hash3, 256 );
LYRA2REV2_2WAY( x21s_8way_matrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash2, hash3, vhash, 256 );
intrlv_2x256( vhash, hash4, hash5, 256 );
LYRA2REV2_2WAY( x21s_8way_matrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash4, hash5, vhash, 256 );
intrlv_2x256( vhash, hash6, hash7, 256 );
LYRA2REV2_2WAY( x21s_8way_matrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash6, hash7, vhash, 256 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash0, 64 );
sph_gost512_close( &ctx.gost, (void*) hash0 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash1, 64 );
sph_gost512_close( &ctx.gost, (void*) hash1 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash2, 64 );
sph_gost512_close( &ctx.gost, (void*) hash2 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash3, 64 );
sph_gost512_close( &ctx.gost, (void*) hash3 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash4, 64 );
sph_gost512_close( &ctx.gost, (void*) hash4 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash5, 64 );
sph_gost512_close( &ctx.gost, (void*) hash5 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash6, 64 );
sph_gost512_close( &ctx.gost, (void*) hash6 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash7, 64 );
sph_gost512_close( &ctx.gost, (void*) hash7 );
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
sha256_8way_init( &ctx.sha256 );
sha256_8way_update( &ctx.sha256, vhash, 64 );
sha256_8way_close( &ctx.sha256, output );
}
int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &hash[7<<3];
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 16;
int thr_id = mythr->id;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( opt_benchmark )
ptarget[7] = 0x0cff;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
}
do
{
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x21s_8way_hash( hash, vdata );
pdata[19] = n;
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash7[lane] <= Htarg ) )
{
extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
}
n += 8;
} while ( ( n < last_nonce ) && !(*restart) );
*hashes_done = n - first_nonce;
return 0;
}
bool x21s_8way_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
const int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
x21s_8way_matrix = _mm_malloc( 2 * size, 64 );
return x21s_8way_matrix;
}
#elif defined (X21S_4WAY)
static __thread uint64_t* x21s_4way_matrix;
union _x21s_4way_context_overlay

View File

@@ -1,7 +1,4 @@
#include "x22i-gate.h"
#if defined(X22I_4WAY)
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/echo/aes_ni/hash_api.h"
@@ -12,6 +9,7 @@
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/hamsi/hamsi-hash-4way.h"
@@ -25,6 +23,426 @@
#include "algo/gost/sph_gost.h"
#include "algo/swifftx/swifftx.h"
#if defined(X22I_8WAY)
union _x22i_8way_ctx_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
hashState_groestl groestl;
hashState_echo echo;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
sph_shavite512_context shavite;
simd_4way_context simd;
hamsi512_8way_context hamsi;
sph_fugue512_context fugue;
shabal512_8way_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
haval256_5_8way_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
sha256_8way_context sha256;
};
typedef union _x22i_8way_ctx_overlay x22i_8way_ctx_overlay;
void x22i_8way_hash( void *output, const void *input )
{
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
uint64_t hash0[8*4] __attribute__ ((aligned (64)));
uint64_t hash1[8*4] __attribute__ ((aligned (64)));
uint64_t hash2[8*4] __attribute__ ((aligned (64)));
uint64_t hash3[8*4] __attribute__ ((aligned (64)));
uint64_t hash4[8*4] __attribute__ ((aligned (64)));
uint64_t hash5[8*4] __attribute__ ((aligned (64)));
uint64_t hash6[8*4] __attribute__ ((aligned (64)));
uint64_t hash7[8*4] __attribute__ ((aligned (64)));
// unsigned char hash[64 * 4] __attribute__((aligned(64))) = {0};
unsigned char hashA0[64] __attribute__((aligned(64))) = {0};
unsigned char hashA1[64] __attribute__((aligned(32))) = {0};
unsigned char hashA2[64] __attribute__((aligned(32))) = {0};
unsigned char hashA3[64] __attribute__((aligned(32))) = {0};
unsigned char hashA4[64] __attribute__((aligned(64))) = {0};
unsigned char hashA5[64] __attribute__((aligned(32))) = {0};
unsigned char hashA6[64] __attribute__((aligned(32))) = {0};
unsigned char hashA7[64] __attribute__((aligned(32))) = {0};
x22i_8way_ctx_overlay ctx;
blake512_8way_init( &ctx.blake );
blake512_8way_update( &ctx.blake, input, 80 );
blake512_8way_close( &ctx.blake, vhash );
bmw512_8way_init( &ctx.bmw );
bmw512_8way_update( &ctx.bmw, vhash, 64 );
bmw512_8way_close( &ctx.bmw, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(const char*)hash0, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(const char*)hash1, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(const char*)hash2, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(const char*)hash3, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash4,
(const char*)hash4, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash5,
(const char*)hash5, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash6,
(const char*)hash6, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash7,
(const char*)hash7, 512 );
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7 );
skein512_8way_init( &ctx.skein );
skein512_8way_update( &ctx.skein, vhash, 64 );
skein512_8way_close( &ctx.skein, vhash );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash4, 64 );
sph_shavite512_close( &ctx.shavite, hash4 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash5, 64 );
sph_shavite512_close( &ctx.shavite, hash5 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash6, 64 );
sph_shavite512_close( &ctx.shavite, hash6 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash7, 64 );
sph_shavite512_close( &ctx.shavite, hash7 );
intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash0,
(const BitSequence*)hash0, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash1,
(const BitSequence*)hash1, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash2,
(const BitSequence*)hash2, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash3,
(const BitSequence*)hash3, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash4,
(const BitSequence*)hash4, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash5,
(const BitSequence*)hash5, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash6,
(const BitSequence*)hash6, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash7,
(const BitSequence*)hash7, 512 );
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7 );
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash4, 64 );
sph_fugue512_close( &ctx.fugue, hash4 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash5, 64 );
sph_fugue512_close( &ctx.fugue, hash5 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash6, 64 );
sph_fugue512_close( &ctx.fugue, hash6 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash7, 64 );
sph_fugue512_close( &ctx.fugue, hash7 );
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, 64 );
shabal512_8way_close( &ctx.shabal, vhash );
dintrlv_8x32_512( &hash0[8], &hash1[8], &hash2[8], &hash3[8],
&hash4[8], &hash5[8], &hash6[8], &hash7[8], vhash );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, &hash0[8], 64 );
sph_whirlpool_close( &ctx.whirlpool, &hash0[16] );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, &hash1[8], 64 );
sph_whirlpool_close( &ctx.whirlpool, &hash1[16] );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, &hash2[8], 64 );
sph_whirlpool_close( &ctx.whirlpool, &hash2[16] );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, &hash3[8], 64 );
sph_whirlpool_close( &ctx.whirlpool, &hash3[16] );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, &hash4[8], 64 );
sph_whirlpool_close( &ctx.whirlpool, &hash4[16] );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, &hash5[8], 64 );
sph_whirlpool_close( &ctx.whirlpool, &hash5[16] );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, &hash6[8], 64 );
sph_whirlpool_close( &ctx.whirlpool, &hash6[16] );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, &hash7[8], 64 );
sph_whirlpool_close( &ctx.whirlpool, &hash7[16] );
intrlv_8x64_512( vhash, &hash0[16], &hash1[16], &hash2[16], &hash3[16],
&hash4[16], &hash5[16], &hash6[16], &hash7[16] );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, 64 );
sha512_8way_close( &ctx.sha512, vhash );
dintrlv_8x64_512( &hash0[24], &hash1[24], &hash2[24], &hash3[24],
&hash4[24], &hash5[24], &hash6[24], &hash7[24], vhash );
ComputeSingleSWIFFTX((unsigned char*)hash0, (unsigned char*)hashA0);
ComputeSingleSWIFFTX((unsigned char*)hash1, (unsigned char*)hashA1);
ComputeSingleSWIFFTX((unsigned char*)hash2, (unsigned char*)hashA2);
ComputeSingleSWIFFTX((unsigned char*)hash3, (unsigned char*)hashA3);
ComputeSingleSWIFFTX((unsigned char*)hash4, (unsigned char*)hashA4);
ComputeSingleSWIFFTX((unsigned char*)hash5, (unsigned char*)hashA5);
ComputeSingleSWIFFTX((unsigned char*)hash6, (unsigned char*)hashA6);
ComputeSingleSWIFFTX((unsigned char*)hash7, (unsigned char*)hashA7);
intrlv_8x32_512( vhashA, hashA0, hashA1, hashA2, hashA3,
hashA4, hashA5, hashA6, hashA7 );
memset( vhash, 0, 64*8 );
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhashA, 64 );
haval256_5_8way_close( &ctx.haval, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash );
memset( hashA0, 0, 64 );
memset( hashA1, 0, 64 );
memset( hashA2, 0, 64 );
memset( hashA3, 0, 64 );
memset( hashA4, 0, 64 );
memset( hashA5, 0, 64 );
memset( hashA6, 0, 64 );
memset( hashA7, 0, 64 );
sph_tiger_init(&ctx.tiger);
sph_tiger (&ctx.tiger, (const void*) hash0, 64);
sph_tiger_close(&ctx.tiger, (void*) hashA0);
sph_tiger_init(&ctx.tiger);
sph_tiger (&ctx.tiger, (const void*) hash1, 64);
sph_tiger_close(&ctx.tiger, (void*) hashA1);
sph_tiger_init(&ctx.tiger);
sph_tiger (&ctx.tiger, (const void*) hash2, 64);
sph_tiger_close(&ctx.tiger, (void*) hashA2);
sph_tiger_init(&ctx.tiger);
sph_tiger (&ctx.tiger, (const void*) hash3, 64);
sph_tiger_close(&ctx.tiger, (void*) hashA3);
sph_tiger_init(&ctx.tiger);
sph_tiger (&ctx.tiger, (const void*) hash4, 64);
sph_tiger_close(&ctx.tiger, (void*) hashA4);
sph_tiger_init(&ctx.tiger);
sph_tiger (&ctx.tiger, (const void*) hash5, 64);
sph_tiger_close(&ctx.tiger, (void*) hashA5);
sph_tiger_init(&ctx.tiger);
sph_tiger (&ctx.tiger, (const void*) hash6, 64);
sph_tiger_close(&ctx.tiger, (void*) hashA6);
sph_tiger_init(&ctx.tiger);
sph_tiger (&ctx.tiger, (const void*) hash7, 64);
sph_tiger_close(&ctx.tiger, (void*) hashA7);
memset( hash0, 0, 64 );
memset( hash1, 0, 64 );
memset( hash2, 0, 64 );
memset( hash3, 0, 64 );
memset( hash4, 0, 64 );
memset( hash5, 0, 64 );
memset( hash6, 0, 64 );
memset( hash7, 0, 64 );
intrlv_2x256( vhash, hashA0, hashA1, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash0, hash1, vhash, 256 );
intrlv_2x256( vhash, hashA2, hashA3, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash2, hash3, vhash, 256 );
intrlv_2x256( vhash, hashA4, hashA5, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash4, hash5, vhash, 256 );
intrlv_2x256( vhash, hashA6, hashA7, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash6, hash7, vhash, 256 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash0, 64 );
sph_gost512_close( &ctx.gost, (void*) hash0 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash1, 64 );
sph_gost512_close( &ctx.gost, (void*) hash1 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash2, 64 );
sph_gost512_close( &ctx.gost, (void*) hash2 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash3, 64 );
sph_gost512_close( &ctx.gost, (void*) hash3 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash4, 64 );
sph_gost512_close( &ctx.gost, (void*) hash4 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash5, 64 );
sph_gost512_close( &ctx.gost, (void*) hash5 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash6, 64 );
sph_gost512_close( &ctx.gost, (void*) hash6 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash7, 64 );
sph_gost512_close( &ctx.gost, (void*) hash7 );
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7 );
sha256_8way_init( &ctx.sha256 );
sha256_8way_update( &ctx.sha256, vhash, 64 );
sha256_8way_close( &ctx.sha256, output );
}
int scanhash_x22i_8way( struct work* work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
__m512i *noncev = (__m512i*)vdata + 9; // aligned
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 8;
const int thr_id = mythr->id;
const uint32_t Htarg = ptarget[7];
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x08ff;
InitializeSWIFFTX();
mm512_bswap32_intrlv80_8x64( vdata, pdata );
do
{
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x22i_8way_hash( hash, vdata );
for ( int lane = 0; lane < 8; lane++ )
if unlikely( ( hash7[ lane ] <= Htarg ) )
{
extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
}
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(X22I_4WAY)
union _x22i_4way_ctx_overlay
{
blake512_4way_context blake;

View File

@@ -2,27 +2,39 @@
bool register_x22i_algo( algo_gate_t* gate )
{
#if defined (X22I_4WAY)
#if defined (X22I_8WAY)
gate->scanhash = (void*)&scanhash_x22i_8way;
gate->hash = (void*)&x22i_8way_hash;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
#elif defined (X22I_4WAY)
gate->scanhash = (void*)&scanhash_x22i_4way;
gate->hash = (void*)&x22i_4way_hash;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
#else
gate->scanhash = (void*)&scanhash_x22i;
gate->hash = (void*)&x22i_hash;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
return true;
};
bool register_x25x_algo( algo_gate_t* gate )
{
#if defined (X22I_4WAY)
#if defined (X25X_8WAY)
gate->scanhash = (void*)&scanhash_x25x_8way;
gate->hash = (void*)&x25x_8way_hash;
// gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
#elif defined (X25X_4WAY)
gate->scanhash = (void*)&scanhash_x25x_4way;
gate->hash = (void*)&x25x_4way_hash;
// gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
#else
gate->scanhash = (void*)&scanhash_x25x;
gate->hash = (void*)&x25x_hash;
// gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
return true;
};

View File

@@ -6,30 +6,64 @@
#include <stdint.h>
#include <unistd.h>
#if defined(__AVX2__) && defined(__AES__)
#define X22I_4WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X22I_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X22I_4WAY 1
#endif
bool register_x22i__algo( algo_gate_t* gate );
bool register_x22i_algo( algo_gate_t* gate );
#if defined(X22I_4WAY)
#if defined(X22I_8WAY)
void x22i_8way_hash( void *state, const void *input );
int scanhash_x22i_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X22I_4WAY)
void x22i_4way_hash( void *state, const void *input );
int scanhash_x22i_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void x25x_4way_hash( void *state, const void *input );
int scanhash_x25x_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#else
void x22i_hash( void *state, const void *input );
int scanhash_x22i( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
// Big problems with x25x 8 way. It blows up just by increasing the
// buffer sizes and nothing else. It may have to do with accessing 2 dim arrays.
//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// #define X25X_8WAY 1
#if defined(__AVX2__) && defined(__AES__)
#define X25X_4WAY 1
#endif
bool register_x25i_algo( algo_gate_t* gate );
#if defined(X25X_8WAY)
void x25x_8way_hash( void *state, const void *input );
int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X25X_4WAY)
void x25x_4way_hash( void *state, const void *input );
int scanhash_x25x_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void x25x_hash( void *state, const void *input );
int scanhash_x25x( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#endif // X22I_GATE_H__

View File

@@ -1,6 +1,6 @@
#include "x22i-gate.h"
#if defined(X22I_4WAY)
#if defined(X25X_4WAY)
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
@@ -88,276 +88,282 @@ void x25x_4way_hash( void *output, const void *input )
unsigned char hash2[25][64] __attribute__((aligned(64))) = {0};
unsigned char hash3[25][64] __attribute__((aligned(64))) = {0};
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
unsigned char vhashA[24][64*4] __attribute__ ((aligned (64)));
// Doubling the size of vhashX breaks everything. It may have something
// to do with accessing arrays: vhashX vs vhashX[0] vs &vhash[0].
// Changing notation did seem to allow the larger buffer but still resulted
// in problems further along.
// unsigned char vhashX[24][64*8] __attribute__ ((aligned (64)));
unsigned char vhashX[24][64*4] __attribute__ ((aligned (64)));
x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64)));
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
dintrlv_4x64_512( &hash0[0], &hash1[0], &hash2[0], &hash3[0], vhash );
dintrlv_4x64_512( hash0[0], hash1[0], hash2[0], hash3[0], vhash );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
dintrlv_4x64_512( &hash0[1], &hash1[1], &hash2[1], &hash3[1], vhash );
dintrlv_4x64_512( hash0[1], hash1[1], hash2[1], hash3[1], vhash );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)&hash0[2],
(const char*)&hash0[1], 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash0[2],
(const char*)hash0[1], 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)&hash1[2],
(const char*)&hash1[1], 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash1[2],
(const char*)hash1[1], 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)&hash2[2],
(const char*)&hash2[1], 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash2[2],
(const char*)hash2[1], 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)&hash3[2],
(const char*)&hash3[1], 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash3[2],
(const char*)hash3[1], 512 );
intrlv_4x64_512( vhash, &hash0[2], &hash1[2], &hash2[2], &hash3[2] );
intrlv_4x64_512( vhash, hash0[2], hash1[2], hash2[2], hash3[2] );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
dintrlv_4x64_512( &hash0[3], &hash1[3], &hash2[3], &hash3[3], vhash );
dintrlv_4x64_512( hash0[3], hash1[3], hash2[3], hash3[3], vhash );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
dintrlv_4x64_512( &hash0[4], &hash1[4], &hash2[4], &hash3[4], vhash );
dintrlv_4x64_512( hash0[4], hash1[4], hash2[4], hash3[4], vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
dintrlv_4x64_512( &hash0[5], &hash1[5], &hash2[5], &hash3[5], vhash );
dintrlv_4x64_512( hash0[5], hash1[5], hash2[5], hash3[5], vhash );
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)&hash0[6],
(const BitSequence*)&hash0[5], 64 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0[6],
(const BitSequence*)hash0[5], 64 );
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)&hash1[6],
(const BitSequence*)&hash1[5], 64 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1[6],
(const BitSequence*)hash1[5], 64 );
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)&hash2[6],
(const BitSequence*)&hash2[5], 64 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2[6],
(const BitSequence*)hash2[5], 64 );
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)&hash3[6],
(const BitSequence*)&hash3[5], 64 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3[6],
(const BitSequence*)hash3[5], 64 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) &hash0[7],
(const byte*)&hash0[6], 64 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0[7],
(const byte*)hash0[6], 64 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) &hash1[7],
(const byte*)&hash1[6], 64 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1[7],
(const byte*)hash1[6], 64 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) &hash2[7],
(const byte*)&hash2[6], 64 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2[7],
(const byte*)hash2[6], 64 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) &hash3[7],
(const byte*)&hash3[6], 64 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3[7],
(const byte*)hash3[6], 64 );
sph_shavite512_init(&ctx.shavite);
sph_shavite512(&ctx.shavite, (const void*) &hash0[7], 64);
sph_shavite512_close(&ctx.shavite, &hash0[8]);
sph_shavite512(&ctx.shavite, (const void*) hash0[7], 64);
sph_shavite512_close(&ctx.shavite, hash0[8]);
sph_shavite512_init(&ctx.shavite);
sph_shavite512(&ctx.shavite, (const void*) &hash1[7], 64);
sph_shavite512_close(&ctx.shavite, &hash1[8]);
sph_shavite512(&ctx.shavite, (const void*) hash1[7], 64);
sph_shavite512_close(&ctx.shavite, hash1[8]);
sph_shavite512_init(&ctx.shavite);
sph_shavite512(&ctx.shavite, (const void*) &hash2[7], 64);
sph_shavite512_close(&ctx.shavite, &hash2[8]);
sph_shavite512(&ctx.shavite, (const void*) hash2[7], 64);
sph_shavite512_close(&ctx.shavite, hash2[8]);
sph_shavite512_init(&ctx.shavite);
sph_shavite512(&ctx.shavite, (const void*) &hash3[7], 64);
sph_shavite512_close(&ctx.shavite, &hash3[8]);
sph_shavite512(&ctx.shavite, (const void*) hash3[7], 64);
sph_shavite512_close(&ctx.shavite, hash3[8]);
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence*)&hash0[9],
(const BitSequence*)&hash0[8], 512 );
update_final_sd( &ctx.simd, (BitSequence*)hash0[9],
(const BitSequence*)hash0[8], 512 );
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence*)&hash1[9],
(const BitSequence*)&hash1[8], 512 );
update_final_sd( &ctx.simd, (BitSequence*)hash1[9],
(const BitSequence*)hash1[8], 512 );
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence*)&hash2[9],
(const BitSequence*)&hash2[8], 512 );
update_final_sd( &ctx.simd, (BitSequence*)hash2[9],
(const BitSequence*)hash2[8], 512 );
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence*)&hash3[9],
(const BitSequence*)&hash3[8], 512 );
update_final_sd( &ctx.simd, (BitSequence*)hash3[9],
(const BitSequence*)hash3[8], 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)&hash0[10],
(const BitSequence*)&hash0[9], 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash0[10],
(const BitSequence*)hash0[9], 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)&hash1[10],
(const BitSequence*)&hash1[9], 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash1[10],
(const BitSequence*)hash1[9], 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)&hash2[10],
(const BitSequence*)&hash2[9], 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash2[10],
(const BitSequence*)hash2[9], 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)&hash3[10],
(const BitSequence*)&hash3[9], 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash3[10],
(const BitSequence*)hash3[9], 512 );
intrlv_4x64_512( vhash, &hash0[10], &hash1[10], &hash2[10], &hash3[10] );
intrlv_4x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10] );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64_512( &hash0[11], &hash1[11], &hash2[11], &hash3[11], vhash );
dintrlv_4x64_512( hash0[11], hash1[11], hash2[11], hash3[11], vhash );
sph_fugue512_init(&ctx.fugue);
sph_fugue512(&ctx.fugue, (const void*) &hash0[11], 64);
sph_fugue512_close(&ctx.fugue, &hash0[12]);
sph_fugue512(&ctx.fugue, (const void*) hash0[11], 64);
sph_fugue512_close(&ctx.fugue, hash0[12]);
sph_fugue512_init(&ctx.fugue);
sph_fugue512(&ctx.fugue, (const void*) &hash1[11], 64);
sph_fugue512_close(&ctx.fugue, &hash1[12]);
sph_fugue512(&ctx.fugue, (const void*) hash1[11], 64);
sph_fugue512_close(&ctx.fugue, hash1[12]);
sph_fugue512_init(&ctx.fugue);
sph_fugue512(&ctx.fugue, (const void*) &hash2[11], 64);
sph_fugue512_close(&ctx.fugue, &hash2[12]);
sph_fugue512(&ctx.fugue, (const void*) hash2[11], 64);
sph_fugue512_close(&ctx.fugue, hash2[12]);
sph_fugue512_init(&ctx.fugue);
sph_fugue512(&ctx.fugue, (const void*) &hash3[11], 64);
sph_fugue512_close(&ctx.fugue, &hash3[12]);
sph_fugue512(&ctx.fugue, (const void*) hash3[11], 64);
sph_fugue512_close(&ctx.fugue, hash3[12]);
intrlv_4x32_512( vhash, &hash0[12], &hash1[12], &hash2[12], &hash3[12] );
intrlv_4x32_512( vhash, hash0[12], hash1[12], hash2[12], hash3[12] );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
dintrlv_4x32_512( &hash0[13], &hash1[13], &hash2[13], &hash3[13], vhash );
dintrlv_4x32_512( hash0[13], hash1[13], hash2[13], hash3[13], vhash );
sph_whirlpool_init(&ctx.whirlpool);
sph_whirlpool (&ctx.whirlpool, (const void*) &hash0[13], 64);
sph_whirlpool_close(&ctx.whirlpool, &hash0[14]);
sph_whirlpool (&ctx.whirlpool, (const void*) hash0[13], 64);
sph_whirlpool_close(&ctx.whirlpool, hash0[14]);
sph_whirlpool_init(&ctx.whirlpool);
sph_whirlpool (&ctx.whirlpool, (const void*) &hash1[13], 64);
sph_whirlpool_close(&ctx.whirlpool, &hash1[14]);
sph_whirlpool (&ctx.whirlpool, (const void*) hash1[13], 64);
sph_whirlpool_close(&ctx.whirlpool, hash1[14]);
sph_whirlpool_init(&ctx.whirlpool);
sph_whirlpool (&ctx.whirlpool, (const void*) &hash2[13], 64);
sph_whirlpool_close(&ctx.whirlpool, &hash2[14]);
sph_whirlpool (&ctx.whirlpool, (const void*) hash2[13], 64);
sph_whirlpool_close(&ctx.whirlpool, hash2[14]);
sph_whirlpool_init(&ctx.whirlpool);
sph_whirlpool (&ctx.whirlpool, (const void*) &hash3[13], 64);
sph_whirlpool_close(&ctx.whirlpool, &hash3[14]);
sph_whirlpool (&ctx.whirlpool, (const void*) hash3[13], 64);
sph_whirlpool_close(&ctx.whirlpool, hash3[14]);
intrlv_4x64_512( vhash, &hash0[14], &hash1[14], &hash2[14], &hash3[14] );
intrlv_4x64_512( vhash, hash0[14], hash1[14], hash2[14], hash3[14] );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
dintrlv_4x64_512( &hash0[15], &hash1[15], &hash2[15], &hash3[15], vhash );
dintrlv_4x64_512( hash0[15], hash1[15], hash2[15], hash3[15], vhash );
ComputeSingleSWIFFTX((unsigned char*)&hash0[12], (unsigned char*)&hash0[16]);
ComputeSingleSWIFFTX((unsigned char*)&hash1[12], (unsigned char*)&hash1[16]);
ComputeSingleSWIFFTX((unsigned char*)&hash2[12], (unsigned char*)&hash2[16]);
ComputeSingleSWIFFTX((unsigned char*)&hash3[12], (unsigned char*)&hash3[16]);
ComputeSingleSWIFFTX((unsigned char*)hash0[12], (unsigned char*)hash0[16]);
ComputeSingleSWIFFTX((unsigned char*)hash1[12], (unsigned char*)hash1[16]);
ComputeSingleSWIFFTX((unsigned char*)hash2[12], (unsigned char*)hash2[16]);
ComputeSingleSWIFFTX((unsigned char*)hash3[12], (unsigned char*)hash3[16]);
intrlv_4x32_512( &vhashA, &hash0[16], &hash1[16], &hash2[16], &hash3[16] );
intrlv_4x32_512( vhashX[0], hash0[16], hash1[16], hash2[16], hash3[16] );
memset( vhash, 0, 64*4 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhashA, 64 );
haval256_5_4way( &ctx.haval, vhashX[0], 64 );
haval256_5_4way_close( &ctx.haval, vhash );
dintrlv_4x32_512( &hash0[17], &hash1[17], &hash2[17], &hash3[17], vhash );
dintrlv_4x32_512( hash0[17], hash1[17], hash2[17], hash3[17], vhash );
sph_tiger_init(&ctx.tiger);
sph_tiger (&ctx.tiger, (const void*) &hash0[17], 64);
sph_tiger_close(&ctx.tiger, (void*) &hash0[18]);
sph_tiger (&ctx.tiger, (const void*) hash0[17], 64);
sph_tiger_close(&ctx.tiger, (void*) hash0[18]);
sph_tiger_init(&ctx.tiger);
sph_tiger (&ctx.tiger, (const void*) &hash1[17], 64);
sph_tiger_close(&ctx.tiger, (void*) &hash1[18]);
sph_tiger (&ctx.tiger, (const void*) hash1[17], 64);
sph_tiger_close(&ctx.tiger, (void*) hash1[18]);
sph_tiger_init(&ctx.tiger);
sph_tiger (&ctx.tiger, (const void*) &hash2[17], 64);
sph_tiger_close(&ctx.tiger, (void*) &hash2[18]);
sph_tiger (&ctx.tiger, (const void*) hash2[17], 64);
sph_tiger_close(&ctx.tiger, (void*) hash2[18]);
sph_tiger_init(&ctx.tiger);
sph_tiger (&ctx.tiger, (const void*) &hash3[17], 64);
sph_tiger_close(&ctx.tiger, (void*) &hash3[18]);
sph_tiger (&ctx.tiger, (const void*) hash3[17], 64);
sph_tiger_close(&ctx.tiger, (void*) hash3[18]);
LYRA2RE( (void*)&hash0[19], 32, (const void*)&hash0[18], 32,
(const void*)&hash0[18], 32, 1, 4, 4 );
LYRA2RE( (void*)&hash1[19], 32, (const void*)&hash1[18], 32,
(const void*)&hash1[18], 32, 1, 4, 4 );
LYRA2RE( (void*)&hash2[19], 32, (const void*)&hash2[18], 32,
(const void*)&hash2[18], 32, 1, 4, 4 );
LYRA2RE( (void*)&hash3[19], 32, (const void*)&hash3[18], 32,
(const void*)&hash3[18], 32, 1, 4, 4 );
LYRA2RE( (void*)hash0[19], 32, (const void*)hash0[18], 32,
(const void*)hash0[18], 32, 1, 4, 4 );
LYRA2RE( (void*)hash1[19], 32, (const void*)hash1[18], 32,
(const void*)hash1[18], 32, 1, 4, 4 );
LYRA2RE( (void*)hash2[19], 32, (const void*)hash2[18], 32,
(const void*)hash2[18], 32, 1, 4, 4 );
LYRA2RE( (void*)hash3[19], 32, (const void*)hash3[18], 32,
(const void*)hash3[18], 32, 1, 4, 4 );
sph_gost512_init(&ctx.gost);
sph_gost512 (&ctx.gost, (const void*) &hash0[19], 64);
sph_gost512_close(&ctx.gost, (void*) &hash0[20]);
sph_gost512 (&ctx.gost, (const void*) hash0[19], 64);
sph_gost512_close(&ctx.gost, (void*) hash0[20]);
sph_gost512_init(&ctx.gost);
sph_gost512 (&ctx.gost, (const void*) &hash1[19], 64);
sph_gost512_close(&ctx.gost, (void*) &hash1[20]);
sph_gost512 (&ctx.gost, (const void*) hash1[19], 64);
sph_gost512_close(&ctx.gost, (void*) hash1[20]);
sph_gost512_init(&ctx.gost);
sph_gost512 (&ctx.gost, (const void*) &hash2[19], 64);
sph_gost512_close(&ctx.gost, (void*) &hash2[20]);
sph_gost512 (&ctx.gost, (const void*) hash2[19], 64);
sph_gost512_close(&ctx.gost, (void*) hash2[20]);
sph_gost512_init(&ctx.gost);
sph_gost512 (&ctx.gost, (const void*) &hash3[19], 64);
sph_gost512_close(&ctx.gost, (void*) &hash3[20]);
sph_gost512 (&ctx.gost, (const void*) hash3[19], 64);
sph_gost512_close(&ctx.gost, (void*) hash3[20]);
intrlv_4x32_512( vhashA, &hash0[20], &hash1[20], &hash2[20], &hash3[20] );
intrlv_4x32_512( vhashX[0], hash0[20], hash1[20], hash2[20], hash3[20] );
memset( vhash, 0, 64*4 );
sha256_4way_init( &ctx.sha256 );
sha256_4way( &ctx.sha256, vhashA, 64 );
sha256_4way( &ctx.sha256, vhashX[0], 64 );
sha256_4way_close( &ctx.sha256, vhash );
dintrlv_4x32_512( &hash0[21], &hash1[21], &hash2[21], &hash3[21], vhash );
dintrlv_4x32_512( hash0[21], hash1[21], hash2[21], hash3[21], vhash );
sph_panama_init(&ctx.panama);
sph_panama (&ctx.panama, (const void*) &hash0[21], 64 );
sph_panama_close(&ctx.panama, (void*) &hash0[22]);
sph_panama (&ctx.panama, (const void*) hash0[21], 64 );
sph_panama_close(&ctx.panama, (void*) hash0[22]);
sph_panama_init(&ctx.panama);
sph_panama (&ctx.panama, (const void*) &hash1[21], 64 );
sph_panama_close(&ctx.panama, (void*) &hash1[22]);
sph_panama (&ctx.panama, (const void*) hash1[21], 64 );
sph_panama_close(&ctx.panama, (void*) hash1[22]);
sph_panama_init(&ctx.panama);
sph_panama (&ctx.panama, (const void*) &hash2[21], 64 );
sph_panama_close(&ctx.panama, (void*) &hash2[22]);
sph_panama (&ctx.panama, (const void*) hash2[21], 64 );
sph_panama_close(&ctx.panama, (void*) hash2[22]);
sph_panama_init(&ctx.panama);
sph_panama (&ctx.panama, (const void*) &hash3[21], 64 );
sph_panama_close(&ctx.panama, (void*) &hash3[22]);
sph_panama (&ctx.panama, (const void*) hash3[21], 64 );
sph_panama_close(&ctx.panama, (void*) hash3[22]);
laneHash(512, (const BitSequence*)&hash0[22], 512, (BitSequence*)&hash0[23]);
laneHash(512, (const BitSequence*)&hash1[22], 512, (BitSequence*)&hash1[23]);
laneHash(512, (const BitSequence*)&hash2[22], 512, (BitSequence*)&hash2[23]);
laneHash(512, (const BitSequence*)&hash3[22], 512, (BitSequence*)&hash3[23]);
laneHash(512, (const BitSequence*)hash0[22], 512, (BitSequence*)hash0[23]);
laneHash(512, (const BitSequence*)hash1[22], 512, (BitSequence*)hash1[23]);
laneHash(512, (const BitSequence*)hash2[22], 512, (BitSequence*)hash2[23]);
laneHash(512, (const BitSequence*)hash3[22], 512, (BitSequence*)hash3[23]);
x25x_shuffle( hash0 );
x25x_shuffle( hash1 );
x25x_shuffle( hash2 );
x25x_shuffle( hash3 );
intrlv_4x32_512( &vhashA[ 0], &hash0[ 0], &hash1[ 0], &hash2[ 0], &hash3[ 0] );
intrlv_4x32_512( &vhashA[ 1], &hash0[ 1], &hash1[ 1], &hash2[ 1], &hash3[ 1] );
intrlv_4x32_512( &vhashA[ 2], &hash0[ 2], &hash1[ 2], &hash2[ 2], &hash3[ 2] );
intrlv_4x32_512( &vhashA[ 3], &hash0[ 3], &hash1[ 3], &hash2[ 3], &hash3[ 3] );
intrlv_4x32_512( &vhashA[ 4], &hash0[ 4], &hash1[ 4], &hash2[ 4], &hash3[ 4] );
intrlv_4x32_512( &vhashA[ 5], &hash0[ 5], &hash1[ 5], &hash2[ 5], &hash3[ 5] );
intrlv_4x32_512( &vhashA[ 6], &hash0[ 6], &hash1[ 6], &hash2[ 6], &hash3[ 6] );
intrlv_4x32_512( &vhashA[ 7], &hash0[ 7], &hash1[ 7], &hash2[ 7], &hash3[ 7] );
intrlv_4x32_512( &vhashA[ 8], &hash0[ 8], &hash1[ 8], &hash2[ 8], &hash3[ 8] );
intrlv_4x32_512( &vhashA[ 9], &hash0[ 9], &hash1[ 9], &hash2[ 9], &hash3[ 9] );
intrlv_4x32_512( &vhashA[10], &hash0[10], &hash1[10], &hash2[10], &hash3[10] );
intrlv_4x32_512( &vhashA[11], &hash0[11], &hash1[11], &hash2[11], &hash3[11] );
intrlv_4x32_512( &vhashA[12], &hash0[12], &hash1[12], &hash2[12], &hash3[12] );
intrlv_4x32_512( &vhashA[13], &hash0[13], &hash1[13], &hash2[13], &hash3[13] );
intrlv_4x32_512( &vhashA[14], &hash0[14], &hash1[14], &hash2[14], &hash3[14] );
intrlv_4x32_512( &vhashA[15], &hash0[15], &hash1[15], &hash2[15], &hash3[15] );
intrlv_4x32_512( &vhashA[16], &hash0[16], &hash1[16], &hash2[16], &hash3[16] );
intrlv_4x32_512( &vhashA[17], &hash0[17], &hash1[17], &hash2[17], &hash3[17] );
intrlv_4x32_512( &vhashA[18], &hash0[18], &hash1[18], &hash2[18], &hash3[18] );
intrlv_4x32_512( &vhashA[19], &hash0[19], &hash1[19], &hash2[19], &hash3[19] );
intrlv_4x32_512( &vhashA[20], &hash0[20], &hash1[20], &hash2[20], &hash3[20] );
intrlv_4x32_512( &vhashA[21], &hash0[21], &hash1[21], &hash2[21], &hash3[21] );
intrlv_4x32_512( &vhashA[22], &hash0[22], &hash1[22], &hash2[22], &hash3[22] );
intrlv_4x32_512( &vhashA[23], &hash0[23], &hash1[23], &hash2[23], &hash3[23] );
intrlv_4x32_512( vhashX[ 0], hash0[ 0], hash1[ 0], hash2[ 0], hash3[ 0] );
intrlv_4x32_512( vhashX[ 1], hash0[ 1], hash1[ 1], hash2[ 1], hash3[ 1] );
intrlv_4x32_512( vhashX[ 2], hash0[ 2], hash1[ 2], hash2[ 2], hash3[ 2] );
intrlv_4x32_512( vhashX[ 3], hash0[ 3], hash1[ 3], hash2[ 3], hash3[ 3] );
intrlv_4x32_512( vhashX[ 4], hash0[ 4], hash1[ 4], hash2[ 4], hash3[ 4] );
intrlv_4x32_512( vhashX[ 5], hash0[ 5], hash1[ 5], hash2[ 5], hash3[ 5] );
intrlv_4x32_512( vhashX[ 6], hash0[ 6], hash1[ 6], hash2[ 6], hash3[ 6] );
intrlv_4x32_512( vhashX[ 7], hash0[ 7], hash1[ 7], hash2[ 7], hash3[ 7] );
intrlv_4x32_512( vhashX[ 8], hash0[ 8], hash1[ 8], hash2[ 8], hash3[ 8] );
intrlv_4x32_512( vhashX[ 9], hash0[ 9], hash1[ 9], hash2[ 9], hash3[ 9] );
intrlv_4x32_512( vhashX[10], hash0[10], hash1[10], hash2[10], hash3[10] );
intrlv_4x32_512( vhashX[11], hash0[11], hash1[11], hash2[11], hash3[11] );
intrlv_4x32_512( vhashX[12], hash0[12], hash1[12], hash2[12], hash3[12] );
intrlv_4x32_512( vhashX[13], hash0[13], hash1[13], hash2[13], hash3[13] );
intrlv_4x32_512( vhashX[14], hash0[14], hash1[14], hash2[14], hash3[14] );
intrlv_4x32_512( vhashX[15], hash0[15], hash1[15], hash2[15], hash3[15] );
intrlv_4x32_512( vhashX[16], hash0[16], hash1[16], hash2[16], hash3[16] );
intrlv_4x32_512( vhashX[17], hash0[17], hash1[17], hash2[17], hash3[17] );
intrlv_4x32_512( vhashX[18], hash0[18], hash1[18], hash2[18], hash3[18] );
intrlv_4x32_512( vhashX[19], hash0[19], hash1[19], hash2[19], hash3[19] );
intrlv_4x32_512( vhashX[20], hash0[20], hash1[20], hash2[20], hash3[20] );
intrlv_4x32_512( vhashX[21], hash0[21], hash1[21], hash2[21], hash3[21] );
intrlv_4x32_512( vhashX[22], hash0[22], hash1[22], hash2[22], hash3[22] );
intrlv_4x32_512( vhashX[23], hash0[23], hash1[23], hash2[23], hash3[23] );
blake2s_4way_init( &ctx.blake2s, 32 );
blake2s_4way_full_blocks( &ctx.blake2s, vhash, vhashA, 64*24 );
dintrlv_4x32( &hash0[24], &hash1[24], &hash2[24], &hash3[24], vhash, 256 );
blake2s_4way_full_blocks( &ctx.blake2s, output, vhashX, 64*24 );
/*
dintrlv_4x32( hash0[24], hash1[24], hash2[24], hash3[24], vhash, 256 );
memcpy(output, &hash0[24], 32);
memcpy(output+32, &hash1[24], 32);
memcpy(output+64, &hash2[24], 32);
memcpy(output+96, &hash3[24], 32);
memcpy(output, hash0[24], 32);
memcpy(output+32, hash1[24], 32);
memcpy(output+64, hash2[24], 32);
memcpy(output+96, hash3[24], 32);
*/
}
int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
@@ -365,11 +371,14 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 9; // aligned
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 4;
const int thr_id = mythr->id;
const uint32_t Htarg = ptarget[7];
@@ -385,6 +394,16 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x25x_4way_hash( hash, vdata );
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
{
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
}
/*
for ( int i = 0; i < 4; i++ )
if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
@@ -392,10 +411,11 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
*/
n += 4;
} while ( likely( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart ) );
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
*hashes_done = n - first_nonce + 1;
*hashes_done = n - first_nonce;
return 0;
}

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.5.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.6.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.10.5'
PACKAGE_STRING='cpuminer-opt 3.10.5'
PACKAGE_VERSION='3.10.6'
PACKAGE_STRING='cpuminer-opt 3.10.6'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.10.5 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.10.6 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.10.5:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.10.6:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.10.5
cpuminer-opt configure 3.10.6
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.10.5, which was
It was created by cpuminer-opt $as_me 3.10.6, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.10.5'
VERSION='3.10.6'
cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.10.5, which was
This file was extended by cpuminer-opt $as_me 3.10.6, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.10.5
cpuminer-opt config.status 3.10.6
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.10.5])
AC_INIT([cpuminer-opt], [3.10.6])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -872,6 +872,7 @@ static uint64_t accept_sum = 0;
static uint64_t reject_sum = 0;
static double norm_diff_sum = 0.;
static uint32_t last_block_height = 0;
static bool new_job = false;
static double last_targetdiff = 0.;
static double ref_rate_hi = 0.;
static double ref_rate_lo = 1e100;
@@ -887,6 +888,7 @@ struct share_stats_t
double share_diff;
double stratum_diff;
double target_diff;
char job_id[32];
};
#define s_stats_size 8
@@ -1093,8 +1095,9 @@ static int share_result( int result, struct work *null_work,
rejected_share_count, solved_block_count );
if ( have_stratum && !opt_quiet )
applog2( LOG_INFO, "Share diff %.3g (%5f%%), block %d",
my_stats.share_diff, share_ratio, stratum.block_height );
applog2( LOG_INFO, "Share diff %.3g (%5f%%), block %d, job %s",
my_stats.share_diff, share_ratio, stratum.block_height,
my_stats.job_id );
if ( reason )
{
@@ -1762,6 +1765,7 @@ void work_set_target_ratio( struct work* work, uint32_t* hash )
share_stats[ s_put_ptr ].net_diff = net_diff;
share_stats[ s_put_ptr ].stratum_diff = stratum_diff;
share_stats[ s_put_ptr ].target_diff = work->targetdiff;
strcpy( share_stats[ s_put_ptr ].job_id, work->job_id );
s_put_ptr = stats_ptr_incr( s_put_ptr );
@@ -2586,28 +2590,38 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
|| ( last_block_height != sctx->block_height ) )
{
double hr = 0.;
new_job = false;
pthread_mutex_lock( &stats_lock );
for ( int i = 0; i < opt_n_threads; i++ )
hr += thr_hashrates[i];
global_hashrate = hr;
pthread_mutex_unlock( &stats_lock );
if ( stratum_diff != sctx->job.diff )
applog( LOG_BLUE, "New stratum difficulty" );
if ( last_block_height != sctx->block_height )
applog( LOG_BLUE, "New block" );
if ( !opt_quiet )
{
if ( stratum_diff != sctx->job.diff )
applog( LOG_BLUE, "New stratum diff %g, block %d, job %s",
sctx->job.diff, sctx->block_height, g_work->job_id );
else if ( last_block_height != sctx->block_height )
applog( LOG_BLUE, "New block %d, job %s", sctx->block_height,
g_work->job_id );
else
applog( LOG_BLUE,"New job %s.", g_work->job_id );
}
// Update data and calculate new estimates.
stratum_diff = sctx->job.diff;
last_block_height = stratum.block_height;
last_targetdiff = g_work->targetdiff;
applog2( LOG_INFO, "%s %s block %d", short_url,
algo_names[opt_algo], stratum.block_height );
applog2( LOG_INFO, "Diff: net %g, stratum %g, target %g",
net_diff, stratum_diff, last_targetdiff );
if ( !opt_quiet )
{
applog2( LOG_INFO, "%s %s block %d", short_url,
algo_names[opt_algo], stratum.block_height );
applog2( LOG_INFO, "Diff: net %g, stratum %g, target %g",
net_diff, stratum_diff, last_targetdiff );
}
if ( hr > 0. )
{
@@ -2619,10 +2633,13 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
sprintf_et( share_ttf, last_targetdiff * diff_to_hash / hr );
scale_hash_for_display ( &hr, hr_units );
applog2( LOG_INFO, "TTF @ %.2f %sh/s: block %s, share %s",
hr, hr_units, block_ttf, share_ttf );
if ( !opt_quiet )
{
applog2( LOG_INFO, "TTF @ %.2f %sh/s: block %s, share %s",
hr, hr_units, block_ttf, share_ttf );
}
}
}
} // new diff/block
}
void jr2_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
@@ -2700,25 +2717,23 @@ static void *stratum_thread(void *userdata )
if ( stratum.job.job_id
&& ( !g_work_time || strcmp( stratum.job.job_id, g_work.job_id ) ) )
{
new_job = true;
pthread_mutex_lock(&g_work_lock);
algo_gate.stratum_gen_work( &stratum, &g_work );
time(&g_work_time);
pthread_mutex_unlock(&g_work_lock);
restart_threads();
/*
if ( stratum.job.clean || jsonrpc_2 )
{
static uint32_t last_block_height;
if ( last_block_height != stratum.block_height )
{
last_block_height = stratum.block_height;
if ( !opt_quiet && last_block_height && new_job
&& ( last_block_height == stratum.block_height ) )
{
new_job = false;
applog( LOG_BLUE,"New job %s", g_work.job_id );
}
}
else
*/
if (opt_debug && !opt_quiet)
else if (opt_debug && !opt_quiet)
{
applog( LOG_BLUE, "%s asks job %d for block %d", short_url,
strtoul( stratum.job.job_id, NULL, 16 ), stratum.block_height );
@@ -2960,9 +2975,7 @@ void parse_arg(int key, char *arg )
show_usage_and_exit(1);
opt_retries = v;
break;
// case 'R':
// applog(LOG_WARNING,"\n-R is no longer valid, use --retry-pause instead.");
case 1025:
case 1025:
v = atoi(arg);
if (v < 1 || v > 9999) /* sanity check */
show_usage_and_exit(1);
@@ -3018,11 +3031,14 @@ void parse_arg(int key, char *arg )
*hp++ = '@';
} else
hp = ap;
if (ap != arg) {
if (strncasecmp(arg, "http://", 7) &&
strncasecmp(arg, "https://", 8) &&
strncasecmp(arg, "stratum+tcp://", 14)) {
fprintf(stderr, "unknown protocol -- '%s'\n", arg);
if ( ap != arg )
{
if ( strncasecmp( arg, "http://", 7 )
&& strncasecmp( arg, "https://", 8 )
&& strncasecmp( arg, "stratum+tcp://", 14 )
&& strncasecmp( arg, "stratum+tcps://", 15 ) )
{
fprintf(stderr, "unknown protocol -- '%s'\n", arg);
show_usage_and_exit(1);
}
free(rpc_url);
@@ -3427,7 +3443,7 @@ bool check_cpu_capability ()
else if ( sw_has_sse42 ) printf( " SSE4.2" );
else if ( sw_has_sse2 ) printf( " SSE2 " );
if ( sw_has_vaes ) printf( " VAES" );
else if ( sw_has_aes ) printf( " AES " );
else if ( sw_has_aes ) printf( " AES" );
if ( sw_has_sha ) printf( " SHA" );
printf("\nAlgo features:");
@@ -3439,7 +3455,7 @@ bool check_cpu_capability ()
else if ( algo_has_sse42 ) printf( " SSE4.2" );
else if ( algo_has_sse2 ) printf( " SSE2 " );
if ( algo_has_vaes ) printf( " VAES" );
else if ( algo_has_aes ) printf( " AES " );
else if ( algo_has_aes ) printf( " AES" );
if ( algo_has_sha ) printf( " SHA" );
}
printf("\n");
@@ -3619,7 +3635,9 @@ int main(int argc, char *argv[])
pthread_mutex_init( &stratum.sock_lock, NULL );
pthread_mutex_init( &stratum.work_lock, NULL );
flags = !opt_benchmark && strncmp( rpc_url, "https:", 6 )
flags = !opt_benchmark
&& ( strncmp( rpc_url, "https:", 6 )
|| strncasecmp(rpc_url, "stratum+tcps://", 15 ) )
? ( CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL )
: CURL_GLOBAL_ALL;
if ( curl_global_init( flags ) )

View File

@@ -113,6 +113,9 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
m512_const1_64( ( ( ( (uint64_t)(i1) << 32 ) ) \
| ( (uint64_t)(i0) & 0xffffffff ) ) )
// { m128_1, m128_1, m128_0, m128_0 }
#define m512_const_2x128( v1, v0 ) \
m512_mask_blend_epi64( 0x0f, m512_const1_128( v1 ), m512_const1_128( v0 ) )
static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
const uint64_t i1, const uint64_t i0 )

66
util.c
View File

@@ -1069,7 +1069,7 @@ double target_to_diff(uint32_t* target)
#define socket_blocks() (errno == EAGAIN || errno == EWOULDBLOCK)
#endif
static bool send_line(curl_socket_t sock, char *s)
static bool send_line( struct stratum_ctx *sctx, char *s )
{
size_t sent = 0;
int len;
@@ -1077,24 +1077,35 @@ static bool send_line(curl_socket_t sock, char *s)
len = (int) strlen(s);
s[len++] = '\n';
while (len > 0) {
while ( len > 0 )
{
struct timeval timeout = {0, 0};
int n;
fd_set wd;
FD_ZERO(&wd);
FD_SET(sock, &wd);
if (select((int) (sock + 1), NULL, &wd, NULL, &timeout) < 1)
FD_ZERO( &wd );
FD_SET( sctx->sock, &wd );
if ( select( (int) ( sctx->sock + 1 ), NULL, &wd, NULL, &timeout ) < 1 )
return false;
n = send(sock, s + sent, len, 0);
if (n < 0) {
if (!socket_blocks())
return false;
n = 0;
}
#if LIBCURL_VERSION_NUM >= 0x071802
CURLcode rc = curl_easy_send(sctx->curl, s + sent, len, (size_t *)&n);
if ( rc != CURLE_OK )
{
if ( rc != CURLE_AGAIN )
#else
n = send(sock, s + sent, len, 0);
if ( n < 0 )
{
if ( !socket_blocks() )
#endif
return false;
n = 0;
}
sent += n;
len -= n;
}
}
return true;
}
@@ -1107,7 +1118,7 @@ bool stratum_send_line(struct stratum_ctx *sctx, char *s)
applog(LOG_DEBUG, "> %s", s);
pthread_mutex_lock(&sctx->sock_lock);
ret = send_line(sctx->sock, s);
ret = send_line( sctx, s );
pthread_mutex_unlock(&sctx->sock_lock);
return ret;
@@ -1167,14 +1178,27 @@ char *stratum_recv_line(struct stratum_ctx *sctx)
ssize_t n;
memset(s, 0, RBUFSIZE);
n = recv(sctx->sock, s, RECVSIZE, 0);
#if LIBCURL_VERSION_NUM >= 0x071802
CURLcode rc = curl_easy_recv(sctx->curl, s, RECVSIZE, (size_t *)&n);
if (rc == CURLE_OK && !n) {
ret = false;
break;
}
if (rc != CURLE_OK) {
if (rc != CURLE_AGAIN || !socket_full(sctx->sock, 1)) {
#else
n = recv(sctx->sock, s, RECVSIZE, 0);
if (!n) {
ret = false;
break;
}
if (n < 0) {
if (!socket_blocks() || !socket_full(sctx->sock, 1)) {
ret = false;
#endif
ret = false;
break;
}
} else
@@ -1244,7 +1268,9 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
}
free(sctx->curl_url);
sctx->curl_url = (char*) malloc(strlen(url));
sprintf(sctx->curl_url, "http%s", strstr(url, "://"));
sprintf( sctx->curl_url, "http%s", strstr( url, "s://" )
? strstr( url, "s://" )
: strstr (url, "://" ) );
if (opt_protocol)
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
@@ -1254,7 +1280,9 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, sctx->curl_err_str);
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
if (opt_proxy) {
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
if (opt_proxy) {
curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
}
@@ -1954,7 +1982,9 @@ static bool stratum_reconnect(struct stratum_ctx *sctx, json_t *params)
return false;
url = (char*) malloc(32 + strlen(host));
sprintf(url, "stratum+tcp://%s:%d", host, port);
strncpy( url, sctx->url, 15 );
sprintf( strstr( url, "://" ) + 3, "%s:%d", host, port );
if (!opt_redirect) {
applog(LOG_INFO, "Ignoring request to reconnect to %s", url);