Compare commits

..

3 Commits

Author SHA1 Message Date
Jay D Dee
b47cfaa720 v3.10.7 2019-12-28 15:00:29 -05:00
Jay D Dee
241bc26767 v3.10.6 2019-12-25 01:26:26 -05:00
Jay D Dee
c65b0ff7a6 v3.10.5 2019-12-21 13:19:29 -05:00
107 changed files with 9007 additions and 2311 deletions

View File

@@ -84,6 +84,7 @@ cpuminer_SOURCES = \
algo/cubehash/cubehash_sse2.c\
algo/cubehash/cube-hash-2way.c \
algo/echo/sph_echo.c \
algo/echo/echo-hash-4way.c \
algo/echo/aes_ni/hash.c\
algo/gost/sph_gost.c \
algo/groestl/sph_groestl.c \

View File

@@ -7,9 +7,11 @@ Security warning
----------------
Miner programs are often flagged as malware by antivirus programs. This is
a false positive, they are flagged simply because they are cryptocurrency
miners. The source code is open for anyone to inspect. If you don't trust
the software, don't use it.
usually a false positive, they are flagged simply because they are
cryptocurrency miners. However, some malware has been spread using the
cover that miners are known to be subject to false positives. Always be on
alert. The source code of cpuminer-opt is open for anyone to inspect.
If you don't trust the software don't download it.
The cryptographic hashing code has been taken from trusted sources but has been
modified for speed at the expense of accepted security practices. This
@@ -33,6 +35,26 @@ not supported. FreeBSD YMMV.
Change Log
----------
v3.10.7
AVX512 for x25x, lbry, x13bcd (bcd).
v3.10.6
Added support for SSL stratum: stratum+tcps://
Added job id reporting again, but leaner, suppressed with --quiet.
AVX512 for x21s, x22i, lyra2z, allium.
Fixed share overflow warnings mining lbry with Ryzen (SHA).
v3.10.5
AVX512 for x17, sonoa, xevan, hmq1725, lyra2rev3, lyra2rev2.
Faster hmq1725 AVX2.
v3.10.4
AVX512 for x16r, x16rv2, x16rt, x16s, x16rt-veil (veil).

View File

@@ -317,6 +317,7 @@ const char* const algo_alias_map[][2] =
{ "argon2d-crds", "argon2d250" },
{ "argon2d-dyn", "argon2d500" },
{ "argon2d-uis", "argon2d4096" },
{ "bcd", "x13bcd" },
{ "bitcore", "timetravel10" },
{ "bitzeny", "yescryptr8" },
{ "blake256r8", "blakecoin" },

View File

@@ -104,7 +104,7 @@ typedef struct {
typedef blake_8way_small_context blake256_8way_context;
void blake256_8way_init(void *cc);
void blake256_8way_update(void *cc, const void *data, size_t len);
#define blake256_8way blake256_8way_update
//#define blake256_8way blake256_8way_update
void blake256_8way_close(void *cc, void *dst);
// 14 rounds, blake, decred

View File

@@ -842,7 +842,8 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
}
static void
blake32_4way( blake_4way_small_context *ctx, const void *data, size_t len )
blake32_4way( blake_4way_small_context *ctx, const void *data,
size_t len )
{
__m128i *buf = (__m128i*)ctx->buf;
size_t bptr = ctx->ptr<<2;
@@ -1237,7 +1238,7 @@ blake256_4way_init(void *ctx)
}
void
blake256_4way(void *ctx, const void *data, size_t len)
blake256_4way_update(void *ctx, const void *data, size_t len)
{
blake32_4way(ctx, data, len);
}

View File

@@ -463,6 +463,38 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
return 0;
}
// Update and final when inlen is a multiple of 64 bytes
int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
const void *input, uint64_t inlen )
{
__m256i *in = (__m256i*)input;
__m256i *buf = (__m256i*)S->buf;
while( inlen > BLAKE2S_BLOCKBYTES )
{
memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
S->buflen = BLAKE2S_BLOCKBYTES;
inlen -= BLAKE2S_BLOCKBYTES;
S->t[0] += BLAKE2S_BLOCKBYTES;
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
blake2s_8way_compress( S, buf );
S->buflen = 0;
in += ( BLAKE2S_BLOCKBYTES >> 2 );
}
// last block
memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
S->buflen = BLAKE2S_BLOCKBYTES;
S->t[0] += S->buflen;
S->t[1] += ( S->t[0] < S->buflen );
if ( S->last_node ) S->f[1] = ~0U;
S->f[0] = ~0U;
blake2s_8way_compress( S, buf );
for ( int i = 0; i < 8; ++i )
casti_m256i( out, i ) = S->h[ i ];
return 0;
}
#endif // __AVX2__

View File

@@ -14,7 +14,6 @@
#ifndef __BLAKE2S_HASH_4WAY_H__
#define __BLAKE2S_HASH_4WAY_H__ 1
//#if defined(__SSE4_2__)
#if defined(__SSE2__)
#include "simd-utils.h"
@@ -95,8 +94,8 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen );
int blake2s_8way_update( blake2s_8way_state *S, const void *in,
uint64_t inlen );
int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
//int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
// const void *input, uint64_t inlen );
int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
const void *input, uint64_t inlen );
#endif
@@ -132,6 +131,6 @@ int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
}
#endif
#endif // __SSE4_2__
#endif // __SSE2__
#endif

View File

@@ -874,6 +874,57 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
#define DH1L( m, sl, sr, a, b, c ) \
_mm256_add_epi32( \
_mm256_xor_si256( M[m], \
_mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \
_mm256_srli_epi32( qt[a], sr ) ) ), \
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
#define DH1R( m, sl, sr, a, b, c ) \
_mm256_add_epi32( \
_mm256_xor_si256( M[m], \
_mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \
_mm256_slli_epi32( qt[a], sr ) ) ), \
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
#define DH2L( m, rl, sl, h, a, b, c ) \
_mm256_add_epi32( _mm256_add_epi32( \
mm256_rol_32( dH[h], rl ), \
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
_mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \
_mm256_xor_si256( qt[b], qt[c] ) ) );
#define DH2R( m, rl, sr, h, a, b, c ) \
_mm256_add_epi32( _mm256_add_epi32( \
mm256_rol_32( dH[h], rl ), \
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
_mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \
_mm256_xor_si256( qt[b], qt[c] ) ) );
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
dH[ 2] = DH1R( 2, 5, 5, 18, 26, 2 );
dH[ 3] = DH1R( 3, 1, 5, 19, 27, 3 );
dH[ 4] = DH1R( 4, 3, 0, 20, 28, 4 );
dH[ 5] = DH1L( 5, 6, 6, 21, 29, 5 );
dH[ 6] = DH1R( 6, 4, 6, 22, 30, 6 );
dH[ 7] = DH1R( 7, 11, 2, 23, 31, 7 );
dH[ 8] = DH2L( 8, 9, 8, 4, 24, 23, 8 );
dH[ 9] = DH2R( 9, 10, 6, 5, 25, 16, 9 );
dH[10] = DH2L( 10, 11, 6, 6, 26, 17, 10 );
dH[11] = DH2L( 11, 12, 4, 7, 27, 18, 11 );
dH[12] = DH2R( 12, 13, 3, 0, 28, 19, 12 );
dH[13] = DH2R( 13, 14, 4, 1, 29, 20, 13 );
dH[14] = DH2R( 14, 15, 7, 2, 30, 21, 14 );
dH[15] = DH2R( 15, 16, 2, 3, 31, 22, 15 );
#undef DH1L
#undef DH1R
#undef DH2L
#undef DH2R
/*
dH[ 0] = _mm256_add_epi32(
_mm256_xor_si256( M[0],
_mm256_xor_si256( _mm256_slli_epi32( xh, 5 ),
@@ -954,6 +1005,7 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
_mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
_mm256_xor_si256( _mm256_srli_epi32( xl, 2 ),
_mm256_xor_si256( qt[22], qt[15] ) ) );
*/
}
static const __m256i final_s8[16] =

View File

@@ -41,7 +41,6 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
// if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
{
extr_lane_8x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
@@ -66,7 +65,7 @@ void bmw512hash_4way(void *state, const void *input)
{
bmw512_4way_context ctx;
bmw512_4way_init( &ctx );
bmw512_4way( &ctx, input, 80 );
bmw512_4way_update( &ctx, input, 80 );
bmw512_4way_close( &ctx, state );
}

559
algo/echo/echo-hash-4way.c Normal file
View File

@@ -0,0 +1,559 @@
#if defined(__AVX512VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#include "simd-utils.h"
#include "echo-hash-4way.h"
/*
#include <memory.h>
#include "miner.h"
#include "hash_api.h"
//#include "vperm.h"
#include <immintrin.h>
*/
/*
#ifndef NO_AES_NI
#include <wmmintrin.h>
#else
#include <tmmintrin.h>
#endif
*/
// not used
/*
const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1};
const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C};
const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1};
const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8};
const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09};
const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79};
const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8};
const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170};
const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1};
const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363};
const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6};
const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b};
const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e};
const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e};
const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515};
const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c};
const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601};
const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06};
const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b};
*/
/*
MYALIGN const unsigned int const1[] = {0x00000001, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int mul2mask[] = {0x00001b00, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int lsbmask[] = {0x01010101, 0x01010101, 0x01010101, 0x01010101};
MYALIGN const unsigned int invshiftrows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
MYALIGN const unsigned int zero[] = {0x00000000, 0x00000000, 0x00000000, 0x00000000};
*/
MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
// do these need to be reversed?
#define mul2mask \
m512_const4_32( 0x00001b00, 0, 0, 0 )
#define lsbmask m512_const1_32( 0x01010101 )
#define ECHO_SUBBYTES( state, i, j ) \
state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
state[i][j] = _mm512_aesenc_epi128( state[i][j], m512_zero ); \
k1 = _mm512_add_epi32( k1, m512_one_32 )
#define ECHO_MIXBYTES( state1, state2, j, t1, t2, s2 ) do \
{ \
const int j1 = ( j+1 ) & 3; \
const int j2 = ( j+2 ) & 3; \
const int j3 = ( j+3 ) & 3; \
s2 = _mm512_add_epi8( state1[ 0 ] [j ], state1[ 0 ][ j ] ); \
t1 = _mm512_srli_epi16( state1[ 0 ][ j ], 7 ); \
t1 = _mm512_and_si128( t1, lsbmask );\
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
s2 = _mm512_xor_si512( s2, t2 ); \
state2[ 0 ] [j ] = s2; \
state2[ 1 ] [j ] = state1[ 0 ][ j ]; \
state2[ 2 ] [j ] = state1[ 0 ][ j ]; \
state2[ 3 ] [j ] = _mm512_xor_si512( s2, state1[ 0 ][ j ] );\
s2 = _mm512_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \
t1 = _mm512_srli_epi16( state1[ 1 ][ j1 ], 7 ); \
t1 = _mm512_and_si512( t1, lsbmask ); \
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
s2 = _mm512_xor_si512( s2, t2 );\
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
_mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
s2 = _mm512_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \
t1 = _mm512_srli_epi16( state1[ 2 ][ j2 ], 7 ); \
t1 = _mm512_and_si512( t1, lsbmask ); \
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
s2 = _mm512_xor_si512( s2, t2 ); \
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
_mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
state2[ 2 ][ j ] = _mm512_xor_si512128( state2[ 2 ][ j ], s2 ); \
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
t1 = _mm512_srli_epi16( state1[ 3 ][ j3 ], 7 ); \
t1 = _mm512_and_si512( t1, lsbmask ); \
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
s2 = _mm512_xor_si512( s2, t2 ); \
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
_mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 )
} while(0)
#define ECHO_ROUND_UNROLL2 \
ECHO_SUBBYTES(_state, 0, 0);\
ECHO_SUBBYTES(_state, 1, 0);\
ECHO_SUBBYTES(_state, 2, 0);\
ECHO_SUBBYTES(_state, 3, 0);\
ECHO_SUBBYTES(_state, 0, 1);\
ECHO_SUBBYTES(_state, 1, 1);\
ECHO_SUBBYTES(_state, 2, 1);\
ECHO_SUBBYTES(_state, 3, 1);\
ECHO_SUBBYTES(_state, 0, 2);\
ECHO_SUBBYTES(_state, 1, 2);\
ECHO_SUBBYTES(_state, 2, 2);\
ECHO_SUBBYTES(_state, 3, 2);\
ECHO_SUBBYTES(_state, 0, 3);\
ECHO_SUBBYTES(_state, 1, 3);\
ECHO_SUBBYTES(_state, 2, 3);\
ECHO_SUBBYTES(_state, 3, 3);\
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
ECHO_SUBBYTES(_state2, 0, 0);\
ECHO_SUBBYTES(_state2, 1, 0);\
ECHO_SUBBYTES(_state2, 2, 0);\
ECHO_SUBBYTES(_state2, 3, 0);\
ECHO_SUBBYTES(_state2, 0, 1);\
ECHO_SUBBYTES(_state2, 1, 1);\
ECHO_SUBBYTES(_state2, 2, 1);\
ECHO_SUBBYTES(_state2, 3, 1);\
ECHO_SUBBYTES(_state2, 0, 2);\
ECHO_SUBBYTES(_state2, 1, 2);\
ECHO_SUBBYTES(_state2, 2, 2);\
ECHO_SUBBYTES(_state2, 3, 2);\
ECHO_SUBBYTES(_state2, 0, 3);\
ECHO_SUBBYTES(_state2, 1, 3);\
ECHO_SUBBYTES(_state2, 2, 3);\
ECHO_SUBBYTES(_state2, 3, 3);\
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
#define SAVESTATE(dst, src)\
dst[0][0] = src[0][0];\
dst[0][1] = src[0][1];\
dst[0][2] = src[0][2];\
dst[0][3] = src[0][3];\
dst[1][0] = src[1][0];\
dst[1][1] = src[1][1];\
dst[1][2] = src[1][2];\
dst[1][3] = src[1][3];\
dst[2][0] = src[2][0];\
dst[2][1] = src[2][1];\
dst[2][2] = src[2][2];\
dst[2][3] = src[2][3];\
dst[3][0] = src[3][0];\
dst[3][1] = src[3][1];\
dst[3][2] = src[3][2];\
dst[3][3] = src[3][3]
void echo_4way_compress( echo_4way_context *ctx, const unsigned char *pmsg,
unsigned int uBlockCount )
{
unsigned int r, b, i, j;
__m512i t1, t2, s2, k1;
__m512i _state[4][4], _state2[4][4], _statebackup[4][4];
// unroll
for ( i = 0; i < 4; i++ )
for ( j = 0; j < ctx->uHashSize / 256; j++ )
_state[ i ][ j ] = ctx->state[ i ][ j ];
for ( b = 0; b < uBlockCount; b++ )
{
ctx->k = _mm512_add_epi64( ctx->k, ctx->const1536 );
// load message, make aligned, remove loadu
for( j = ctx->uHashSize / 256; j < 4; j++ )
{
for ( i = 0; i < 4; i++ )
{
_state[ i ][ j ] = _mm512_loadu_si512(
(__m512i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i );
}
}
// save state
SAVESTATE( _statebackup, _state );
k1 = ctx->k;
for ( r = 0; r < ctx->uRounds / 2; r++ )
{
ECHO_ROUND_UNROLL2;
}
if ( ctx->uHashSize == 256 )
{
for ( i = 0; i < 4; i++ )
{
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
_state[ i ][ 1 ] );
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
_state[ i ][ 2 ] );
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
_state[ i ][ 3 ] );
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
_statebackup[ i ][ 0 ] );
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
_statebackup[ i ][ 1 ] );
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
_statebackup[ i ][ 2 ] ) ;
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
_statebackup[ i ][ 3 ] );
}
}
else
{
for ( i = 0; i < 4; i++ )
{
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
_state[ i ][ 2 ] );
_state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
_state[ i ][ 3 ] );
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
_statebackup[ i ][ 0 ] );
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ] [0 ],
_statebackup[ i ][ 2 ] );
_state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
_statebackup[ i ][ 1 ] );
_state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
_statebackup[ i ][ 3 ] );
}
}
pmsg += ctx->uBlockLength;
}
SAVESTATE(ctx->state, _state);
}
int echo_4way_init( echo_4way_context *ctx, int nHashSize )
{
int i, j;
ctx->k = m512_zero;
ctx->processed_bits = 0;
ctx->uBufferBytes = 0;
switch( nHashSize )
{
case 256:
ctx->uHashSize = 256;
ctx->uBlockLength = 192;
ctx->uRounds = 8;
ctx->hashsize = _mm512_const4_32( 0, 0, 0, 0x100 );
ctx->const1536 = _mm512_const4_32( 0, 0, 0, 0x600 );
break;
case 512:
ctx->uHashSize = 512;
ctx->uBlockLength = 128;
ctx->uRounds = 10;
ctx->hashsize = _mm512_const4_32( 0, 0, 0, 0x200 );
ctx->const1536 = _mm512_const4_32( 0, 0, 0, 0x400);
break;
default:
return BAD_HASHBITLEN;
}
for( i = 0; i < 4; i++ )
for( j = 0; j < nHashSize / 256; j++ )
ctx->state[ i ][ j ] = ctx->hashsize;
for( i = 0; i < 4; i++ )
for( j = nHashSize / 256; j < 4; j++ )
ctx->state[ i ][ j ] = m512_zero;
return SUCCESS;
}
int echo_4way_update( echo_4way_context *state, const BitSequence *data, DataLength databitlen )
{
unsigned int uByteLength, uBlockCount, uRemainingBytes;
uByteLength = (unsigned int)(databitlen / 8);
if ( ( state->uBufferBytes + uByteLength ) >= state->uBlockLength )
{
if ( state->uBufferBytes != 0 )
{
// Fill the buffer
memcpy( state->buffer + state->uBufferBytes,
(void*)data, state->uBlockLength - state->uBufferBytes );
// Process buffer
echo_4way_compress( state, state->buffer, 1 );
state->processed_bits += state->uBlockLength * 8;
data += state->uBlockLength - state->uBufferBytes;
uByteLength -= state->uBlockLength - state->uBufferBytes;
}
// buffer now does not contain any unprocessed bytes
uBlockCount = uByteLength / state->uBlockLength;
uRemainingBytes = uByteLength % state->uBlockLength;
if ( uBlockCount > 0 )
{
echo_4way_compress( state, data, uBlockCount );
state->processed_bits += uBlockCount * state->uBlockLength * 8;
data += uBlockCount * state->uBlockLength;
}
if ( uRemainingBytes > 0 )
{
memcpy( state->buffer, (void*)data, uRemainingBytes );
}
state->uBufferBytes = uRemainingBytes;
}
else
{
memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength );
state->uBufferBytes += uByteLength;
}
return 0;
}
echo_4way_close( echo_4way_context *state, BitSequence *hashval )
{
__m512i remainingbits;
// Add remaining bytes in the buffer
state->processed_bits += state->uBufferBytes * 8;
remainingbits = _mm512_set4_epi32( 0, 0, 0, state->uBufferBytes * 8 );
// Pad with 0x80
state->buffer[ state->uBufferBytes++ ] = 0x80;
// Enough buffer space for padding in this block?
if ( ( state->uBlockLength - state->uBufferBytes ) >= 18)
{
// Pad with zeros
memset( state->buffer + state->uBufferBytes, 0,
state->uBlockLength - ( state->uBufferBytes + 18 ) );
// Hash size
*( (unsigned short*)( state->buffer + state->uBlockLength - 18 ) )
= state->uHashSize;
// Processed bits
*( ( DataLength*)( state->buffer + state->uBlockLength - 16 ) )
= state->processed_bits;
*( ( DataLength*)( state->buffer + state->uBlockLength - 8 ) ) = 0;
// Last block contains message bits?
if ( state->uBufferBytes == 1 )
{
state->k = _mm512_xor_si512( state->k, state->k );
state->k = _mm512_sub_epi64( state->k, state->const1536 );
}
else
{
state->k = _mm512_add_epi64( state->k, remainingbits );
state->k = _mm512_sub_epi64( state->k, state->const1536 );
}
// Compress
echo_4way_compress( state, state->buffer, 1 );
}
else
{
// Fill with zero and compress
memset( state->buffer + state->uBufferBytes, 0,
state->uBlockLength - state->uBufferBytes );
state->k = _mm512_add_epi64( state->k, remainingbits );
state->k = _mm512_sub_epi64( state->k, state->const1536 );
echo_4way_compress( state, state->buffer, 1 );
// Last block
memset( state->buffer, 0, state->uBlockLength - 18 );
// Hash size
*( (unsigned short*)( state->buffer + state->uBlockLength - 18 ) )
= state->uHashSize;
// Processed bits
*( (DataLength*)( state->buffer + state->uBlockLength - 16 ) )
= state->processed_bits;
*( (DataLength*)( state->buffer + state->uBlockLength - 8 ) ) = 0;
// Compress the last block
state->k = _mm512_xor_si512(state->k, state->k);
state->k = _mm512_sub_epi64(state->k, state->const1536);
echo_4way_compress(state, state->buffer, 1);
}
// Store the hash value
_mm512_storeu_si512( (__m512i*)hashval + 0, state->state[ 0][ 0 ]);
_mm512_storeu_si512( (__m512i*)hashval + 1, state->state[ 1][ 0 ]);
if ( state->uHashSize == 512 )
{
_mm512_storeu_si512((__m512i*)hashval + 2, state->state[ 2 ][ 0 ]);
_mm512_storeu_si512((__m512i*)hashval + 3, state->state[ 3 ][ 0 ]);
}
return 0;
}
int echo_4way_update_close( echo_4way_context *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen )
{
unsigned int uByteLength, uBlockCount, uRemainingBytes;
uByteLength = (unsigned int)(databitlen / 8);
if ( (state->uBufferBytes + uByteLength) >= state->uBlockLength )
{
if ( state->uBufferBytes != 0 )
{
// Fill the buffer
memcpy( state->buffer + state->uBufferBytes,
(void*)data, state->uBlockLength - state->uBufferBytes );
// Process buffer
echo_4way_compress( state, state->buffer, 1 );
state->processed_bits += state->uBlockLength * 8;
data += state->uBlockLength - state->uBufferBytes;
uByteLength -= state->uBlockLength - state->uBufferBytes;
}
// buffer now does not contain any unprocessed bytes
uBlockCount = uByteLength / state->uBlockLength;
uRemainingBytes = uByteLength % state->uBlockLength;
if ( uBlockCount > 0 )
{
echo_4way_compress( state, data, uBlockCount );
state->processed_bits += uBlockCount * state->uBlockLength * 8;
data += uBlockCount * state->uBlockLength;
}
if ( uRemainingBytes > 0 )
memcpy(state->buffer, (void*)data, uRemainingBytes);
state->uBufferBytes = uRemainingBytes;
}
else
{
memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength );
state->uBufferBytes += uByteLength;
}
__m512i remainingbits;
// Add remaining bytes in the buffer
state->processed_bits += state->uBufferBytes * 8;
remainingbits = _mm512_set4_epi32( 0, 0, 0, state->uBufferBytes * 8 );
// Pad with 0x80
state->buffer[ state->uBufferBytes++ ] = 0x80;
// Enough buffer space for padding in this block?
if ( (state->uBlockLength - state->uBufferBytes) >= 18 )
{
// Pad with zeros
memset( state->buffer + state->uBufferBytes, 0,i
state->uBlockLength - (state->uBufferBytes + 18) );
// Hash size
*( (unsigned short*)(state->buffer + state->uBlockLength - 18) )
= state->uHashSize;
// Processed bits
*( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
state->processed_bits;
*( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
// Last block contains message bits?
if( state->uBufferBytes == 1 )
{
state->k = _mm512_xor_si512( state->k, state->k );
state->k = _mm512_sub_epi64( state->k, state->const1536 );
}
else
{
state->k = _mm_add_epi64( state->k, remainingbits );
state->k = _mm_sub_epi64( state->k, state->const1536 );
}
// Compress
echo_4way_compress( state, state->buffer, 1 );
}
else
{
// Fill with zero and compress
memset( state->buffer + state->uBufferBytes, 0,
state->uBlockLength - state->uBufferBytes );
state->k = _mm512_add_epi64( state->k, remainingbits );
state->k = _mm512_sub_epi64( state->k, state->const1536 );
echo_4way_compress( state, state->buffer, 1 );
// Last block
memset( state->buffer, 0, state->uBlockLength - 18 );
// Hash size
*( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) =
state->uHashSize;
// Processed bits
*( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
state->processed_bits;
*( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
// Compress the last block
state->k = _mm512_xor_si512( state->k, state->k );
state->k = _mm512_sub_epi64( state->k, state->const1536 );
echo_4way_compress( state, state->buffer, 1) ;
}
// Store the hash value
_mm512_storeu_si512( (__m512i*)hashval + 0, state->state[ 0 ][ 0] );
_mm512_storeu_si512( (__m512i*)hashval + 1, state->state[ 1 ][ 0] );
if ( state->uHashSize == 512 )
{
_mm512_storeu_si512( (__m512i*)hashval + 2, state->state[ 2 ][ 0 ] );
_mm512_storeu_si512( (__m512i*)hashval + 3, state->state[ 3 ][ 0 ] );
}
return 0;
}
#endif

View File

@@ -0,0 +1,36 @@
#if !defined(ECHO_HASH_4WAY_H__)
#define ECHO_HASH_4WAY_H__ 1
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#include "simd-utils.h"
typedef struct
{
__m512i state[4][4];
__m512i buffer[ 4 * 192 / 16 ]; // 4x128 interleaved 192 bytes
__m512i k;
__m512i hashsize;
__m512i const1536;
unsigned int uRounds;
unsigned int uHashSize;
unsigned int uBlockLength;
unsigned int uBufferBytes;
unsigned int processed_bits;
} echo_4way_context __attribute__ ((aligned (64)));
int echo_4way_init( echo_4way_context *state, int hashbitlen );
int echo_4way_update( echo_4way_context *state, const void *data,
unsigned int databitlen);
int echo_close( echo_4way_context *state, void *hashval );
int echo_4way_update_close( echo_4way_context *state, void *hashval,
const void *data, int databitlen );
#endif
#endif

View File

@@ -9,6 +9,7 @@
//#ifndef NO_AES_NI
// Not to be confused with AVX512VAES
#define VAES
// #define VAVX
// #define VVPERM

View File

@@ -45,7 +45,7 @@ void myriad_4way_hash( void *output, const void *input )
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
sha256_4way( &ctx.sha, vhash, 64 );
sha256_4way_update( &ctx.sha, vhash, 64 );
sha256_4way_close( &ctx.sha, output );
}

View File

@@ -1171,7 +1171,8 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc )
sc->h[7] = m256_const1_64( 0x6769756d2042656c );
}
void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len )
void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
size_t len )
{
__m256i *vdata = (__m256i*)data;

View File

@@ -62,7 +62,7 @@ typedef hamsi_4way_big_context hamsi512_4way_context;
void hamsi512_4way_init( hamsi512_4way_context *sc );
void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
size_t len );
#define hamsi512_4way hamsi512_4way_update
//#define hamsi512_4way hamsi512_4way_update
void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

View File

@@ -38,7 +38,7 @@
#define SPH_XCAT_(a, b) a ## b
static void
SPH_XCAT(SPH_XCAT(haval, PASSES), _4way)
SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
( haval_4way_context *sc, const void *data, size_t len )
{
__m128i *vdata = (__m128i*)data;

View File

@@ -0,0 +1,115 @@
/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
/*
* Helper code, included (three times !) by HAVAL implementation.
*
* TODO: try to merge this with md_helper.c.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#undef SPH_XCAT
#define SPH_XCAT(a, b) SPH_XCAT_(a, b)
#undef SPH_XCAT_
#define SPH_XCAT_(a, b) a ## b
static void
SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_update)
( haval_8way_context *sc, const void *data, size_t len )
{
__m256i *vdata = (__m256i*)data;
unsigned current;
current = (unsigned)sc->count_low & 127U;
while ( len > 0 )
{
unsigned clen;
uint32_t clow, clow2;
clen = 128U - current;
if ( clen > len )
clen = len;
memcpy_256( sc->buf + (current>>2), vdata, clen>>2 );
vdata += clen>>2;
current += clen;
len -= clen;
if ( current == 128U )
{
DSTATE_8W;
IN_PREPARE_8W(sc->buf);
RSTATE_8W;
SPH_XCAT(CORE_8W, PASSES)(INW_8W);
WSTATE_8W;
current = 0;
}
clow = sc->count_low;
clow2 = clow + clen;
sc->count_low = clow2;
if ( clow2 < clow )
sc->count_high ++;
}
}
static void
SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_close)( haval_8way_context *sc,
void *dst)
{
unsigned current;
DSTATE_8W;
current = (unsigned)sc->count_low & 127UL;
sc->buf[ current>>2 ] = m256_one_32;
current += 4;
RSTATE_8W;
if ( current > 116UL )
{
memset_zero_256( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
do
{
IN_PREPARE_8W(sc->buf);
SPH_XCAT(CORE_8W, PASSES)(INW_8W);
} while (0);
current = 0;
}
uint32_t t1, t2;
memset_zero_256( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
t1 = 0x01 | (PASSES << 3);
t2 = sc->olen << 3;
sc->buf[ 116>>2 ] = _mm256_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
sc->buf[ 120>>2 ] = _mm256_set1_epi32( sc->count_low << 3 );
sc->buf[ 124>>2 ] = _mm256_set1_epi32( (sc->count_high << 3)
| (sc->count_low >> 29) );
do
{
IN_PREPARE_8W(sc->buf);
SPH_XCAT(CORE_8W, PASSES)(INW_8W);
} while (0);
WSTATE_8W;
haval_8way_out( sc, dst );
}

View File

@@ -40,7 +40,7 @@
#include <string.h>
#include "haval-hash-4way.h"
// won't compile with sse4.2
// won't compile with sse4.2, not a problem, it's only used with AVX2 4 way.
//#if defined (__SSE4_2__)
#if defined(__AVX__)
@@ -479,9 +479,9 @@ haval ## xxx ## _ ## y ## _4way_init(void *cc) \
} \
\
void \
haval ## xxx ## _ ## y ## _4way (void *cc, const void *data, size_t len) \
haval ## xxx ## _ ## y ## _4way_update (void *cc, const void *data, size_t len) \
{ \
haval ## y ## _4way(cc, data, len); \
haval ## y ## _4way_update(cc, data, len); \
} \
\
void \
@@ -518,6 +518,301 @@ do { \
#define INMSG(i) msg[i]
#if defined(__AVX2__)
// Haval-256 8 way 32 bit avx2
#define F1_8W(x6, x5, x4, x3, x2, x1, x0) \
_mm256_xor_si256( x0, \
_mm256_xor_si256( _mm256_and_si256(_mm256_xor_si256( x0, x4 ), x1 ), \
_mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
_mm256_and_si256( x3, x6 ) ) ) ) \
#define F2_8W(x6, x5, x4, x3, x2, x1, x0) \
_mm256_xor_si256( \
_mm256_and_si256( x2, \
_mm256_xor_si256( _mm256_andnot_si256( x3, x1 ), \
_mm256_xor_si256( _mm256_and_si256( x4, x5 ), \
_mm256_xor_si256( x6, x0 ) ) ) ), \
_mm256_xor_si256( \
_mm256_and_si256( x4, _mm256_xor_si256( x1, x5 ) ), \
_mm256_xor_si256( _mm256_and_si256( x3, x5 ), x0 ) ) ) \
#define F3_8W(x6, x5, x4, x3, x2, x1, x0) \
_mm256_xor_si256( \
_mm256_and_si256( x3, \
_mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
_mm256_xor_si256( x6, x0 ) ) ), \
_mm256_xor_si256( _mm256_xor_si256(_mm256_and_si256( x1, x4 ), \
_mm256_and_si256( x2, x5 ) ), x0 ) )
#define F4_8W(x6, x5, x4, x3, x2, x1, x0) \
_mm256_xor_si256( \
_mm256_xor_si256( \
_mm256_and_si256( x3, \
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
_mm256_or_si256( x4, x6 ) ), x5 ) ), \
_mm256_and_si256( x4, \
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( mm256_not(x2), x5 ), \
_mm256_xor_si256( x1, x6 ) ), x0 ) ) ), \
_mm256_xor_si256( _mm256_and_si256( x2, x6 ), x0 ) )
#define F5_8W(x6, x5, x4, x3, x2, x1, x0) \
_mm256_xor_si256( \
_mm256_and_si256( x0, \
mm256_not( _mm256_xor_si256( \
_mm256_and_si256( _mm256_and_si256( x1, x2 ), x3 ), x5 ) ) ), \
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
_mm256_and_si256( x2, x5 ) ), \
_mm256_and_si256( x3, x6 ) ) )
#define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \
F1_8W(x1, x0, x3, x5, x6, x2, x4)
#define FP3_2_8W(x6, x5, x4, x3, x2, x1, x0) \
F2_8W(x4, x2, x1, x0, x5, x3, x6)
#define FP3_3_8W(x6, x5, x4, x3, x2, x1, x0) \
F3_8W(x6, x1, x2, x3, x4, x5, x0)
#define FP4_1_8W(x6, x5, x4, x3, x2, x1, x0) \
F1_8W(x2, x6, x1, x4, x5, x3, x0)
#define FP4_2_8W(x6, x5, x4, x3, x2, x1, x0) \
F2_8W(x3, x5, x2, x0, x1, x6, x4)
#define FP4_3_8W(x6, x5, x4, x3, x2, x1, x0) \
F3_8W(x1, x4, x3, x6, x0, x2, x5)
#define FP4_4_8W(x6, x5, x4, x3, x2, x1, x0) \
F4_8W(x6, x4, x0, x5, x2, x1, x3)
#define FP5_1_8W(x6, x5, x4, x3, x2, x1, x0) \
F1_8W(x3, x4, x1, x0, x5, x2, x6)
#define FP5_2_8W(x6, x5, x4, x3, x2, x1, x0) \
F2_8W(x6, x2, x1, x0, x3, x4, x5)
#define FP5_3_8W(x6, x5, x4, x3, x2, x1, x0) \
F3_8W(x2, x6, x0, x4, x3, x1, x5)
#define FP5_4_8W(x6, x5, x4, x3, x2, x1, x0) \
F4_8W(x1, x5, x3, x2, x0, x4, x6)
#define FP5_5_8W(x6, x5, x4, x3, x2, x1, x0) \
F5_8W(x2, x5, x0, x6, x4, x3, x1)
#define STEP_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
do { \
__m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
mm256_ror_32( x7, 11 ) ), \
_mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
} while (0)
#define PASS1_8W(n, in) do { \
unsigned pass_count; \
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
STEP_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
in(pass_count + 0), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
in(pass_count + 1), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
in(pass_count + 2), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
in(pass_count + 3), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
in(pass_count + 4), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
in(pass_count + 5), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
in(pass_count + 6), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
in(pass_count + 7), SPH_C32(0x00000000)); \
} \
} while (0)
#define PASSG_8W(p, n, in) do { \
unsigned pass_count; \
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
STEP_8W(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \
in(MP ## p[pass_count + 0]), \
RK ## p[pass_count + 0]); \
STEP_8W(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \
in(MP ## p[pass_count + 1]), \
RK ## p[pass_count + 1]); \
STEP_8W(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \
in(MP ## p[pass_count + 2]), \
RK ## p[pass_count + 2]); \
STEP_8W(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \
in(MP ## p[pass_count + 3]), \
RK ## p[pass_count + 3]); \
STEP_8W(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \
in(MP ## p[pass_count + 4]), \
RK ## p[pass_count + 4]); \
STEP_8W(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \
in(MP ## p[pass_count + 5]), \
RK ## p[pass_count + 5]); \
STEP_8W(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \
in(MP ## p[pass_count + 6]), \
RK ## p[pass_count + 6]); \
STEP_8W(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \
in(MP ## p[pass_count + 7]), \
RK ## p[pass_count + 7]); \
} \
} while (0)
#define PASS2_8W(n, in) PASSG_8W(2, n, in)
#define PASS3_8W(n, in) PASSG_8W(3, n, in)
#define PASS4_8W(n, in) PASSG_8W(4, n, in)
#define PASS5_8W(n, in) PASSG_8W(5, n, in)
#define SAVE_STATE_8W \
__m256i u0, u1, u2, u3, u4, u5, u6, u7; \
do { \
u0 = s0; \
u1 = s1; \
u2 = s2; \
u3 = s3; \
u4 = s4; \
u5 = s5; \
u6 = s6; \
u7 = s7; \
} while (0)
#define UPDATE_STATE_8W \
do { \
s0 = _mm256_add_epi32( s0, u0 ); \
s1 = _mm256_add_epi32( s1, u1 ); \
s2 = _mm256_add_epi32( s2, u2 ); \
s3 = _mm256_add_epi32( s3, u3 ); \
s4 = _mm256_add_epi32( s4, u4 ); \
s5 = _mm256_add_epi32( s5, u5 ); \
s6 = _mm256_add_epi32( s6, u6 ); \
s7 = _mm256_add_epi32( s7, u7 ); \
} while (0)
#define CORE_8W5(in) do { \
SAVE_STATE_8W; \
PASS1_8W(5, in); \
PASS2_8W(5, in); \
PASS3_8W(5, in); \
PASS4_8W(5, in); \
PASS5_8W(5, in); \
UPDATE_STATE_8W; \
} while (0)
#define DSTATE_8W __m256i s0, s1, s2, s3, s4, s5, s6, s7
#define RSTATE_8W \
do { \
s0 = sc->s0; \
s1 = sc->s1; \
s2 = sc->s2; \
s3 = sc->s3; \
s4 = sc->s4; \
s5 = sc->s5; \
s6 = sc->s6; \
s7 = sc->s7; \
} while (0)
#define WSTATE_8W \
do { \
sc->s0 = s0; \
sc->s1 = s1; \
sc->s2 = s2; \
sc->s3 = s3; \
sc->s4 = s4; \
sc->s5 = s5; \
sc->s6 = s6; \
sc->s7 = s7; \
} while (0)
static void
haval_8way_init( haval_8way_context *sc, unsigned olen, unsigned passes )
{
sc->s0 = m256_const1_32( 0x243F6A88UL );
sc->s1 = m256_const1_32( 0x85A308D3UL );
sc->s2 = m256_const1_32( 0x13198A2EUL );
sc->s3 = m256_const1_32( 0x03707344UL );
sc->s4 = m256_const1_32( 0xA4093822UL );
sc->s5 = m256_const1_32( 0x299F31D0UL );
sc->s6 = m256_const1_32( 0x082EFA98UL );
sc->s7 = m256_const1_32( 0xEC4E6C89UL );
sc->olen = olen;
sc->passes = passes;
sc->count_high = 0;
sc->count_low = 0;
}
#define IN_PREPARE_8W(indata) const __m256i *const load_ptr_8w = (indata)
#define INW_8W(i) load_ptr_8w[ i ]
static void
haval_8way_out( haval_8way_context *sc, void *dst )
{
__m256i *buf = (__m256i*)dst;
DSTATE_8W;
RSTATE_8W;
buf[0] = s0;
buf[1] = s1;
buf[2] = s2;
buf[3] = s3;
buf[4] = s4;
buf[5] = s5;
buf[6] = s6;
buf[7] = s7;
}
#undef PASSES
#define PASSES 5
#include "haval-8way-helper.c"
#define API_8W(xxx, y) \
void \
haval ## xxx ## _ ## y ## _8way_init(void *cc) \
{ \
haval_8way_init(cc, xxx >> 5, y); \
} \
\
void \
haval ## xxx ## _ ## y ## _8way_update (void *cc, const void *data, size_t len) \
{ \
haval ## y ## _8way_update(cc, data, len); \
} \
\
void \
haval ## xxx ## _ ## y ## _8way_close(void *cc, void *dst) \
{ \
haval ## y ## _8way_close(cc, dst); \
} \
API_8W(256, 5)
#define RVAL_8W \
do { \
s0 = val[0]; \
s1 = val[1]; \
s2 = val[2]; \
s3 = val[3]; \
s4 = val[4]; \
s5 = val[5]; \
s6 = val[6]; \
s7 = val[7]; \
} while (0)
#define WVAL_8W \
do { \
val[0] = s0; \
val[1] = s1; \
val[2] = s2; \
val[3] = s3; \
val[4] = s4; \
val[5] = s5; \
val[6] = s6; \
val[7] = s7; \
} while (0)
#define INMSG_8W(i) msg[i]
#endif // AVX2
#ifdef __cplusplus
}
#endif

View File

@@ -59,7 +59,7 @@
*/
#ifndef HAVAL_HASH_4WAY_H__
#define HAVAL_HASH_4WAY_H__
#define HAVAL_HASH_4WAY_H__ 1
#if defined(__AVX__)
@@ -84,10 +84,30 @@ typedef haval_4way_context haval256_5_4way_context;
void haval256_5_4way_init( void *cc );
void haval256_5_4way( void *cc, const void *data, size_t len );
void haval256_5_4way_update( void *cc, const void *data, size_t len );
//#define haval256_5_4way haval256_5_4way_update
void haval256_5_4way_close( void *cc, void *dst );
#if defined(__AVX2__)
typedef struct {
__m256i buf[32];
__m256i s0, s1, s2, s3, s4, s5, s6, s7;
unsigned olen, passes;
uint32_t count_high, count_low;
} haval_8way_context __attribute__ ((aligned (64)));
typedef haval_8way_context haval256_5_8way_context;
void haval256_5_8way_init( void *cc );
void haval256_5_8way_update( void *cc, const void *data, size_t len );
void haval256_5_8way_close( void *cc, void *dst );
#endif // AVX2
#ifdef __cplusplus
}
#endif

View File

@@ -103,14 +103,12 @@ typedef jh_4way_context jh512_4way_context;
void jh256_4way_init( jh_4way_context *sc);
void jh256_4way_update(void *cc, const void *data, size_t len);
#define jh256_4way jh256_4way_update
void jh256_4way_close(void *cc, void *dst);
void jh512_4way_init( jh_4way_context *sc );
void jh512_4way_update(void *cc, const void *data, size_t len);
#define jh512_4way jh512_4way_update
void jh512_4way_close(void *cc, void *dst);

View File

@@ -33,7 +33,7 @@ void jha_hash_4way( void *out, const void *input )
keccak512_4way_context ctx_keccak;
keccak512_4way_init( &ctx_keccak );
keccak512_4way( &ctx_keccak, input, 80 );
keccak512_4way_update( &ctx_keccak, input, 80 );
keccak512_4way_close( &ctx_keccak, vhash );
// Heavy & Light Pair Loop
@@ -58,7 +58,7 @@ void jha_hash_4way( void *out, const void *input )
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx_skein );
skein512_4way( &ctx_skein, vhash, 64 );
skein512_4way_update( &ctx_skein, vhash, 64 );
skein512_4way_close( &ctx_skein, vhashB );
for ( int i = 0; i < 8; i++ )
@@ -69,7 +69,7 @@ void jha_hash_4way( void *out, const void *input )
blake512_4way_close( &ctx_blake, vhashA );
jh512_4way_init( &ctx_jh );
jh512_4way( &ctx_jh, vhash, 64 );
jh512_4way_update( &ctx_jh, vhash, 64 );
jh512_4way_close( &ctx_jh, vhashB );
for ( int i = 0; i < 8; i++ )

View File

@@ -99,14 +99,12 @@ typedef keccak64_ctx_m256i keccak512_4way_context;
void keccak256_4way_init(void *cc);
void keccak256_4way_update(void *cc, const void *data, size_t len);
void keccak256_4way_close(void *cc, void *dst);
#define keccak256_4way keccak256_4way_update
void keccak512_4way_init(void *cc);
void keccak512_4way_update(void *cc, const void *data, size_t len);
void keccak512_4way_close(void *cc, void *dst);
void keccak512_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#define keccak512_4way keccak512_4way_update
#endif

View File

@@ -1,15 +1,178 @@
#include "lyra2-gate.h"
#include <memory.h>
#include <mm_malloc.h>
#if defined (ALLIUM_4WAY)
#include "algo/blake/blake-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/groestl/aes_ni/hash-groestl256.h"
#if defined (ALLIUM_8WAY)
typedef struct {
blake256_8way_context blake;
keccak256_8way_context keccak;
cube_4way_context cube;
skein256_8way_context skein;
hashState_groestl256 groestl;
} allium_8way_ctx_holder;
static __thread allium_8way_ctx_holder allium_8way_ctx;
bool init_allium_8way_ctx()
{
keccak256_8way_init( &allium_8way_ctx.keccak );
cube_4way_init( &allium_8way_ctx.cube, 256, 16, 32 );
skein256_8way_init( &allium_8way_ctx.skein );
init_groestl256( &allium_8way_ctx.groestl, 32 );
return true;
}
void allium_8way_hash( void *state, const void *input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (128)));
uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
blake256_8way_close( &ctx.blake, vhash );
rintrlv_8x32_8x64( vhashA, vhash, 256 );
keccak256_8way_update( &ctx.keccak, vhashA, 32 );
keccak256_8way_close( &ctx.keccak, vhash );
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, 256 );
intrlv_2x256( vhash, hash0, hash1, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash0, hash1, vhash, 256 );
intrlv_2x256( vhash, hash2, hash3, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash2, hash3, vhash, 256 );
intrlv_2x256( vhash, hash4, hash5, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash4, hash5, vhash, 256 );
intrlv_2x256( vhash, hash6, hash7, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash6, hash7, vhash, 256 );
intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
cube_4way_init( &ctx.cube, 256, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
intrlv_2x256( vhash, hash0, hash1, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash0, hash1, vhash, 256 );
intrlv_2x256( vhash, hash2, hash3, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash2, hash3, vhash, 256 );
intrlv_2x256( vhash, hash4, hash5, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash4, hash5, vhash, 256 );
intrlv_2x256( vhash, hash6, hash7, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
dintrlv_2x256( hash6, hash7, vhash, 256 );
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, 256 );
skein256_8way_update( &ctx.skein, vhash, 32 );
skein256_8way_close( &ctx.skein, vhash );
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, 256 );
update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+128, hash4, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+160, hash5, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+192, hash6, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 );
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
sizeof(hashState_groestl256) );
}
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 8;
const uint32_t Htarg = ptarget[7];
__m256i *noncev = (__m256i*)vdata + 19; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake256_8way_init( &allium_8way_ctx.blake );
blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );
do {
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
n+3, n+2, n+1, n ) );
allium_8way_hash( hash, vdata );
pdata[19] = n;
for ( int lane = 0; lane < 8; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
{
if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_lane_solution( work, hash+(lane<<3), mythr, lane );
}
}
n += 8;
} while ( (n < last_nonce) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce;
return 0;
}
#elif defined (ALLIUM_4WAY)
typedef struct {
blake256_4way_context blake;
keccak256_4way_context keccak;
@@ -41,11 +204,11 @@ void allium_4way_hash( void *state, const void *input )
allium_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &allium_4way_ctx, sizeof(allium_4way_ctx) );
blake256_4way( &ctx.blake, input + (64<<2), 16 );
blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
blake256_4way_close( &ctx.blake, vhash32 );
rintrlv_4x32_4x64( vhash64, vhash32, 256 );
keccak256_4way( &ctx.keccak, vhash64, 32 );
keccak256_4way_update( &ctx.keccak, vhash64, 32 );
keccak256_4way_close( &ctx.keccak, vhash64 );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
@@ -70,7 +233,7 @@ void allium_4way_hash( void *state, const void *input )
intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
skein256_4way( &ctx.skein, vhash64, 32 );
skein256_4way_update( &ctx.skein, vhash64, 32 );
skein256_4way_close( &ctx.skein, vhash64 );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

View File

@@ -78,8 +78,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_lyra2rev3;
gate->hash = (void*)&lyra2rev3_hash;
#endif
// gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
opt_target_factor = 256.0;
return true;
@@ -95,10 +94,14 @@ bool lyra2rev2_thread_init()
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
#if defined (LYRA2REV2_8WAY)
l2v2_wholeMatrix = _mm_malloc( 2 * size, 64 ); // 2 way
init_lyra2rev2_8way_ctx();;
#elif defined (LYRA2REV2_4WAY)
l2v2_wholeMatrix = _mm_malloc( size, 64 );
#if defined (LYRA2REV2_4WAY)
init_lyra2rev2_4way_ctx();;
#else
l2v2_wholeMatrix = _mm_malloc( size, 64 );
init_lyra2rev2_ctx();
#endif
return l2v2_wholeMatrix;
@@ -106,14 +109,17 @@ bool lyra2rev2_thread_init()
bool register_lyra2rev2_algo( algo_gate_t* gate )
{
#if defined (LYRA2REV2_4WAY)
#if defined (LYRA2REV2_8WAY)
gate->scanhash = (void*)&scanhash_lyra2rev2_8way;
gate->hash = (void*)&lyra2rev2_8way_hash;
#elif defined (LYRA2REV2_4WAY)
gate->scanhash = (void*)&scanhash_lyra2rev2_4way;
gate->hash = (void*)&lyra2rev2_4way_hash;
#else
gate->scanhash = (void*)&scanhash_lyra2rev2;
gate->hash = (void*)&lyra2rev2_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
opt_target_factor = 256.0;
return true;
@@ -123,7 +129,11 @@ bool register_lyra2rev2_algo( algo_gate_t* gate )
bool register_lyra2z_algo( algo_gate_t* gate )
{
#if defined(LYRA2Z_8WAY)
#if defined(LYRA2Z_16WAY)
gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_16way;
gate->hash = (void*)&lyra2z_16way_hash;
#elif defined(LYRA2Z_8WAY)
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_8way;
gate->hash = (void*)&lyra2z_8way_hash;
@@ -136,7 +146,7 @@ bool register_lyra2z_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_lyra2z;
gate->hash = (void*)&lyra2z_hash;
#endif
gate->optimizations = SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE42_OPT | AVX2_OPT | AVX512_OPT;
opt_target_factor = 256.0;
return true;
};
@@ -164,7 +174,11 @@ bool register_lyra2h_algo( algo_gate_t* gate )
bool register_allium_algo( algo_gate_t* gate )
{
#if defined (ALLIUM_4WAY)
#if defined (ALLIUM_8WAY)
gate->miner_thread_init = (void*)&init_allium_8way_ctx;
gate->scanhash = (void*)&scanhash_allium_8way;
gate->hash = (void*)&allium_8way_hash;
#elif defined (ALLIUM_4WAY)
gate->miner_thread_init = (void*)&init_allium_4way_ctx;
gate->scanhash = (void*)&scanhash_allium_4way;
gate->hash = (void*)&allium_4way_hash;
@@ -173,7 +187,7 @@ bool register_allium_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_allium;
gate->hash = (void*)&allium_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
opt_target_factor = 256.0;
return true;
};

View File

@@ -5,12 +5,10 @@
#include <stdint.h>
#include "lyra2.h"
/*
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define LYRA2REV3_16WAY 1
#elif defined(__AVX2__)
*/
#if defined(__AVX2__)
#define LYRA2REV3_8WAY 1
#elif defined(__SSE2__)
#define LYRA2REV3_4WAY 1
@@ -52,15 +50,24 @@ bool init_lyra2rev3_ctx();
//////////////////////////////////
#if defined(__AVX2__)
#define LYRA2REV2_4WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define LYRA2REV2_8WAY 1
#elif defined(__AVX2__)
#define LYRA2REV2_4WAY 1
#endif
extern __thread uint64_t* l2v2_wholeMatrix;
bool register_lyra2rev2_algo( algo_gate_t* gate );
#if defined(LYRA2REV2_4WAY)
#if defined(LYRA2REV2_8WAY)
void lyra2rev2_8way_hash( void *state, const void *input );
int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool init_lyra2rev2_8way_ctx();
#elif defined(LYRA2REV2_4WAY)
void lyra2rev2_4way_hash( void *state, const void *input );
int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
@@ -78,17 +85,25 @@ bool init_lyra2rev2_ctx();
/////////////////////////
#if defined(__SSE2__)
#define LYRA2Z_4WAY
#endif
#if defined(__AVX2__)
#define LYRA2Z_8WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define LYRA2Z_16WAY 1
#elif defined(__AVX2__)
#define LYRA2Z_8WAY 1
#elif defined(__SSE2__)
#define LYRA2Z_4WAY 1
#endif
#define LYRA2Z_MATRIX_SIZE BLOCK_LEN_INT64 * 8 * 8 * 8
#if defined(LYRA2Z_8WAY)
#if defined(LYRA2Z_16WAY)
void lyra2z_16way_hash( void *state, const void *input );
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2z_16way_thread_init();
#elif defined(LYRA2Z_8WAY)
void lyra2z_8way_hash( void *state, const void *input );
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
@@ -137,13 +152,22 @@ bool lyra2h_thread_init();
//////////////////////////////////
#if defined(__AVX2__) && defined(__AES__)
#define ALLIUM_4WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define ALLIUM_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define ALLIUM_4WAY 1
#endif
bool register_allium_algo( algo_gate_t* gate );
#if defined(ALLIUM_4WAY)
#if defined(ALLIUM_8WAY)
void allium_8way_hash( void *state, const void *input );
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool init_allium_8way_ctx();
#elif defined(ALLIUM_4WAY)
void allium_4way_hash( void *state, const void *input );
int scanhash_allium_4way( struct work *work, uint32_t max_nonce,

File diff suppressed because it is too large Load Diff

View File

@@ -327,7 +327,6 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
nCols);
do
{
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)

View File

@@ -62,12 +62,17 @@ int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *sa
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
//int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
// uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
int LYRA2REV2_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
int LYRA2Z_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
#endif

View File

@@ -20,7 +20,7 @@ static __thread blake256_4way_context l2h_4way_blake_mid;
void lyra2h_4way_midstate( const void* input )
{
blake256_4way_init( &l2h_4way_blake_mid );
blake256_4way( &l2h_4way_blake_mid, input, 64 );
blake256_4way_update( &l2h_4way_blake_mid, input, 64 );
}
void lyra2h_4way_hash( void *state, const void *input )

View File

@@ -1,13 +1,150 @@
#include "lyra2-gate.h"
#include <memory.h>
#if defined (LYRA2REV2_4WAY)
#include "algo/blake/blake-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/cubehash/cube-hash-2way.h"
#if defined (LYRA2REV2_8WAY)
typedef struct {
blake256_8way_context blake;
keccak256_8way_context keccak;
cube_4way_context cube;
skein256_8way_context skein;
bmw256_8way_context bmw;
} lyra2v2_8way_ctx_holder __attribute__ ((aligned (64)));
static lyra2v2_8way_ctx_holder l2v2_8way_ctx;
bool init_lyra2rev2_8way_ctx()
{
keccak256_8way_init( &l2v2_8way_ctx.keccak );
cube_4way_init( &l2v2_8way_ctx.cube, 256, 16, 32 );
skein256_8way_init( &l2v2_8way_ctx.skein );
bmw256_8way_init( &l2v2_8way_ctx.bmw );
return true;
}
void lyra2rev2_8way_hash( void *state, const void *input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (128)));
uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );
blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
blake256_8way_close( &ctx.blake, vhash );
rintrlv_8x32_8x64( vhashA, vhash, 256 );
keccak256_8way_update( &ctx.keccak, vhashA, 32 );
keccak256_8way_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
cube_4way_init( &ctx.cube, 256, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
intrlv_2x256( vhash, hash0, hash1, 256 );
LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash0, hash1, vhash, 256 );
intrlv_2x256( vhash, hash2, hash3, 256 );
LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash2, hash3, vhash, 256 );
intrlv_2x256( vhash, hash4, hash5, 256 );
LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash4, hash5, vhash, 256 );
intrlv_2x256( vhash, hash6, hash7, 256 );
LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash6, hash7, vhash, 256 );
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, 256 );
skein256_8way_update( &ctx.skein, vhash, 32 );
skein256_8way_close( &ctx.skein, vhash );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
cube_4way_init( &ctx.cube, 256, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
cube_4way_init( &ctx.cube, 256, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, 256 );
bmw256_8way_update( &ctx.bmw, vhash, 32 );
bmw256_8way_close( &ctx.bmw, state );
}
int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
__m256i *noncev = (__m256i*)vdata + 19; // aligned
int thr_id = mythr->id;
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake256_8way_init( &l2v2_8way_ctx.blake );
blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 );
do
{
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
n+3, n+2, n+1, n ) );
lyra2rev2_8way_hash( hash, vdata );
pdata[19] = n;
for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
{
extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
}
n += 8;
} while ( (n < last_nonce) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce;
return 0;
}
#elif defined (LYRA2REV2_4WAY)
typedef struct {
blake256_4way_context blake;
@@ -39,12 +176,12 @@ void lyra2rev2_4way_hash( void *state, const void *input )
lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
blake256_4way( &ctx.blake, input + (64<<2), 16 );
blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
blake256_4way_close( &ctx.blake, vhash );
rintrlv_4x32_4x64( vhash64, vhash, 256 );
keccak256_4way( &ctx.keccak, vhash64, 32 );
keccak256_4way_update( &ctx.keccak, vhash64, 32 );
keccak256_4way_close( &ctx.keccak, vhash64 );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
@@ -64,7 +201,7 @@ void lyra2rev2_4way_hash( void *state, const void *input )
intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
skein256_4way( &ctx.skein, vhash64, 32 );
skein256_4way_update( &ctx.skein, vhash64, 32 );
skein256_4way_close( &ctx.skein, vhash64 );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
@@ -80,7 +217,7 @@ void lyra2rev2_4way_hash( void *state, const void *input )
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_update( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, state );
}
@@ -105,7 +242,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
mm128_bswap32_intrlv80_4x32( vdata, pdata );
blake256_4way_init( &l2v2_4way_ctx.blake );
blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
blake256_4way_update( &l2v2_4way_ctx.blake, vdata, 64 );
do
{

View File

@@ -28,21 +28,21 @@ void lyra2rev3_16way_hash( void *state, const void *input )
{
uint32_t vhash[16*8] __attribute__ ((aligned (128)));
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
uint32_t hash4[8] __attribute__ ((aligned (32)));
uint32_t hash5[8] __attribute__ ((aligned (32)));
uint32_t hash6[8] __attribute__ ((aligned (32)));
uint32_t hash7[8] __attribute__ ((aligned (32)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t hash8[8] __attribute__ ((aligned (64)));
uint32_t hash9[8] __attribute__ ((aligned (32)));
uint32_t hash10[8] __attribute__ ((aligned (32)));
uint32_t hash11[8] __attribute__ ((aligned (32)));
uint32_t hash12[8] __attribute__ ((aligned (32)));
uint32_t hash13[8] __attribute__ ((aligned (32)));
uint32_t hash14[8] __attribute__ ((aligned (32)));
uint32_t hash15[8] __attribute__ ((aligned (32)));
uint32_t hash9[8] __attribute__ ((aligned (64)));
uint32_t hash10[8] __attribute__ ((aligned (64)));
uint32_t hash11[8] __attribute__ ((aligned (64)));
uint32_t hash12[8] __attribute__ ((aligned (64)));
uint32_t hash13[8] __attribute__ ((aligned (64)));
uint32_t hash14[8] __attribute__ ((aligned (64)));
uint32_t hash15[8] __attribute__ ((aligned (64)));
lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) );
@@ -53,54 +53,30 @@ void lyra2rev3_16way_hash( void *state, const void *input )
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
vhash, 256 );
//printf("Lyra1 lane 0\n");
intrlv_2x256( vhash, hash0, hash1, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash0, 32, 1, 4, 4 );
uint32_t h[8];
LYRA2REV3( l2v3_wholeMatrix, h, 32, hash1, 32, hash1, 32, 1, 4, 4 );
printf("S: %08x %08x %08x %08x %08x %08x %08x %08x\n",hash0[0],hash0[1],hash0[2],hash0[3],hash0[4],hash0[5],hash0[6],hash0[7]);
printf("V: %08x %08x %08x %08x %08x %08x %08x %08x\n",h[0],h[1],h[2],h[3],h[4],h[5],h[6],h[7]);
printf("\n");
//printf("Lyra1 lane 2\n");
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash0, hash1, vhash, 256 );
/*
intrlv_2x256( vhash, hash2, hash3, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash2, 32, 1, 4, 4 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash2, hash3, vhash, 256 );
intrlv_2x256( vhash, hash4, hash5, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash4, 32, 1, 4, 4 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash4, hash5, vhash, 256 );
intrlv_2x256( vhash, hash6, hash7, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash6, 32, 1, 4, 4 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash6, hash7, vhash, 256 );
intrlv_2x256( vhash, hash8, hash9, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash8, 32, 1, 4, 4 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash8, hash9, vhash, 256 );
intrlv_2x256( vhash, hash10, hash11, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash10, 32, 1, 4, 4 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash10, hash11, vhash, 256 );
intrlv_2x256( vhash, hash12, hash13, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash12, 32, 1, 4, 4 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash12, hash13, vhash, 256 );
intrlv_2x256( vhash, hash14, hash15, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash14, 32, 1, 4, 4 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash14, hash15, vhash, 256 );
*/
//printf("cube\n");
intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
@@ -118,46 +94,37 @@ printf("\n");
cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
dintrlv_4x128( hash12, hash13, hash14, hash15, vhash, 256 );
//printf("Lyra2...\n");
/*
intrlv_2x256( vhash, hash0, hash1, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash0, 32, 1, 4, 4 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash0, hash1, vhash, 256 );
intrlv_2x256( vhash, hash2, hash3, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash2, 32, 1, 4, 4 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash2, hash3, vhash, 256 );
intrlv_2x256( vhash, hash4, hash5, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash4, 32, 1, 4, 4 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash4, hash5, vhash, 256 );
intrlv_2x256( vhash, hash6, hash7, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash6, 32, 1, 4, 4 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash6, hash7, vhash, 256 );
intrlv_2x256( vhash, hash8, hash9, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash8, 32, 1, 4, 4 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash8, hash9, vhash, 256 );
intrlv_2x256( vhash, hash10, hash11, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash10, 32, 1, 4, 4 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash10, hash11, vhash, 256 );
intrlv_2x256( vhash, hash12, hash13, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash12, 32, 1, 4, 4 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash12, hash13, vhash, 256 );
intrlv_2x256( vhash, hash14, hash15, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash14, 32, 1, 4, 4 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash14, hash15, vhash, 256 );
*/
intrlv_16x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
hash15, 256 );
//printf("bmw\n");
bmw256_16way_update( &ctx.bmw, vhash, 32 );
bmw256_16way_close( &ctx.bmw, state );
//printf("done\n");
}
@@ -166,12 +133,13 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
uint32_t *hash7 = &hash[7<<3];
uint32_t *hash7 = &hash[7<<4];
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 16;
const uint32_t Htarg = ptarget[7];
__m512i *noncev = (__m512i*)vdata + 19; // aligned
const int thr_id = mythr->id;
@@ -181,7 +149,7 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
mm512_bswap32_intrlv80_16x32( vdata, pdata );
blake256_16way_init( &l2v3_16way_ctx.blake );
// blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 );
blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 );
do
{
@@ -204,7 +172,7 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
}
}
n += 16;
} while ( likely( (n < max_nonce-16) && !work_restart[thr_id].restart ) );
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
*hashes_done = n - first_nonce;
return 0;
}
@@ -241,7 +209,7 @@ void lyra2rev3_8way_hash( void *state, const void *input )
lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );
blake256_8way( &ctx.blake, input + (64*8), 16 );
blake256_8way_update( &ctx.blake, input + (64*8), 16 );
blake256_8way_close( &ctx.blake, vhash );
dintrlv_8x32( hash0, hash1, hash2, hash3,
@@ -284,7 +252,7 @@ void lyra2rev3_8way_hash( void *state, const void *input )
intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, 256 );
bmw256_8way( &ctx.bmw, vhash, 32 );
bmw256_8way_update( &ctx.bmw, vhash, 32 );
bmw256_8way_close( &ctx.bmw, state );
}
@@ -309,7 +277,7 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake256_8way_init( &l2v3_8way_ctx.blake );
blake256_8way( &l2v3_8way_ctx.blake, vdata, 64 );
blake256_8way_update( &l2v3_8way_ctx.blake, vdata, 64 );
do
{
@@ -366,8 +334,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );
// blake256_4way( &ctx.blake, input, 80 );
blake256_4way( &ctx.blake, input + (64*4), 16 );
blake256_4way_update( &ctx.blake, input + (64*4), 16 );
blake256_4way_close( &ctx.blake, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
@@ -390,7 +357,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_update( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, state );
}
@@ -415,7 +382,7 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
mm128_bswap32_intrlv80_4x32( vdata, pdata );
blake256_4way_init( &l2v3_4way_ctx.blake );
blake256_4way( &l2v3_4way_ctx.blake, vdata, 64 );
blake256_4way_update( &l2v3_4way_ctx.blake, vdata, 64 );
do
{

View File

@@ -1,13 +1,240 @@
#include "lyra2-gate.h"
#ifdef LYRA2Z_4WAY
#include <memory.h>
#include <mm_malloc.h>
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake-hash-4way.h"
#if defined(LYRA2Z_16WAY)
__thread uint64_t* lyra2z_16way_matrix;
bool lyra2z_16way_thread_init()
{
return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
}
static __thread blake256_16way_context l2z_16way_blake_mid;
void lyra2z_16way_midstate( const void* input )
{
blake256_16way_init( &l2z_16way_blake_mid );
blake256_16way_update( &l2z_16way_blake_mid, input, 64 );
}
void lyra2z_16way_hash( void *state, const void *input )
{
uint32_t vhash[8*16] __attribute__ ((aligned (128)));
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t hash8[8] __attribute__ ((aligned (64)));
uint32_t hash9[8] __attribute__ ((aligned (64)));
uint32_t hash10[8] __attribute__ ((aligned (64)));
uint32_t hash11[8] __attribute__ ((aligned (64)));
uint32_t hash12[8] __attribute__ ((aligned (64)));
uint32_t hash13[8] __attribute__ ((aligned (64)));
uint32_t hash14[8] __attribute__ ((aligned (64)));
uint32_t hash15[8] __attribute__ ((aligned (64)));
blake256_16way_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &l2z_16way_blake_mid, sizeof l2z_16way_blake_mid );
blake256_16way_update( &ctx_blake, input + (64*16), 16 );
blake256_16way_close( &ctx_blake, vhash );
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
vhash, 256 );
intrlv_2x256( vhash, hash0, hash1, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash0, hash1, vhash, 256 );
intrlv_2x256( vhash, hash2, hash3, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash2, hash3, vhash, 256 );
intrlv_2x256( vhash, hash4, hash5, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash4, hash5, vhash, 256 );
intrlv_2x256( vhash, hash6, hash7, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash6, hash7, vhash, 256 );
intrlv_2x256( vhash, hash8, hash9, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash8, hash9, vhash, 256 );
intrlv_2x256( vhash, hash10, hash11, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash10, hash11, vhash, 256 );
intrlv_2x256( vhash, hash12, hash13, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash12, hash13, vhash, 256 );
intrlv_2x256( vhash, hash14, hash15, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
dintrlv_2x256( hash14, hash15, vhash, 256 );
memcpy( state, hash0, 32 );
memcpy( state+ 32, hash1, 32 );
memcpy( state+ 64, hash2, 32 );
memcpy( state+ 96, hash3, 32 );
memcpy( state+128, hash4, 32 );
memcpy( state+160, hash5, 32 );
memcpy( state+192, hash6, 32 );
memcpy( state+224, hash7, 32 );
memcpy( state+256, hash8, 32 );
memcpy( state+288, hash9, 32 );
memcpy( state+320, hash10, 32 );
memcpy( state+352, hash11, 32 );
memcpy( state+384, hash12, 32 );
memcpy( state+416, hash13, 32 );
memcpy( state+448, hash14, 32 );
memcpy( state+480, hash15, 32 );
}
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
__m512i *noncev = (__m512i*)vdata + 19; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
mm512_bswap32_intrlv80_16x32( vdata, pdata );
lyra2z_16way_midstate( vdata );
do {
*noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4,
n+ 3, n+ 2, n+ 1, n ) );
lyra2z_16way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 16; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
&& !opt_benchmark )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 16;
} while ( (n < max_nonce-16) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
#elif defined(LYRA2Z_8WAY)
__thread uint64_t* lyra2z_8way_matrix;
bool lyra2z_8way_thread_init()
{
return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
}
static __thread blake256_8way_context l2z_8way_blake_mid;
void lyra2z_8way_midstate( const void* input )
{
blake256_8way_init( &l2z_8way_blake_mid );
blake256_8way_update( &l2z_8way_blake_mid, input, 64 );
}
void lyra2z_8way_hash( void *state, const void *input )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
blake256_8way_update( &ctx_blake, input + (64*8), 16 );
blake256_8way_close( &ctx_blake, vhash );
dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash4, 32, hash4, 32, hash4, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash5, 32, hash5, 32, hash5, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
memcpy( state, hash0, 32 );
memcpy( state+ 32, hash1, 32 );
memcpy( state+ 64, hash2, 32 );
memcpy( state+ 96, hash3, 32 );
memcpy( state+128, hash4, 32 );
memcpy( state+160, hash5, 32 );
memcpy( state+192, hash6, 32 );
memcpy( state+224, hash7, 32 );
}
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 19; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
mm256_bswap32_intrlv80_8x32( vdata, pdata );
lyra2z_8way_midstate( vdata );
do {
*noncev = mm256_bswap_32(
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
lyra2z_8way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 8; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
&& !opt_benchmark )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 8;
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
#elif defined(LYRA2Z_4WAY)
__thread uint64_t* lyra2z_4way_matrix;
bool lyra2z_4way_thread_init()
@@ -20,7 +247,7 @@ static __thread blake256_4way_context l2z_4way_blake_mid;
void lyra2z_4way_midstate( const void* input )
{
blake256_4way_init( &l2z_4way_blake_mid );
blake256_4way( &l2z_4way_blake_mid, input, 64 );
blake256_4way_update( &l2z_4way_blake_mid, input, 64 );
}
void lyra2z_4way_hash( void *state, const void *input )
@@ -33,7 +260,7 @@ void lyra2z_4way_hash( void *state, const void *input )
blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
blake256_4way( &ctx_blake, input + (64*4), 16 );
blake256_4way_update( &ctx_blake, input + (64*4), 16 );
blake256_4way_close( &ctx_blake, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
@@ -85,100 +312,3 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
#endif
#if defined(LYRA2Z_8WAY)
__thread uint64_t* lyra2z_8way_matrix;
bool lyra2z_8way_thread_init()
{
return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
}
static __thread blake256_8way_context l2z_8way_blake_mid;
void lyra2z_8way_midstate( const void* input )
{
blake256_8way_init( &l2z_8way_blake_mid );
blake256_8way( &l2z_8way_blake_mid, input, 64 );
}
void lyra2z_8way_hash( void *state, const void *input )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
blake256_8way( &ctx_blake, input + (64*8), 16 );
blake256_8way_close( &ctx_blake, vhash );
dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash4, 32, hash4, 32, hash4, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash5, 32, hash5, 32, hash5, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
memcpy( state, hash0, 32 );
memcpy( state+ 32, hash1, 32 );
memcpy( state+ 64, hash2, 32 );
memcpy( state+ 96, hash3, 32 );
memcpy( state+128, hash4, 32 );
memcpy( state+160, hash5, 32 );
memcpy( state+192, hash6, 32 );
memcpy( state+224, hash7, 32 );
}
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 19; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
mm256_bswap32_intrlv80_8x32( vdata, pdata );
lyra2z_8way_midstate( vdata );
do {
*noncev = mm256_bswap_32(
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
lyra2z_8way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 8; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
&& !opt_benchmark )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 8;
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

View File

@@ -31,38 +31,35 @@
inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
{
const int fullBlocks = len / 32;
const int len_m256i = len / 32;
const int fullBlocks = len_m256i / BLOCK_LEN_M256I;
__m512i* state = (__m512i*)State;
__m512i* out = (__m512i*)Out;
int i;
//printf("squeeze 1, len= %d, full %d\n", len,fullBlocks);
//Squeezes full blocks
for ( i = 0; i < fullBlocks; i++ )
{
//printf("squeeze 1, %d\n",i);
memcpy_512( out, state, BLOCK_LEN_M256I*2 );
//printf("squeeze 2\n");
memcpy_512( out, state, BLOCK_LEN_M256I );
LYRA_ROUND_2WAY_AVX512( state[0], state[1], state[2], state[3] );
//printf("squeeze 2\n");
out += BLOCK_LEN_M256I;
}
//Squeezes remaining bytes
// memcpy_512( out, state, ( (len * 2 ) );
memcpy_512( out, state, len_m256i % BLOCK_LEN_M256I );
}
inline void absorbBlock_2way( uint64_t *State, const uint64_t *In )
inline void absorbBlock_2way( uint64_t *State, const uint64_t *In0,
const uint64_t *In1 )
{
register __m512i state0, state1, state2, state3;
__m512i *in = (__m512i*)In;
__m512i in[3];
casti_m256i( in, 0 ) = casti_m256i( In0, 0 );
casti_m256i( in, 1 ) = casti_m256i( In1, 1 );
casti_m256i( in, 2 ) = casti_m256i( In0, 2 );
casti_m256i( in, 3 ) = casti_m256i( In1, 3 );
casti_m256i( in, 4 ) = casti_m256i( In0, 4 );
casti_m256i( in, 5 ) = casti_m256i( In1, 5 );
state0 = _mm512_load_si512( (__m512i*)State );
state1 = _mm512_load_si512( (__m512i*)State + 1 );
state2 = _mm512_load_si512( (__m512i*)State + 2 );
@@ -136,7 +133,7 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
{
_mm_prefetch( out - 9, _MM_HINT_T0 );
_mm_prefetch( out - 11, _MM_HINT_T0 );
out[0] = state0;
out[1] = state1;
out[2] = state2;
@@ -195,7 +192,6 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols )
{
int i;
register __m512i state0, state1, state2, state3;
__m512i* in = (__m512i*)rowIn;
__m512i* inout = (__m512i*)rowInOut;
@@ -228,11 +224,12 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
t2 = _mm512_permutex_epi64( state2, 0x93 );
inout[0] = _mm512_xor_si512( inout[0],
_mm512_mask_blend_epi32( 0x03, t0, t2 ) );
_mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
inout[1] = _mm512_xor_si512( inout[1],
_mm512_mask_blend_epi32( 0x03, t1, t0 ) );
_mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
inout[2] = _mm512_xor_si512( inout[2],
_mm512_mask_blend_epi32( 0x03, t2, t1 ) );
_mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
//Inputs: next column (i.e., next block in sequence)
in += BLOCK_LEN_M256I;
@@ -249,21 +246,18 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
// big ugly workaound for pointer aliasing, use a union of pointers.
// Access matrix using m512i for in and out, m256i for inout
inline void reducedDuplexRow_2way( uint64_t *State, povly matrix,
uint64_t rowIn,
uint64_t rowInOut0, uint64_t rowInOut1,
uint64_t rowOut, uint64_t nCols )
inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
uint64_t *rowInOut0, uint64_t *rowInOut1,
uint64_t *rowOut, uint64_t nCols)
{
int i;
const uint64_t ROW_LEN_M256I = BLOCK_LEN_INT64 * nCols / 4;
__m512i state0, state1, state2, state3;
// register __m512i state0, state1, state2, state3;
__m512i *in = &matrix.v512[ rowIn * ROW_LEN_M256I ];
__m256i *inout0 = &matrix.v256[ 2 * rowInOut0 * ROW_LEN_M256I ];
__m256i *inout1 = &matrix.v256[ 2 * rowInOut1 * ROW_LEN_M256I ];
__m512i *out = &matrix.v512[ rowOut * ROW_LEN_M256I ];
__m512i io[3];
register __m512i state0, state1, state2, state3;
__m512i *in = (__m512i*)rowIn;
__m256i *inout0 = (__m256i*)rowInOut0;
__m256i *inout1 = (__m256i*)rowInOut1;
__m512i *out = (__m512i*)rowOut;
__m512i io[3];
povly inout;
inout.v512 = &io[0];
__m512i t0, t1, t2;
@@ -286,23 +280,10 @@ inline void reducedDuplexRow_2way( uint64_t *State, povly matrix,
_mm_prefetch( inout0 + 6, _MM_HINT_T0 );
_mm_prefetch( inout1 + 6, _MM_HINT_T0 );
//uint64_t *ii = (uint64_t*)in0;
//printf("RDRV0 IO %016lx %016lx %016lx %016lx\n",ii[0],ii[1],ii[2],ii[3]);
for ( i = 0; i < nCols; i++ )
{
/*
//printf("RDR: loop %d\n",i);
uint64_t *io1 = (uint64_t*)inout1;
printf("RDRV0 col= %d\n", i);
printf("RDRV0 IO1 %016lx %016lx %016lx %016lx\n",io1[0],io1[1],io1[2],io1[3]);
printf("RDRV0 IO1 %016lx %016lx %016lx %016lx\n",io1[4],io1[5],io1[6],io1[7]);
printf("RDRV0 IO1 %016lx %016lx %016lx %016lx\n",io1[8],io1[9],io1[10],io1[11]);
printf("RDRV0 IO1 %016lx %016lx %016lx %016lx\n",io1[12],io1[13],io1[14],io1[153]);
*/
//Absorbing "M[prev] [+] M[row*]"
inout.v256[0] = inout0[0];
inout.v256[1] = inout1[1];
@@ -311,21 +292,6 @@ printf("RDRV0 IO1 %016lx %016lx %016lx %016lx\n",io1[12],io1[13],io1[14],io1[153
inout.v256[4] = inout0[4];
inout.v256[5] = inout1[5];
/*
uint64_t *io = (uint64_t*)inout.u64;
uint64_t *ii = (uint64_t*)in;
printf("RDRV1 col= %d\n", i);
printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[0],io[1],io[2],io[3]);
printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[4],io[5],io[6],io[7]);
printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[8],io[9],io[10],io[11]);
printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[12],io[13],io[14],io[15]);
printf("RDRV1 IN %016lx %016lx %016lx %016lx\n",ii[0],ii[1],ii[2],ii[3]);
printf("RDRV1 IN %016lx %016lx %016lx %016lx\n",ii[4],ii[5],ii[6],ii[7]);
printf("RDRV1 IN %016lx %016lx %016lx %016lx\n",ii[8],ii[9],ii[10],ii[11]);
printf("RDRV1 IN %016lx %016lx %016lx %016lx\n",ii[12],ii[13],ii[14],ii[15]);
*/
state0 = _mm512_xor_si512( state0,
_mm512_add_epi64( in[0], inout.v512[0] ) );
state1 = _mm512_xor_si512( state1,
@@ -333,76 +299,40 @@ printf("RDRV1 IN %016lx %016lx %016lx %016lx\n",ii[12],ii[13],ii[14],ii[15]);
state2 = _mm512_xor_si512( state2,
_mm512_add_epi64( in[2], inout.v512[2] ) );
//printf("RDR: round\n");
//Applies the reduced-round transformation f to the sponge's state
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
//printf("RDR 3\n");
//M[rowOut][col] = M[rowOut][col] XOR rand
out[0] = _mm512_xor_si512( out[0], state0 );
out[1] = _mm512_xor_si512( out[1], state1 );
out[2] = _mm512_xor_si512( out[2], state2 );
// if inout is the same row as out it was just overwritten, reload.
if ( rowOut == rowInOut0 )
{
inout.v256[0] = inout0[0];
inout.v256[2] = inout0[2];
inout.v256[4] = inout0[4];
}
if ( rowOut == rowInOut1 )
{
inout.v256[1] = inout1[1];
inout.v256[3] = inout1[3];
inout.v256[5] = inout1[5];
}
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
t0 = _mm512_permutex_epi64( state0, 0x93 );
t1 = _mm512_permutex_epi64( state1, 0x93 );
t2 = _mm512_permutex_epi64( state2, 0x93 );
/*
uint64_t *st = (uint64_t*)&state0;
printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[0],st[1],st[2],st[3]);
printf("RDRv2 %016lx %016lx %016lx %016lx\n",st[4],st[5],st[6],st[7]);
st = (uint64_t*)&state1;
printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[0],st[1],st[2],st[3]);
printf("RDRv2 %016lx %016lx %016lx %016lx\n",st[4],st[5],st[6],st[7]);
st = (uint64_t*)&state2;
printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[0],st[1],st[2],st[3]);
printf("RDRv2 %016lx %016lx %016lx %016lx\n",st[4],st[5],st[6],st[7]);
st = (uint64_t*)&t0;
printf("RDRV2 t0 %016lx %016lx %016lx %016lx\n",st[0],st[1],st[2],st[3]);
printf("RDRv2 t0 %016lx %016lx %016lx %016lx\n",st[4],st[5],st[6],st[7]);
st = (uint64_t*)&t1;
printf("RDRV2 t1 %016lx %016lx %016lx %016lx\n",st[0],st[1],st[2],st[3]);
printf("RDRv2 t1 %016lx %016lx %016lx %016lx\n",st[4],st[5],st[6],st[7]);
st = (uint64_t*)&t2;
printf("RDRV2 t2 %016lx %016lx %016lx %016lx\n",st[0],st[1],st[2],st[3]);
printf("RDRv2 t2 %016lx %016lx %016lx %016lx\n",st[4],st[5],st[6],st[7]);
*/
/*
printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[8],st[9],st[10],st[11]);
printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[12],st[13],st[14],st[15]);
printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[16],st[17],st[18],st[19]);
printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[20],st[21],st[22],st[23]);
printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[24],st[25],st[26],st[271]);
printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[28],st[29],st[30],st[31]);
*/
//printf("RDR 4\n");
/*
//uint64_t *io = (uint64_t*)&inout;
printf("RDRV1 col= %d\n", i);
printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[0],io[1],io[2],io[3]);
printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[4],io[5],io[6],io[7]);
printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[8],io[9],io[10],io[11]);
printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[12],io[13],io[14],io[15]);
*/
// need to split inout for write
inout.v512[0] = _mm512_xor_si512( inout.v512[0],
_mm512_mask_blend_epi32( 0x03, t0, t2 ) );
_mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
inout.v512[1] = _mm512_xor_si512( inout.v512[1],
_mm512_mask_blend_epi32( 0x03, t1, t0 ) );
_mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
inout.v512[2] = _mm512_xor_si512( inout.v512[2],
_mm512_mask_blend_epi32( 0x03, t2, t1 ) );
/*
printf("RDRV3 IO %016lx %016lx %016lx %016lx\n",io[0],io[1],io[2],io[3]);
printf("RDRV3 IO %016lx %016lx %016lx %016lx\n",io[4],io[5],io[6],io[7]);
printf("RDRV3 IO %016lx %016lx %016lx %016lx\n",io[8],io[9],io[10],io[11]);
printf("RDRV3 IO %016lx %016lx %016lx %016lx\n",io[12],io[13],io[14],io[153]);
*/
_mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
inout0[0] = inout.v256[0];
inout1[1] = inout.v256[1];
@@ -410,9 +340,6 @@ printf("RDRV3 IO %016lx %016lx %016lx %016lx\n",io[12],io[13],io[14],io[153]);
inout1[3] = inout.v256[3];
inout0[4] = inout.v256[4];
inout1[5] = inout.v256[5];
//printf("RDR 5\n");
//Goes to next block
in += BLOCK_LEN_M256I;

View File

@@ -375,7 +375,10 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
{
_mm_prefetch( out - 9, _MM_HINT_T0 );
_mm_prefetch( out - 11, _MM_HINT_T0 );
//printf("S RSR0 col= %d, out= %x\n",i,out);
out[0] = state0;
out[1] = state1;
out[2] = state2;
@@ -706,11 +709,34 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
out[1] = _mm256_xor_si256( state1, in[1] );
out[2] = _mm256_xor_si256( state2, in[2] );
/*
printf("s duplexsetup col= %d\n",i);
uint64_t * o = (uint64_t*)out;
printf("S out %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
printf("S out %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
printf("S out %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
printf("S out %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
printf("S out %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
printf("S out %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
*/
//M[row*][col] = M[row*][col] XOR rotW(rand)
t0 = _mm256_permute4x64_epi64( state0, 0x93 );
t1 = _mm256_permute4x64_epi64( state1, 0x93 );
t2 = _mm256_permute4x64_epi64( state2, 0x93 );
/*
uint64_t *t = (uint64_t*)&t0;
printf("S t0 %016lx %016lx %016lx %016lx\n",t[0],t[1],t[2],t[3]);
o = (uint64_t*)inout;
printf("S inout0 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
printf("S inout0 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
printf("S inout0 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
printf("S inout0 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
printf("S inout0 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
printf("S inout0 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
*/
inout[0] = _mm256_xor_si256( inout[0],
_mm256_blend_epi32( t0, t2, 0x03 ) );
inout[1] = _mm256_xor_si256( inout[1],
@@ -718,7 +744,17 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
inout[2] = _mm256_xor_si256( inout[2],
_mm256_blend_epi32( t2, t1, 0x03 ) );
//Inputs: next column (i.e., next block in sequence)
/*
o = (uint64_t*)inout;
printf("S inout1 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
printf("S inout1 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
printf("S inout1 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
printf("S inout1 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
printf("S inout1 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
printf("S inout1 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
*/
//Inputs: next column (i.e., next block in sequence)
in += BLOCK_LEN_M256I;
inout += BLOCK_LEN_M256I;
//Output: goes to previous column
@@ -949,6 +985,22 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
_mm_prefetch( inout + 9, _MM_HINT_T0 );
_mm_prefetch( inout + 11, _MM_HINT_T0 );
/*
uint64_t *io = (uint64_t*)inout;
uint64_t *ii = (uint64_t*)in;
printf("RDRS1 col= %d\n", i);
printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[0],io[1],io[2],io[3]);
printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[4],io[5],io[6],io[7]);
printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[8],io[9],io[10],io[11]);
printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[12],io[13],io[14],io[15]);
printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[0],ii[1],ii[2],ii[3]);
printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[4],ii[5],ii[6],ii[7]);
printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[8],ii[9],ii[10],ii[11]);
printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[12],ii[13],ii[14],ii[15]);
*/
//Absorbing "M[prev] [+] M[row*]"
state0 = _mm256_xor_si256( state0,
_mm256_add_epi64( in[0], inout[0] ) );

View File

@@ -203,29 +203,6 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
//---- Housekeeping
void initState_2way( uint64_t state[/*16*/] );
//---- Squeezes
void squeeze_2way( uint64_t *state, unsigned char *out, unsigned int len );
void reducedSqueezeRow0_2way( uint64_t* state, uint64_t* row, uint64_t nCols );
//---- Absorbs
void absorbBlock_2way( uint64_t *state, const uint64_t *in );
void absorbBlockBlake2Safe_2way( uint64_t *state, const uint64_t *in,
const uint64_t nBlocks, const uint64_t block_len );
//---- Duplexes
void reducedDuplexRow1_2way( uint64_t *state, uint64_t *rowIn,
uint64_t *rowOut, uint64_t nCols);
void reducedDuplexRowSetup_2way( uint64_t *state, uint64_t *rowIn,
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
/*
void reducedDuplexRow_2way( uint64_t *state, uint64_t *rowIn,
uint64_t *rowInOut0, uint64_t *rowInOut1,
uint64_t *rowOut, uint64_t nCols);
*/
union _povly
{
__m512i *v512;
@@ -234,9 +211,28 @@ union _povly
};
typedef union _povly povly;
void reducedDuplexRow_2way( uint64_t *state, povly matrix, uint64_t rowIn,
uint64_t rowInOut0, uint64_t rowInOut1,
uint64_t rowOut, uint64_t nCols);
//---- Housekeeping
void initState_2way( uint64_t State[/*16*/] );
//---- Squeezes
void squeeze_2way( uint64_t *State, unsigned char *out, unsigned int len );
void reducedSqueezeRow0_2way( uint64_t* state, uint64_t* row, uint64_t nCols );
//---- Absorbs
void absorbBlock_2way( uint64_t *State, const uint64_t *In0,
const uint64_t *In1 );
void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
const uint64_t nBlocks, const uint64_t block_len );
//---- Duplexes
void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
uint64_t *rowOut, uint64_t nCols);
void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
uint64_t *rowInOut0, uint64_t *rowInOut1,
uint64_t *rowOut, uint64_t nCols);
#endif

View File

@@ -133,7 +133,7 @@ void nist5hash_4way( void *out, const void *input )
keccak512_4way_context ctx_keccak;
blake512_4way_init( &ctx_blake );
blake512_4way( &ctx_blake, input, 80 );
blake512_4way_update( &ctx_blake, input, 80 );
blake512_4way_close( &ctx_blake, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -154,15 +154,15 @@ void nist5hash_4way( void *out, const void *input )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
jh512_4way_init( &ctx_jh );
jh512_4way( &ctx_jh, vhash, 64 );
jh512_4way_update( &ctx_jh, vhash, 64 );
jh512_4way_close( &ctx_jh, vhash );
keccak512_4way_init( &ctx_keccak );
keccak512_4way( &ctx_keccak, vhash, 64 );
keccak512_4way_update( &ctx_keccak, vhash, 64 );
keccak512_4way_close( &ctx_keccak, vhash );
skein512_4way_init( &ctx_skein );
skein512_4way( &ctx_skein, vhash, 64 );
skein512_4way_update( &ctx_skein, vhash, 64 );
skein512_4way_close( &ctx_skein, out );
}

View File

@@ -54,10 +54,10 @@ void anime_4way_hash( void *state, const void *input )
anime_4way_ctx_holder ctx;
memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );
bmw512_4way( &ctx.bmw, input, 80 );
bmw512_4way_update( &ctx.bmw, input, 80 );
bmw512_4way_close( &ctx.bmw, vhash );
blake512_4way( &ctx.blake, vhash, 64 );
blake512_4way_update( &ctx.blake, vhash, 64 );
blake512_4way_close( &ctx.blake, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -92,7 +92,7 @@ void anime_4way_hash( void *state, const void *input )
if ( mm256_anybits0( vh_mask ) )
{
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_update( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhashB );
}
@@ -111,7 +111,7 @@ void anime_4way_hash( void *state, const void *input )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -119,23 +119,23 @@ void anime_4way_hash( void *state, const void *input )
if ( mm256_anybits1( vh_mask ) )
{
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 );
blake512_4way_update( &ctx.blake, vhash, 64 );
blake512_4way_close( &ctx.blake, vhashA );
}
if ( mm256_anybits0( vh_mask ) )
{
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhashB );
}
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_update( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -143,13 +143,13 @@ void anime_4way_hash( void *state, const void *input )
if ( mm256_anybits1( vh_mask ) )
{
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhashA );
}
if ( mm256_anybits0( vh_mask ) )
{
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhashB );
}

File diff suppressed because it is too large Load Diff

View File

@@ -2,7 +2,10 @@
bool register_hmq1725_algo( algo_gate_t* gate )
{
#if defined(HMQ1725_4WAY)
#if defined(HMQ1725_8WAY)
gate->scanhash = (void*)&scanhash_hmq1725_8way;
gate->hash = (void*)&hmq1725_8way_hash;
#elif defined(HMQ1725_4WAY)
gate->scanhash = (void*)&scanhash_hmq1725_4way;
gate->hash = (void*)&hmq1725_4way_hash;
#else
@@ -10,7 +13,7 @@ bool register_hmq1725_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_hmq1725;
gate->hash = (void*)&hmq1725hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
opt_target_factor = 65536.0;
return true;
};

View File

@@ -4,13 +4,21 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
// #define HMQ1725_4WAY 1
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define HMQ1725_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define HMQ1725_4WAY 1
#endif
bool register_hmq1725_algo( algo_gate_t* gate );
#if defined(HMQ1725_4WAY)
#if defined(HMQ1725_8WAY)
void hmq1725_8way_hash( void *state, const void *input );
int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(HMQ1725_4WAY)
void hmq1725_4way_hash( void *state, const void *input );
int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,

View File

@@ -333,6 +333,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
if (((hash64[7]&0xFFFFFFFF)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
work_set_target_ratio( work, hash64 );
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
@@ -346,6 +347,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
if (((hash64[7]&0xFFFFFFF0)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
work_set_target_ratio( work, hash64 );
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
@@ -359,6 +361,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
if (((hash64[7]&0xFFFFFF00)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
work_set_target_ratio( work, hash64 );
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
@@ -372,6 +375,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
if (((hash64[7]&0xFFFFF000)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
work_set_target_ratio( work, hash64 );
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
@@ -386,6 +390,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
if (((hash64[7]&0xFFFF0000)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
work_set_target_ratio( work, hash64 );
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
@@ -399,6 +404,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
hmq1725hash(hash64, endiandata);
if (fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
work_set_target_ratio( work, hash64 );
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);

View File

@@ -63,20 +63,6 @@ void quark_8way_hash( void *state, const void *input )
bmw512_8way_update( &ctx.bmw, vhash, 64 );
bmw512_8way_close( &ctx.bmw, vhash );
// AVX 512 cmpeq returns a bit mask instead of a vector mask.
// This should simplify things but the logic doesn't seem to be working.
// The problem appears to be related to the test to skip a hash if it isn't
// to be used. Skipping the test for all 8 way hashes seems to have
// fixed it. The hash selection blending works if the hash is produced
// but the hash wasn't being produced when it should.
// Both decisions are based on the same data, the __mmask8. It works
// as a blend mask but not in a logical comparison, maybe the type is the
// problem. Maybe a cast to int or movm is needed to make it work.
// It's now moot because the hash can only be skipped 1 in 256 iterations
// when hashing parallel 8 ways.
// The performance impact of the workaround should be negligible.
// It's a problem for another day.
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
zero );
@@ -303,10 +289,10 @@ void quark_4way_hash( void *state, const void *input )
memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_update( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -341,7 +327,7 @@ void quark_4way_hash( void *state, const void *input )
if ( mm256_anybits1( vh_mask ) )
{
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_update( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhashB );
}
@@ -360,7 +346,7 @@ void quark_4way_hash( void *state, const void *input )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -368,24 +354,24 @@ void quark_4way_hash( void *state, const void *input )
if ( mm256_anybits0( vh_mask ) )
{
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 );
blake512_4way_update( &ctx.blake, vhash, 64 );
blake512_4way_close( &ctx.blake, vhashA );
}
if ( mm256_anybits1( vh_mask ) )
{
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhashB );
}
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_update( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -393,14 +379,14 @@ void quark_4way_hash( void *state, const void *input )
if ( mm256_anybits0( vh_mask ) )
{
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhashA );
}
if ( mm256_anybits1( vh_mask ) )
{
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhashB );
}

View File

@@ -7,16 +7,147 @@
#include "ripemd-hash-4way.h"
#define LBRY_INPUT_SIZE 112
#define LBRY_MIDSTATE 64
#define LBRY_MIDSTATE 96
#define LBRY_TAIL (LBRY_INPUT_SIZE) - (LBRY_MIDSTATE)
#if defined(LBRY_8WAY)
#if defined(LBRY_16WAY)
static __thread sha256_16way_context sha256_16w_mid;
void lbry_16way_hash( void* output, const void* input )
{
uint32_t _ALIGN(128) vhashA[16<<4];
uint32_t _ALIGN(64) vhashB[16<<4];
uint32_t _ALIGN(64) vhashC[16<<4];
uint32_t _ALIGN(64) h0[32];
uint32_t _ALIGN(64) h1[32];
uint32_t _ALIGN(64) h2[32];
uint32_t _ALIGN(64) h3[32];
uint32_t _ALIGN(64) h4[32];
uint32_t _ALIGN(64) h5[32];
uint32_t _ALIGN(64) h6[32];
uint32_t _ALIGN(64) h7[32];
uint32_t _ALIGN(64) h8[32];
uint32_t _ALIGN(64) h9[32];
uint32_t _ALIGN(64) h10[32];
uint32_t _ALIGN(64) h11[32];
uint32_t _ALIGN(64) h12[32];
uint32_t _ALIGN(64) h13[32];
uint32_t _ALIGN(64) h14[32];
uint32_t _ALIGN(64) h15[32];
sha256_16way_context ctx_sha256 __attribute__ ((aligned (64)));
sha512_8way_context ctx_sha512;
ripemd160_16way_context ctx_ripemd;
memcpy( &ctx_sha256, &sha256_16w_mid, sizeof(ctx_sha256) );
sha256_16way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<4), LBRY_TAIL );
sha256_16way_close( &ctx_sha256, vhashA );
sha256_16way_init( &ctx_sha256 );
sha256_16way_update( &ctx_sha256, vhashA, 32 );
sha256_16way_close( &ctx_sha256, vhashA );
// reinterleave to do sha512 4-way 64 bit twice.
dintrlv_16x32( h0, h1, h2, h3, h4, h5, h6, h7,
h8, h9, h10, h11, h12, h13, h14, h15, vhashA, 256 );
intrlv_8x64( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 256 );
intrlv_8x64( vhashB, h8, h9, h10, h11, h12, h13, h14, h15, 256 );
sha512_8way_init( &ctx_sha512 );
sha512_8way_update( &ctx_sha512, vhashA, 32 );
sha512_8way_close( &ctx_sha512, vhashA );
sha512_8way_init( &ctx_sha512 );
sha512_8way_update( &ctx_sha512, vhashB, 32 );
sha512_8way_close( &ctx_sha512, vhashB );
// back to 8-way 32 bit
dintrlv_8x64( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 512 );
dintrlv_8x64( h8, h9, h10, h11, h12, h13, h14, h15, vhashB, 512 );
intrlv_16x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7,
h8, h9, h10, h11, h12, h13, h14, h15, 512 );
ripemd160_16way_init( &ctx_ripemd );
ripemd160_16way_update( &ctx_ripemd, vhashA, 32 );
ripemd160_16way_close( &ctx_ripemd, vhashB );
ripemd160_16way_init( &ctx_ripemd );
ripemd160_16way_update( &ctx_ripemd, vhashA+(8<<4), 32 );
ripemd160_16way_close( &ctx_ripemd, vhashC );
sha256_16way_init( &ctx_sha256 );
sha256_16way_update( &ctx_sha256, vhashB, 20 );
sha256_16way_update( &ctx_sha256, vhashC, 20 );
sha256_16way_close( &ctx_sha256, vhashA );
sha256_16way_init( &ctx_sha256 );
sha256_16way_update( &ctx_sha256, vhashA, 32 );
sha256_16way_close( &ctx_sha256, output );
}
int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[32*16] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t edata[32] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<4]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[27];
const uint32_t first_nonce = pdata[27];
const uint32_t last_nonce = max_nonce - 16;
const uint32_t Htarg = ptarget[7];
__m512i *noncev = (__m512i*)vdata + 27; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
// we need bigendian data...
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
sha256_16way_init( &sha256_16w_mid );
sha256_16way_update( &sha256_16w_mid, vdata, LBRY_MIDSTATE );
do
{
*noncev = mm512_bswap_32( _mm512_set_epi32(
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
lbry_16way_hash( hash, vdata );
for ( int i = 0; i < 16; i++ )
if ( unlikely( hash7[ i ] <= Htarg ) )
{
// deinterleave hash for lane
extr_lane_16x32( lane_hash, hash, i, 256 );
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[27] = n + i;
submit_lane_solution( work, lane_hash, mythr, i );
}
}
n += 16;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(LBRY_8WAY)
static __thread sha256_8way_context sha256_8w_mid;
void lbry_8way_hash( void* output, const void* input )
{
uint32_t _ALIGN(64) vhashA[16<<3];
uint32_t _ALIGN(128) vhashA[16<<3];
uint32_t _ALIGN(64) vhashB[16<<3];
uint32_t _ALIGN(64) vhashC[16<<3];
uint32_t _ALIGN(32) h0[32];
@@ -32,11 +163,11 @@ void lbry_8way_hash( void* output, const void* input )
ripemd160_8way_context ctx_ripemd;
memcpy( &ctx_sha256, &sha256_8w_mid, sizeof(ctx_sha256) );
sha256_8way( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL );
sha256_8way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL );
sha256_8way_close( &ctx_sha256, vhashA );
sha256_8way_init( &ctx_sha256 );
sha256_8way( &ctx_sha256, vhashA, 32 );
sha256_8way_update( &ctx_sha256, vhashA, 32 );
sha256_8way_close( &ctx_sha256, vhashA );
// reinterleave to do sha512 4-way 64 bit twice.
@@ -45,11 +176,11 @@ void lbry_8way_hash( void* output, const void* input )
intrlv_4x64( vhashB, h4, h5, h6, h7, 256 );
sha512_4way_init( &ctx_sha512 );
sha512_4way( &ctx_sha512, vhashA, 32 );
sha512_4way_update( &ctx_sha512, vhashA, 32 );
sha512_4way_close( &ctx_sha512, vhashA );
sha512_4way_init( &ctx_sha512 );
sha512_4way( &ctx_sha512, vhashB, 32 );
sha512_4way_update( &ctx_sha512, vhashB, 32 );
sha512_4way_close( &ctx_sha512, vhashB );
// back to 8-way 32 bit
@@ -58,20 +189,20 @@ void lbry_8way_hash( void* output, const void* input )
intrlv_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );
ripemd160_8way_init( &ctx_ripemd );
ripemd160_8way( &ctx_ripemd, vhashA, 32 );
ripemd160_8way_update( &ctx_ripemd, vhashA, 32 );
ripemd160_8way_close( &ctx_ripemd, vhashB );
ripemd160_8way_init( &ctx_ripemd );
ripemd160_8way( &ctx_ripemd, vhashA+(8<<3), 32 );
ripemd160_8way_update( &ctx_ripemd, vhashA+(8<<3), 32 );
ripemd160_8way_close( &ctx_ripemd, vhashC );
sha256_8way_init( &ctx_sha256 );
sha256_8way( &ctx_sha256, vhashB, 20 );
sha256_8way( &ctx_sha256, vhashC, 20 );
sha256_8way_update( &ctx_sha256, vhashB, 20 );
sha256_8way_update( &ctx_sha256, vhashC, 20 );
sha256_8way_close( &ctx_sha256, vhashA );
sha256_8way_init( &ctx_sha256 );
sha256_8way( &ctx_sha256, vhashA, 32 );
sha256_8way_update( &ctx_sha256, vhashA, 32 );
sha256_8way_close( &ctx_sha256, output );
}
@@ -81,21 +212,16 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t vdata[32*8] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t edata[32] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[27];
const uint32_t first_nonce = pdata[27];
const uint32_t Htarg = ptarget[7];
uint32_t edata[32] __attribute__ ((aligned (64)));
__m256i *noncev = (__m256i*)vdata + 27; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// we need bigendian data...
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
@@ -106,33 +232,30 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
intrlv_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 1024 );
edata, edata, edata, edata, 1024 );
sha256_8way_init( &sha256_8w_mid );
sha256_8way( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
sha256_8way_update( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] )
do
{
uint32_t mask = masks[m];
do
{
*noncev = mm256_bswap_32( _mm256_set_epi32(
n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
lbry_8way_hash( hash, vdata );
*noncev = mm256_bswap_32( _mm256_set_epi32(
n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
lbry_8way_hash( hash, vdata );
for ( int i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
for ( int i = 0; i < 8; i++ )
if ( unlikely( hash7[ i ] <= Htarg ) )
{
// deinterleave hash for lane
extr_lane_8x32( lane_hash, hash, i, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
// deinterleave hash for lane
extr_lane_8x32( lane_hash, hash, i, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[27] = n + i;
submit_lane_solution( work, lane_hash, mythr, i );
}
pdata[27] = n + i;
submit_lane_solution( work, lane_hash, mythr, i );
}
n += 8;
} while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
break;
}
}
n += 8;
} while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return 0;
}

View File

@@ -98,16 +98,23 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }
bool register_lbry_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT | SHA_OPT;
#if defined (LBRY_8WAY)
// gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
#if defined (LBRY_16WAY)
gate->scanhash = (void*)&scanhash_lbry_16way;
gate->hash = (void*)&lbry_16way_hash;
gate->optimizations = AVX2_OPT | AVX512_OPT;
#elif defined (LBRY_8WAY)
gate->scanhash = (void*)&scanhash_lbry_8way;
gate->hash = (void*)&lbry_8way_hash;
gate->optimizations = AVX2_OPT | AVX512_OPT;
#elif defined (LBRY_4WAY)
gate->scanhash = (void*)&scanhash_lbry_4way;
gate->hash = (void*)&lbry_4way_hash;
gate->optimizations = AVX2_OPT | AVX512_OPT;
#else
gate->scanhash = (void*)&scanhash_lbry;
gate->hash = (void*)&lbry_hash;
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
#endif
gate->calc_network_diff = (void*)&lbry_calc_network_diff;
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;

View File

@@ -4,11 +4,19 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define LBRY_16WAY 1
#elif defined(__AVX2__)
#define LBRY_8WAY 1
#endif
/*
#if !defined(__SHA__)
#if defined(__AVX2__)
#define LBRY_8WAY
#endif
#endif
*/
#define LBRY_NTIME_INDEX 25
#define LBRY_NBITS_INDEX 26
@@ -18,18 +26,23 @@
bool register_lbry_algo( algo_gate_t* gate );
#if defined(LBRY_8WAY)
#if defined(LBRY_16WAY)
void lbry_16way_hash( void *state, const void *input );
int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(LBRY_8WAY)
void lbry_8way_hash( void *state, const void *input );
int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
/*
#elif defined(LBRY_4WAY)
void lbry_4way_hash( void *state, const void *input );
int scanhash_lbry_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
*/
#else
void lbry_hash( void *state, const void *input );

View File

@@ -80,9 +80,6 @@ int scanhash_lbry( struct work *work, uint32_t max_nonce,
// we need bigendian data...
swab32_array( endiandata, pdata, 32 );
#ifdef DEBUG_ALGO
printf("[%d] Htarg=%X\n", thr_id, Htarg);
#endif
for (int m=0; m < sizeof(masks); m++) {
if (Htarg <= htmax[m]) {
uint32_t mask = masks[m];
@@ -90,23 +87,11 @@ int scanhash_lbry( struct work *work, uint32_t max_nonce,
pdata[27] = ++n;
be32enc(&endiandata[27], n);
lbry_hash(hash64, &endiandata);
#ifndef DEBUG_ALGO
if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
return true;
pdata[27] = n;
submit_solution( work, hash64, mythr );
}
#else
if (!(n % 0x1000) && !thr_id) printf(".");
if (!(hash64[7] & mask)) {
printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
return true;
}
}
#endif
} while (n < max_nonce && !work_restart[thr_id].restart);
// see blake.c if else to understand the loop on htmax => mask
} while ( (n < max_nonce -8) && !work_restart[thr_id].restart);
break;
}
}

View File

@@ -259,7 +259,8 @@ void ripemd160_4way_init( ripemd160_4way_context *sc )
sc->count_high = sc->count_low = 0;
}
void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len )
void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
size_t len )
{
__m128i *vdata = (__m128i*)data;
size_t ptr;
@@ -559,7 +560,8 @@ void ripemd160_8way_init( ripemd160_8way_context *sc )
sc->count_high = sc->count_low = 0;
}
void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len )
void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
size_t len )
{
__m256i *vdata = (__m256i*)data;
size_t ptr;
@@ -623,3 +625,303 @@ void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst )
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// RIPEMD-160 16 way
#define F16W_1(x, y, z) \
_mm512_xor_si512( _mm512_xor_si512( x, y ), z )
#define F16W_2(x, y, z) \
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( y, z ), x ), z )
#define F16W_3(x, y, z) \
_mm512_xor_si512( _mm512_or_si512( x, mm512_not( y ) ), z )
#define F16W_4(x, y, z) \
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( x, y ), z ), y )
#define F16W_5(x, y, z) \
_mm512_xor_si512( x, _mm512_or_si512( y, mm512_not( z ) ) )
#define RR_16W(a, b, c, d, e, f, s, r, k) \
do{ \
a = _mm512_add_epi32( mm512_rol_32( _mm512_add_epi32( _mm512_add_epi32( \
_mm512_add_epi32( a, f( b ,c, d ) ), r ), \
m512_const1_64( k ) ), s ), e ); \
c = mm512_rol_32( c, 10 );\
} while (0)
#define ROUND1_16W(a, b, c, d, e, f, s, r, k) \
RR_16W(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
#define ROUND2_16W(a, b, c, d, e, f, s, r, k) \
RR_16W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
static void ripemd160_16way_round( ripemd160_16way_context *sc )
{
const __m512i *in = (__m512i*)sc->buf;
__m512i *h = (__m512i*)sc->val;
register __m512i A1, B1, C1, D1, E1;
register __m512i A2, B2, C2, D2, E2;
__m512i tmp;
A1 = A2 = h[0];
B1 = B2 = h[1];
C1 = C2 = h[2];
D1 = D2 = h[3];
E1 = E2 = h[4];
ROUND1_16W( A, B, C, D, E, F16W_1, 11, in[ 0], 1 );
ROUND1_16W( E, A, B, C, D, F16W_1, 14, in[ 1], 1 );
ROUND1_16W( D, E, A, B, C, F16W_1, 15, in[ 2], 1 );
ROUND1_16W( C, D, E, A, B, F16W_1, 12, in[ 3], 1 );
ROUND1_16W( B, C, D, E, A, F16W_1, 5, in[ 4], 1 );
ROUND1_16W( A, B, C, D, E, F16W_1, 8, in[ 5], 1 );
ROUND1_16W( E, A, B, C, D, F16W_1, 7, in[ 6], 1 );
ROUND1_16W( D, E, A, B, C, F16W_1, 9, in[ 7], 1 );
ROUND1_16W( C, D, E, A, B, F16W_1, 11, in[ 8], 1 );
ROUND1_16W( B, C, D, E, A, F16W_1, 13, in[ 9], 1 );
ROUND1_16W( A, B, C, D, E, F16W_1, 14, in[10], 1 );
ROUND1_16W( E, A, B, C, D, F16W_1, 15, in[11], 1 );
ROUND1_16W( D, E, A, B, C, F16W_1, 6, in[12], 1 );
ROUND1_16W( C, D, E, A, B, F16W_1, 7, in[13], 1 );
ROUND1_16W( B, C, D, E, A, F16W_1, 9, in[14], 1 );
ROUND1_16W( A, B, C, D, E, F16W_1, 8, in[15], 1 );
ROUND1_16W( E, A, B, C, D, F16W_2, 7, in[ 7], 2 );
ROUND1_16W( D, E, A, B, C, F16W_2, 6, in[ 4], 2 );
ROUND1_16W( C, D, E, A, B, F16W_2, 8, in[13], 2 );
ROUND1_16W( B, C, D, E, A, F16W_2, 13, in[ 1], 2 );
ROUND1_16W( A, B, C, D, E, F16W_2, 11, in[10], 2 );
ROUND1_16W( E, A, B, C, D, F16W_2, 9, in[ 6], 2 );
ROUND1_16W( D, E, A, B, C, F16W_2, 7, in[15], 2 );
ROUND1_16W( C, D, E, A, B, F16W_2, 15, in[ 3], 2 );
ROUND1_16W( B, C, D, E, A, F16W_2, 7, in[12], 2 );
ROUND1_16W( A, B, C, D, E, F16W_2, 12, in[ 0], 2 );
ROUND1_16W( E, A, B, C, D, F16W_2, 15, in[ 9], 2 );
ROUND1_16W( D, E, A, B, C, F16W_2, 9, in[ 5], 2 );
ROUND1_16W( C, D, E, A, B, F16W_2, 11, in[ 2], 2 );
ROUND1_16W( B, C, D, E, A, F16W_2, 7, in[14], 2 );
ROUND1_16W( A, B, C, D, E, F16W_2, 13, in[11], 2 );
ROUND1_16W( E, A, B, C, D, F16W_2, 12, in[ 8], 2 );
ROUND1_16W( D, E, A, B, C, F16W_3, 11, in[ 3], 3 );
ROUND1_16W( C, D, E, A, B, F16W_3, 13, in[10], 3 );
ROUND1_16W( B, C, D, E, A, F16W_3, 6, in[14], 3 );
ROUND1_16W( A, B, C, D, E, F16W_3, 7, in[ 4], 3 );
ROUND1_16W( E, A, B, C, D, F16W_3, 14, in[ 9], 3 );
ROUND1_16W( D, E, A, B, C, F16W_3, 9, in[15], 3 );
ROUND1_16W( C, D, E, A, B, F16W_3, 13, in[ 8], 3 );
ROUND1_16W( B, C, D, E, A, F16W_3, 15, in[ 1], 3 );
ROUND1_16W( A, B, C, D, E, F16W_3, 14, in[ 2], 3 );
ROUND1_16W( E, A, B, C, D, F16W_3, 8, in[ 7], 3 );
ROUND1_16W( D, E, A, B, C, F16W_3, 13, in[ 0], 3 );
ROUND1_16W( C, D, E, A, B, F16W_3, 6, in[ 6], 3 );
ROUND1_16W( B, C, D, E, A, F16W_3, 5, in[13], 3 );
ROUND1_16W( A, B, C, D, E, F16W_3, 12, in[11], 3 );
ROUND1_16W( E, A, B, C, D, F16W_3, 7, in[ 5], 3 );
ROUND1_16W( D, E, A, B, C, F16W_3, 5, in[12], 3 );
ROUND1_16W( C, D, E, A, B, F16W_4, 11, in[ 1], 4 );
ROUND1_16W( B, C, D, E, A, F16W_4, 12, in[ 9], 4 );
ROUND1_16W( A, B, C, D, E, F16W_4, 14, in[11], 4 );
ROUND1_16W( E, A, B, C, D, F16W_4, 15, in[10], 4 );
ROUND1_16W( D, E, A, B, C, F16W_4, 14, in[ 0], 4 );
ROUND1_16W( C, D, E, A, B, F16W_4, 15, in[ 8], 4 );
ROUND1_16W( B, C, D, E, A, F16W_4, 9, in[12], 4 );
ROUND1_16W( A, B, C, D, E, F16W_4, 8, in[ 4], 4 );
ROUND1_16W( E, A, B, C, D, F16W_4, 9, in[13], 4 );
ROUND1_16W( D, E, A, B, C, F16W_4, 14, in[ 3], 4 );
ROUND1_16W( C, D, E, A, B, F16W_4, 5, in[ 7], 4 );
ROUND1_16W( B, C, D, E, A, F16W_4, 6, in[15], 4 );
ROUND1_16W( A, B, C, D, E, F16W_4, 8, in[14], 4 );
ROUND1_16W( E, A, B, C, D, F16W_4, 6, in[ 5], 4 );
ROUND1_16W( D, E, A, B, C, F16W_4, 5, in[ 6], 4 );
ROUND1_16W( C, D, E, A, B, F16W_4, 12, in[ 2], 4 );
ROUND1_16W( B, C, D, E, A, F16W_5, 9, in[ 4], 5 );
ROUND1_16W( A, B, C, D, E, F16W_5, 15, in[ 0], 5 );
ROUND1_16W( E, A, B, C, D, F16W_5, 5, in[ 5], 5 );
ROUND1_16W( D, E, A, B, C, F16W_5, 11, in[ 9], 5 );
ROUND1_16W( C, D, E, A, B, F16W_5, 6, in[ 7], 5 );
ROUND1_16W( B, C, D, E, A, F16W_5, 8, in[12], 5 );
ROUND1_16W( A, B, C, D, E, F16W_5, 13, in[ 2], 5 );
ROUND1_16W( E, A, B, C, D, F16W_5, 12, in[10], 5 );
ROUND1_16W( D, E, A, B, C, F16W_5, 5, in[14], 5 );
ROUND1_16W( C, D, E, A, B, F16W_5, 12, in[ 1], 5 );
ROUND1_16W( B, C, D, E, A, F16W_5, 13, in[ 3], 5 );
ROUND1_16W( A, B, C, D, E, F16W_5, 14, in[ 8], 5 );
ROUND1_16W( E, A, B, C, D, F16W_5, 11, in[11], 5 );
ROUND1_16W( D, E, A, B, C, F16W_5, 8, in[ 6], 5 );
ROUND1_16W( C, D, E, A, B, F16W_5, 5, in[15], 5 );
ROUND1_16W( B, C, D, E, A, F16W_5, 6, in[13], 5 );
ROUND2_16W( A, B, C, D, E, F16W_5, 8, in[ 5], 1 );
ROUND2_16W( E, A, B, C, D, F16W_5, 9, in[14], 1 );
ROUND2_16W( D, E, A, B, C, F16W_5, 9, in[ 7], 1 );
ROUND2_16W( C, D, E, A, B, F16W_5, 11, in[ 0], 1 );
ROUND2_16W( B, C, D, E, A, F16W_5, 13, in[ 9], 1 );
ROUND2_16W( A, B, C, D, E, F16W_5, 15, in[ 2], 1 );
ROUND2_16W( E, A, B, C, D, F16W_5, 15, in[11], 1 );
ROUND2_16W( D, E, A, B, C, F16W_5, 5, in[ 4], 1 );
ROUND2_16W( C, D, E, A, B, F16W_5, 7, in[13], 1 );
ROUND2_16W( B, C, D, E, A, F16W_5, 7, in[ 6], 1 );
ROUND2_16W( A, B, C, D, E, F16W_5, 8, in[15], 1 );
ROUND2_16W( E, A, B, C, D, F16W_5, 11, in[ 8], 1 );
ROUND2_16W( D, E, A, B, C, F16W_5, 14, in[ 1], 1 );
ROUND2_16W( C, D, E, A, B, F16W_5, 14, in[10], 1 );
ROUND2_16W( B, C, D, E, A, F16W_5, 12, in[ 3], 1 );
ROUND2_16W( A, B, C, D, E, F16W_5, 6, in[12], 1 );
ROUND2_16W( E, A, B, C, D, F16W_4, 9, in[ 6], 2 );
ROUND2_16W( D, E, A, B, C, F16W_4, 13, in[11], 2 );
ROUND2_16W( C, D, E, A, B, F16W_4, 15, in[ 3], 2 );
ROUND2_16W( B, C, D, E, A, F16W_4, 7, in[ 7], 2 );
ROUND2_16W( A, B, C, D, E, F16W_4, 12, in[ 0], 2 );
ROUND2_16W( E, A, B, C, D, F16W_4, 8, in[13], 2 );
ROUND2_16W( D, E, A, B, C, F16W_4, 9, in[ 5], 2 );
ROUND2_16W( C, D, E, A, B, F16W_4, 11, in[10], 2 );
ROUND2_16W( B, C, D, E, A, F16W_4, 7, in[14], 2 );
ROUND2_16W( A, B, C, D, E, F16W_4, 7, in[15], 2 );
ROUND2_16W( E, A, B, C, D, F16W_4, 12, in[ 8], 2 );
ROUND2_16W( D, E, A, B, C, F16W_4, 7, in[12], 2 );
ROUND2_16W( C, D, E, A, B, F16W_4, 6, in[ 4], 2 );
ROUND2_16W( B, C, D, E, A, F16W_4, 15, in[ 9], 2 );
ROUND2_16W( A, B, C, D, E, F16W_4, 13, in[ 1], 2 );
ROUND2_16W( E, A, B, C, D, F16W_4, 11, in[ 2], 2 );
ROUND2_16W( D, E, A, B, C, F16W_3, 9, in[15], 3 );
ROUND2_16W( C, D, E, A, B, F16W_3, 7, in[ 5], 3 );
ROUND2_16W( B, C, D, E, A, F16W_3, 15, in[ 1], 3 );
ROUND2_16W( A, B, C, D, E, F16W_3, 11, in[ 3], 3 );
ROUND2_16W( E, A, B, C, D, F16W_3, 8, in[ 7], 3 );
ROUND2_16W( D, E, A, B, C, F16W_3, 6, in[14], 3 );
ROUND2_16W( C, D, E, A, B, F16W_3, 6, in[ 6], 3 );
ROUND2_16W( B, C, D, E, A, F16W_3, 14, in[ 9], 3 );
ROUND2_16W( A, B, C, D, E, F16W_3, 12, in[11], 3 );
ROUND2_16W( E, A, B, C, D, F16W_3, 13, in[ 8], 3 );
ROUND2_16W( D, E, A, B, C, F16W_3, 5, in[12], 3 );
ROUND2_16W( C, D, E, A, B, F16W_3, 14, in[ 2], 3 );
ROUND2_16W( B, C, D, E, A, F16W_3, 13, in[10], 3 );
ROUND2_16W( A, B, C, D, E, F16W_3, 13, in[ 0], 3 );
ROUND2_16W( E, A, B, C, D, F16W_3, 7, in[ 4], 3 );
ROUND2_16W( D, E, A, B, C, F16W_3, 5, in[13], 3 );
ROUND2_16W( C, D, E, A, B, F16W_2, 15, in[ 8], 4 );
ROUND2_16W( B, C, D, E, A, F16W_2, 5, in[ 6], 4 );
ROUND2_16W( A, B, C, D, E, F16W_2, 8, in[ 4], 4 );
ROUND2_16W( E, A, B, C, D, F16W_2, 11, in[ 1], 4 );
ROUND2_16W( D, E, A, B, C, F16W_2, 14, in[ 3], 4 );
ROUND2_16W( C, D, E, A, B, F16W_2, 14, in[11], 4 );
ROUND2_16W( B, C, D, E, A, F16W_2, 6, in[15], 4 );
ROUND2_16W( A, B, C, D, E, F16W_2, 14, in[ 0], 4 );
ROUND2_16W( E, A, B, C, D, F16W_2, 6, in[ 5], 4 );
ROUND2_16W( D, E, A, B, C, F16W_2, 9, in[12], 4 );
ROUND2_16W( C, D, E, A, B, F16W_2, 12, in[ 2], 4 );
ROUND2_16W( B, C, D, E, A, F16W_2, 9, in[13], 4 );
ROUND2_16W( A, B, C, D, E, F16W_2, 12, in[ 9], 4 );
ROUND2_16W( E, A, B, C, D, F16W_2, 5, in[ 7], 4 );
ROUND2_16W( D, E, A, B, C, F16W_2, 15, in[10], 4 );
ROUND2_16W( C, D, E, A, B, F16W_2, 8, in[14], 4 );
ROUND2_16W( B, C, D, E, A, F16W_1, 8, in[12], 5 );
ROUND2_16W( A, B, C, D, E, F16W_1, 5, in[15], 5 );
ROUND2_16W( E, A, B, C, D, F16W_1, 12, in[10], 5 );
ROUND2_16W( D, E, A, B, C, F16W_1, 9, in[ 4], 5 );
ROUND2_16W( C, D, E, A, B, F16W_1, 12, in[ 1], 5 );
ROUND2_16W( B, C, D, E, A, F16W_1, 5, in[ 5], 5 );
ROUND2_16W( A, B, C, D, E, F16W_1, 14, in[ 8], 5 );
ROUND2_16W( E, A, B, C, D, F16W_1, 6, in[ 7], 5 );
ROUND2_16W( D, E, A, B, C, F16W_1, 8, in[ 6], 5 );
ROUND2_16W( C, D, E, A, B, F16W_1, 13, in[ 2], 5 );
ROUND2_16W( B, C, D, E, A, F16W_1, 6, in[13], 5 );
ROUND2_16W( A, B, C, D, E, F16W_1, 5, in[14], 5 );
ROUND2_16W( E, A, B, C, D, F16W_1, 15, in[ 0], 5 );
ROUND2_16W( D, E, A, B, C, F16W_1, 13, in[ 3], 5 );
ROUND2_16W( C, D, E, A, B, F16W_1, 11, in[ 9], 5 );
ROUND2_16W( B, C, D, E, A, F16W_1, 11, in[11], 5 );
tmp = _mm512_add_epi32( _mm512_add_epi32( h[1], C1 ), D2 );
h[1] = _mm512_add_epi32( _mm512_add_epi32( h[2], D1 ), E2 );
h[2] = _mm512_add_epi32( _mm512_add_epi32( h[3], E1 ), A2 );
h[3] = _mm512_add_epi32( _mm512_add_epi32( h[4], A1 ), B2 );
h[4] = _mm512_add_epi32( _mm512_add_epi32( h[0], B1 ), C2 );
h[0] = tmp;
}
void ripemd160_16way_init( ripemd160_16way_context *sc )
{
sc->val[0] = m512_const1_64( 0x6745230167452301 );
sc->val[1] = m512_const1_64( 0xEFCDAB89EFCDAB89 );
sc->val[2] = m512_const1_64( 0x98BADCFE98BADCFE );
sc->val[3] = m512_const1_64( 0x1032547610325476 );
sc->val[4] = m512_const1_64( 0xC3D2E1F0C3D2E1F0 );
sc->count_high = sc->count_low = 0;
}
void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
size_t len )
{
__m512i *vdata = (__m512i*)data;
size_t ptr;
const int block_size = 64;
ptr = (unsigned)sc->count_low & (block_size - 1U);
while ( len > 0 )
{
size_t clen;
uint32_t clow, clow2;
clen = block_size - ptr;
if ( clen > len )
clen = len;
memcpy_512( sc->buf + (ptr>>2), vdata, clen>>2 );
vdata = vdata + (clen>>2);
ptr += clen;
len -= clen;
if ( ptr == block_size )
{
ripemd160_16way_round( sc );
ptr = 0;
}
clow = sc->count_low;
clow2 = clow + clen;
sc->count_low = clow2;
if ( clow2 < clow )
sc->count_high++;
}
}
void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst )
{
unsigned ptr, u;
uint32_t low, high;
const int block_size = 64;
const int pad = block_size - 8;
ptr = (unsigned)sc->count_low & ( block_size - 1U);
sc->buf[ ptr>>2 ] = m512_const1_32( 0x80 );
ptr += 4;
if ( ptr > pad )
{
memset_zero_512( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
ripemd160_16way_round( sc );
memset_zero_512( sc->buf, pad>>2 );
}
else
memset_zero_512( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
low = sc->count_low;
high = (sc->count_high << 3) | (low >> 29);
low = low << 3;
sc->buf[ pad>>2 ] = _mm512_set1_epi32( low );
sc->buf[ (pad>>2) + 1 ] = _mm512_set1_epi32( high );
ripemd160_16way_round( sc );
for (u = 0; u < 5; u ++)
casti_m512i( dst, u ) = sc->val[u];
}
#endif // AVX512

View File

@@ -16,7 +16,8 @@ typedef struct
} __attribute__ ((aligned (64))) ripemd160_4way_context;
void ripemd160_4way_init( ripemd160_4way_context *sc );
void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len );
void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
size_t len );
void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst );
#if defined (__AVX2__)
@@ -26,13 +27,28 @@ typedef struct
__m256i buf[64>>2];
__m256i val[5];
uint32_t count_high, count_low;
} __attribute__ ((aligned (64))) ripemd160_8way_context;
} __attribute__ ((aligned (128))) ripemd160_8way_context;
void ripemd160_8way_init( ripemd160_8way_context *sc );
void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len );
void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
size_t len );
void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
typedef struct
{
__m512i buf[64>>2];
__m512i val[5];
uint32_t count_high, count_low;
} __attribute__ ((aligned (128))) ripemd160_16way_context;
void ripemd160_16way_init( ripemd160_16way_context *sc );
void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
size_t len );
void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst );
#endif // AVX512
#endif // __AVX2__
#endif // __SSE4_2__
#endif // RIPEMD_HASH_4WAY_H__

View File

@@ -41,13 +41,9 @@
#define SHA2_HASH_4WAY_H__ 1
#include <stddef.h>
#include "sph_types.h"
#include "simd-utils.h"
#if defined(__SSE2__)
//#if defined(__SSE4_2__)
//#define SPH_SIZE_sha256 256
// SHA-256 4 way
@@ -59,9 +55,12 @@ typedef struct {
} sha256_4way_context __attribute__ ((aligned (64)));
void sha256_4way_init( sha256_4way_context *sc );
void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
void sha256_4way_update( sha256_4way_context *sc, const void *data,
size_t len );
void sha256_4way_close( sha256_4way_context *sc, void *dst );
#endif // SSE2
#if defined (__AVX2__)
// SHA-256 8 way
@@ -74,10 +73,29 @@ typedef struct {
} sha256_8way_context __attribute__ ((aligned (128)));
void sha256_8way_init( sha256_8way_context *sc );
void sha256_8way( sha256_8way_context *sc, const void *data, size_t len );
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
void sha256_8way_close( sha256_8way_context *sc, void *dst );
//#define SPH_SIZE_sha512 512
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// SHA-256 16 way
typedef struct {
__m512i buf[64>>2];
__m512i val[8];
uint32_t count_high, count_low;
bool initialized;
} sha256_16way_context __attribute__ ((aligned (128)));
void sha256_16way_init( sha256_16way_context *sc );
void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
void sha256_16way_close( sha256_16way_context *sc, void *dst );
#endif // AVX512
#if defined (__AVX2__)
// SHA-512 4 way
@@ -91,9 +109,10 @@ typedef struct {
void sha512_4way_init( sha512_4way_context *sc);
void sha512_4way_update( sha512_4way_context *sc, const void *data,
size_t len );
#define sha512_4way sha512_4way_update
void sha512_4way_close( sha512_4way_context *sc, void *dst );
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// SHA-512 8 way
@@ -110,8 +129,6 @@ void sha512_8way_update( sha512_8way_context *sc, const void *data,
size_t len );
void sha512_8way_close( sha512_8way_context *sc, void *dst );
#endif // AVX512
#endif // __AVX2__
#endif // __SSE2__
#endif // SHA256_4WAY_H__

View File

@@ -39,47 +39,31 @@
// SHA-256 32 bit
/*
static const sph_u32 H256[8] = {
SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
static const uint32_t H256[8] =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
*/
static const sph_u32 K256[64] = {
SPH_C32(0x428A2F98), SPH_C32(0x71374491),
SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
SPH_C32(0x3956C25B), SPH_C32(0x59F111F1),
SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5),
SPH_C32(0xD807AA98), SPH_C32(0x12835B01),
SPH_C32(0x243185BE), SPH_C32(0x550C7DC3),
SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE),
SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174),
SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786),
SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC),
SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA),
SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA),
SPH_C32(0x983E5152), SPH_C32(0xA831C66D),
SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7),
SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147),
SPH_C32(0x06CA6351), SPH_C32(0x14292967),
SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138),
SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13),
SPH_C32(0x650A7354), SPH_C32(0x766A0ABB),
SPH_C32(0x81C2C92E), SPH_C32(0x92722C85),
SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B),
SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3),
SPH_C32(0xD192E819), SPH_C32(0xD6990624),
SPH_C32(0xF40E3585), SPH_C32(0x106AA070),
SPH_C32(0x19A4C116), SPH_C32(0x1E376C08),
SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5),
SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A),
SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3),
SPH_C32(0x748F82EE), SPH_C32(0x78A5636F),
SPH_C32(0x84C87814), SPH_C32(0x8CC70208),
SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
static const uint32_t K256[64] =
{
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
};
// SHA-256 4 way
@@ -248,7 +232,7 @@ void sha256_4way_init( sha256_4way_context *sc )
*/
}
void sha256_4way( sha256_4way_context *sc, const void *data, size_t len )
void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
{
__m128i *vdata = (__m128i*)data;
size_t ptr;
@@ -273,7 +257,7 @@ void sha256_4way( sha256_4way_context *sc, const void *data, size_t len )
ptr = 0;
}
clow = sc->count_low;
clow2 = SPH_T32( clow + clen );
clow2 = clow + clen;
sc->count_low = clow2;
if ( clow2 < clow )
sc->count_high++;
@@ -306,10 +290,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
sc->buf[ pad >> 2 ] =
mm128_bswap_32( m128_const1_32( high ) );
// mm128_bswap_32( _mm_set1_epi32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] =
mm128_bswap_32( m128_const1_32( low ) );
// mm128_bswap_32( _mm_set1_epi32( low ) );
sha256_4way_round( sc, sc->buf, sc->val );
mm128_block_bswap_32( dst, sc->val );
@@ -483,7 +465,7 @@ void sha256_8way_init( sha256_8way_context *sc )
*/
}
void sha256_8way( sha256_8way_context *sc, const void *data, size_t len )
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
{
__m256i *vdata = (__m256i*)data;
size_t ptr;
@@ -508,7 +490,7 @@ void sha256_8way( sha256_8way_context *sc, const void *data, size_t len )
ptr = 0;
}
clow = sc->count_low;
clow2 = SPH_T32( clow + clen );
clow2 = clow + clen;
sc->count_low = clow2;
if ( clow2 < clow )
sc->count_high++;
@@ -549,5 +531,233 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
mm256_block_bswap_32( dst, sc->val );
}
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// SHA-256 16 way
#define CHx16(X, Y, Z) \
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z )
#define MAJx16(X, Y, Z) \
_mm512_or_si512( _mm512_and_si512( X, Y ), \
_mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
#define BSG2_0x16(x) \
_mm512_xor_si512( _mm512_xor_si512( \
mm512_ror_32(x, 2), mm512_ror_32(x, 13) ), mm512_ror_32( x, 22) )
#define BSG2_1x16(x) \
_mm512_xor_si512( _mm512_xor_si512( \
mm512_ror_32(x, 6), mm512_ror_32(x, 11) ), mm512_ror_32( x, 25) )
#define SSG2_0x16(x) \
_mm512_xor_si512( _mm512_xor_si512( \
mm512_ror_32(x, 7), mm512_ror_32(x, 18) ), _mm512_srli_epi32(x, 3) )
#define SSG2_1x16(x) \
_mm512_xor_si512( _mm512_xor_si512( \
mm512_ror_32(x, 17), mm512_ror_32(x, 19) ), _mm512_srli_epi32(x, 10) )
#define SHA2x16_MEXP( a, b, c, d ) \
mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] );
#define SHA2s_16WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
do { \
__m512i T1, T2; \
__m512i K = _mm512_set1_epi32( K256[( (j)+(i) )] ); \
T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \
K, W[i] ) ); \
T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
D = _mm512_add_epi32( D, T1 ); \
H = _mm512_add_epi32( T1, T2 ); \
} while (0)
static void
sha256_16way_round( sha256_16way_context *ctx, __m512i *in, __m512i r[8] )
{
register __m512i A, B, C, D, E, F, G, H;
__m512i W[16];
mm512_block_bswap_32( W , in );
mm512_block_bswap_32( W+8, in+8 );
if ( ctx->initialized )
{
A = r[0];
B = r[1];
C = r[2];
D = r[3];
E = r[4];
F = r[5];
G = r[6];
H = r[7];
}
else
{
A = m512_const1_64( 0x6A09E6676A09E667 );
B = m512_const1_64( 0xBB67AE85BB67AE85 );
C = m512_const1_64( 0x3C6EF3723C6EF372 );
D = m512_const1_64( 0xA54FF53AA54FF53A );
E = m512_const1_64( 0x510E527F510E527F );
F = m512_const1_64( 0x9B05688C9B05688C );
G = m512_const1_64( 0x1F83D9AB1F83D9AB );
H = m512_const1_64( 0x5BE0CD195BE0CD19 );
}
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
for ( int j = 16; j < 64; j += 16 )
{
W[ 0] = SHA2x16_MEXP( 14, 9, 1, 0 );
W[ 1] = SHA2x16_MEXP( 15, 10, 2, 1 );
W[ 2] = SHA2x16_MEXP( 0, 11, 3, 2 );
W[ 3] = SHA2x16_MEXP( 1, 12, 4, 3 );
W[ 4] = SHA2x16_MEXP( 2, 13, 5, 4 );
W[ 5] = SHA2x16_MEXP( 3, 14, 6, 5 );
W[ 6] = SHA2x16_MEXP( 4, 15, 7, 6 );
W[ 7] = SHA2x16_MEXP( 5, 0, 8, 7 );
W[ 8] = SHA2x16_MEXP( 6, 1, 9, 8 );
W[ 9] = SHA2x16_MEXP( 7, 2, 10, 9 );
W[10] = SHA2x16_MEXP( 8, 3, 11, 10 );
W[11] = SHA2x16_MEXP( 9, 4, 12, 11 );
W[12] = SHA2x16_MEXP( 10, 5, 13, 12 );
W[13] = SHA2x16_MEXP( 11, 6, 14, 13 );
W[14] = SHA2x16_MEXP( 12, 7, 15, 14 );
W[15] = SHA2x16_MEXP( 13, 8, 0, 15 );
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
}
if ( ctx->initialized )
{
r[0] = _mm512_add_epi32( r[0], A );
r[1] = _mm512_add_epi32( r[1], B );
r[2] = _mm512_add_epi32( r[2], C );
r[3] = _mm512_add_epi32( r[3], D );
r[4] = _mm512_add_epi32( r[4], E );
r[5] = _mm512_add_epi32( r[5], F );
r[6] = _mm512_add_epi32( r[6], G );
r[7] = _mm512_add_epi32( r[7], H );
}
else
{
ctx->initialized = true;
r[0] = _mm512_add_epi32( A, m512_const1_64( 0x6A09E6676A09E667 ) );
r[1] = _mm512_add_epi32( B, m512_const1_64( 0xBB67AE85BB67AE85 ) );
r[2] = _mm512_add_epi32( C, m512_const1_64( 0x3C6EF3723C6EF372 ) );
r[3] = _mm512_add_epi32( D, m512_const1_64( 0xA54FF53AA54FF53A ) );
r[4] = _mm512_add_epi32( E, m512_const1_64( 0x510E527F510E527F ) );
r[5] = _mm512_add_epi32( F, m512_const1_64( 0x9B05688C9B05688C ) );
r[6] = _mm512_add_epi32( G, m512_const1_64( 0x1F83D9AB1F83D9AB ) );
r[7] = _mm512_add_epi32( H, m512_const1_64( 0x5BE0CD195BE0CD19 ) );
}
}
void sha256_16way_init( sha256_16way_context *sc )
{
sc->initialized = false;
sc->count_high = sc->count_low = 0;
}
void sha256_16way_update( sha256_16way_context *sc, const void *data,
size_t len )
{
__m512i *vdata = (__m512i*)data;
size_t ptr;
const int buf_size = 64;
ptr = (unsigned)sc->count_low & (buf_size - 1U);
while ( len > 0 )
{
size_t clen;
uint32_t clow, clow2;
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_512( sc->buf + (ptr>>2), vdata, clen>>2 );
vdata = vdata + (clen>>2);
ptr += clen;
len -= clen;
if ( ptr == buf_size )
{
sha256_16way_round( sc, sc->buf, sc->val );
ptr = 0;
}
clow = sc->count_low;
clow2 = clow + clen;
sc->count_low = clow2;
if ( clow2 < clow )
sc->count_high++;
}
}
void sha256_16way_close( sha256_16way_context *sc, void *dst )
{
unsigned ptr;
uint32_t low, high;
const int buf_size = 64;
const int pad = buf_size - 8;
ptr = (unsigned)sc->count_low & (buf_size - 1U);
sc->buf[ ptr>>2 ] = m512_const1_64( 0x0000008000000080 );
ptr += 4;
if ( ptr > pad )
{
memset_zero_512( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
sha256_16way_round( sc, sc->buf, sc->val );
memset_zero_512( sc->buf, pad >> 2 );
}
else
memset_zero_512( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
low = sc->count_low;
high = (sc->count_high << 3) | (low >> 29);
low = low << 3;
sc->buf[ pad >> 2 ] =
mm512_bswap_32( m512_const1_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] =
mm512_bswap_32( m512_const1_32( low ) );
sha256_16way_round( sc, sc->buf, sc->val );
mm512_block_bswap_32( dst, sc->val );
}
#endif // AVX512
#endif // __AVX2__
#endif // __SSE2__

View File

@@ -15,19 +15,19 @@ void sha256q_8way_hash( void* output, const void* input )
sha256_8way_context ctx;
memcpy( &ctx, &sha256_ctx8, sizeof ctx );
sha256_8way( &ctx, input + (64<<3), 16 );
sha256_8way_update( &ctx, input + (64<<3), 16 );
sha256_8way_close( &ctx, vhash );
sha256_8way_init( &ctx );
sha256_8way( &ctx, vhash, 32 );
sha256_8way_update( &ctx, vhash, 32 );
sha256_8way_close( &ctx, vhash );
sha256_8way_init( &ctx );
sha256_8way( &ctx, vhash, 32 );
sha256_8way_update( &ctx, vhash, 32 );
sha256_8way_close( &ctx, vhash );
sha256_8way_init( &ctx );
sha256_8way( &ctx, vhash, 32 );
sha256_8way_update( &ctx, vhash, 32 );
sha256_8way_close( &ctx, output );
}
@@ -61,7 +61,7 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,
// Need big endian data
mm256_bswap32_intrlv80_8x32( vdata, pdata );
sha256_8way_init( &sha256_ctx8 );
sha256_8way( &sha256_ctx8, vdata, 64 );
sha256_8way_update( &sha256_ctx8, vdata, 64 );
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
@@ -108,19 +108,19 @@ void sha256q_4way_hash( void* output, const void* input )
sha256_4way_context ctx;
memcpy( &ctx, &sha256_ctx4, sizeof ctx );
sha256_4way( &ctx, input + (64<<2), 16 );
sha256_4way_update( &ctx, input + (64<<2), 16 );
sha256_4way_close( &ctx, vhash );
sha256_4way_init( &ctx );
sha256_4way( &ctx, vhash, 32 );
sha256_4way_update( &ctx, vhash, 32 );
sha256_4way_close( &ctx, vhash );
sha256_4way_init( &ctx );
sha256_4way( &ctx, vhash, 32 );
sha256_4way_update( &ctx, vhash, 32 );
sha256_4way_close( &ctx, vhash );
sha256_4way_init( &ctx );
sha256_4way( &ctx, vhash, 32 );
sha256_4way_update( &ctx, vhash, 32 );
sha256_4way_close( &ctx, output );
}
@@ -154,7 +154,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
mm128_bswap32_intrlv80_4x32( vdata, pdata );
sha256_4way_init( &sha256_ctx4 );
sha256_4way( &sha256_ctx4, vdata, 64 );
sha256_4way_update( &sha256_ctx4, vdata, 64 );
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{

View File

@@ -15,15 +15,15 @@ void sha256t_8way_hash( void* output, const void* input )
sha256_8way_context ctx;
memcpy( &ctx, &sha256_ctx8, sizeof ctx );
sha256_8way( &ctx, input + (64<<3), 16 );
sha256_8way_update( &ctx, input + (64<<3), 16 );
sha256_8way_close( &ctx, vhash );
sha256_8way_init( &ctx );
sha256_8way( &ctx, vhash, 32 );
sha256_8way_update( &ctx, vhash, 32 );
sha256_8way_close( &ctx, vhash );
sha256_8way_init( &ctx );
sha256_8way( &ctx, vhash, 32 );
sha256_8way_update( &ctx, vhash, 32 );
sha256_8way_close( &ctx, output );
}
@@ -59,7 +59,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
// Need big endian data
mm256_bswap32_intrlv80_8x32( vdata, pdata );
sha256_8way_init( &sha256_ctx8 );
sha256_8way( &sha256_ctx8, vdata, 64 );
sha256_8way_update( &sha256_ctx8, vdata, 64 );
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
@@ -101,15 +101,15 @@ void sha256t_4way_hash( void* output, const void* input )
sha256_4way_context ctx;
memcpy( &ctx, &sha256_ctx4, sizeof ctx );
sha256_4way( &ctx, input + (64<<2), 16 );
sha256_4way_update( &ctx, input + (64<<2), 16 );
sha256_4way_close( &ctx, vhash );
sha256_4way_init( &ctx );
sha256_4way( &ctx, vhash, 32 );
sha256_4way_update( &ctx, vhash, 32 );
sha256_4way_close( &ctx, vhash );
sha256_4way_init( &ctx );
sha256_4way( &ctx, vhash, 32 );
sha256_4way_update( &ctx, vhash, 32 );
sha256_4way_close( &ctx, output );
}
@@ -143,7 +143,7 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
mm128_bswap32_intrlv80_4x32( vdata, pdata );
sha256_4way_init( &sha256_ctx4 );
sha256_4way( &sha256_ctx4, vdata, 64 );
sha256_4way_update( &sha256_ctx4, vdata, 64 );
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{

View File

@@ -37,55 +37,57 @@
#include "sha-hash-4way.h"
/*
static const sph_u64 H512[8] = {
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
static const uit64_t H512[8] =
{
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
*/
static const sph_u64 K512[80] = {
SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
static const uint64_t K512[80] =
{
0x428A2F98D728AE22, 0x7137449123EF65CD,
0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
0x3956C25BF348B538, 0x59F111F1B605D019,
0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
0xD807AA98A3030242, 0x12835B0145706FBE,
0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1,
0x9BDC06A725C71235, 0xC19BF174CF692694,
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3,
0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
0x2DE92C6F592B0275, 0x4A7484AA6EA6E483,
0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
0x983E5152EE66DFAB, 0xA831C66D2DB43210,
0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
0xC6E00BF33DA88FC2, 0xD5A79147930AA725,
0x06CA6351E003826F, 0x142929670A0E6E70,
0x27B70A8546D22FFC, 0x2E1B21385C26C926,
0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
0x650A73548BAF63DE, 0x766A0ABB3C77B2A8,
0x81C2C92E47EDAEE6, 0x92722C851482353B,
0xA2BFE8A14CF10364, 0xA81A664BBC423001,
0xC24B8B70D0F89791, 0xC76C51A30654BE30,
0xD192E819D6EF5218, 0xD69906245565A910,
0xF40E35855771202A, 0x106AA07032BBD1B8,
0x19A4C116B8D2D0C8, 0x1E376C085141AB53,
0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB,
0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
0x748F82EE5DEFB2FC, 0x78A5636F43172F60,
0x84C87814A1F0AB72, 0x8CC702081A6439EC,
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9,
0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
0xCA273ECEEA26619C, 0xD186B8C721C0C207,
0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
0x06F067AA72176FBA, 0x0A637DC5A2C898A6,
0x113F9804BEF90DAE, 0x1B710B35131C471B,
0x28DB77F523047D84, 0x32CAAB7B40C72493,
0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A,
0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
};

View File

@@ -97,7 +97,7 @@ void shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void shabal512_4way_init( void *cc );
void shabal512_4way_update( void *cc, const void *data, size_t len );
#define shabal512_4way shabal512_4way_update
//#define shabal512_4way shabal512_4way_update
void shabal512_4way_close( void *cc, void *dst );
void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );

View File

@@ -100,9 +100,20 @@ c512( sph_shavite_big_context *sc, const void *msg )
p3 = h[3];
// round
// working proof of concept
/*
__m512i K = m512_const1_128( m[0] );
__m512i X = _mm512_xor_si512( m512_const1_128( p1 ), K );
X = _mm512_aesenc_epi128( X, m512_zero );
k00 = _mm512_castsi512_si128( K );
x = _mm512_castsi512_si128( X );
*/
k00 = m[0];
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, zero );
k01 = m[1];
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, zero );

View File

@@ -18,76 +18,18 @@ void skeinhash_8way( void *state, const void *input )
uint64_t vhash64[8*8] __attribute__ ((aligned (128)));
skein512_8way_context ctx_skein;
//#if defined(__SHA__)
// uint32_t hash0[16] __attribute__ ((aligned (64)));
// uint32_t hash1[16] __attribute__ ((aligned (64)));
// uint32_t hash2[16] __attribute__ ((aligned (64)));
// uint32_t hash3[16] __attribute__ ((aligned (64)));
// uint32_t hash4[16] __attribute__ ((aligned (64)));
// uint32_t hash5[16] __attribute__ ((aligned (64)));
// uint32_t hash6[16] __attribute__ ((aligned (64)));
// uint32_t hash7[16] __attribute__ ((aligned (64)));
// SHA256_CTX ctx_sha256;
//#else
uint32_t vhash32[16*8] __attribute__ ((aligned (128)));
sha256_8way_context ctx_sha256;
//#endif
skein512_8way_init( &ctx_skein );
skein512_8way_update( &ctx_skein, input, 80 );
skein512_8way_close( &ctx_skein, vhash64 );
/*
#if defined(__SHA__)
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash64, 512 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash4, 64 );
SHA256_Final( (unsigned char*)hash4, &ctx_sha256 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash5, 64 );
SHA256_Final( (unsigned char*)hash5, &ctx_sha256 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash6, 64 );
SHA256_Final( (unsigned char*)hash6, &ctx_sha256 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash7, 64 );
SHA256_Final( (unsigned char*)hash7, &ctx_sha256 );
intrlv_8x32( state, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, 256 );
#else
*/
rintrlv_8x64_8x32( vhash32, vhash64, 512 );
// dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
// vhash64, 512 );
// intrlv_8x32( vhash32, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
// hash7, 512 );
sha256_8way_init( &ctx_sha256 );
sha256_8way( &ctx_sha256, vhash32, 64 );
sha256_8way_update( &ctx_sha256, vhash32, 64 );
sha256_8way_close( &ctx_sha256, state );
//#endif
}
int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
@@ -176,7 +118,7 @@ void skeinhash_4way( void *state, const void *input )
rintrlv_4x64_4x32( vhash32, vhash64, 512 );
sha256_4way_init( &ctx_sha256 );
sha256_4way( &ctx_sha256, vhash32, 64 );
sha256_4way_update( &ctx_sha256, vhash32, 64 );
sha256_4way_close( &ctx_sha256, state );
#endif
}

View File

@@ -93,12 +93,12 @@ typedef sph_skein_4way_big_context skein256_4way_context;
void skein512_4way_init( skein512_4way_context *sc );
void skein512_4way_update( void *cc, const void *data, size_t len );
void skein512_4way_close( void *cc, void *dst );
#define skein512_4way skein512_4way_update
//#define skein512_4way skein512_4way_update
void skein256_4way_init( skein256_4way_context *sc );
void skein256_4way_update( void *cc, const void *data, size_t len );
void skein256_4way_close( void *cc, void *dst );
#define skein256_4way skein256_4way_update
//#define skein256_4way skein256_4way_update
#ifdef __cplusplus
}

View File

@@ -68,11 +68,11 @@ void skein2hash_4way( void *output, const void *input )
uint64_t hash[16*4] __attribute__ ((aligned (64)));
skein512_4way_init( &ctx );
skein512_4way( &ctx, input, 80 );
skein512_4way_update( &ctx, input, 80 );
skein512_4way_close( &ctx, hash );
skein512_4way_init( &ctx );
skein512_4way( &ctx, hash, 64 );
skein512_4way_update( &ctx, hash, 64 );
skein512_4way_close( &ctx, output );
}

View File

@@ -50,41 +50,138 @@
#include <string.h>
#include "sm3-hash-4way.h"
#ifdef __SSE4_2__
#ifdef __AVX2__
void sm3_4way_init( sm3_4way_ctx_t *ctx )
#define P0_8W(x) \
_mm256_xor_si256( x, _mm256_xor_si256( mm256_rol_32( x, 9 ), \
mm256_rol_32( x, 17 ) ) )
#define P1_8W(x) \
_mm256_xor_si256( x, _mm256_xor_si256( mm256_rol_32( x, 15 ), \
mm256_rol_32( x, 23 ) ) )
#define FF0_8W(x,y,z) \
_mm256_xor_si256( x, _mm256_xor_si256( y, z ) )
#define FF1_8W(x,y,z) \
_mm256_or_si256( _mm256_or_si256( _mm256_and_si256( x, y ), \
_mm256_and_si256( x, z ) ), \
_mm256_and_si256( y, z ) )
#define GG0_8W(x,y,z) FF0_8W(x,y,z)
#define GG1_8W(x,y,z) \
_mm256_or_si256( _mm256_and_si256( x, y ), \
_mm256_andnot_si256( x, z ) )
void sm3_8way_compress( __m256i *digest, __m256i *block )
{
ctx->digest[0] = _mm_set1_epi32( 0x7380166F );
ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 );
ctx->digest[2] = _mm_set1_epi32( 0x172442D7 );
ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 );
ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC );
ctx->digest[5] = _mm_set1_epi32( 0x163138AA );
ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D );
ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E );
ctx->nblocks = 0;
ctx->num = 0;
__m256i W[68], W1[64];
__m256i A = digest[ 0 ];
__m256i B = digest[ 1 ];
__m256i C = digest[ 2 ];
__m256i D = digest[ 3 ];
__m256i E = digest[ 4 ];
__m256i F = digest[ 5 ];
__m256i G = digest[ 6 ];
__m256i H = digest[ 7 ];
__m256i SS1, SS2, TT1, TT2, T;
int j;
for ( j = 0; j < 16; j++ )
W[j] = mm256_bswap_32( block[j] );
for ( j = 16; j < 68; j++ )
W[j] = _mm256_xor_si256( P1_8W( _mm256_xor_si256(
_mm256_xor_si256( W[ j-16 ], W[ j-9 ] ),
mm256_rol_32( W[ j-3 ], 15 ) ) ),
_mm256_xor_si256( mm256_rol_32( W[ j-13 ], 7 ), W[ j-6 ] ) );
for( j = 0; j < 64; j++ )
W1[j] = _mm256_xor_si256( W[j], W[j+4] );
T = _mm256_set1_epi32( 0x79CC4519UL );
for( j =0; j < 16; j++ )
{
SS1 = mm256_rol_32( _mm256_add_epi32( E, _mm256_add_epi32(
mm256_rol_32( A, 12 ), mm256_rol_var_32( T, j ) ) ), 7 );
SS2 = _mm256_xor_si256( SS1, mm256_rol_32( A, 12 ) );
TT1 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32(
FF0_8W( A, B, C ), D ), SS2 ), W1[j] );
TT2 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32(
GG0_8W( E, F, G ), H ), SS1 ), W[j] );
D = C;
C = mm256_rol_32( B, 9 );
B = A;
A = TT1;
H = G;
G = mm256_rol_32( F, 19 );
F = E;
E = P0_8W( TT2 );
}
T = _mm256_set1_epi32( 0x7A879D8AUL );
for( j =16; j < 64; j++ )
{
SS1 = mm256_rol_32( _mm256_add_epi32( _mm256_add_epi32(
mm256_rol_32(A,12), E ), mm256_rol_var_32( T, j&31 ) ), 7 );
SS2 = _mm256_xor_si256( SS1, mm256_rol_32( A, 12 ) );
TT1 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32(
FF1_8W( A, B, C ), D ), SS2 ), W1[j] );
TT2 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32(
GG1_8W( E, F, G ), H ), SS1 ), W[j] );
D = C;
C = mm256_rol_32( B, 9 );
B = A;
A = TT1;
H = G;
G = mm256_rol_32( F, 19 );
F = E;
E = P0_8W( TT2 );
}
digest[0] = _mm256_xor_si256( digest[0], A );
digest[1] = _mm256_xor_si256( digest[1], B );
digest[2] = _mm256_xor_si256( digest[2], C );
digest[3] = _mm256_xor_si256( digest[3], D );
digest[4] = _mm256_xor_si256( digest[4], E );
digest[5] = _mm256_xor_si256( digest[5], F );
digest[6] = _mm256_xor_si256( digest[6], G );
digest[7] = _mm256_xor_si256( digest[7], H );
}
void sm3_4way( void *cc, const void *data, size_t len )
void sm3_8way_init( sm3_8way_ctx_t *ctx )
{
sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
__m128i *block = (__m128i*)ctx->block;
__m128i *vdata = (__m128i*)data;
ctx->digest[0] = _mm256_set1_epi32( 0x7380166F );
ctx->digest[1] = _mm256_set1_epi32( 0x4914B2B9 );
ctx->digest[2] = _mm256_set1_epi32( 0x172442D7 );
ctx->digest[3] = _mm256_set1_epi32( 0xDA8A0600 );
ctx->digest[4] = _mm256_set1_epi32( 0xA96F30BC );
ctx->digest[5] = _mm256_set1_epi32( 0x163138AA );
ctx->digest[6] = _mm256_set1_epi32( 0xE38DEE4D );
ctx->digest[7] = _mm256_set1_epi32( 0xB0FB0E4E );
ctx->nblocks = 0;
ctx->num = 0;
}
void sm3_8way_update( void *cc, const void *data, size_t len )
{
sm3_8way_ctx_t *ctx = (sm3_8way_ctx_t*)cc;
__m256i *block = (__m256i*)ctx->block;
__m256i *vdata = (__m256i*)data;
if ( ctx->num )
{
unsigned int left = SM3_BLOCK_SIZE - ctx->num;
if ( len < left )
{
memcpy_128( block + (ctx->num >> 2), vdata , len>>2 );
memcpy_256( block + (ctx->num >> 2), vdata , len>>2 );
ctx->num += len;
return;
}
else
{
memcpy_128( block + (ctx->num >> 2), vdata , left>>2 );
sm3_4way_compress( ctx->digest, block );
memcpy_256( block + (ctx->num >> 2), vdata , left>>2 );
sm3_8way_compress( ctx->digest, block );
ctx->nblocks++;
vdata += left>>2;
len -= left;
@@ -92,49 +189,53 @@ void sm3_4way( void *cc, const void *data, size_t len )
}
while ( len >= SM3_BLOCK_SIZE )
{
sm3_4way_compress( ctx->digest, vdata );
sm3_8way_compress( ctx->digest, vdata );
ctx->nblocks++;
vdata += SM3_BLOCK_SIZE>>2;
len -= SM3_BLOCK_SIZE;
}
ctx->num = len;
if ( len )
memcpy_128( block, vdata, len>>2 );
memcpy_256( block, vdata, len>>2 );
}
void sm3_4way_close( void *cc, void *dst )
void sm3_8way_close( void *cc, void *dst )
{
sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
__m128i *hash = (__m128i*)dst;
__m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
__m128i *block = (__m128i*)ctx->block;
sm3_8way_ctx_t *ctx = (sm3_8way_ctx_t*)cc;
__m256i *hash = (__m256i*)dst;
__m256i *count = (__m256i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
__m256i *block = (__m256i*)ctx->block;
int i;
block[ctx->num] = _mm_set1_epi32( 0x80 );
block[ctx->num] = _mm256_set1_epi32( 0x80 );
if ( ctx->num + 8 <= SM3_BLOCK_SIZE )
{
memset_zero_128( block + (ctx->num >> 2) + 1,
( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 );
memset_zero_256( block + (ctx->num >> 2) + 1,
( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 );
}
else
{
memset_zero_128( block + (ctx->num >> 2) + 1,
memset_zero_256( block + (ctx->num >> 2) + 1,
( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) );
sm3_4way_compress( ctx->digest, block );
memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
sm3_8way_compress( ctx->digest, block );
memset_zero_256( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
}
count[0] = mm128_bswap_32(
_mm_set1_epi32( ctx->nblocks >> 23 ) );
count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
count[0] = mm256_bswap_32(
_mm256_set1_epi32( ctx->nblocks >> 23 ) );
count[1] = mm256_bswap_32( _mm256_set1_epi32( ( ctx->nblocks << 9 ) +
( ctx->num << 3 ) ) );
sm3_4way_compress( ctx->digest, block );
sm3_8way_compress( ctx->digest, block );
for ( i = 0; i < 8 ; i++ )
hash[i] = mm128_bswap_32( ctx->digest[i] );
hash[i] = mm256_bswap_32( ctx->digest[i] );
}
#endif
#if defined(__SSE2__)
#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 9 ), \
mm128_rol_32( x, 17 ) ) )
#define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 15 ), \
@@ -227,5 +328,88 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
digest[7] = _mm_xor_si128( digest[7], H );
}
void sm3_4way_init( sm3_4way_ctx_t *ctx )
{
ctx->digest[0] = _mm_set1_epi32( 0x7380166F );
ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 );
ctx->digest[2] = _mm_set1_epi32( 0x172442D7 );
ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 );
ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC );
ctx->digest[5] = _mm_set1_epi32( 0x163138AA );
ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D );
ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E );
ctx->nblocks = 0;
ctx->num = 0;
}
void sm3_4way_update( void *cc, const void *data, size_t len )
{
sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
__m128i *block = (__m128i*)ctx->block;
__m128i *vdata = (__m128i*)data;
if ( ctx->num )
{
unsigned int left = SM3_BLOCK_SIZE - ctx->num;
if ( len < left )
{
memcpy_128( block + (ctx->num >> 2), vdata , len>>2 );
ctx->num += len;
return;
}
else
{
memcpy_128( block + (ctx->num >> 2), vdata , left>>2 );
sm3_4way_compress( ctx->digest, block );
ctx->nblocks++;
vdata += left>>2;
len -= left;
}
}
while ( len >= SM3_BLOCK_SIZE )
{
sm3_4way_compress( ctx->digest, vdata );
ctx->nblocks++;
vdata += SM3_BLOCK_SIZE>>2;
len -= SM3_BLOCK_SIZE;
}
ctx->num = len;
if ( len )
memcpy_128( block, vdata, len>>2 );
}
void sm3_4way_close( void *cc, void *dst )
{
sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
__m128i *hash = (__m128i*)dst;
__m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
__m128i *block = (__m128i*)ctx->block;
int i;
block[ctx->num] = _mm_set1_epi32( 0x80 );
if ( ctx->num + 8 <= SM3_BLOCK_SIZE )
{
memset_zero_128( block + (ctx->num >> 2) + 1,
( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 );
}
else
{
memset_zero_128( block + (ctx->num >> 2) + 1,
( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) );
sm3_4way_compress( ctx->digest, block );
memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
}
count[0] = mm128_bswap_32(
_mm_set1_epi32( ctx->nblocks >> 23 ) );
count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
( ctx->num << 3 ) ) );
sm3_4way_compress( ctx->digest, block );
for ( i = 0; i < 8 ; i++ )
hash[i] = mm128_bswap_32( ctx->digest[i] );
}
#endif

View File

@@ -48,14 +48,13 @@
*/
#ifndef SPH_SM3_HASH_4WAY_H
#define SPH_SM3_HASH_4WAY_H
#define SPH_SM3_HASH_4WAY_H 1
#define SM3_DIGEST_LENGTH 32
#define SM3_BLOCK_SIZE 64
#define SM3_CBLOCK (SM3_BLOCK_SIZE)
#define SM3_HMAC_SIZE (SM3_DIGEST_LENGTH)
#include <sys/types.h>
#include <stdint.h>
#include <string.h>
@@ -65,7 +64,6 @@
extern "C" {
#endif
typedef struct {
__m128i block[16] __attribute__ ((aligned (64)));
__m128i digest[8];
@@ -74,15 +72,24 @@ typedef struct {
} sm3_4way_ctx_t;
void sm3_4way_init( sm3_4way_ctx_t *ctx );
//void sm3_4way_update( sm3_4way_ctx_t *ctx, const unsigned char* data,
// size_t data_len );
//void sm3_4way_final( sm3_4way_ctx_t *ctx,
// unsigned char digest[SM3_DIGEST_LENGTH] );
void sm3_4way_compress( __m128i *digest, __m128i *block );
void sm3_4way(void *cc, const void *data, size_t len);
void sm3_4way_update(void *cc, const void *data, size_t len);
void sm3_4way_close(void *cc, void *dst);
#if defined(__AVX2__)
typedef struct {
__m256i block[16] __attribute__ ((aligned (64)));
__m256i digest[8];
uint32_t nblocks;
uint32_t num;
} sm3_8way_ctx_t;
void sm3_8way_init( sm3_8way_ctx_t *ctx );
void sm3_8way_update(void *cc, const void *data, size_t len);
void sm3_8way_close(void *cc, void *dst);
#endif
#ifdef __cplusplus
}
#endif

View File

@@ -282,11 +282,11 @@ void c11_4way_hash( void *state, const void *input )
memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
// 1 Blake 4way
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_update( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// 2 Bmw
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
@@ -305,15 +305,15 @@ void c11_4way_hash( void *state, const void *input )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// 5 Keccak
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// 6 Skein
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_update( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// Serial

View File

@@ -84,13 +84,13 @@ void timetravel_4way_hash(void *output, const void *input)
switch ( permutation[i] )
{
case 0:
blake512_4way( &ctx.blake, vhashA, dataLen );
blake512_4way_update( &ctx.blake, vhashA, dataLen );
blake512_4way_close( &ctx.blake, vhashB );
if ( i == 7 )
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
break;
case 1:
bmw512_4way( &ctx.bmw, vhashA, dataLen );
bmw512_4way_update( &ctx.bmw, vhashA, dataLen );
bmw512_4way_close( &ctx.bmw, vhashB );
if ( i == 7 )
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
@@ -112,19 +112,19 @@ void timetravel_4way_hash(void *output, const void *input)
intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
break;
case 3:
skein512_4way( &ctx.skein, vhashA, dataLen );
skein512_4way_update( &ctx.skein, vhashA, dataLen );
skein512_4way_close( &ctx.skein, vhashB );
if ( i == 7 )
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
break;
case 4:
jh512_4way( &ctx.jh, vhashA, dataLen );
jh512_4way_update( &ctx.jh, vhashA, dataLen );
jh512_4way_close( &ctx.jh, vhashB );
if ( i == 7 )
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
break;
case 5:
keccak512_4way( &ctx.keccak, vhashA, dataLen );
keccak512_4way_update( &ctx.keccak, vhashA, dataLen );
keccak512_4way_close( &ctx.keccak, vhashB );
if ( i == 7 )
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );

View File

@@ -90,13 +90,13 @@ void timetravel10_4way_hash(void *output, const void *input)
switch ( permutation[i] )
{
case 0:
blake512_4way( &ctx.blake, vhashA, dataLen );
blake512_4way_update( &ctx.blake, vhashA, dataLen );
blake512_4way_close( &ctx.blake, vhashB );
if ( i == 9 )
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
break;
case 1:
bmw512_4way( &ctx.bmw, vhashA, dataLen );
bmw512_4way_update( &ctx.bmw, vhashA, dataLen );
bmw512_4way_close( &ctx.bmw, vhashB );
if ( i == 9 )
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
@@ -118,19 +118,19 @@ void timetravel10_4way_hash(void *output, const void *input)
intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
break;
case 3:
skein512_4way( &ctx.skein, vhashA, dataLen );
skein512_4way_update( &ctx.skein, vhashA, dataLen );
skein512_4way_close( &ctx.skein, vhashB );
if ( i == 9 )
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
break;
case 4:
jh512_4way( &ctx.jh, vhashA, dataLen );
jh512_4way_update( &ctx.jh, vhashA, dataLen );
jh512_4way_close( &ctx.jh, vhashB );
if ( i == 9 )
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
break;
case 5:
keccak512_4way( &ctx.keccak, vhashA, dataLen );
keccak512_4way_update( &ctx.keccak, vhashA, dataLen );
keccak512_4way_close( &ctx.keccak, vhashB );
if ( i == 9 )
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );

View File

@@ -282,11 +282,11 @@ void x11_4way_hash( void *state, const void *input )
memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) );
// 1 Blake 4way
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_update( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// 2 Bmw
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
@@ -305,15 +305,15 @@ void x11_4way_hash( void *state, const void *input )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 Skein
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_update( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// 5 JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// 6 Keccak
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

View File

@@ -85,12 +85,12 @@ void x11evo_4way_hash( void *state, const void *input )
switch ( idx )
{
case 0:
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_update( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
break;
case 1:
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
if ( i >= len-1 )
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
@@ -112,19 +112,19 @@ void x11evo_4way_hash( void *state, const void *input )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 64<<3 );
break;
case 3:
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_update( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
if ( i >= len-1 )
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
break;
case 4:
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
if ( i >= len-1 )
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
break;
case 5:
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
if ( i >= len-1 )
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );

View File

@@ -310,10 +310,10 @@ void x11gost_4way_hash( void *state, const void *input )
x11gost_4way_ctx_holder ctx;
memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) );
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_update( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
@@ -333,13 +333,13 @@ void x11gost_4way_hash( void *state, const void *input )
// 4way
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_update( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// Serial

View File

@@ -272,10 +272,10 @@ void x12_4way_hash( void *state, const void *input )
x12_4way_ctx_holder ctx;
memcpy( &ctx, &x12_4way_ctx, sizeof(x12_4way_ctx) );
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_update( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -328,16 +328,16 @@ void x12_4way_hash( void *state, const void *input )
// Parallel 4way 64 bit
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_update( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( state, state+32, state+64, state+96, vhash, 256 );

View File

@@ -225,11 +225,11 @@ void phi1612_4way_hash( void *state, const void *input )
memcpy( &ctx, &phi1612_4way_ctx, sizeof(phi1612_4way_ctx) );
// Skein parallel 4way
skein512_4way( &ctx.skein, input, 80 );
skein512_4way_update( &ctx.skein, input, 80 );
skein512_4way_close( &ctx.skein, vhash );
// JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// Serial to the end

View File

@@ -168,7 +168,7 @@ void skunk_4way_hash( void *output, const void *input )
skunk_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &skunk_4way_ctx, sizeof(skunk_4way_ctx) );
skein512_4way( &ctx.skein, input, 80 );
skein512_4way_update( &ctx.skein, input, 80 );
skein512_4way_close( &ctx.skein, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

View File

@@ -321,11 +321,11 @@ void x13_4way_hash( void *state, const void *input )
memcpy( &ctx, &x13_4way_ctx, sizeof(x13_4way_ctx) );
// 1 Blake
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_update( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// 2 Bmw
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
@@ -344,15 +344,15 @@ void x13_4way_hash( void *state, const void *input )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 Skein
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_update( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// 5 JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// 6 Keccak
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// Serial
@@ -416,7 +416,7 @@ void x13_4way_hash( void *state, const void *input )
// 12 Hamsi parallel 4way 32 bit
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

View File

@@ -1,7 +1,4 @@
#include "x13sm3-gate.h"
#if defined(X13SM3_4WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -20,6 +17,281 @@
#include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/fugue/sph_fugue.h"
#if defined(X13BCD_8WAY)
typedef struct {
blake512_8way_context blake;
bmw512_8way_context bmw;
hashState_groestl groestl;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
cubehashParam cube;
sph_shavite512_context shavite;
simd_4way_context simd;
hashState_echo echo;
sm3_8way_ctx_t sm3;
hamsi512_8way_context hamsi;
sph_fugue512_context fugue;
} x13bcd_8way_ctx_holder;
x13bcd_8way_ctx_holder x13bcd_8way_ctx __attribute__ ((aligned (64)));
static __thread blake512_8way_context x13bcd_8way_ctx_mid;
void init_x13bcd_8way_ctx()
{
blake512_8way_init( &x13bcd_8way_ctx.blake );
bmw512_8way_init( &x13bcd_8way_ctx.bmw );
init_groestl( &x13bcd_8way_ctx.groestl, 64 );
skein512_8way_init( &x13bcd_8way_ctx.skein );
jh512_8way_init( &x13bcd_8way_ctx.jh );
keccak512_8way_init( &x13bcd_8way_ctx.keccak );
cubehashInit( &x13bcd_8way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x13bcd_8way_ctx.shavite );
simd_4way_init( &x13bcd_8way_ctx.simd, 512 );
init_echo( &x13bcd_8way_ctx.echo, 512 );
sm3_8way_init( &x13bcd_8way_ctx.sm3 );
hamsi512_8way_init( &x13bcd_8way_ctx.hamsi );
sph_fugue512_init( &x13bcd_8way_ctx.fugue );
};
void x13bcd_8way_hash( void *state, const void *input )
{
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t hash4[8] __attribute__ ((aligned (64)));
uint64_t hash5[8] __attribute__ ((aligned (64)));
uint64_t hash6[8] __attribute__ ((aligned (64)));
uint64_t hash7[8] __attribute__ ((aligned (64)));
x13bcd_8way_ctx_holder ctx;
memcpy( &ctx, &x13bcd_8way_ctx, sizeof(x13bcd_8way_ctx) );
// Blake
memcpy( &ctx.blake, &x13bcd_8way_ctx_mid, sizeof(x13bcd_8way_ctx_mid) );
blake512_8way_update( &ctx.blake, input + (64<<3), 16 );
blake512_8way_close( &ctx.blake, vhash );
// Bmw
bmw512_8way_update( &ctx.bmw, vhash, 64 );
bmw512_8way_close( &ctx.bmw, vhash );
// Serial
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash );
// Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
// Parallel 4way
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7 );
// Skein
skein512_8way_update( &ctx.skein, vhash, 64 );
skein512_8way_close( &ctx.skein, vhash );
// JH
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
// Keccak
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
// SM3 parallel 32 bit
rintrlv_8x64_8x32( vhashA, vhash, 512 );
memset( vhash, 0, sizeof vhash );
sm3_8way_update( &ctx.sm3, vhashA, 64 );
sm3_8way_close( &ctx.sm3, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash );
// Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
memcpy( &ctx.cube, &x13bcd_8way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
memcpy( &ctx.cube, &x13bcd_8way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
memcpy( &ctx.cube, &x13bcd_8way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
memcpy( &ctx.cube, &x13bcd_8way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash4, (const byte*) hash4, 64 );
memcpy( &ctx.cube, &x13bcd_8way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash5, (const byte*) hash5, 64 );
memcpy( &ctx.cube, &x13bcd_8way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash6, (const byte*) hash6, 64 );
memcpy( &ctx.cube, &x13bcd_8way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash7, (const byte*) hash7, 64 );
// Shavite
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash4, 64 );
sph_shavite512_close( &ctx.shavite, hash4 );
memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash5, 64 );
sph_shavite512_close( &ctx.shavite, hash5 );
memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash6, 64 );
sph_shavite512_close( &ctx.shavite, hash6 );
memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash7, 64 );
sph_shavite512_close( &ctx.shavite, hash7 );
// Simd
intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 512 );
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 512 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 512 );
// Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash4,
(const BitSequence *) hash4, 512 );
memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash5,
(const BitSequence *) hash5, 512 );
memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash6,
(const BitSequence *) hash6, 512 );
memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash7,
(const BitSequence *) hash7, 512 );
// Hamsi parallel 4x32x2
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7 );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash );
// Fugue serial
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, state );
memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, state+32 );
memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, state+64 );
memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, state+96 );
memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash4, 64 );
sph_fugue512_close( &ctx.fugue, state+128 );
memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash5, 64 );
sph_fugue512_close( &ctx.fugue, state+160 );
memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash6, 64 );
sph_fugue512_close( &ctx.fugue, state+192 );
memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash7, 64 );
sph_fugue512_close( &ctx.fugue, state+224 );
}
int scanhash_x13bcd_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (128)));
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
mm512_bswap32_intrlv80_8x64( vdata, pdata );
blake512_8way_init( &x13bcd_8way_ctx_mid );
blake512_8way_update( &x13bcd_8way_ctx_mid, vdata, 64 );
do
{
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x13bcd_8way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 8; i++ )
if ( (hash+(i<<3))[7] <= Htarg )
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 8;
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(X13BCD_4WAY)
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
@@ -68,11 +340,11 @@ void x13bcd_4way_hash( void *state, const void *input )
// Blake
memcpy( &ctx.blake, &x13bcd_ctx_mid, sizeof(x13bcd_ctx_mid) );
blake512_4way( &ctx.blake, input + (64<<2), 16 );
blake512_4way_update( &ctx.blake, input + (64<<2), 16 );
blake512_4way_close( &ctx.blake, vhash );
// Bmw
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
@@ -91,15 +363,15 @@ void x13bcd_4way_hash( void *state, const void *input )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// Skein
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_update( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// Keccak
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -118,7 +390,7 @@ void x13bcd_4way_hash( void *state, const void *input )
uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
memset( sm3_hash3, 0, sizeof sm3_hash3 );
sm3_4way( &ctx.sm3, vhash, 64 );
sm3_4way_update( &ctx.sm3, vhash, 64 );
sm3_4way_close( &ctx.sm3, sm3_vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
@@ -171,20 +443,23 @@ void x13bcd_4way_hash( void *state, const void *input )
// Hamsi parallel 4x32x2
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// Fugue serial
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) );
memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue,
sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) );
memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue,
sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) );
memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue,
sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
@@ -203,44 +478,33 @@ int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
int thr_id = mythr->id;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
mm256_bswap32_intrlv80_4x64( vdata, pdata );
blake512_4way_init( &x13bcd_ctx_mid );
blake512_4way( &x13bcd_ctx_mid, vdata, 64 );
do
{
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x13bcd_4way_hash( hash, vdata );
pdata[19] = n;
x13bcd_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( ( ( (hash+(i<<3))[7] & mask ) == 0 ) )
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg )
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce;
return 0;
}

View File

@@ -71,13 +71,11 @@ void x13sm3_4way_hash( void *state, const void *input )
// Blake
memcpy( &ctx.blake, &x13sm3_ctx_mid, sizeof(x13sm3_ctx_mid) );
blake512_4way( &ctx.blake, input + (64<<2), 16 );
// blake512_4way( &ctx.blake, input, 80 );
blake512_4way_update( &ctx.blake, input + (64<<2), 16 );
blake512_4way_close( &ctx.blake, vhash );
// Bmw
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
@@ -96,15 +94,15 @@ void x13sm3_4way_hash( void *state, const void *input )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// Skein
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_update( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// Keccak
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// Serial to the end
@@ -180,13 +178,13 @@ void x13sm3_4way_hash( void *state, const void *input )
uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
memset( sm3_hash3, 0, sizeof sm3_hash3 );
sm3_4way( &ctx.sm3, vhash, 64 );
sm3_4way_update( &ctx.sm3, vhash, 64 );
sm3_4way_close( &ctx.sm3, sm3_vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
// Hamsi parallel 4x32x2
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

View File

@@ -17,7 +17,11 @@ bool register_x13sm3_algo( algo_gate_t* gate )
bool register_x13bcd_algo( algo_gate_t* gate )
{
#if defined (X13SM3_4WAY)
#if defined (X13BCD_8WAY)
init_x13bcd_8way_ctx();
gate->scanhash = (void*)&scanhash_x13bcd_8way;
gate->hash = (void*)&x13bcd_8way_hash;
#elif defined (X13BCD_4WAY)
init_x13bcd_4way_ctx();
gate->scanhash = (void*)&scanhash_x13bcd_4way;
gate->hash = (void*)&x13bcd_4way_hash;
@@ -26,7 +30,7 @@ bool register_x13bcd_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_x13bcd;
gate->hash = (void*)&x13bcd_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
return true;
};

View File

@@ -5,13 +5,11 @@
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define X13SM3_4WAY
#define X13SM3_4WAY 1
#endif
bool register_x13sm3_algo( algo_gate_t* gate );
bool register_x13bcd_algo( algo_gate_t* gate );
#if defined(X13SM3_4WAY)
void x13sm3_4way_hash( void *state, const void *input );
@@ -19,18 +17,39 @@ int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x13sm3_4way_ctx();
void x13bcd_4way_hash( void *state, const void *input );
int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x13bcd_4way_ctx();
#endif
#else
void x13sm3_hash( void *state, const void *input );
int scanhash_x13sm3( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x13sm3_ctx();
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X13BCD_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X13BCD_4WAY 1
#endif
bool register_x13bcd_algo( algo_gate_t* gate );
#if defined(X13BCD_8WAY)
void x13bcd_8way_hash( void *state, const void *input );
int scanhash_x13bcd_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x13bcd_8way_ctx();
#elif defined(X13BCD_4WAY)
void x13bcd_4way_hash( void *state, const void *input );
int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x13bcd_4way_ctx();
#else
void x13bcd_hash( void *state, const void *input );
int scanhash_x13bcd( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
@@ -38,3 +57,4 @@ void init_x13bcd_ctx();
#endif
#endif

View File

@@ -34,14 +34,14 @@ void polytimos_4way_hash( void *output, const void *input )
poly_4way_context_overlay ctx;
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, input, 80 );
skein512_4way_update( &ctx.skein, input, 80 );
skein512_4way_close( &ctx.skein, vhash );
// Need to convert from 64 bit interleaved to 32 bit interleaved.
uint32_t vhash32[16*4];
rintrlv_4x64_4x32( vhash32, vhash, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash32, 64 );
shabal512_4way_update( &ctx.shabal, vhash32, 64 );
shabal512_4way_close( &ctx.shabal, vhash32 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );

View File

@@ -38,7 +38,7 @@ void veltor_4way_hash( void *output, const void *input )
veltor_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &veltor_4way_ctx, sizeof(veltor_4way_ctx) );
skein512_4way( &ctx.skein, input, 80 );
skein512_4way_update( &ctx.skein, input, 80 );
skein512_4way_close( &ctx.skein, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -55,7 +55,7 @@ void veltor_4way_hash( void *output, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_update( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );

View File

@@ -325,11 +325,11 @@ void x14_4way_hash( void *state, const void *input )
memcpy( &ctx, &x14_4way_ctx, sizeof(x14_4way_ctx) );
// 1 Blake
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_update( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// 2 Bmw
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
@@ -348,15 +348,15 @@ void x14_4way_hash( void *state, const void *input )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 Skein
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_update( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// 5 JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// 6 Keccak
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// Serial
@@ -420,7 +420,7 @@ void x14_4way_hash( void *state, const void *input )
// 12 Hamsi parallel 4way 32 bit
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

View File

@@ -374,11 +374,11 @@ void x15_4way_hash( void *state, const void *input )
memcpy( &ctx, &x15_4way_ctx, sizeof(x15_4way_ctx) );
// 1 Blake
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_update( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// 2 Bmw
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
@@ -397,15 +397,15 @@ void x15_4way_hash( void *state, const void *input )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 Skein
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_update( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// 5 JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// 6 Keccak
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// Serial to the end
@@ -469,7 +469,7 @@ void x15_4way_hash( void *state, const void *input )
// 12 Hamsi parallel 4way 32 bit
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

View File

@@ -463,11 +463,11 @@ void x16r_4way_hash( void* output, const void* input )
case BLAKE:
blake512_4way_init( &ctx.blake );
if ( i == 0 )
blake512_4way( &ctx.blake, input, size );
blake512_4way_update( &ctx.blake, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
blake512_4way( &ctx.blake, vhash, size );
blake512_4way_update( &ctx.blake, vhash, size );
}
blake512_4way_close( &ctx.blake, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -475,11 +475,11 @@ void x16r_4way_hash( void* output, const void* input )
case BMW:
bmw512_4way_init( &ctx.bmw );
if ( i == 0 )
bmw512_4way( &ctx.bmw, input, size );
bmw512_4way_update( &ctx.bmw, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
bmw512_4way( &ctx.bmw, vhash, size );
bmw512_4way_update( &ctx.bmw, vhash, size );
}
bmw512_4way_close( &ctx.bmw, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -501,11 +501,11 @@ void x16r_4way_hash( void* output, const void* input )
case SKEIN:
skein512_4way_init( &ctx.skein );
if ( i == 0 )
skein512_4way( &ctx.skein, input, size );
skein512_4way_update( &ctx.skein, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
skein512_4way( &ctx.skein, vhash, size );
skein512_4way_update( &ctx.skein, vhash, size );
}
skein512_4way_close( &ctx.skein, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -513,11 +513,11 @@ void x16r_4way_hash( void* output, const void* input )
case JH:
jh512_4way_init( &ctx.jh );
if ( i == 0 )
jh512_4way( &ctx.jh, input, size );
jh512_4way_update( &ctx.jh, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
jh512_4way( &ctx.jh, vhash, size );
jh512_4way_update( &ctx.jh, vhash, size );
}
jh512_4way_close( &ctx.jh, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -525,11 +525,11 @@ void x16r_4way_hash( void* output, const void* input )
case KECCAK:
keccak512_4way_init( &ctx.keccak );
if ( i == 0 )
keccak512_4way( &ctx.keccak, input, size );
keccak512_4way_update( &ctx.keccak, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
keccak512_4way( &ctx.keccak, vhash, size );
keccak512_4way_update( &ctx.keccak, vhash, size );
}
keccak512_4way_close( &ctx.keccak, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -599,7 +599,7 @@ void x16r_4way_hash( void* output, const void* input )
case HAMSI:
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, size );
hamsi512_4way_update( &ctx.hamsi, vhash, size );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
@@ -620,7 +620,7 @@ void x16r_4way_hash( void* output, const void* input )
case SHABAL:
intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, size );
shabal512_4way_update( &ctx.shabal, vhash, size );
shabal512_4way_close( &ctx.shabal, vhash );
dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
break;
@@ -641,7 +641,7 @@ void x16r_4way_hash( void* output, const void* input )
case SHA_512:
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, size );
sha512_4way_update( &ctx.sha512, vhash, size );
sha512_4way_close( &ctx.sha512, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;

View File

@@ -52,10 +52,10 @@ bool register_x16r_algo( algo_gate_t* gate )
bool register_x16rv2_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
#if defined (X16RV2_8WAY)
gate->scanhash = (void*)&scanhash_x16rv2_8way;
gate->hash = (void*)&x16rv2_8way_hash;
#elif defined (X16R_4WAY)
#elif defined (X16RV2_4WAY)
gate->scanhash = (void*)&scanhash_x16rv2_4way;
gate->hash = (void*)&x16rv2_4way_hash;
#else
@@ -205,10 +205,10 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
bool register_x16rt_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
#if defined (X16RT_8WAY)
gate->scanhash = (void*)&scanhash_x16rt_8way;
gate->hash = (void*)&x16rt_8way_hash;
#elif defined (X16R_4WAY)
#elif defined (X16RT_4WAY)
gate->scanhash = (void*)&scanhash_x16rt_4way;
gate->hash = (void*)&x16rt_4way_hash;
#else
@@ -222,10 +222,10 @@ bool register_x16rt_algo( algo_gate_t* gate )
bool register_x16rt_veil_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
#if defined (X16RT_8WAY)
gate->scanhash = (void*)&scanhash_x16rt_8way;
gate->hash = (void*)&x16rt_8way_hash;
#elif defined (X16R_4WAY)
#elif defined (X16RT_4WAY)
gate->scanhash = (void*)&scanhash_x16rt_4way;
gate->hash = (void*)&x16rt_4way_hash;
#else
@@ -258,16 +258,23 @@ bool register_hex_algo( algo_gate_t* gate )
bool register_x21s_algo( algo_gate_t* gate )
{
#if defined (X16R_4WAY)
#if defined (X21S_8WAY)
gate->scanhash = (void*)&scanhash_x21s_8way;
gate->hash = (void*)&x21s_8way_hash;
gate->miner_thread_init = (void*)&x21s_8way_thread_init;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
#elif defined (X21S_4WAY)
gate->scanhash = (void*)&scanhash_x21s_4way;
gate->hash = (void*)&x21s_4way_hash;
gate->miner_thread_init = (void*)&x21s_4way_thread_init;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
#else
gate->scanhash = (void*)&scanhash_x21s;
gate->hash = (void*)&x21s_hash;
gate->miner_thread_init = (void*)&x21s_thread_init;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
// gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
opt_target_factor = 256.0;
return true;

View File

@@ -12,6 +12,24 @@
#define X16R_4WAY 1
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X16RV2_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X16RV2_4WAY 1
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X16RT_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X16RT_4WAY 1
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X21S_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X21S_4WAY 1
#endif
enum x16r_Algo {
BLAKE = 0,
BMW,
@@ -46,18 +64,13 @@ bool register_x16rt_algo( algo_gate_t* gate );
bool register_hex__algo( algo_gate_t* gate );
bool register_x21s__algo( algo_gate_t* gate );
// x16r, x16s
#if defined(X16R_8WAY)
void x16r_8way_hash( void *state, const void *input );
int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void x16rv2_8way_hash( void *state, const void *input );
int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void x16rt_8way_hash( void *state, const void *input );
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X16R_4WAY)
@@ -65,31 +78,65 @@ void x16r_4way_hash( void *state, const void *input );
int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void x16rv2_4way_hash( void *state, const void *input );
int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void x16rt_4way_hash( void *state, const void *input );
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void x16r_hash( void *state, const void *input );
int scanhash_x16r( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
// x16Rv2
#if defined(X16RV2_8WAY)
void x16rv2_8way_hash( void *state, const void *input );
int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X16RV2_4WAY)
void x16rv2_4way_hash( void *state, const void *input );
int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void x16rv2_hash( void *state, const void *input );
int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
// x16rt, veil
#if defined(X16RT_8WAY)
void x16rt_8way_hash( void *state, const void *input );
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X16RT_4WAY)
void x16rt_4way_hash( void *state, const void *input );
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void x16rt_hash( void *state, const void *input );
int scanhash_x16rt( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(X16R_4WAY)
// x21s
#if defined(X21S_8WAY)
void x21s_8way_hash( void *state, const void *input );
int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool x21s_8way_thread_init();
#elif defined(X21S_4WAY)
void x21s_4way_hash( void *state, const void *input );
int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,

View File

@@ -24,7 +24,7 @@
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
#if defined (X16R_8WAY)
#if defined (X16RT_8WAY)
union _x16rt_8way_context_overlay
{
@@ -407,7 +407,7 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined (X16R_4WAY)
#elif defined (X16RT_4WAY)
union _x16rt_4way_context_overlay
{
@@ -458,11 +458,11 @@ void x16rt_4way_hash( void* output, const void* input )
case BLAKE:
blake512_4way_init( &ctx.blake );
if ( i == 0 )
blake512_4way( &ctx.blake, input, size );
blake512_4way_update( &ctx.blake, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
blake512_4way( &ctx.blake, vhash, size );
blake512_4way_update( &ctx.blake, vhash, size );
}
blake512_4way_close( &ctx.blake, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -470,11 +470,11 @@ void x16rt_4way_hash( void* output, const void* input )
case BMW:
bmw512_4way_init( &ctx.bmw );
if ( i == 0 )
bmw512_4way( &ctx.bmw, input, size );
bmw512_4way_update( &ctx.bmw, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
bmw512_4way( &ctx.bmw, vhash, size );
bmw512_4way_update( &ctx.bmw, vhash, size );
}
bmw512_4way_close( &ctx.bmw, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -496,11 +496,11 @@ void x16rt_4way_hash( void* output, const void* input )
case SKEIN:
skein512_4way_init( &ctx.skein );
if ( i == 0 )
skein512_4way( &ctx.skein, input, size );
skein512_4way_update( &ctx.skein, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
skein512_4way( &ctx.skein, vhash, size );
skein512_4way_update( &ctx.skein, vhash, size );
}
skein512_4way_close( &ctx.skein, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -508,11 +508,11 @@ void x16rt_4way_hash( void* output, const void* input )
case JH:
jh512_4way_init( &ctx.jh );
if ( i == 0 )
jh512_4way( &ctx.jh, input, size );
jh512_4way_update( &ctx.jh, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
jh512_4way( &ctx.jh, vhash, size );
jh512_4way_update( &ctx.jh, vhash, size );
}
jh512_4way_close( &ctx.jh, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -520,11 +520,11 @@ void x16rt_4way_hash( void* output, const void* input )
case KECCAK:
keccak512_4way_init( &ctx.keccak );
if ( i == 0 )
keccak512_4way( &ctx.keccak, input, size );
keccak512_4way_update( &ctx.keccak, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
keccak512_4way( &ctx.keccak, vhash, size );
keccak512_4way_update( &ctx.keccak, vhash, size );
}
keccak512_4way_close( &ctx.keccak, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -594,7 +594,7 @@ void x16rt_4way_hash( void* output, const void* input )
case HAMSI:
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, size );
hamsi512_4way_update( &ctx.hamsi, vhash, size );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
@@ -615,7 +615,7 @@ void x16rt_4way_hash( void* output, const void* input )
case SHABAL:
intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, size );
shabal512_4way_update( &ctx.shabal, vhash, size );
shabal512_4way_close( &ctx.shabal, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
break;
@@ -636,7 +636,7 @@ void x16rt_4way_hash( void* output, const void* input )
case SHA_512:
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, size );
sha512_4way_update( &ctx.sha512, vhash, size );
sha512_4way_close( &ctx.sha512, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;

View File

@@ -31,7 +31,7 @@
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
#if defined (X16R_8WAY)
#if defined (X16RV2_8WAY)
union _x16rv2_8way_context_overlay
{
@@ -497,10 +497,7 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined (X16R_4WAY)
#elif defined (X16RV2_4WAY)
union _x16rv2_4way_context_overlay
{
@@ -556,11 +553,11 @@ void x16rv2_4way_hash( void* output, const void* input )
case BLAKE:
blake512_4way_init( &ctx.blake );
if ( i == 0 )
blake512_4way( &ctx.blake, input, size );
blake512_4way_update( &ctx.blake, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
blake512_4way( &ctx.blake, vhash, size );
blake512_4way_update( &ctx.blake, vhash, size );
}
blake512_4way_close( &ctx.blake, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -568,11 +565,11 @@ void x16rv2_4way_hash( void* output, const void* input )
case BMW:
bmw512_4way_init( &ctx.bmw );
if ( i == 0 )
bmw512_4way( &ctx.bmw, input, size );
bmw512_4way_update( &ctx.bmw, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
bmw512_4way( &ctx.bmw, vhash, size );
bmw512_4way_update( &ctx.bmw, vhash, size );
}
bmw512_4way_close( &ctx.bmw, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -594,11 +591,11 @@ void x16rv2_4way_hash( void* output, const void* input )
case SKEIN:
skein512_4way_init( &ctx.skein );
if ( i == 0 )
skein512_4way( &ctx.skein, input, size );
skein512_4way_update( &ctx.skein, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
skein512_4way( &ctx.skein, vhash, size );
skein512_4way_update( &ctx.skein, vhash, size );
}
skein512_4way_close( &ctx.skein, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -606,11 +603,11 @@ void x16rv2_4way_hash( void* output, const void* input )
case JH:
jh512_4way_init( &ctx.jh );
if ( i == 0 )
jh512_4way( &ctx.jh, input, size );
jh512_4way_update( &ctx.jh, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
jh512_4way( &ctx.jh, vhash, size );
jh512_4way_update( &ctx.jh, vhash, size );
}
jh512_4way_close( &ctx.jh, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -634,7 +631,7 @@ void x16rv2_4way_hash( void* output, const void* input )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
@@ -724,7 +721,7 @@ void x16rv2_4way_hash( void* output, const void* input )
case HAMSI:
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, size );
hamsi512_4way_update( &ctx.hamsi, vhash, size );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
@@ -745,7 +742,7 @@ void x16rv2_4way_hash( void* output, const void* input )
case SHABAL:
intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, size );
shabal512_4way_update( &ctx.shabal, vhash, size );
shabal512_4way_close( &ctx.shabal, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
break;
@@ -782,7 +779,7 @@ void x16rv2_4way_hash( void* output, const void* input )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, 64 );
sha512_4way_update( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;

View File

@@ -1,13 +1,10 @@
/**
* x16r algo implementation
* x21s algo implementation
*
* Implementation by tpruvot@github Jan 2018
* Optimized by JayDDee@github Jan 2018
*/
#include "x16r-gate.h"
#if defined (X16R_4WAY)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -21,6 +18,7 @@
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
@@ -38,6 +36,483 @@
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
#if defined (X21S_8WAY)
static __thread uint64_t* x21s_8way_matrix;
union _x21s_8way_context_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
hashState_groestl groestl;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
sph_shavite512_context shavite;
simd_4way_context simd;
hashState_echo echo;
hamsi512_8way_context hamsi;
sph_fugue512_context fugue;
shabal512_8way_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
haval256_5_8way_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
sha256_8way_context sha256;
} __attribute__ ((aligned (64)));
typedef union _x21s_8way_context_overlay x21s_8way_context_overlay;
void x21s_8way_hash( void* output, const void* input )
{
uint32_t vhash[24*8] __attribute__ ((aligned (128)));
uint32_t hash0[24] __attribute__ ((aligned (64)));
uint32_t hash1[24] __attribute__ ((aligned (64)));
uint32_t hash2[24] __attribute__ ((aligned (64)));
uint32_t hash3[24] __attribute__ ((aligned (64)));
uint32_t hash4[24] __attribute__ ((aligned (64)));
uint32_t hash5[24] __attribute__ ((aligned (64)));
uint32_t hash6[24] __attribute__ ((aligned (64)));
uint32_t hash7[24] __attribute__ ((aligned (64)));
x21s_8way_context_overlay ctx;
void *in0 = (void*) hash0;
void *in1 = (void*) hash1;
void *in2 = (void*) hash2;
void *in3 = (void*) hash3;
void *in4 = (void*) hash4;
void *in5 = (void*) hash5;
void *in6 = (void*) hash6;
void *in7 = (void*) hash7;
int size = 80;
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
input, 640 );
for ( int i = 0; i < 16; i++ )
{
const char elem = hashOrder[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case BLAKE:
blake512_8way_init( &ctx.blake );
if ( i == 0 )
blake512_8way_update( &ctx.blake, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
blake512_8way_update( &ctx.blake, vhash, size );
}
blake512_8way_close( &ctx.blake, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case BMW:
bmw512_8way_init( &ctx.bmw );
if ( i == 0 )
bmw512_8way_update( &ctx.bmw, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
bmw512_8way_update( &ctx.bmw, vhash, size );
}
bmw512_8way_close( &ctx.bmw, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case GROESTL:
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(const char*)in0, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(const char*)in1, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(const char*)in2, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(const char*)in3, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash4,
(const char*)in4, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash5,
(const char*)in5, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash6,
(const char*)in6, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash7,
(const char*)in7, size<<3 );
break;
case SKEIN:
skein512_8way_init( &ctx.skein );
if ( i == 0 )
skein512_8way_update( &ctx.skein, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
skein512_8way_update( &ctx.skein, vhash, size );
}
skein512_8way_close( &ctx.skein, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case JH:
jh512_8way_init( &ctx.jh );
if ( i == 0 )
jh512_8way_update( &ctx.jh, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
jh512_8way_update( &ctx.jh, vhash, size );
}
jh512_8way_close( &ctx.jh, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case KECCAK:
keccak512_8way_init( &ctx.keccak );
if ( i == 0 )
keccak512_8way_update( &ctx.keccak, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
keccak512_8way_update( &ctx.keccak, vhash, size );
}
keccak512_8way_close( &ctx.keccak, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case LUFFA:
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
break;
case CUBEHASH:
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
break;
case SHAVITE:
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in0, size );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in1, size );
sph_shavite512_close( &ctx.shavite, hash1 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in2, size );
sph_shavite512_close( &ctx.shavite, hash2 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in3, size );
sph_shavite512_close( &ctx.shavite, hash3 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in4, size );
sph_shavite512_close( &ctx.shavite, hash4 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in5, size );
sph_shavite512_close( &ctx.shavite, hash5 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in6, size );
sph_shavite512_close( &ctx.shavite, hash6 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in7, size );
sph_shavite512_close( &ctx.shavite, hash7 );
break;
case SIMD:
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
break;
case ECHO:
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash0,
(const BitSequence*)in0, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash1,
(const BitSequence*)in1, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash2,
(const BitSequence*)in2, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash3,
(const BitSequence*)in3, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash4,
(const BitSequence*)in4, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash5,
(const BitSequence*)in5, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash6,
(const BitSequence*)in6, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash7,
(const BitSequence*)in7, size<<3 );
break;
case HAMSI:
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, size );
hamsi512_8way_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case FUGUE:
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in0, size );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in1, size );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in2, size );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in3, size );
sph_fugue512_close( &ctx.fugue, hash3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in4, size );
sph_fugue512_close( &ctx.fugue, hash4 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in5, size );
sph_fugue512_close( &ctx.fugue, hash5 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in6, size );
sph_fugue512_close( &ctx.fugue, hash6 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in7, size );
sph_fugue512_close( &ctx.fugue, hash7 );
break;
case SHABAL:
intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, size );
shabal512_8way_close( &ctx.shabal, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case WHIRLPOOL:
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in0, size );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in1, size );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in2, size );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in3, size );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in4, size );
sph_whirlpool_close( &ctx.whirlpool, hash4 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in5, size );
sph_whirlpool_close( &ctx.whirlpool, hash5 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in6, size );
sph_whirlpool_close( &ctx.whirlpool, hash6 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in7, size );
sph_whirlpool_close( &ctx.whirlpool, hash7 );
break;
case SHA_512:
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, size );
sha512_8way_close( &ctx.sha512, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
}
size = 64;
}
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhash, 64 );
haval256_5_8way_close( &ctx.haval, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash0, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash1, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash1 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash2, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash2 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash3, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash3 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash4, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash4 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash5, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash5 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash6, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash6 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash7, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash7 );
intrlv_2x256( vhash, hash0, hash1, 256 );
LYRA2REV2_2WAY( x21s_8way_matrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash0, hash1, vhash, 256 );
intrlv_2x256( vhash, hash2, hash3, 256 );
LYRA2REV2_2WAY( x21s_8way_matrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash2, hash3, vhash, 256 );
intrlv_2x256( vhash, hash4, hash5, 256 );
LYRA2REV2_2WAY( x21s_8way_matrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash4, hash5, vhash, 256 );
intrlv_2x256( vhash, hash6, hash7, 256 );
LYRA2REV2_2WAY( x21s_8way_matrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash6, hash7, vhash, 256 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash0, 64 );
sph_gost512_close( &ctx.gost, (void*) hash0 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash1, 64 );
sph_gost512_close( &ctx.gost, (void*) hash1 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash2, 64 );
sph_gost512_close( &ctx.gost, (void*) hash2 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash3, 64 );
sph_gost512_close( &ctx.gost, (void*) hash3 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash4, 64 );
sph_gost512_close( &ctx.gost, (void*) hash4 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash5, 64 );
sph_gost512_close( &ctx.gost, (void*) hash5 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash6, 64 );
sph_gost512_close( &ctx.gost, (void*) hash6 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash7, 64 );
sph_gost512_close( &ctx.gost, (void*) hash7 );
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
sha256_8way_init( &ctx.sha256 );
sha256_8way_update( &ctx.sha256, vhash, 64 );
sha256_8way_close( &ctx.sha256, output );
}
int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &hash[7<<3];
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 16;
int thr_id = mythr->id;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( opt_benchmark )
ptarget[7] = 0x0cff;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
}
do
{
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x21s_8way_hash( hash, vdata );
pdata[19] = n;
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash7[lane] <= Htarg ) )
{
extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
}
n += 8;
} while ( ( n < last_nonce ) && !(*restart) );
*hashes_done = n - first_nonce;
return 0;
}
bool x21s_8way_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
const int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
x21s_8way_matrix = _mm_malloc( 2 * size, 64 );
return x21s_8way_matrix;
}
#elif defined (X21S_4WAY)
static __thread uint64_t* x21s_4way_matrix;
union _x21s_4way_context_overlay
@@ -103,11 +578,11 @@ void x21s_4way_hash( void* output, const void* input )
case BLAKE:
blake512_4way_init( &ctx.blake );
if ( i == 0 )
blake512_4way( &ctx.blake, input, size );
blake512_4way_update( &ctx.blake, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
blake512_4way( &ctx.blake, vhash, size );
blake512_4way_update( &ctx.blake, vhash, size );
}
blake512_4way_close( &ctx.blake, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -115,11 +590,11 @@ void x21s_4way_hash( void* output, const void* input )
case BMW:
bmw512_4way_init( &ctx.bmw );
if ( i == 0 )
bmw512_4way( &ctx.bmw, input, size );
bmw512_4way_update( &ctx.bmw, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
bmw512_4way( &ctx.bmw, vhash, size );
bmw512_4way_update( &ctx.bmw, vhash, size );
}
bmw512_4way_close( &ctx.bmw, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -141,11 +616,11 @@ void x21s_4way_hash( void* output, const void* input )
case SKEIN:
skein512_4way_init( &ctx.skein );
if ( i == 0 )
skein512_4way( &ctx.skein, input, size );
skein512_4way_update( &ctx.skein, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
skein512_4way( &ctx.skein, vhash, size );
skein512_4way_update( &ctx.skein, vhash, size );
}
skein512_4way_close( &ctx.skein, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -153,11 +628,11 @@ void x21s_4way_hash( void* output, const void* input )
case JH:
jh512_4way_init( &ctx.jh );
if ( i == 0 )
jh512_4way( &ctx.jh, input, size );
jh512_4way_update( &ctx.jh, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
jh512_4way( &ctx.jh, vhash, size );
jh512_4way_update( &ctx.jh, vhash, size );
}
jh512_4way_close( &ctx.jh, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -165,11 +640,11 @@ void x21s_4way_hash( void* output, const void* input )
case KECCAK:
keccak512_4way_init( &ctx.keccak );
if ( i == 0 )
keccak512_4way( &ctx.keccak, input, size );
keccak512_4way_update( &ctx.keccak, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
keccak512_4way( &ctx.keccak, vhash, size );
keccak512_4way_update( &ctx.keccak, vhash, size );
}
keccak512_4way_close( &ctx.keccak, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -239,7 +714,7 @@ void x21s_4way_hash( void* output, const void* input )
case HAMSI:
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, size );
hamsi512_4way_update( &ctx.hamsi, vhash, size );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
@@ -260,7 +735,7 @@ void x21s_4way_hash( void* output, const void* input )
case SHABAL:
intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, size );
shabal512_4way_update( &ctx.shabal, vhash, size );
shabal512_4way_close( &ctx.shabal, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
break;
@@ -281,7 +756,7 @@ void x21s_4way_hash( void* output, const void* input )
case SHA_512:
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, size );
sha512_4way_update( &ctx.sha512, vhash, size );
sha512_4way_close( &ctx.sha512, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
@@ -292,7 +767,7 @@ void x21s_4way_hash( void* output, const void* input )
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhash, 64 );
haval256_5_4way_update( &ctx.haval, vhash, 64 );
haval256_5_4way_close( &ctx.haval, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -356,7 +831,7 @@ void x21s_4way_hash( void* output, const void* input )
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
sha256_4way_init( &ctx.sha256 );
sha256_4way( &ctx.sha256, vhash, 64 );
sha256_4way_update( &ctx.sha256, vhash, 64 );
sha256_4way_close( &ctx.sha256, vhash );
dintrlv_4x32( output, output+32, output+64,output+96, vhash, 256 );

File diff suppressed because it is too large Load Diff

View File

@@ -2,8 +2,10 @@
bool register_sonoa_algo( algo_gate_t* gate )
{
#if defined (SONOA_4WAY)
// init_sonoa_4way_ctx();
#if defined (SONOA_8WAY)
gate->scanhash = (void*)&scanhash_sonoa_8way;
gate->hash = (void*)&sonoa_8way_hash;
#elif defined (SONOA_4WAY)
gate->scanhash = (void*)&scanhash_sonoa_4way;
gate->hash = (void*)&sonoa_4way_hash;
#else
@@ -11,7 +13,7 @@ bool register_sonoa_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_sonoa;
gate->hash = (void*)&sonoa_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
return true;
};

View File

@@ -4,29 +4,33 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define SONOA_4WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SONOA_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define SONOA_4WAY 1
#endif
bool register_sonoa_algo( algo_gate_t* gate );
#if defined(SONOA_4WAY)
#if defined(SONOA_8WAY)
void sonoa_8way_hash( void *state, const void *input );
int scanhash_sonoa_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(SONOA_4WAY)
void sonoa_4way_hash( void *state, const void *input );
int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
//void init_sonoa_4way_ctx();
#endif
#else
void sonoa_hash( void *state, const void *input );
int scanhash_sonoa( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_sonoa_ctx();
#endif
#endif

View File

@@ -1,7 +1,4 @@
#include "x17-gate.h"
#if defined(X17_4WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -14,6 +11,7 @@
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
@@ -24,6 +22,309 @@
#include "algo/haval/haval-hash-4way.h"
#include "algo/sha/sha-hash-4way.h"
#if defined(X17_8WAY)
union _x17_8way_context_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
hashState_groestl groestl;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
sph_shavite512_context shavite;
simd_4way_context simd;
hashState_echo echo;
hamsi512_8way_context hamsi;
sph_fugue512_context fugue;
shabal512_8way_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
haval256_5_8way_context haval;
} __attribute__ ((aligned (64)));
typedef union _x17_8way_context_overlay x17_8way_context_overlay;
void x17_8way_hash( void *state, const void *input )
{
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
uint64_t vhash0[8*8] __attribute__ ((aligned (64)));
uint64_t vhash1[8*8] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t hash4[8] __attribute__ ((aligned (64)));
uint64_t hash5[8] __attribute__ ((aligned (64)));
uint64_t hash6[8] __attribute__ ((aligned (64)));
uint64_t hash7[8] __attribute__ ((aligned (64)));
x17_8way_context_overlay ctx;
// 1 Blake parallel 4 way 64 bit
blake512_8way_init( &ctx.blake );
blake512_8way_update( &ctx.blake, input, 80 );
blake512_8way_close( &ctx.blake, vhash );
// 2 Bmw
bmw512_8way_init( &ctx.bmw );
bmw512_8way_update( &ctx.bmw, vhash, 64 );
bmw512_8way_close( &ctx.bmw, vhash );
// Serialize
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
// 3 Groestl
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
// Parallellize
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
// 4 Skein parallel 4 way 64 bit
skein512_8way_init( &ctx.skein );
skein512_8way_update( &ctx.skein, vhash, 64 );
skein512_8way_close( &ctx.skein, vhash );
// 5 JH
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
// 6 Keccak
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
// 7 Luffa
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
// 8 Cubehash
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
// 9 Shavite
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash4, 64 );
sph_shavite512_close( &ctx.shavite, hash4 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash5, 64 );
sph_shavite512_close( &ctx.shavite, hash5 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash6, 64 );
sph_shavite512_close( &ctx.shavite, hash6 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash7, 64 );
sph_shavite512_close( &ctx.shavite, hash7 );
// 10 Simd
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
// 11 Echo serial
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash4,
(const BitSequence *) hash4, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash5,
(const BitSequence *) hash5, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash6,
(const BitSequence *) hash6, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash7,
(const BitSequence *) hash7, 512 );
// 12 Hamsi parallel 4 way 64 bit
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
// 13 Fugue serial
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash4, 64 );
sph_fugue512_close( &ctx.fugue, hash4 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash5, 64 );
sph_fugue512_close( &ctx.fugue, hash5 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash6, 64 );
sph_fugue512_close( &ctx.fugue, hash6 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash7, 64 );
sph_fugue512_close( &ctx.fugue, hash7 );
// 14 Shabal, parallel 4 way 32 bit
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, 64 );
shabal512_8way_close( &ctx.shabal, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
// 15 Whirlpool serial
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash4, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash4 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash5, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash5 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash6, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash6 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash7, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash7 );
// 16 SHA512 parallel 64 bit
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, 64 );
sha512_8way_close( &ctx.sha512, vhash );
// 17 Haval parallel 32 bit
rintrlv_8x64_8x32( vhash0, vhash, 512 );
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhash0, 64 );
haval256_5_8way_close( &ctx.haval, state );
}
int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const uint32_t Htarg = ptarget[7];
mm512_bswap32_intrlv80_8x64( vdata, pdata );
do
{
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x17_8way_hash( hash, vdata );
for ( int lane = 0; lane < 8; lane++ )
if unlikely( ( hash7[ lane ] <= Htarg ) )
{
extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
}
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(X17_4WAY)
union _x17_4way_context_overlay
{
blake512_4way_context blake;
@@ -59,12 +360,12 @@ void x17_4way_hash( void *state, const void *input )
// 1 Blake parallel 4 way 64 bit
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_update( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// 2 Bmw
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
// Serialize
@@ -85,17 +386,17 @@ void x17_4way_hash( void *state, const void *input )
// 4 Skein parallel 4 way 64 bit
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_update( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// 5 JH
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// 6 Keccak
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// 7 Luffa parallel 2 way 128 bit
@@ -127,7 +428,6 @@ void x17_4way_hash( void *state, const void *input )
dintrlv_2x128_512( hash0, hash1, vhashA );
dintrlv_2x128_512( hash2, hash3, vhashB );
// 11 Echo serial
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -146,7 +446,7 @@ void x17_4way_hash( void *state, const void *input )
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -169,7 +469,7 @@ void x17_4way_hash( void *state, const void *input )
intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_update( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
@@ -192,14 +492,14 @@ void x17_4way_hash( void *state, const void *input )
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, 64 );
sha512_4way_update( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
// 17 Haval parallel 32 bit
rintrlv_4x64_4x32( vhashB, vhash, 512 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhashB, 64 );
haval256_5_4way_update( &ctx.haval, vhashB, 64 );
haval256_5_4way_close( &ctx.haval, state );
}

View File

@@ -2,14 +2,17 @@
bool register_x17_algo( algo_gate_t* gate )
{
#if defined (X17_4WAY)
#if defined (X17_8WAY)
gate->scanhash = (void*)&scanhash_x17_8way;
gate->hash = (void*)&x17_8way_hash;
#elif defined (X17_4WAY)
gate->scanhash = (void*)&scanhash_x17_4way;
gate->hash = (void*)&x17_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x17;
gate->hash = (void*)&x17_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
return true;
};

View File

@@ -4,13 +4,20 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define X17_4WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X17_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X17_4WAY 1
#endif
bool register_x17_algo( algo_gate_t* gate );
#if defined(X17_4WAY)
#if defined(X17_8WAY)
void x17_8way_hash( void *state, const void *input );
int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X17_4WAY)
void x17_4way_hash( void *state, const void *input );
int scanhash_x17_4way( struct work *work, uint32_t max_nonce,

View File

@@ -1,7 +1,4 @@
#include "xevan-gate.h"
#if defined(XEVAN_4WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -15,6 +12,7 @@
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
@@ -25,6 +23,515 @@
#include "algo/sha/sha-hash-4way.h"
#include "algo/haval/haval-hash-4way.h"
#if defined(XEVAN_8WAY)
union _xevan_8way_context_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
hashState_groestl groestl;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
sph_shavite512_context shavite;
simd_4way_context simd;
hashState_echo echo;
hamsi512_8way_context hamsi;
sph_fugue512_context fugue;
shabal512_8way_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
haval256_5_8way_context haval;
} __attribute__ ((aligned (64)));
typedef union _xevan_8way_context_overlay xevan_8way_context_overlay;
void xevan_8way_hash( void *output, const void *input )
{
uint64_t vhash[16<<3] __attribute__ ((aligned (128)));
uint64_t vhashA[16<<3] __attribute__ ((aligned (64)));
uint64_t vhashB[16<<3] __attribute__ ((aligned (64)));
uint64_t hash0[16] __attribute__ ((aligned (64)));
uint64_t hash1[16] __attribute__ ((aligned (64)));
uint64_t hash2[16] __attribute__ ((aligned (64)));
uint64_t hash3[16] __attribute__ ((aligned (64)));
uint64_t hash4[16] __attribute__ ((aligned (64)));
uint64_t hash5[16] __attribute__ ((aligned (64)));
uint64_t hash6[16] __attribute__ ((aligned (64)));
uint64_t hash7[16] __attribute__ ((aligned (64)));
const int dataLen = 128;
xevan_8way_context_overlay ctx __attribute__ ((aligned (64)));
blake512_8way_init( &ctx.blake );
blake512_8way_update( &ctx.blake, input, 80 );
blake512_8way_close( &ctx.blake, vhash );
memset( &vhash[8<<3], 0, 64<<3 );
bmw512_8way_init( &ctx.bmw );
bmw512_8way_update( &ctx.bmw, vhash, dataLen );
bmw512_8way_close( &ctx.bmw, vhash );
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7,
dataLen<<3 );
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, dataLen<<3 );
skein512_8way_init( &ctx.skein );
skein512_8way_update( &ctx.skein, vhash, dataLen );
skein512_8way_close( &ctx.skein, vhash );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, dataLen );
jh512_8way_close( &ctx.jh, vhash );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, dataLen );
keccak512_8way_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash0, dataLen );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash1, dataLen );
sph_shavite512_close( &ctx.shavite, hash1 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash2, dataLen );
sph_shavite512_close( &ctx.shavite, hash2 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash3, dataLen );
sph_shavite512_close( &ctx.shavite, hash3 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash4, dataLen );
sph_shavite512_close( &ctx.shavite, hash4 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash5, dataLen );
sph_shavite512_close( &ctx.shavite, hash5 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash6, dataLen );
sph_shavite512_close( &ctx.shavite, hash6 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash7, dataLen );
sph_shavite512_close( &ctx.shavite, hash7 );
intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 );
intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash4,
(const BitSequence *) hash4, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash5,
(const BitSequence *) hash5, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash6,
(const BitSequence *) hash6, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash7,
(const BitSequence *) hash7, dataLen<<3 );
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, dataLen<<3 );
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, dataLen );
hamsi512_8way_close( &ctx.hamsi, vhash );
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, dataLen<<3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, dataLen );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, dataLen );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, dataLen );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, dataLen );
sph_fugue512_close( &ctx.fugue, hash3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash4, dataLen );
sph_fugue512_close( &ctx.fugue, hash4 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash5, dataLen );
sph_fugue512_close( &ctx.fugue, hash5 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash6, dataLen );
sph_fugue512_close( &ctx.fugue, hash6 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash7, dataLen );
sph_fugue512_close( &ctx.fugue, hash7 );
intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, dataLen<<3 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, dataLen );
shabal512_8way_close( &ctx.shabal, vhash );
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, dataLen<<3 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash4, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash4 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash5, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash5 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash6, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash6 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash7, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash7 );
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, dataLen<<3 );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, dataLen );
sha512_8way_close( &ctx.sha512, vhash );
rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 );
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhashA, dataLen );
haval256_5_8way_close( &ctx.haval, vhashA );
rintrlv_8x32_8x64( vhash, vhashA, dataLen<<3 );
memset( &vhash[ 4<<3 ], 0, (dataLen-32) << 3 );
blake512_8way_init( &ctx.blake );
blake512_8way_update( &ctx.blake, vhash, dataLen );
blake512_8way_close(&ctx.blake, vhash);
bmw512_8way_init( &ctx.bmw );
bmw512_8way_update( &ctx.bmw, vhash, dataLen );
bmw512_8way_close( &ctx.bmw, vhash );
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7,
dataLen<<3 );
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, dataLen<<3 );
skein512_8way_init( &ctx.skein );
skein512_8way_update( &ctx.skein, vhash, dataLen );
skein512_8way_close( &ctx.skein, vhash );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, dataLen );
jh512_8way_close( &ctx.jh, vhash );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, dataLen );
keccak512_8way_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash0, dataLen );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash1, dataLen );
sph_shavite512_close( &ctx.shavite, hash1 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash2, dataLen );
sph_shavite512_close( &ctx.shavite, hash2 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash3, dataLen );
sph_shavite512_close( &ctx.shavite, hash3 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash4, dataLen );
sph_shavite512_close( &ctx.shavite, hash4 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash5, dataLen );
sph_shavite512_close( &ctx.shavite, hash5 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash6, dataLen );
sph_shavite512_close( &ctx.shavite, hash6 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash7, dataLen );
sph_shavite512_close( &ctx.shavite, hash7 );
intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 );
intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash4,
(const BitSequence *) hash4, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash5,
(const BitSequence *) hash5, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash6,
(const BitSequence *) hash6, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash7,
(const BitSequence *) hash7, dataLen<<3 );
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, dataLen<<3 );
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, dataLen );
hamsi512_8way_close( &ctx.hamsi, vhash );
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, dataLen<<3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, dataLen );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, dataLen );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, dataLen );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, dataLen );
sph_fugue512_close( &ctx.fugue, hash3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash4, dataLen );
sph_fugue512_close( &ctx.fugue, hash4 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash5, dataLen );
sph_fugue512_close( &ctx.fugue, hash5 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash6, dataLen );
sph_fugue512_close( &ctx.fugue, hash6 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash7, dataLen );
sph_fugue512_close( &ctx.fugue, hash7 );
intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, dataLen<<3 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, dataLen );
shabal512_8way_close( &ctx.shabal, vhash );
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, dataLen<<3 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash4, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash4 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash5, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash5 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash6, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash6 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash7, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash7 );
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, dataLen<<3 );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, dataLen );
sha512_8way_close( &ctx.sha512, vhash );
rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 );
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhashA, dataLen );
haval256_5_8way_close( &ctx.haval, output );
}
int scanhash_xevan_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const uint32_t Htarg = ptarget[7];
mm512_bswap32_intrlv80_8x64( vdata, pdata );
do
{
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
xevan_8way_hash( hash, vdata );
for ( int lane = 0; lane < 8; lane++ )
if unlikely( ( hash7[ lane ] <= Htarg ) )
{
extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
}
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(XEVAN_4WAY)
union _xevan_4way_context_overlay
{
blake512_4way_context blake;
@@ -62,12 +569,12 @@ void xevan_4way_hash( void *output, const void *input )
// parallel 4 way
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_update( &ctx.blake, input, 80 );
blake512_4way_close(&ctx.blake, vhash);
memset( &vhash[8<<2], 0, 64<<2 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, dataLen );
bmw512_4way_update( &ctx.bmw, vhash, dataLen );
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
@@ -90,15 +597,15 @@ void xevan_4way_hash( void *output, const void *input )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, dataLen );
skein512_4way_update( &ctx.skein, vhash, dataLen );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, dataLen );
jh512_4way_update( &ctx.jh, vhash, dataLen );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, dataLen );
keccak512_4way_update( &ctx.keccak, vhash, dataLen );
keccak512_4way_close( &ctx.keccak, vhash );
rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
@@ -142,7 +649,7 @@ void xevan_4way_hash( void *output, const void *input )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, dataLen );
hamsi512_4way_update( &ctx.hamsi, vhash, dataLen );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
@@ -164,7 +671,7 @@ void xevan_4way_hash( void *output, const void *input )
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, dataLen );
shabal512_4way_update( &ctx.shabal, vhash, dataLen );
shabal512_4way_close( &ctx.shabal, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
@@ -186,13 +693,13 @@ void xevan_4way_hash( void *output, const void *input )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, dataLen );
sha512_4way_update( &ctx.sha512, vhash, dataLen );
sha512_4way_close( &ctx.sha512, vhash );
rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhashA, dataLen );
haval256_5_4way_update( &ctx.haval, vhashA, dataLen );
haval256_5_4way_close( &ctx.haval, vhashA );
rintrlv_4x32_4x64( vhash, vhashA, dataLen<<3 );
@@ -200,11 +707,11 @@ void xevan_4way_hash( void *output, const void *input )
memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 );
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, dataLen );
blake512_4way_update( &ctx.blake, vhash, dataLen );
blake512_4way_close(&ctx.blake, vhash);
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, dataLen );
bmw512_4way_update( &ctx.bmw, vhash, dataLen );
bmw512_4way_close( &ctx.bmw, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
@@ -225,15 +732,15 @@ void xevan_4way_hash( void *output, const void *input )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, dataLen );
skein512_4way_update( &ctx.skein, vhash, dataLen );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, dataLen );
jh512_4way_update( &ctx.jh, vhash, dataLen );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, dataLen );
keccak512_4way_update( &ctx.keccak, vhash, dataLen );
keccak512_4way_close( &ctx.keccak, vhash );
rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
@@ -277,7 +784,7 @@ void xevan_4way_hash( void *output, const void *input )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, dataLen );
hamsi512_4way_update( &ctx.hamsi, vhash, dataLen );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
@@ -298,7 +805,7 @@ void xevan_4way_hash( void *output, const void *input )
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, dataLen );
shabal512_4way_update( &ctx.shabal, vhash, dataLen );
shabal512_4way_close( &ctx.shabal, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
@@ -319,13 +826,13 @@ void xevan_4way_hash( void *output, const void *input )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, dataLen );
sha512_4way_update( &ctx.sha512, vhash, dataLen );
sha512_4way_close( &ctx.sha512, vhash );
rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhashA, dataLen );
haval256_5_4way_update( &ctx.haval, vhashA, dataLen );
haval256_5_4way_close( &ctx.haval, output );
}

View File

@@ -2,8 +2,10 @@
bool register_xevan_algo( algo_gate_t* gate )
{
#if defined (XEVAN_4WAY)
// init_xevan_4way_ctx();
#if defined (XEVAN_8WAY)
gate->scanhash = (void*)&scanhash_xevan_8way;
gate->hash = (void*)&xevan_8way_hash;
#elif defined (XEVAN_4WAY)
gate->scanhash = (void*)&scanhash_xevan_4way;
gate->hash = (void*)&xevan_4way_hash;
#else

View File

@@ -4,13 +4,21 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define XEVAN_4WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define XEVAN_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define XEVAN_4WAY 1
#endif
bool register_xevan_algo( algo_gate_t* gate );
#if defined(XEVAN_4WAY)
#if defined(XEVAN_8WAY)
void xevan_8way_hash( void *state, const void *input );
int scanhash_xevan_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(XEVAN_4WAY)
void xevan_4way_hash( void *state, const void *input );
@@ -19,7 +27,7 @@ int scanhash_xevan_4way( struct work *work, uint32_t max_nonce,
//void init_xevan_4way_ctx();
#endif
#else
void xevan_hash( void *state, const void *input );
@@ -30,3 +38,4 @@ void init_xevan_ctx();
#endif
#endif

View File

@@ -1,7 +1,4 @@
#include "x22i-gate.h"
#if defined(X22I_4WAY)
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/echo/aes_ni/hash_api.h"
@@ -12,6 +9,7 @@
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/hamsi/hamsi-hash-4way.h"
@@ -25,6 +23,424 @@
#include "algo/gost/sph_gost.h"
#include "algo/swifftx/swifftx.h"
#if defined(X22I_8WAY)
union _x22i_8way_ctx_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
hashState_groestl groestl;
hashState_echo echo;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
sph_shavite512_context shavite;
simd_4way_context simd;
hamsi512_8way_context hamsi;
sph_fugue512_context fugue;
shabal512_8way_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
haval256_5_8way_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
sha256_8way_context sha256;
};
typedef union _x22i_8way_ctx_overlay x22i_8way_ctx_overlay;
void x22i_8way_hash( void *output, const void *input )
{
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
uint64_t hash0[8*4] __attribute__ ((aligned (64)));
uint64_t hash1[8*4] __attribute__ ((aligned (64)));
uint64_t hash2[8*4] __attribute__ ((aligned (64)));
uint64_t hash3[8*4] __attribute__ ((aligned (64)));
uint64_t hash4[8*4] __attribute__ ((aligned (64)));
uint64_t hash5[8*4] __attribute__ ((aligned (64)));
uint64_t hash6[8*4] __attribute__ ((aligned (64)));
uint64_t hash7[8*4] __attribute__ ((aligned (64)));
// unsigned char hash[64 * 4] __attribute__((aligned(64))) = {0};
unsigned char hashA0[64] __attribute__((aligned(64))) = {0};
unsigned char hashA1[64] __attribute__((aligned(32))) = {0};
unsigned char hashA2[64] __attribute__((aligned(32))) = {0};
unsigned char hashA3[64] __attribute__((aligned(32))) = {0};
unsigned char hashA4[64] __attribute__((aligned(64))) = {0};
unsigned char hashA5[64] __attribute__((aligned(32))) = {0};
unsigned char hashA6[64] __attribute__((aligned(32))) = {0};
unsigned char hashA7[64] __attribute__((aligned(32))) = {0};
x22i_8way_ctx_overlay ctx;
blake512_8way_init( &ctx.blake );
blake512_8way_update( &ctx.blake, input, 80 );
blake512_8way_close( &ctx.blake, vhash );
bmw512_8way_init( &ctx.bmw );
bmw512_8way_update( &ctx.bmw, vhash, 64 );
bmw512_8way_close( &ctx.bmw, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(const char*)hash0, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(const char*)hash1, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(const char*)hash2, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(const char*)hash3, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash4,
(const char*)hash4, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash5,
(const char*)hash5, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash6,
(const char*)hash6, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash7,
(const char*)hash7, 512 );
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7 );
skein512_8way_init( &ctx.skein );
skein512_8way_update( &ctx.skein, vhash, 64 );
skein512_8way_close( &ctx.skein, vhash );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash4, 64 );
sph_shavite512_close( &ctx.shavite, hash4 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash5, 64 );
sph_shavite512_close( &ctx.shavite, hash5 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash6, 64 );
sph_shavite512_close( &ctx.shavite, hash6 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash7, 64 );
sph_shavite512_close( &ctx.shavite, hash7 );
intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash0,
(const BitSequence*)hash0, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash1,
(const BitSequence*)hash1, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash2,
(const BitSequence*)hash2, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash3,
(const BitSequence*)hash3, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash4,
(const BitSequence*)hash4, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash5,
(const BitSequence*)hash5, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash6,
(const BitSequence*)hash6, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash7,
(const BitSequence*)hash7, 512 );
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7 );
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash4, 64 );
sph_fugue512_close( &ctx.fugue, hash4 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash5, 64 );
sph_fugue512_close( &ctx.fugue, hash5 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash6, 64 );
sph_fugue512_close( &ctx.fugue, hash6 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash7, 64 );
sph_fugue512_close( &ctx.fugue, hash7 );
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, 64 );
shabal512_8way_close( &ctx.shabal, vhash );
dintrlv_8x32_512( &hash0[8], &hash1[8], &hash2[8], &hash3[8],
&hash4[8], &hash5[8], &hash6[8], &hash7[8], vhash );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, &hash0[8], 64 );
sph_whirlpool_close( &ctx.whirlpool, &hash0[16] );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, &hash1[8], 64 );
sph_whirlpool_close( &ctx.whirlpool, &hash1[16] );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, &hash2[8], 64 );
sph_whirlpool_close( &ctx.whirlpool, &hash2[16] );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, &hash3[8], 64 );
sph_whirlpool_close( &ctx.whirlpool, &hash3[16] );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, &hash4[8], 64 );
sph_whirlpool_close( &ctx.whirlpool, &hash4[16] );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, &hash5[8], 64 );
sph_whirlpool_close( &ctx.whirlpool, &hash5[16] );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, &hash6[8], 64 );
sph_whirlpool_close( &ctx.whirlpool, &hash6[16] );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, &hash7[8], 64 );
sph_whirlpool_close( &ctx.whirlpool, &hash7[16] );
intrlv_8x64_512( vhash, &hash0[16], &hash1[16], &hash2[16], &hash3[16],
&hash4[16], &hash5[16], &hash6[16], &hash7[16] );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, 64 );
sha512_8way_close( &ctx.sha512, vhash );
dintrlv_8x64_512( &hash0[24], &hash1[24], &hash2[24], &hash3[24],
&hash4[24], &hash5[24], &hash6[24], &hash7[24], vhash );
ComputeSingleSWIFFTX((unsigned char*)hash0, (unsigned char*)hashA0);
ComputeSingleSWIFFTX((unsigned char*)hash1, (unsigned char*)hashA1);
ComputeSingleSWIFFTX((unsigned char*)hash2, (unsigned char*)hashA2);
ComputeSingleSWIFFTX((unsigned char*)hash3, (unsigned char*)hashA3);
ComputeSingleSWIFFTX((unsigned char*)hash4, (unsigned char*)hashA4);
ComputeSingleSWIFFTX((unsigned char*)hash5, (unsigned char*)hashA5);
ComputeSingleSWIFFTX((unsigned char*)hash6, (unsigned char*)hashA6);
ComputeSingleSWIFFTX((unsigned char*)hash7, (unsigned char*)hashA7);
intrlv_8x32_512( vhashA, hashA0, hashA1, hashA2, hashA3,
hashA4, hashA5, hashA6, hashA7 );
memset( vhash, 0, 64*8 );
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhashA, 64 );
haval256_5_8way_close( &ctx.haval, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash );
memset( hashA0, 0, 64 );
memset( hashA1, 0, 64 );
memset( hashA2, 0, 64 );
memset( hashA3, 0, 64 );
memset( hashA4, 0, 64 );
memset( hashA5, 0, 64 );
memset( hashA6, 0, 64 );
memset( hashA7, 0, 64 );
sph_tiger_init(&ctx.tiger);
sph_tiger (&ctx.tiger, (const void*) hash0, 64);
sph_tiger_close(&ctx.tiger, (void*) hashA0);
sph_tiger_init(&ctx.tiger);
sph_tiger (&ctx.tiger, (const void*) hash1, 64);
sph_tiger_close(&ctx.tiger, (void*) hashA1);
sph_tiger_init(&ctx.tiger);
sph_tiger (&ctx.tiger, (const void*) hash2, 64);
sph_tiger_close(&ctx.tiger, (void*) hashA2);
sph_tiger_init(&ctx.tiger);
sph_tiger (&ctx.tiger, (const void*) hash3, 64);
sph_tiger_close(&ctx.tiger, (void*) hashA3);
sph_tiger_init(&ctx.tiger);
sph_tiger (&ctx.tiger, (const void*) hash4, 64);
sph_tiger_close(&ctx.tiger, (void*) hashA4);
sph_tiger_init(&ctx.tiger);
sph_tiger (&ctx.tiger, (const void*) hash5, 64);
sph_tiger_close(&ctx.tiger, (void*) hashA5);
sph_tiger_init(&ctx.tiger);
sph_tiger (&ctx.tiger, (const void*) hash6, 64);
sph_tiger_close(&ctx.tiger, (void*) hashA6);
sph_tiger_init(&ctx.tiger);
sph_tiger (&ctx.tiger, (const void*) hash7, 64);
sph_tiger_close(&ctx.tiger, (void*) hashA7);
memset( hash0, 0, 64 );
memset( hash1, 0, 64 );
memset( hash2, 0, 64 );
memset( hash3, 0, 64 );
memset( hash4, 0, 64 );
memset( hash5, 0, 64 );
memset( hash6, 0, 64 );
memset( hash7, 0, 64 );
intrlv_2x256( vhash, hashA0, hashA1, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash0, hash1, vhash, 256 );
intrlv_2x256( vhash, hashA2, hashA3, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash2, hash3, vhash, 256 );
intrlv_2x256( vhash, hashA4, hashA5, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash4, hash5, vhash, 256 );
intrlv_2x256( vhash, hashA6, hashA7, 256 );
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash6, hash7, vhash, 256 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash0, 64 );
sph_gost512_close( &ctx.gost, (void*) hash0 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash1, 64 );
sph_gost512_close( &ctx.gost, (void*) hash1 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash2, 64 );
sph_gost512_close( &ctx.gost, (void*) hash2 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash3, 64 );
sph_gost512_close( &ctx.gost, (void*) hash3 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash4, 64 );
sph_gost512_close( &ctx.gost, (void*) hash4 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash5, 64 );
sph_gost512_close( &ctx.gost, (void*) hash5 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash6, 64 );
sph_gost512_close( &ctx.gost, (void*) hash6 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash7, 64 );
sph_gost512_close( &ctx.gost, (void*) hash7 );
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7 );
sha256_8way_init( &ctx.sha256 );
sha256_8way_update( &ctx.sha256, vhash, 64 );
sha256_8way_close( &ctx.sha256, output );
}
int scanhash_x22i_8way( struct work* work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
__m512i *noncev = (__m512i*)vdata + 9; // aligned
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 8;
const int thr_id = mythr->id;
const uint32_t Htarg = ptarget[7];
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x08ff;
InitializeSWIFFTX();
mm512_bswap32_intrlv80_8x64( vdata, pdata );
do
{
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x22i_8way_hash( hash, vdata );
for ( int lane = 0; lane < 8; lane++ )
if unlikely( ( hash7[ lane ] <= Htarg ) )
{
extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
}
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(X22I_4WAY)
union _x22i_4way_ctx_overlay
{
blake512_4way_context blake;
@@ -59,8 +475,6 @@ void x22i_4way_hash( void *output, const void *input )
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
// unsigned char hash[64 * 4] __attribute__((aligned(64))) = {0};
unsigned char hashA0[64] __attribute__((aligned(64))) = {0};
unsigned char hashA1[64] __attribute__((aligned(32))) = {0};
unsigned char hashA2[64] __attribute__((aligned(32))) = {0};
@@ -68,13 +482,12 @@ void x22i_4way_hash( void *output, const void *input )
x22i_ctx_overlay ctx;
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_update( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
init_groestl( &ctx.groestl, 64 );
@@ -93,15 +506,15 @@ void x22i_4way_hash( void *output, const void *input )
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_update( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -142,13 +555,11 @@ void x22i_4way_hash( void *output, const void *input )
update_final_echo ( &ctx.echo, (BitSequence*)hash3,
(const BitSequence*)hash3, 512 );
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
sph_fugue512_init( &ctx.fugue );
@@ -167,9 +578,8 @@ void x22i_4way_hash( void *output, const void *input )
intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_update( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
dintrlv_4x32_512( &hash0[8], &hash1[8], &hash2[8], &hash3[8], vhash );
sph_whirlpool_init( &ctx.whirlpool );
@@ -188,12 +598,10 @@ void x22i_4way_hash( void *output, const void *input )
intrlv_4x64_512( vhash, &hash0[16], &hash1[16], &hash2[16], &hash3[16] );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, 64 );
sha512_4way_update( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
dintrlv_4x64_512( &hash0[24], &hash1[24], &hash2[24], &hash3[24], vhash );
// InitializeSWIFFTX();
ComputeSingleSWIFFTX((unsigned char*)hash0, (unsigned char*)hashA0);
ComputeSingleSWIFFTX((unsigned char*)hash1, (unsigned char*)hashA1);
ComputeSingleSWIFFTX((unsigned char*)hash2, (unsigned char*)hashA2);
@@ -204,9 +612,8 @@ void x22i_4way_hash( void *output, const void *input )
memset( vhash, 0, 64*4 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhashA, 64 );
haval256_5_4way_update( &ctx.haval, vhashA, 64 );
haval256_5_4way_close( &ctx.haval, vhash );
dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
memset( hashA0, 0, 64 );
@@ -257,10 +664,8 @@ void x22i_4way_hash( void *output, const void *input )
intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
sha256_4way_init( &ctx.sha256 );
sha256_4way( &ctx.sha256, vhash, 64 );
sha256_4way_update( &ctx.sha256, vhash, 64 );
sha256_4way_close( &ctx.sha256, output );
// memcpy(output, hash, 32);
}

View File

@@ -1,28 +1,44 @@
#include "x22i-gate.h"
// Ryzen has poor AVX2 performance so use SHA over AVX2.
// Intel has AVX512 so use AVX512 over SHA.
// When Ryzen AVX2 improves use AVX2 over SHA.
bool register_x22i_algo( algo_gate_t* gate )
{
#if defined (X22I_4WAY)
#if defined (X22I_8WAY)
gate->scanhash = (void*)&scanhash_x22i_8way;
gate->hash = (void*)&x22i_8way_hash;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
#elif defined (X22I_4WAY)
gate->scanhash = (void*)&scanhash_x22i_4way;
gate->hash = (void*)&x22i_4way_hash;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
#else
gate->scanhash = (void*)&scanhash_x22i;
gate->hash = (void*)&x22i_hash;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
return true;
};
bool register_x25x_algo( algo_gate_t* gate )
{
#if defined (X22I_4WAY)
#if defined (X25X_8WAY)
gate->scanhash = (void*)&scanhash_x25x_8way;
gate->hash = (void*)&x25x_8way_hash;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
#elif defined (X25X_4WAY)
gate->scanhash = (void*)&scanhash_x25x_4way;
gate->hash = (void*)&x25x_4way_hash;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
#else
gate->scanhash = (void*)&scanhash_x25x;
gate->hash = (void*)&x25x_hash;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
// gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
return true;
};

View File

@@ -6,30 +6,60 @@
#include <stdint.h>
#include <unistd.h>
#if defined(__AVX2__) && defined(__AES__)
#define X22I_4WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X22I_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X22I_4WAY 1
#endif
bool register_x22i__algo( algo_gate_t* gate );
bool register_x22i_algo( algo_gate_t* gate );
#if defined(X22I_4WAY)
#if defined(X22I_8WAY)
void x22i_8way_hash( void *state, const void *input );
int scanhash_x22i_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X22I_4WAY)
void x22i_4way_hash( void *state, const void *input );
int scanhash_x22i_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void x25x_4way_hash( void *state, const void *input );
int scanhash_x25x_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#else
void x22i_hash( void *state, const void *input );
int scanhash_x22i( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X25X_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X25X_4WAY 1
#endif
bool register_x25i_algo( algo_gate_t* gate );
#if defined(X25X_8WAY)
void x25x_8way_hash( void *state, const void *input );
int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X25X_4WAY)
void x25x_4way_hash( void *state, const void *input );
int scanhash_x25x_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void x25x_hash( void *state, const void *input );
int scanhash_x25x( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#endif // X22I_GATE_H__

File diff suppressed because it is too large Load Diff

View File

@@ -4,7 +4,7 @@
# during develpment. However the information contained may provide compilation
# tips to users.
rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen
rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen > /dev/null
make distclean || echo clean
rm -f config.status

27
build-avx2.sh Executable file
View File

@@ -0,0 +1,27 @@
#!/bin/bash
#if [ "$OS" = "Windows_NT" ]; then
# ./mingw64.sh
# exit 0
#fi
# Linux build
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
# Ubuntu 10.04 (gcc 4.4)
# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
CFLAGS="-O3 -march=haswell -maes -Wall" ./configure --with-curl
#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
make -j 4
strip -s cpuminer

10
clean-all.sh Executable file
View File

@@ -0,0 +1,10 @@
#!/bin/bash
#
# imake clean and rm all the targetted executables.
# tips to users.
rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen > /dev/null
rm cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-aes-avx.exe cpuminer-aes-sse42.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-sse2.exe cpuminer-zen.exe > /dev/null
make distclean

Some files were not shown because too many files have changed in this diff Show More