This commit is contained in:
Jay D Dee
2021-10-19 22:35:36 -04:00
parent 47cc5dcff5
commit 1a234cbe53
18 changed files with 474 additions and 189 deletions

View File

@@ -337,42 +337,42 @@ do{ \
XC2 = XOR( XC2, TC ); \
\
TA = ADD32( XA2, XA1 ); \
XA1 = ROL_1X32( XA1 ); \
TB = ADD32( XB2, XB1 ); \
TC = ADD32( XC2, XC1 ); \
TA = ROL32( TA, 13 ); \
XA1 = ROL_1X32( XA1 ); \
XB1 = ROL_1X32( XB1 ); \
XC1 = ROL_1X32( XC1 ); \
TA = ROL32( TA, 13 ); \
XA3 = XOR( XA3, TA ); \
XC1 = ROL_1X32( XC1 ); \
TB = ROL32( TB, 13 ); \
XB3 = XOR( XB3, TB ); \
TC = ROL32( TC, 13 ); \
XC3 = XOR( XC3, TC ); \
\
TA = ADD32( XA3, XA2 ); \
XA2 = SWAP_64( XA2 ); \
TB = ADD32( XB3, XB2 ); \
TC = ADD32( XC3, XC2 ); \
TA = ROL32( TA, 18 ); \
XA2 = SWAP_64( XA2 ); \
XB2 = SWAP_64( XB2 ); \
XC2 = SWAP_64( XC2 ); \
XA0 = XOR( XA0, TA ); \
TB = ROL32( TB, 18 ); \
XB0 = XOR( XB0, TB ); \
XC2 = SWAP_64( XC2 ); \
TC = ROL32( TC, 18 ); \
XC0 = XOR( XC0, TC ); \
\
TA = ADD32( XA0, XA1 ); \
XA3 = ROR_1X32( XA3 ); \
TB = ADD32( XB0, XB1 ); \
TC = ADD32( XC0, XC1 ); \
TA = ROL32( TA, 7 ); \
XA3 = ROR_1X32( XA3 ); \
XB3 = ROR_1X32( XB3 ); \
XA3 = XOR( XA3, TA ); \
TB = ROL32( TB, 7 ); \
XB3 = ROR_1X32( XB3 ); \
XC3 = ROR_1X32( XC3 ); \
XB3 = XOR( XB3, TB ); \
TC = ROL32( TC, 7 ); \
XC3 = ROR_1X32( XC3 ); \
XC3 = XOR( XC3, TC ); \
\
TA = ADD32( XA3, XA0 ); \
@@ -399,24 +399,24 @@ do{ \
XC1 = XOR( XC1, TC ); \
\
TA = ADD32( XA1, XA2 ); \
XA2 = SWAP_64( XA2 ); \
TB = ADD32( XB1, XB2 ); \
XB2 = SWAP_64( XB2 ); \
TA = ROL32( TA, 18); \
TC = ADD32( XC1, XC2 ); \
XA2 = SWAP_64( XA2 ); \
XC2 = SWAP_64( XC2 ); \
TB = ROL32( TB, 18); \
XA0 = XOR( XA0, TA ); \
XB2 = SWAP_64( XB2 ); \
XA1 = ROR_1X32( XA1 ); \
TC = ROL32( TC, 18); \
XB0 = XOR( XB0, TB ); \
XC2 = SWAP_64( XC2 ); \
XA1 = ROR_1X32( XA1 ); \
XB1 = ROR_1X32( XB1 ); \
XC0 = XOR( XC0, TC ); \
XC1 = ROR_1X32( XC1 ); \
} while (0);
// slow rol, an attempt to optimze non-avx512 bit rotations
// slow rot, an attempt to optimze non-avx512 bit rotations
// Contains target specific instructions, only for use with 128 bit vectors
#define SALSA_2ROUNDS_SIMD128_3BUF_SLOROT \
do{ \

View File

@@ -28,7 +28,6 @@
*/
#include "algo-gate-api.h"
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
@@ -55,11 +54,25 @@ static const uint32_t sha256_initial_state[8] =
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
static int scrypt_throughput = 0;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SCRYPT_THROUGHPUT 16
#elif defined(__AVX2__)
#define SCRYPT_THROUGHPUT 8
#else
#define SCRYPT_THROUGHPUT 4
#endif
// static int scrypt_throughput = 0;
static int scratchbuf_size = 0;
static __thread char *scratchbuf = NULL;
static __thread uint32_t *scratchbuf = NULL;
// change this to a constant to be used directly as input state arg
// vectors still need an init function.
@@ -709,15 +722,11 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
#endif // AVX512
//#if defined(USE_ASM) && defined(__x86_64__)
#define SCRYPT_MAX_WAYS 12
#define HAVE_SCRYPT_3WAY 1
//int scrypt_best_throughput();
void scrypt_core(uint32_t *X, uint32_t *V, int N);
void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
//#if defined(USE_AVX2)
#if defined(__AVX2__)
#undef SCRYPT_MAX_WAYS
#define SCRYPT_MAX_WAYS 24
@@ -727,40 +736,39 @@ void scrypt_core_6way(uint32_t *X, uint32_t *V, int N);
#ifndef SCRYPT_MAX_WAYS
#define SCRYPT_MAX_WAYS 1
//#define scrypt_best_throughput() 1
#endif
#include "scrypt-core-4way.h"
static bool scrypt_N_1_1_256(const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thr_id )
/*
static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
uint32_t *midstate, int N, int thr_id )
{
uint32_t tstate[8], ostate[8];
uint32_t X[32];
uint32_t *V = (uint32_t*)scratchpad;
memcpy(tstate, midstate, 32);
HMAC_SHA256_80_init(input, tstate, ostate);
PBKDF2_SHA256_80_128(tstate, ostate, input, X);
scrypt_core_simd128( X, V, N ); // woring
scrypt_core_simd128( X, scratchbuf, N ); // woring
// scrypt_core_1way( X, V, N ); // working
// scrypt_core(X, V, N);
PBKDF2_SHA256_128_32(tstate, ostate, X, output);
return true;
}
*/
#if defined(__AVX2__)
#if ( SCRYPT_THROUGHPUT == 8 )
static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
uint32_t *midstate, int N, int thrid )
{
uint32_t _ALIGN(128) tstate[ 8*8 ];
uint32_t _ALIGN(128) ostate[ 8*8 ];
uint32_t _ALIGN(128) W[ 8*32 ];
uint32_t _ALIGN(128) X[ 8*32 ];
uint32_t *V = (uint32_t*)scratchpad;
intrlv_8x32( W, input, input+ 20, input+ 40, input+ 60,
input+80, input+100, input+120, input+140, 640 );
@@ -774,11 +782,11 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
if ( opt_param_n > 0x4000 )
{
scrypt_core_simd128_3buf( X, V, N );
scrypt_core_simd128_3buf( X, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_3buf( X+ 96, V, N );
scrypt_core_simd128_3buf( X+ 96, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+192, V, N );
scrypt_core_simd128_2buf( X+192, scratchbuf, N );
}
else
{
@@ -786,13 +794,13 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
intrlv_2x128( W+128, X+128, X+160, 1024 );
intrlv_2x128( W+192, X+192, X+224, 1024 );
scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)V, N );
scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)scratchbuf, N );
dintrlv_2x128( X, X+ 32, W, 1024 );
dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
dintrlv_2x128( X+128, X+160, W+128, 1024 );
@@ -928,16 +936,15 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if ( SCRYPT_THROUGHPUT == 16 )
static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
uint32_t *midstate, int N, int thrid )
{
uint32_t _ALIGN(128) tstate[ 16*8 ];
uint32_t _ALIGN(128) ostate[ 16*8 ];
uint32_t _ALIGN(128) W[ 16*32 ];
uint32_t _ALIGN(128) X[ 16*32 ];
uint32_t *V = (uint32_t*)scratchpad;
intrlv_16x32( W, input, input+ 20, input+ 40, input+ 60,
input+ 80, input+100, input+120, input+140,
@@ -956,17 +963,17 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
if ( opt_param_n > 0x4000 )
{
scrypt_core_simd128_3buf( X, V, N );
scrypt_core_simd128_3buf( X, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_3buf( X+ 96, V, N );
scrypt_core_simd128_3buf( X+ 96, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+192, V, N );
scrypt_core_simd128_2buf( X+192, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_3buf( X+256, V, N );
scrypt_core_simd128_3buf( X+256, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_3buf( X+352, V, N );
scrypt_core_simd128_3buf( X+352, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+448, V, N );
scrypt_core_simd128_2buf( X+448, scratchbuf, N );
}
else
{
@@ -974,13 +981,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
intrlv_4x128( W+256, X+256, X+288, X+320, X+352, 1024 );
intrlv_4x128( W+384, X+384, X+416, X+448, X+480, 1024 );
scrypt_core_4way_simd128( (__m512i*) W, (__m512i*)V, N );
scrypt_core_4way_simd128( (__m512i*) W, (__m512i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)V, N );
scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_4way_simd128( (__m512i*)(W+384), (__m512i*)V, N );
scrypt_core_4way_simd128( (__m512i*)(W+384), (__m512i*)scratchbuf, N );
dintrlv_4x128( X, X+ 32, X+ 64, X+ 96, W, 1024 );
dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
dintrlv_4x128( X+256, X+288, X+320, X+352, W+256, 1024 );
@@ -1236,15 +1243,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
#endif // AVX512
#if defined(__SHA__)
#if 0
static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
uint32_t *midstate, int N, int thrid )
{
uint32_t _ALIGN(128) tstate[ 2*8 ];
uint32_t _ALIGN(128) ostate[ 2*8 ];
uint32_t _ALIGN(128) W[ 2*32 ];
uint32_t *V = (uint32_t*)scratchpad;
memcpy( tstate, midstate, 32 );
memcpy( tstate+ 8, midstate, 32 );
@@ -1254,7 +1259,7 @@ static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
PBKDF2_SHA256_80_128_SHA_2BUF( tstate, tstate+8, ostate, ostate+8,
input, input+20, W, W+32 );
scrypt_core_simd128_2buf( W, V, N );
scrypt_core_simd128_2buf( W, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
PBKDF2_SHA256_128_32_SHA_2BUF( tstate, tstate+8, ostate, ostate+8, W, W+32,
@@ -1264,12 +1269,11 @@ static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
}
static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
uint32_t *midstate, int N, int thrid )
{
uint32_t _ALIGN(128) tstate[4 * 8];
uint32_t _ALIGN(128) ostate[4 * 8];
uint32_t _ALIGN(128) W[4 * 32];
uint32_t *V = (uint32_t*)scratchpad;
memcpy( tstate, midstate, 32 );
memcpy( tstate+ 8, midstate, 32 );
@@ -1300,9 +1304,9 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
*/
// working, double buffered linear simd
scrypt_core_simd128_2buf( W, V, N );
scrypt_core_simd128_2buf( W, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( W+64, V, N );
scrypt_core_simd128_2buf( W+64, scratchbuf, N );
/*
scrypt_core_simd128_3buf( W, V, N );
@@ -1323,17 +1327,15 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
return 1;
}
#endif
#else
#ifdef HAVE_SHA256_4WAY
#if ( SCRYPT_THROUGHPUT == 4 )
static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
uint32_t *midstate, int N, int thrid )
{
uint32_t _ALIGN(128) tstate[ 4*8 ];
uint32_t _ALIGN(128) ostate[ 4*8 ];
uint32_t _ALIGN(128) W[ 4*32 ];
uint32_t *V = (uint32_t*)scratchpad;
intrlv_4x32( W, input, input+20, input+40, input+60, 640 );
for ( int i = 0; i < 8; i++ )
@@ -1346,13 +1348,13 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
{
uint32_t _ALIGN(128) X[ 4*32 ];
dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
scrypt_core_simd128_2buf( X, V, N );
scrypt_core_simd128_2buf( X, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+64, V, N );
scrypt_core_simd128_2buf( X+64, scratchbuf, N );
intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
}
else
scrypt_core_4way( (__m128i*)W, (__m128i*)V, N );
scrypt_core_4way( (__m128i*)W, (__m128i*)scratchbuf, N );
@@ -1398,65 +1400,73 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
return 1;
}
#endif /* HAVE_SHA256_4WAY */
#endif // SCRYPT_THROUGHPUT == 4
#endif // SHA
//#endif // SHA
extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) hash[ 8*SCRYPT_THROUGHPUT ];
uint32_t _ALIGN(64) data[ 20*SCRYPT_THROUGHPUT ];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
uint32_t midstate[8];
uint32_t n = pdata[19] - 1;
int thr_id = mythr->id;
int throughput = scrypt_throughput;
int i;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
for ( i = 0; i < throughput; i++ )
for ( i = 0; i < SCRYPT_THROUGHPUT; i++ )
memcpy( data + i * 20, pdata, 80 );
sha256_transform_le( midstate, data, sha256_initial_state );
do {
bool rc = true;
for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n;
for ( i = 0; i < SCRYPT_THROUGHPUT; i++ ) data[ i*20 + 19 ] = ++n;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
if ( throughput == 16 )
rc = scrypt_N_1_1_256_16way( data, hash, midstate, scratchbuf,
opt_param_n, thr_id );
else
#endif
#if defined(__AVX2__)
if ( throughput == 8 )
rc = scrypt_N_1_1_256_8way( data, hash, midstate, scratchbuf,
opt_param_n, thr_id );
else
#endif
if ( throughput == 4 ) // slower on Ryzen than 8way
#if defined(__SHA__)
rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, scratchbuf,
opt_param_n, thr_id );
//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if ( SCRYPT_THROUGHPUT == 16 )
// if ( SCRYPT_THROUGHPUT == 16 )
rc = scrypt_N_1_1_256_16way( data, hash, midstate, opt_param_n,
thr_id );
// else
//#endif
//#if defined(__AVX2__)
#elif ( SCRYPT_THROUGHPUT == 8 )
// if ( SCRYPT_THROUGHPUT == 8 )
rc = scrypt_N_1_1_256_8way( data, hash, midstate, opt_param_n,
thr_id );
// else
//#endif
#elif ( SCRYPT_THROUGHPUT == 4 )
// if ( SCRYPT_THROUGHPUT == 4 ) // slower on Ryzen than 8way
//#if defined(__SHA__)
// rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, opt_param_n,
// thr_id );
//#else
rc = scrypt_N_1_1_256_4way( data, hash, midstate, opt_param_n,
thr_id );
#else
rc = scrypt_N_1_1_256_4way( data, hash, midstate, scratchbuf,
opt_param_n, thr_id );
#error "Invalid SCRYPT_THROUGHPUT"
#endif
/*
#if defined(__SHA__)
else
if (throughput == 2 ) // slower on Ryzen than 4way_sha & 8way
rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, scratchbuf,
opt_param_n, thr_id );
if ( SCRYPT_THROUGHPUT == 2 ) // slower on Ryzen than 4way_sha & 8way
rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, opt_param_n,
thr_id );
#endif
else // should never get here
rc = scrypt_N_1_1_256( data, hash, midstate, scratchbuf,
opt_param_n, thr_id );
rc = scrypt_N_1_1_256( data, hash, midstate, opt_param_n, thr_id );
*/
// test the hash
if ( rc )
for ( i = 0; i < throughput; i++ )
for ( i = 0; i < SCRYPT_THROUGHPUT; i++ )
{
if ( unlikely( valid_hash( hash + i*8, ptarget ) && !opt_benchmark ) )
{
@@ -1468,7 +1478,7 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
}
} while ( likely( ( n < ( max_nonce - throughput ) ) && !(*restart) ) );
} while ( likely( ( n < ( max_nonce - SCRYPT_THROUGHPUT ) ) && !(*restart) ) );
*hashes_done = n - pdata[19];
pdata[19] = n;
@@ -1489,7 +1499,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
//#if defined(__SHA__)
// gate->optimizations = SSE2_OPT | SHA_OPT;
//#else
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT;
//#endif
gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
gate->scanhash = (void*)&scanhash_scrypt;
@@ -1497,8 +1507,11 @@ bool register_scrypt_algo( algo_gate_t* gate )
opt_param_n = opt_param_n ? opt_param_n : 1024;
applog( LOG_INFO,"Scrypt paramaters: N= %d, R= 1", opt_param_n );
// scrypt_throughput can be defined at compile time and used to replace
// MAX_WAYS to reduce memory usage.
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
scrypt_throughput = 16;
// scrypt_throughput = 16;
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
else
@@ -1511,13 +1524,13 @@ bool register_scrypt_algo( algo_gate_t* gate )
*/
#elif defined(__AVX2__)
scrypt_throughput = 8;
// scrypt_throughput = 8;
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
else
scratchbuf_size = opt_param_n * 2 * 128; // 2 way
#else
scrypt_throughput = 4;
// scrypt_throughput = 4;
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
else
@@ -1533,7 +1546,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
format_number_si( &d_size, d_units );
applog( LOG_INFO,"Throughput %d/thr, Buffer %.0f %siB/thr, Total %.0f %siB\n",
scrypt_throughput, t_size, t_units, d_size, d_units );
SCRYPT_THROUGHPUT, t_size, t_units, d_size, d_units );
return true;
};

View File

@@ -84,6 +84,11 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
const __m256i *state_in );
void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
const __m256i *state_in );
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const __m256i *state_mid );
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

View File

@@ -8,7 +8,7 @@
* any later version. See COPYING for more details.
*/
#include "algo-gate-api.h"
#include "sha256d-4way.h"
#include <string.h>
#include <inttypes.h>
@@ -181,6 +181,8 @@ static const uint32_t sha256d_hash1[16] = {
};
// this performs the entire hash all over again, why?
// because main function only does 56 rounds.
static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
{
uint32_t S[16];
@@ -492,7 +494,7 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
void sha256d_ms_4way(uint32_t *hash, uint32_t *data,
const uint32_t *midstate, const uint32_t *prehash);
static inline int scanhash_sha256d_4way( struct work *work,
static inline int scanhash_sha256d_4way_pooler( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
@@ -553,7 +555,7 @@ static inline int scanhash_sha256d_4way( struct work *work,
void sha256d_ms_8way(uint32_t *hash, uint32_t *data,
const uint32_t *midstate, const uint32_t *prehash);
static inline int scanhash_sha256d_8way( struct work *work,
static inline int scanhash_sha256d_8way_pooler( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
@@ -609,7 +611,7 @@ static inline int scanhash_sha256d_8way( struct work *work,
#endif /* HAVE_SHA256_8WAY */
int scanhash_sha256d( struct work *work,
int scanhash_sha256d_pooler( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
@@ -625,11 +627,11 @@ int scanhash_sha256d( struct work *work,
#ifdef HAVE_SHA256_8WAY
if (sha256_use_8way())
return scanhash_sha256d_8way( work, max_nonce, hashes_done, mythr );
return scanhash_sha256d_8way_pooler( work, max_nonce, hashes_done, mythr );
#endif
#ifdef HAVE_SHA256_4WAY
if (sha256_use_4way())
return scanhash_sha256d_4way( work, max_nonce, hashes_done, mythr );
return scanhash_sha256d_4way_pooler( work, max_nonce, hashes_done, mythr );
#endif
memcpy(data, pdata + 16, 64);
@@ -690,9 +692,13 @@ int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
bool register_sha256d_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_sha256d;
// gate->hash = (void*)&sha256d;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
#if defined(SHA256D_16WAY)
gate->scanhash = (void*)&scanhash_sha256d_16way;
#else
gate->scanhash = (void*)&scanhash_sha256d_pooler;
#endif
// gate->hash = (void*)&sha256d;
return true;
};

View File

@@ -548,6 +548,136 @@ void sha256_8way_init( sha256_8way_context *sc )
sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
}
// Aggresive prehashing, LE byte order
void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
const __m256i *state_in )
{
__m256i A, B, C, D, E, F, G, H;
A = _mm256_load_si256( state_in );
B = _mm256_load_si256( state_in + 1 );
C = _mm256_load_si256( state_in + 2 );
D = _mm256_load_si256( state_in + 3 );
E = _mm256_load_si256( state_in + 4 );
F = _mm256_load_si256( state_in + 5 );
G = _mm256_load_si256( state_in + 6 );
H = _mm256_load_si256( state_in + 7 );
#if !defined(__AVX512VL__)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
#endif
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
_mm256_store_si256( state_mid , A );
_mm256_store_si256( state_mid + 1, B );
_mm256_store_si256( state_mid + 2, C );
_mm256_store_si256( state_mid + 3, D );
_mm256_store_si256( state_mid + 4, E );
_mm256_store_si256( state_mid + 5, F );
_mm256_store_si256( state_mid + 6, G );
_mm256_store_si256( state_mid + 7, H );
}
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const __m256i *state_mid )
{
__m256i A, B, C, D, E, F, G, H;
__m256i W[16];
memcpy_256( W, data, 16 );
A = _mm256_load_si256( state_mid );
B = _mm256_load_si256( state_mid + 1 );
C = _mm256_load_si256( state_mid + 2 );
D = _mm256_load_si256( state_mid + 3 );
E = _mm256_load_si256( state_mid + 4 );
F = _mm256_load_si256( state_mid + 5 );
G = _mm256_load_si256( state_mid + 6 );
H = _mm256_load_si256( state_mid + 7 );
// SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
// SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
// SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
#if !defined(__AVX512VL__)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( G, H );
#endif
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
for ( int j = 16; j < 64; j += 16 )
{
W[ 0] = SHA2x_MEXP( 14, 9, 1, 0 );
W[ 1] = SHA2x_MEXP( 15, 10, 2, 1 );
W[ 2] = SHA2x_MEXP( 0, 11, 3, 2 );
W[ 3] = SHA2x_MEXP( 1, 12, 4, 3 );
W[ 4] = SHA2x_MEXP( 2, 13, 5, 4 );
W[ 5] = SHA2x_MEXP( 3, 14, 6, 5 );
W[ 6] = SHA2x_MEXP( 4, 15, 7, 6 );
W[ 7] = SHA2x_MEXP( 5, 0, 8, 7 );
W[ 8] = SHA2x_MEXP( 6, 1, 9, 8 );
W[ 9] = SHA2x_MEXP( 7, 2, 10, 9 );
W[10] = SHA2x_MEXP( 8, 3, 11, 10 );
W[11] = SHA2x_MEXP( 9, 4, 12, 11 );
W[12] = SHA2x_MEXP( 10, 5, 13, 12 );
W[13] = SHA2x_MEXP( 11, 6, 14, 13 );
W[14] = SHA2x_MEXP( 12, 7, 15, 14 );
W[15] = SHA2x_MEXP( 13, 8, 0, 15 );
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
}
A = _mm256_add_epi32( A, _mm256_load_si256( state_in ) );
B = _mm256_add_epi32( B, _mm256_load_si256( state_in + 1 ) );
C = _mm256_add_epi32( C, _mm256_load_si256( state_in + 2 ) );
D = _mm256_add_epi32( D, _mm256_load_si256( state_in + 3 ) );
E = _mm256_add_epi32( E, _mm256_load_si256( state_in + 4 ) );
F = _mm256_add_epi32( F, _mm256_load_si256( state_in + 5 ) );
G = _mm256_add_epi32( G, _mm256_load_si256( state_in + 6 ) );
H = _mm256_add_epi32( H, _mm256_load_si256( state_in + 7 ) );
_mm256_store_si256( state_out , A );
_mm256_store_si256( state_out + 1, B );
_mm256_store_si256( state_out + 2, C );
_mm256_store_si256( state_out + 3, D );
_mm256_store_si256( state_out + 4, E );
_mm256_store_si256( state_out + 5, F );
_mm256_store_si256( state_out + 6, G );
_mm256_store_si256( state_out + 7, H );
}
// need to handle odd byte length for yespower.
// Assume only last update is odd.

View File

@@ -53,4 +53,8 @@ void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
#define sha256_transform_be sph_sha256_transform_be
#endif
// SHA can't do only 3 rounds
#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
#endif

View File

@@ -1,4 +1,4 @@
#include "sha256t-gate.h"
#include "sha256d-4way.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -13,7 +13,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
__m512i block[16] __attribute__ ((aligned (64)));
__m512i hash32[8] __attribute__ ((aligned (32)));
__m512i initstate[8] __attribute__ ((aligned (32)));
__m512i midstate[8] __attribute__ ((aligned (32)));
__m512i midstate1[8] __attribute__ ((aligned (32)));
__m512i midstate2[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m512i vdata[20] __attribute__ ((aligned (32)));
@@ -46,11 +46,10 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 byte block of data
sha256_16way_transform_le( midstate, vdata, initstate );
sha256_16way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
do
{
@@ -59,7 +58,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
block[ 4] = last_byte;
memset_zero_512( block + 5, 10 );
block[15] = m512_const1_32( 80*8 ); // bit count
sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
// 2. 32 byte hash from 1.
memcpy_512( block, hash32, 8 );
@@ -99,7 +98,8 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
__m256i block[16] __attribute__ ((aligned (64)));
__m256i hash32[8] __attribute__ ((aligned (32)));
__m256i initstate[8] __attribute__ ((aligned (32)));
__m256i midstate[8] __attribute__ ((aligned (32)));
__m256i midstate1[8] __attribute__ ((aligned (32)));
__m256i midstate2[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m256i vdata[20] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
@@ -116,7 +116,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
const __m256i eight = m256_const1_32( 8 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m256_const1_32( pdata[i] );
vdata[i] = m256_const1_32( pdata[i] );
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
@@ -130,8 +130,10 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_8way_transform_le( midstate, vdata, initstate );
sha256_8way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
do
{
@@ -140,7 +142,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
block[ 4] = last_byte;
memset_zero_256( block + 5, 10 );
block[15] = m256_const1_32( 80*8 ); // bit count
sha256_8way_transform_le( hash32, block, midstate );
sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
// 2. 32 byte hash from 1.
memcpy_256( block, hash32, 8 );
@@ -253,3 +255,20 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
#endif
/*
bool register_sha256d_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
#if defined(SHA256D_16WAY)
gate->scanhash = (void*)&scanhash_sha256d_16way;
#elif defined(SHA256D_8WAY)
gate->scanhash = (void*)&scanhash_sha256d_8way;
#elif defined(SHA256D_4WAY)
gate->scanhash = (void*)&scanhash_sha256d_4way;
#endif
// gate->hash = (void*)&sha256d;
return true;
};
*/

48
algo/sha/sha256d-4way.h Normal file
View File

@@ -0,0 +1,48 @@
#ifndef __SHA256D_4WAY_H__
#define __SHA256D_4WAY_H__ 1
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA256D_16WAY 1
/*
#elif defined(__AVX2__)
#define SHA256D_8WAY 1
#else
#define SHA256D_4WAY 1
*/
#endif
bool register_sha256d_algo( algo_gate_t* gate );
#if defined(SHA256D_16WAY)
int scanhash_sha256d_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
/*
#if defined(SHA256D_8WAY)
int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA256D_4WAY)
int scanhash_sha256d_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
*/
/*
#if defined(__SHA__)
int scanhash_sha256d( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
*/
#endif

View File

@@ -13,7 +13,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
__m512i block[16] __attribute__ ((aligned (64)));
__m512i hash32[8] __attribute__ ((aligned (32)));
__m512i initstate[8] __attribute__ ((aligned (32)));
__m512i midstate[8] __attribute__ ((aligned (32)));
__m512i midstate1[8] __attribute__ ((aligned (32)));
__m512i midstate2[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m512i vdata[20] __attribute__ ((aligned (32)));
@@ -31,7 +31,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
const __m512i sixteen = m512_const1_32( 16 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m512_const1_32( pdata[i] );
vdata[i] = m512_const1_32( pdata[i] );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
@@ -46,11 +46,10 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 byte block of data
sha256_16way_transform_le( midstate, vdata, initstate );
sha256_16way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
do
{
@@ -59,7 +58,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
block[ 4] = last_byte;
memset_zero_512( block + 5, 10 );
block[15] = m512_const1_32( 80*8 ); // bit count
sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
// 2. 32 byte hash from 1.
memcpy_512( block, hash32, 8 );
@@ -104,7 +103,8 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
__m256i block[16] __attribute__ ((aligned (64)));
__m256i hash32[8] __attribute__ ((aligned (32)));
__m256i initstate[8] __attribute__ ((aligned (32)));
__m256i midstate[8] __attribute__ ((aligned (32)));
__m256i midstate1[8] __attribute__ ((aligned (32)));
__m256i midstate2[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m256i vdata[20] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
@@ -121,7 +121,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
const __m256i eight = m256_const1_32( 8 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m256_const1_32( pdata[i] );
vdata[i] = m256_const1_32( pdata[i] );
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
@@ -135,9 +135,11 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_8way_transform_le( midstate, vdata, initstate );
sha256_8way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
do
{
// 1. final 16 bytes of data, with padding
@@ -145,7 +147,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
block[ 4] = last_byte;
memset_zero_256( block + 5, 10 );
block[15] = m256_const1_32( 80*8 ); // bit count
sha256_8way_transform_le( hash32, block, midstate );
sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
// 2. 32 byte hash from 1.
memcpy_256( block, hash32, 8 );

View File

@@ -702,6 +702,36 @@ memcpy( state_out, state_in, 32 );
}
void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in )
{
uint32_t t1, t2, X_xor_Y, Y_xor_Z = state_in[1] ^ state_in[2];
memcpy( state_out, state_in, 32 );
t1 = state_out[7] + BSG2_1( state_out[4] )
+ CH( state_out[4], state_out[5], state_out[6] ) + 0x428A2F98 + data[0];
t2 = BSG2_0( state_out[0] )
+ MAJ( state_out[0], state_out[1], state_out[2] );
Y_xor_Z = X_xor_Y;
state_out[3] += t1;
state_out[7] = t1 + t2;
t1 = state_out[6] + BSG2_1( state_out[3] )
+ CH( state_out[3], state_out[4], state_out[5] ) + 0x71374491 + data[1];
t2 = BSG2_0( state_out[7] )
+ MAJ( state_out[7], state_out[0], state_out[1] );
Y_xor_Z = X_xor_Y;
state_out[2] += t1;
state_out[6] = t1 + t2;
t1 = state_out[5] + BSG2_1( state_out[2] )
+ CH( state_out[2], state_out[3], state_out[4] ) + 0xB5C0FBCF + data[2];
t2 = BSG2_0( state_out[6] )
+ MAJ( state_out[6], state_out[7], state_out[0] );
state_out[1] += t1;
state_out[5] = t1 + t2;
}
/* see sph_sha2.h */
void
sph_sha224_init(void *cc)

View File

@@ -215,6 +215,9 @@ void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
#if SPH_64