This commit is contained in:
Jay D Dee
2021-10-19 22:35:36 -04:00
parent 47cc5dcff5
commit 1a234cbe53
18 changed files with 474 additions and 189 deletions

View File

@@ -171,6 +171,7 @@ cpuminer_SOURCES = \
algo/sha/hmac-sha256-hash-4way.c \
algo/sha/sha256d.c \
algo/sha/sha2.c \
algo/sha/sha256d-4way.c \
algo/sha/sha256t-gate.c \
algo/sha/sha256t-4way.c \
algo/sha/sha256t.c \

View File

@@ -65,6 +65,20 @@ If not what makes it happen or not happen?
Change Log
----------
v3.8.2
Issue #342, fixed Groestl AES on Windows, broken in v3.18.0.
AVX512 for sha256d.
SSE42 and AVX may now be displayed as mining features at startup.
This is hard coded for each algo, and is only implemented for scrypt
at this time as it is the only algo with significant performance differences
with those features.
Fixed an issue where a high hashrate algo could cause excessive invalid hash
rate log reports when starting up in benchmark mode.
v3.18.1
More speed for scrypt:

View File

@@ -337,42 +337,42 @@ do{ \
XC2 = XOR( XC2, TC ); \
\
TA = ADD32( XA2, XA1 ); \
XA1 = ROL_1X32( XA1 ); \
TB = ADD32( XB2, XB1 ); \
TC = ADD32( XC2, XC1 ); \
TA = ROL32( TA, 13 ); \
XA1 = ROL_1X32( XA1 ); \
XB1 = ROL_1X32( XB1 ); \
XC1 = ROL_1X32( XC1 ); \
TA = ROL32( TA, 13 ); \
XA3 = XOR( XA3, TA ); \
XC1 = ROL_1X32( XC1 ); \
TB = ROL32( TB, 13 ); \
XB3 = XOR( XB3, TB ); \
TC = ROL32( TC, 13 ); \
XC3 = XOR( XC3, TC ); \
\
TA = ADD32( XA3, XA2 ); \
XA2 = SWAP_64( XA2 ); \
TB = ADD32( XB3, XB2 ); \
TC = ADD32( XC3, XC2 ); \
TA = ROL32( TA, 18 ); \
XA2 = SWAP_64( XA2 ); \
XB2 = SWAP_64( XB2 ); \
XC2 = SWAP_64( XC2 ); \
XA0 = XOR( XA0, TA ); \
TB = ROL32( TB, 18 ); \
XB0 = XOR( XB0, TB ); \
XC2 = SWAP_64( XC2 ); \
TC = ROL32( TC, 18 ); \
XC0 = XOR( XC0, TC ); \
\
TA = ADD32( XA0, XA1 ); \
XA3 = ROR_1X32( XA3 ); \
TB = ADD32( XB0, XB1 ); \
TC = ADD32( XC0, XC1 ); \
TA = ROL32( TA, 7 ); \
XA3 = ROR_1X32( XA3 ); \
XB3 = ROR_1X32( XB3 ); \
XA3 = XOR( XA3, TA ); \
TB = ROL32( TB, 7 ); \
XB3 = ROR_1X32( XB3 ); \
XC3 = ROR_1X32( XC3 ); \
XB3 = XOR( XB3, TB ); \
TC = ROL32( TC, 7 ); \
XC3 = ROR_1X32( XC3 ); \
XC3 = XOR( XC3, TC ); \
\
TA = ADD32( XA3, XA0 ); \
@@ -399,24 +399,24 @@ do{ \
XC1 = XOR( XC1, TC ); \
\
TA = ADD32( XA1, XA2 ); \
XA2 = SWAP_64( XA2 ); \
TB = ADD32( XB1, XB2 ); \
XB2 = SWAP_64( XB2 ); \
TA = ROL32( TA, 18); \
TC = ADD32( XC1, XC2 ); \
XA2 = SWAP_64( XA2 ); \
XC2 = SWAP_64( XC2 ); \
TB = ROL32( TB, 18); \
XA0 = XOR( XA0, TA ); \
XB2 = SWAP_64( XB2 ); \
XA1 = ROR_1X32( XA1 ); \
TC = ROL32( TC, 18); \
XB0 = XOR( XB0, TB ); \
XC2 = SWAP_64( XC2 ); \
XA1 = ROR_1X32( XA1 ); \
XB1 = ROR_1X32( XB1 ); \
XC0 = XOR( XC0, TC ); \
XC1 = ROR_1X32( XC1 ); \
} while (0);
// slow rol, an attempt to optimze non-avx512 bit rotations
// slow rot, an attempt to optimze non-avx512 bit rotations
// Contains target specific instructions, only for use with 128 bit vectors
#define SALSA_2ROUNDS_SIMD128_3BUF_SLOROT \
do{ \

View File

@@ -28,7 +28,6 @@
*/
#include "algo-gate-api.h"
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
@@ -55,11 +54,25 @@ static const uint32_t sha256_initial_state[8] =
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
static int scrypt_throughput = 0;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SCRYPT_THROUGHPUT 16
#elif defined(__AVX2__)
#define SCRYPT_THROUGHPUT 8
#else
#define SCRYPT_THROUGHPUT 4
#endif
// static int scrypt_throughput = 0;
static int scratchbuf_size = 0;
static __thread char *scratchbuf = NULL;
static __thread uint32_t *scratchbuf = NULL;
// change this to a constant to be used directly as input state arg
// vectors still need an init function.
@@ -709,15 +722,11 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
#endif // AVX512
//#if defined(USE_ASM) && defined(__x86_64__)
#define SCRYPT_MAX_WAYS 12
#define HAVE_SCRYPT_3WAY 1
//int scrypt_best_throughput();
void scrypt_core(uint32_t *X, uint32_t *V, int N);
void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
//#if defined(USE_AVX2)
#if defined(__AVX2__)
#undef SCRYPT_MAX_WAYS
#define SCRYPT_MAX_WAYS 24
@@ -727,40 +736,39 @@ void scrypt_core_6way(uint32_t *X, uint32_t *V, int N);
#ifndef SCRYPT_MAX_WAYS
#define SCRYPT_MAX_WAYS 1
//#define scrypt_best_throughput() 1
#endif
#include "scrypt-core-4way.h"
static bool scrypt_N_1_1_256(const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thr_id )
/*
static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
uint32_t *midstate, int N, int thr_id )
{
uint32_t tstate[8], ostate[8];
uint32_t X[32];
uint32_t *V = (uint32_t*)scratchpad;
memcpy(tstate, midstate, 32);
HMAC_SHA256_80_init(input, tstate, ostate);
PBKDF2_SHA256_80_128(tstate, ostate, input, X);
scrypt_core_simd128( X, V, N ); // woring
scrypt_core_simd128( X, scratchbuf, N ); // woring
// scrypt_core_1way( X, V, N ); // working
// scrypt_core(X, V, N);
PBKDF2_SHA256_128_32(tstate, ostate, X, output);
return true;
}
*/
#if defined(__AVX2__)
#if ( SCRYPT_THROUGHPUT == 8 )
static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
uint32_t *midstate, int N, int thrid )
{
uint32_t _ALIGN(128) tstate[ 8*8 ];
uint32_t _ALIGN(128) ostate[ 8*8 ];
uint32_t _ALIGN(128) W[ 8*32 ];
uint32_t _ALIGN(128) X[ 8*32 ];
uint32_t *V = (uint32_t*)scratchpad;
intrlv_8x32( W, input, input+ 20, input+ 40, input+ 60,
input+80, input+100, input+120, input+140, 640 );
@@ -774,11 +782,11 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
if ( opt_param_n > 0x4000 )
{
scrypt_core_simd128_3buf( X, V, N );
scrypt_core_simd128_3buf( X, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_3buf( X+ 96, V, N );
scrypt_core_simd128_3buf( X+ 96, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+192, V, N );
scrypt_core_simd128_2buf( X+192, scratchbuf, N );
}
else
{
@@ -786,13 +794,13 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
intrlv_2x128( W+128, X+128, X+160, 1024 );
intrlv_2x128( W+192, X+192, X+224, 1024 );
scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)V, N );
scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)scratchbuf, N );
dintrlv_2x128( X, X+ 32, W, 1024 );
dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
dintrlv_2x128( X+128, X+160, W+128, 1024 );
@@ -928,16 +936,15 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if ( SCRYPT_THROUGHPUT == 16 )
static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
uint32_t *midstate, int N, int thrid )
{
uint32_t _ALIGN(128) tstate[ 16*8 ];
uint32_t _ALIGN(128) ostate[ 16*8 ];
uint32_t _ALIGN(128) W[ 16*32 ];
uint32_t _ALIGN(128) X[ 16*32 ];
uint32_t *V = (uint32_t*)scratchpad;
intrlv_16x32( W, input, input+ 20, input+ 40, input+ 60,
input+ 80, input+100, input+120, input+140,
@@ -956,17 +963,17 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
if ( opt_param_n > 0x4000 )
{
scrypt_core_simd128_3buf( X, V, N );
scrypt_core_simd128_3buf( X, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_3buf( X+ 96, V, N );
scrypt_core_simd128_3buf( X+ 96, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+192, V, N );
scrypt_core_simd128_2buf( X+192, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_3buf( X+256, V, N );
scrypt_core_simd128_3buf( X+256, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_3buf( X+352, V, N );
scrypt_core_simd128_3buf( X+352, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+448, V, N );
scrypt_core_simd128_2buf( X+448, scratchbuf, N );
}
else
{
@@ -974,13 +981,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
intrlv_4x128( W+256, X+256, X+288, X+320, X+352, 1024 );
intrlv_4x128( W+384, X+384, X+416, X+448, X+480, 1024 );
scrypt_core_4way_simd128( (__m512i*) W, (__m512i*)V, N );
scrypt_core_4way_simd128( (__m512i*) W, (__m512i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)V, N );
scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_4way_simd128( (__m512i*)(W+384), (__m512i*)V, N );
scrypt_core_4way_simd128( (__m512i*)(W+384), (__m512i*)scratchbuf, N );
dintrlv_4x128( X, X+ 32, X+ 64, X+ 96, W, 1024 );
dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
dintrlv_4x128( X+256, X+288, X+320, X+352, W+256, 1024 );
@@ -1236,15 +1243,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
#endif // AVX512
#if defined(__SHA__)
#if 0
static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
uint32_t *midstate, int N, int thrid )
{
uint32_t _ALIGN(128) tstate[ 2*8 ];
uint32_t _ALIGN(128) ostate[ 2*8 ];
uint32_t _ALIGN(128) W[ 2*32 ];
uint32_t *V = (uint32_t*)scratchpad;
memcpy( tstate, midstate, 32 );
memcpy( tstate+ 8, midstate, 32 );
@@ -1254,7 +1259,7 @@ static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
PBKDF2_SHA256_80_128_SHA_2BUF( tstate, tstate+8, ostate, ostate+8,
input, input+20, W, W+32 );
scrypt_core_simd128_2buf( W, V, N );
scrypt_core_simd128_2buf( W, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
PBKDF2_SHA256_128_32_SHA_2BUF( tstate, tstate+8, ostate, ostate+8, W, W+32,
@@ -1264,12 +1269,11 @@ static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
}
static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
uint32_t *midstate, int N, int thrid )
{
uint32_t _ALIGN(128) tstate[4 * 8];
uint32_t _ALIGN(128) ostate[4 * 8];
uint32_t _ALIGN(128) W[4 * 32];
uint32_t *V = (uint32_t*)scratchpad;
memcpy( tstate, midstate, 32 );
memcpy( tstate+ 8, midstate, 32 );
@@ -1300,9 +1304,9 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
*/
// working, double buffered linear simd
scrypt_core_simd128_2buf( W, V, N );
scrypt_core_simd128_2buf( W, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( W+64, V, N );
scrypt_core_simd128_2buf( W+64, scratchbuf, N );
/*
scrypt_core_simd128_3buf( W, V, N );
@@ -1323,17 +1327,15 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
return 1;
}
#endif
#else
#ifdef HAVE_SHA256_4WAY
#if ( SCRYPT_THROUGHPUT == 4 )
static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
uint32_t *midstate, int N, int thrid )
{
uint32_t _ALIGN(128) tstate[ 4*8 ];
uint32_t _ALIGN(128) ostate[ 4*8 ];
uint32_t _ALIGN(128) W[ 4*32 ];
uint32_t *V = (uint32_t*)scratchpad;
intrlv_4x32( W, input, input+20, input+40, input+60, 640 );
for ( int i = 0; i < 8; i++ )
@@ -1346,13 +1348,13 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
{
uint32_t _ALIGN(128) X[ 4*32 ];
dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
scrypt_core_simd128_2buf( X, V, N );
scrypt_core_simd128_2buf( X, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+64, V, N );
scrypt_core_simd128_2buf( X+64, scratchbuf, N );
intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
}
else
scrypt_core_4way( (__m128i*)W, (__m128i*)V, N );
scrypt_core_4way( (__m128i*)W, (__m128i*)scratchbuf, N );
@@ -1398,65 +1400,73 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
return 1;
}
#endif /* HAVE_SHA256_4WAY */
#endif // SCRYPT_THROUGHPUT == 4
#endif // SHA
//#endif // SHA
extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) hash[ 8*SCRYPT_THROUGHPUT ];
uint32_t _ALIGN(64) data[ 20*SCRYPT_THROUGHPUT ];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
uint32_t midstate[8];
uint32_t n = pdata[19] - 1;
int thr_id = mythr->id;
int throughput = scrypt_throughput;
int i;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
for ( i = 0; i < throughput; i++ )
for ( i = 0; i < SCRYPT_THROUGHPUT; i++ )
memcpy( data + i * 20, pdata, 80 );
sha256_transform_le( midstate, data, sha256_initial_state );
do {
bool rc = true;
for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n;
for ( i = 0; i < SCRYPT_THROUGHPUT; i++ ) data[ i*20 + 19 ] = ++n;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
if ( throughput == 16 )
rc = scrypt_N_1_1_256_16way( data, hash, midstate, scratchbuf,
opt_param_n, thr_id );
else
#endif
#if defined(__AVX2__)
if ( throughput == 8 )
rc = scrypt_N_1_1_256_8way( data, hash, midstate, scratchbuf,
opt_param_n, thr_id );
else
#endif
if ( throughput == 4 ) // slower on Ryzen than 8way
#if defined(__SHA__)
rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, scratchbuf,
opt_param_n, thr_id );
//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if ( SCRYPT_THROUGHPUT == 16 )
// if ( SCRYPT_THROUGHPUT == 16 )
rc = scrypt_N_1_1_256_16way( data, hash, midstate, opt_param_n,
thr_id );
// else
//#endif
//#if defined(__AVX2__)
#elif ( SCRYPT_THROUGHPUT == 8 )
// if ( SCRYPT_THROUGHPUT == 8 )
rc = scrypt_N_1_1_256_8way( data, hash, midstate, opt_param_n,
thr_id );
// else
//#endif
#elif ( SCRYPT_THROUGHPUT == 4 )
// if ( SCRYPT_THROUGHPUT == 4 ) // slower on Ryzen than 8way
//#if defined(__SHA__)
// rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, opt_param_n,
// thr_id );
//#else
rc = scrypt_N_1_1_256_4way( data, hash, midstate, opt_param_n,
thr_id );
#else
rc = scrypt_N_1_1_256_4way( data, hash, midstate, scratchbuf,
opt_param_n, thr_id );
#error "Invalid SCRYPT_THROUGHPUT"
#endif
/*
#if defined(__SHA__)
else
if (throughput == 2 ) // slower on Ryzen than 4way_sha & 8way
rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, scratchbuf,
opt_param_n, thr_id );
if ( SCRYPT_THROUGHPUT == 2 ) // slower on Ryzen than 4way_sha & 8way
rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, opt_param_n,
thr_id );
#endif
else // should never get here
rc = scrypt_N_1_1_256( data, hash, midstate, scratchbuf,
opt_param_n, thr_id );
rc = scrypt_N_1_1_256( data, hash, midstate, opt_param_n, thr_id );
*/
// test the hash
if ( rc )
for ( i = 0; i < throughput; i++ )
for ( i = 0; i < SCRYPT_THROUGHPUT; i++ )
{
if ( unlikely( valid_hash( hash + i*8, ptarget ) && !opt_benchmark ) )
{
@@ -1468,7 +1478,7 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
}
} while ( likely( ( n < ( max_nonce - throughput ) ) && !(*restart) ) );
} while ( likely( ( n < ( max_nonce - SCRYPT_THROUGHPUT ) ) && !(*restart) ) );
*hashes_done = n - pdata[19];
pdata[19] = n;
@@ -1489,7 +1499,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
//#if defined(__SHA__)
// gate->optimizations = SSE2_OPT | SHA_OPT;
//#else
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT;
//#endif
gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
gate->scanhash = (void*)&scanhash_scrypt;
@@ -1497,8 +1507,11 @@ bool register_scrypt_algo( algo_gate_t* gate )
opt_param_n = opt_param_n ? opt_param_n : 1024;
applog( LOG_INFO,"Scrypt paramaters: N= %d, R= 1", opt_param_n );
// scrypt_throughput can be defined at compile time and used to replace
// MAX_WAYS to reduce memory usage.
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
scrypt_throughput = 16;
// scrypt_throughput = 16;
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
else
@@ -1511,13 +1524,13 @@ bool register_scrypt_algo( algo_gate_t* gate )
*/
#elif defined(__AVX2__)
scrypt_throughput = 8;
// scrypt_throughput = 8;
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
else
scratchbuf_size = opt_param_n * 2 * 128; // 2 way
#else
scrypt_throughput = 4;
// scrypt_throughput = 4;
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
else
@@ -1533,7 +1546,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
format_number_si( &d_size, d_units );
applog( LOG_INFO,"Throughput %d/thr, Buffer %.0f %siB/thr, Total %.0f %siB\n",
scrypt_throughput, t_size, t_units, d_size, d_units );
SCRYPT_THROUGHPUT, t_size, t_units, d_size, d_units );
return true;
};

View File

@@ -84,6 +84,11 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
const __m256i *state_in );
void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
const __m256i *state_in );
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const __m256i *state_mid );
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

View File

@@ -8,7 +8,7 @@
* any later version. See COPYING for more details.
*/
#include "algo-gate-api.h"
#include "sha256d-4way.h"
#include <string.h>
#include <inttypes.h>
@@ -181,6 +181,8 @@ static const uint32_t sha256d_hash1[16] = {
};
// this performs the entire hash all over again, why?
// because main function only does 56 rounds.
static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
{
uint32_t S[16];
@@ -492,7 +494,7 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
void sha256d_ms_4way(uint32_t *hash, uint32_t *data,
const uint32_t *midstate, const uint32_t *prehash);
static inline int scanhash_sha256d_4way( struct work *work,
static inline int scanhash_sha256d_4way_pooler( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
@@ -553,7 +555,7 @@ static inline int scanhash_sha256d_4way( struct work *work,
void sha256d_ms_8way(uint32_t *hash, uint32_t *data,
const uint32_t *midstate, const uint32_t *prehash);
static inline int scanhash_sha256d_8way( struct work *work,
static inline int scanhash_sha256d_8way_pooler( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
@@ -609,7 +611,7 @@ static inline int scanhash_sha256d_8way( struct work *work,
#endif /* HAVE_SHA256_8WAY */
int scanhash_sha256d( struct work *work,
int scanhash_sha256d_pooler( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
@@ -625,11 +627,11 @@ int scanhash_sha256d( struct work *work,
#ifdef HAVE_SHA256_8WAY
if (sha256_use_8way())
return scanhash_sha256d_8way( work, max_nonce, hashes_done, mythr );
return scanhash_sha256d_8way_pooler( work, max_nonce, hashes_done, mythr );
#endif
#ifdef HAVE_SHA256_4WAY
if (sha256_use_4way())
return scanhash_sha256d_4way( work, max_nonce, hashes_done, mythr );
return scanhash_sha256d_4way_pooler( work, max_nonce, hashes_done, mythr );
#endif
memcpy(data, pdata + 16, 64);
@@ -690,9 +692,13 @@ int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
bool register_sha256d_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_sha256d;
// gate->hash = (void*)&sha256d;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
#if defined(SHA256D_16WAY)
gate->scanhash = (void*)&scanhash_sha256d_16way;
#else
gate->scanhash = (void*)&scanhash_sha256d_pooler;
#endif
// gate->hash = (void*)&sha256d;
return true;
};

View File

@@ -548,6 +548,136 @@ void sha256_8way_init( sha256_8way_context *sc )
sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
}
// Aggresive prehashing, LE byte order
void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
const __m256i *state_in )
{
__m256i A, B, C, D, E, F, G, H;
A = _mm256_load_si256( state_in );
B = _mm256_load_si256( state_in + 1 );
C = _mm256_load_si256( state_in + 2 );
D = _mm256_load_si256( state_in + 3 );
E = _mm256_load_si256( state_in + 4 );
F = _mm256_load_si256( state_in + 5 );
G = _mm256_load_si256( state_in + 6 );
H = _mm256_load_si256( state_in + 7 );
#if !defined(__AVX512VL__)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
#endif
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
_mm256_store_si256( state_mid , A );
_mm256_store_si256( state_mid + 1, B );
_mm256_store_si256( state_mid + 2, C );
_mm256_store_si256( state_mid + 3, D );
_mm256_store_si256( state_mid + 4, E );
_mm256_store_si256( state_mid + 5, F );
_mm256_store_si256( state_mid + 6, G );
_mm256_store_si256( state_mid + 7, H );
}
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const __m256i *state_mid )
{
__m256i A, B, C, D, E, F, G, H;
__m256i W[16];
memcpy_256( W, data, 16 );
A = _mm256_load_si256( state_mid );
B = _mm256_load_si256( state_mid + 1 );
C = _mm256_load_si256( state_mid + 2 );
D = _mm256_load_si256( state_mid + 3 );
E = _mm256_load_si256( state_mid + 4 );
F = _mm256_load_si256( state_mid + 5 );
G = _mm256_load_si256( state_mid + 6 );
H = _mm256_load_si256( state_mid + 7 );
// SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
// SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
// SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
#if !defined(__AVX512VL__)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( G, H );
#endif
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
for ( int j = 16; j < 64; j += 16 )
{
W[ 0] = SHA2x_MEXP( 14, 9, 1, 0 );
W[ 1] = SHA2x_MEXP( 15, 10, 2, 1 );
W[ 2] = SHA2x_MEXP( 0, 11, 3, 2 );
W[ 3] = SHA2x_MEXP( 1, 12, 4, 3 );
W[ 4] = SHA2x_MEXP( 2, 13, 5, 4 );
W[ 5] = SHA2x_MEXP( 3, 14, 6, 5 );
W[ 6] = SHA2x_MEXP( 4, 15, 7, 6 );
W[ 7] = SHA2x_MEXP( 5, 0, 8, 7 );
W[ 8] = SHA2x_MEXP( 6, 1, 9, 8 );
W[ 9] = SHA2x_MEXP( 7, 2, 10, 9 );
W[10] = SHA2x_MEXP( 8, 3, 11, 10 );
W[11] = SHA2x_MEXP( 9, 4, 12, 11 );
W[12] = SHA2x_MEXP( 10, 5, 13, 12 );
W[13] = SHA2x_MEXP( 11, 6, 14, 13 );
W[14] = SHA2x_MEXP( 12, 7, 15, 14 );
W[15] = SHA2x_MEXP( 13, 8, 0, 15 );
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
}
A = _mm256_add_epi32( A, _mm256_load_si256( state_in ) );
B = _mm256_add_epi32( B, _mm256_load_si256( state_in + 1 ) );
C = _mm256_add_epi32( C, _mm256_load_si256( state_in + 2 ) );
D = _mm256_add_epi32( D, _mm256_load_si256( state_in + 3 ) );
E = _mm256_add_epi32( E, _mm256_load_si256( state_in + 4 ) );
F = _mm256_add_epi32( F, _mm256_load_si256( state_in + 5 ) );
G = _mm256_add_epi32( G, _mm256_load_si256( state_in + 6 ) );
H = _mm256_add_epi32( H, _mm256_load_si256( state_in + 7 ) );
_mm256_store_si256( state_out , A );
_mm256_store_si256( state_out + 1, B );
_mm256_store_si256( state_out + 2, C );
_mm256_store_si256( state_out + 3, D );
_mm256_store_si256( state_out + 4, E );
_mm256_store_si256( state_out + 5, F );
_mm256_store_si256( state_out + 6, G );
_mm256_store_si256( state_out + 7, H );
}
// need to handle odd byte length for yespower.
// Assume only last update is odd.

View File

@@ -53,4 +53,8 @@ void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
#define sha256_transform_be sph_sha256_transform_be
#endif
// SHA can't do only 3 rounds
#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
#endif

View File

@@ -1,4 +1,4 @@
#include "sha256t-gate.h"
#include "sha256d-4way.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -13,7 +13,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
__m512i block[16] __attribute__ ((aligned (64)));
__m512i hash32[8] __attribute__ ((aligned (32)));
__m512i initstate[8] __attribute__ ((aligned (32)));
__m512i midstate[8] __attribute__ ((aligned (32)));
__m512i midstate1[8] __attribute__ ((aligned (32)));
__m512i midstate2[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m512i vdata[20] __attribute__ ((aligned (32)));
@@ -46,11 +46,10 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 byte block of data
sha256_16way_transform_le( midstate, vdata, initstate );
sha256_16way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
do
{
@@ -59,7 +58,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
block[ 4] = last_byte;
memset_zero_512( block + 5, 10 );
block[15] = m512_const1_32( 80*8 ); // bit count
sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
// 2. 32 byte hash from 1.
memcpy_512( block, hash32, 8 );
@@ -99,7 +98,8 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
__m256i block[16] __attribute__ ((aligned (64)));
__m256i hash32[8] __attribute__ ((aligned (32)));
__m256i initstate[8] __attribute__ ((aligned (32)));
__m256i midstate[8] __attribute__ ((aligned (32)));
__m256i midstate1[8] __attribute__ ((aligned (32)));
__m256i midstate2[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m256i vdata[20] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
@@ -130,8 +130,10 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_8way_transform_le( midstate, vdata, initstate );
sha256_8way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
do
{
@@ -140,7 +142,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
block[ 4] = last_byte;
memset_zero_256( block + 5, 10 );
block[15] = m256_const1_32( 80*8 ); // bit count
sha256_8way_transform_le( hash32, block, midstate );
sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
// 2. 32 byte hash from 1.
memcpy_256( block, hash32, 8 );
@@ -253,3 +255,20 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
#endif
/*
bool register_sha256d_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
#if defined(SHA256D_16WAY)
gate->scanhash = (void*)&scanhash_sha256d_16way;
#elif defined(SHA256D_8WAY)
gate->scanhash = (void*)&scanhash_sha256d_8way;
#elif defined(SHA256D_4WAY)
gate->scanhash = (void*)&scanhash_sha256d_4way;
#endif
// gate->hash = (void*)&sha256d;
return true;
};
*/

48
algo/sha/sha256d-4way.h Normal file
View File

@@ -0,0 +1,48 @@
#ifndef __SHA256D_4WAY_H__
#define __SHA256D_4WAY_H__ 1
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA256D_16WAY 1
/*
#elif defined(__AVX2__)
#define SHA256D_8WAY 1
#else
#define SHA256D_4WAY 1
*/
#endif
bool register_sha256d_algo( algo_gate_t* gate );
#if defined(SHA256D_16WAY)
int scanhash_sha256d_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
/*
#if defined(SHA256D_8WAY)
int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA256D_4WAY)
int scanhash_sha256d_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
*/
/*
#if defined(__SHA__)
int scanhash_sha256d( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
*/
#endif

View File

@@ -13,7 +13,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
__m512i block[16] __attribute__ ((aligned (64)));
__m512i hash32[8] __attribute__ ((aligned (32)));
__m512i initstate[8] __attribute__ ((aligned (32)));
__m512i midstate[8] __attribute__ ((aligned (32)));
__m512i midstate1[8] __attribute__ ((aligned (32)));
__m512i midstate2[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m512i vdata[20] __attribute__ ((aligned (32)));
@@ -46,11 +46,10 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 byte block of data
sha256_16way_transform_le( midstate, vdata, initstate );
sha256_16way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
do
{
@@ -59,7 +58,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
block[ 4] = last_byte;
memset_zero_512( block + 5, 10 );
block[15] = m512_const1_32( 80*8 ); // bit count
sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
// 2. 32 byte hash from 1.
memcpy_512( block, hash32, 8 );
@@ -104,7 +103,8 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
__m256i block[16] __attribute__ ((aligned (64)));
__m256i hash32[8] __attribute__ ((aligned (32)));
__m256i initstate[8] __attribute__ ((aligned (32)));
__m256i midstate[8] __attribute__ ((aligned (32)));
__m256i midstate1[8] __attribute__ ((aligned (32)));
__m256i midstate2[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m256i vdata[20] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
@@ -135,8 +135,10 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_8way_transform_le( midstate, vdata, initstate );
sha256_8way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
do
{
@@ -145,7 +147,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
block[ 4] = last_byte;
memset_zero_256( block + 5, 10 );
block[15] = m256_const1_32( 80*8 ); // bit count
sha256_8way_transform_le( hash32, block, midstate );
sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
// 2. 32 byte hash from 1.
memcpy_256( block, hash32, 8 );

View File

@@ -702,6 +702,36 @@ memcpy( state_out, state_in, 32 );
}
void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in )
{
uint32_t t1, t2, X_xor_Y, Y_xor_Z = state_in[1] ^ state_in[2];
memcpy( state_out, state_in, 32 );
t1 = state_out[7] + BSG2_1( state_out[4] )
+ CH( state_out[4], state_out[5], state_out[6] ) + 0x428A2F98 + data[0];
t2 = BSG2_0( state_out[0] )
+ MAJ( state_out[0], state_out[1], state_out[2] );
Y_xor_Z = X_xor_Y;
state_out[3] += t1;
state_out[7] = t1 + t2;
t1 = state_out[6] + BSG2_1( state_out[3] )
+ CH( state_out[3], state_out[4], state_out[5] ) + 0x71374491 + data[1];
t2 = BSG2_0( state_out[7] )
+ MAJ( state_out[7], state_out[0], state_out[1] );
Y_xor_Z = X_xor_Y;
state_out[2] += t1;
state_out[6] = t1 + t2;
t1 = state_out[5] + BSG2_1( state_out[2] )
+ CH( state_out[2], state_out[3], state_out[4] ) + 0xB5C0FBCF + data[2];
t2 = BSG2_0( state_out[6] )
+ MAJ( state_out[6], state_out[7], state_out[0] );
state_out[1] += t1;
state_out[5] = t1 + t2;
}
/* see sph_sha2.h */
void
sph_sha224_init(void *cc)

View File

@@ -215,6 +215,9 @@ void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
#if SPH_64

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.1.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.2.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.18.1'
PACKAGE_STRING='cpuminer-opt 3.18.1'
PACKAGE_VERSION='3.18.2'
PACKAGE_STRING='cpuminer-opt 3.18.2'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.18.1 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.18.2 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.18.1:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.18.2:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.18.1
cpuminer-opt configure 3.18.2
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.18.1, which was
It was created by cpuminer-opt $as_me 3.18.2, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.18.1'
VERSION='3.18.2'
cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.18.1, which was
This file was extended by cpuminer-opt $as_me 3.18.2, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.18.1
cpuminer-opt config.status 3.18.2
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.18.1])
AC_INIT([cpuminer-opt], [3.18.2])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -1112,19 +1112,17 @@ void report_summary_log( bool force )
applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], short_url );
applog2( LOG_NOTICE, "Periodic Report %s %s", et_str, upt_str );
applog2( LOG_INFO, "Share rate %.2f/min %.2f/min",
submit_rate, (double)submitted_share_count*60. /
( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ) );
submit_rate, safe_div( (double)submitted_share_count*60.,
( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ), 0. ) );
applog2( LOG_INFO, "Hash rate %7.2f%sh/s %7.2f%sh/s (%.2f%sh/s)",
shrate, shr_units, sess_hrate, sess_hr_units, ghrate, ghr_units );
if ( accepted_share_count < submitted_share_count )
{
double lost_ghrate = uptime.tv_sec == 0 ? 0.
: target_diff
* (double)(submitted_share_count - accepted_share_count )
/ (double)uptime.tv_sec;
double lost_shrate = share_time == 0. ? 0.
: target_diff * (double)(submits - accepts ) / share_time;
double lost_ghrate = safe_div( target_diff
* (double)(submitted_share_count - accepted_share_count ),
(double)uptime.tv_sec, 0. );
double lost_shrate = safe_div( target_diff * (double)(submits - accepts ), share_time, 0. );
char lshr_units[4] = {0};
char lghr_units[4] = {0};
scale_hash_for_display( &lost_shrate, lshr_units );
@@ -2495,6 +2493,8 @@ static void *miner_thread( void *userdata )
timeval_subtract( &uptime, &total_hashes_time, &session_start );
double hashrate = safe_div( total_hashes, uptime.tv_sec, 0. );
if ( hashrate > 0. )
{
scale_hash_for_display( &hashrate, hr_units );
sprintf( hr, "%.2f", hashrate );
#if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32))
@@ -2508,6 +2508,7 @@ static void *miner_thread( void *userdata )
hi_freq / 1e6 );
#endif
}
}
} // benchmark
// conditional mining
@@ -2900,6 +2901,7 @@ static bool cpu_capability( bool display_only )
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
bool algo_has_aes = set_incl( AES_OPT, algo_features );
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
bool algo_has_avx = set_incl( AVX_OPT, algo_features );
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features );
bool algo_has_sha = set_incl( SHA_OPT, algo_features );
@@ -2907,6 +2909,8 @@ static bool cpu_capability( bool display_only )
bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features );
bool use_aes;
bool use_sse2;
bool use_sse42;
bool use_avx;
bool use_avx2;
bool use_avx512;
bool use_sha;
@@ -2976,6 +2980,8 @@ static bool cpu_capability( bool display_only )
else if ( sw_has_aes ) printf( " AES" );
if ( sw_has_sha ) printf( " SHA" );
if ( !display_only )
{
printf("\nAlgo features:");
if ( algo_features == EMPTY_SET ) printf( " None" );
else
@@ -2989,6 +2995,7 @@ static bool cpu_capability( bool display_only )
else if ( algo_has_aes ) printf( " AES" );
if ( algo_has_sha ) printf( " SHA" );
}
}
printf("\n");
if ( display_only ) return true;
@@ -3022,6 +3029,8 @@ static bool cpu_capability( bool display_only )
// Determine mining options
use_sse2 = cpu_has_sse2 && algo_has_sse2;
use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42;
use_avx = cpu_has_avx && sw_has_avx && algo_has_avx;
use_aes = cpu_has_aes && sw_has_aes && algo_has_aes;
use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2;
use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
@@ -3038,6 +3047,8 @@ static bool cpu_capability( bool display_only )
{
if ( use_avx512 ) printf( " AVX512" );
else if ( use_avx2 ) printf( " AVX2" );
else if ( use_avx ) printf( " AVX" );
else if ( use_sse42 ) printf( " SSE42" );
else if ( use_sse2 ) printf( " SSE2" );
if ( use_vaes ) printf( " VAES" );
else if ( use_aes ) printf( " AES" );

18
miner.h
View File

@@ -868,9 +868,9 @@ Options:\n\
yespowerr16 Yenten (YTN)\n\
yespower-b2b generic yespower + blake2b\n\
zr5 Ziftr\n\
-N, --param-n N parameter for scrypt based algos\n\
-R, --param-r R parameter for scrypt based algos\n\
-K, --param-key Key (pers) parameter for algos that use it\n\
-N, --param-n=N N parameter for scrypt based algos\n\
-R, --param-r=N R parameter for scrypt based algos\n\
-K, --param-key=STRING Key (pers) parameter for algos that use it\n\
-o, --url=URL URL of mining server\n\
-O, --userpass=U:P username:password pair for mining server\n\
-u, --user=USERNAME username for mining server\n\
@@ -886,8 +886,8 @@ Options:\n\
-s, --scantime=N upper bound on time spent scanning current work when\n\
long polling is unavailable, in seconds (default: 5)\n\
--randomize Randomize scan range start to reduce duplicates\n\
-f, --diff-factor Divide req. difficulty by this factor (std is 1.0)\n\
-m, --diff-multiplier Multiply difficulty by this factor (std is 1.0)\n\
-f, --diff-factor=N Divide req. difficulty by this factor (std is 1.0)\n\
-m, --diff-multiplier=N Multiply difficulty by this factor (std is 1.0)\n\
--hash-meter Display thread hash rates\n\
--coinbase-addr=ADDR payout address for solo mining\n\
--coinbase-sig=TEXT data to insert in the coinbase when possible\n\
@@ -895,9 +895,9 @@ Options:\n\
--no-getwork disable getwork support\n\
--no-gbt disable getblocktemplate support\n\
--no-stratum disable X-Stratum support\n\
--no-extranonce disable Stratum extranonce support\n\
--no-extranonce disable Stratum extranonce subscribe\n\
--no-redirect ignore requests to change the URL of the mining server\n\
-q, --quiet disable per-thread hashmeter output\n\
-q, --quiet reduce log verbosity\n\
--no-color disable colored output\n\
-D, --debug enable debug output\n\
-P, --protocol-dump verbose dump of protocol-level activities\n"
@@ -916,9 +916,9 @@ Options:\n\
--max-rate=N[KMG] Only mine if net hashrate is less than specified value\n\
--max-diff=N Only mine if net difficulty is less than specified value\n\
-c, --config=FILE load a JSON-format configuration file\n\
--data-file path and name of data file\n\
--data-file=FILE path and name of data file\n\
--verify enable additional time consuming start up tests\n\
-V, --version display version information and exit\n\
-V, --version display version and CPU information and exit\n\
-h, --help display this help text and exit\n\
";

View File

@@ -2,22 +2,21 @@
#define SIMD_INT_H__ 1
// Endian byte swap
#define bswap_64( a ) __builtin_bswap64( a )
#define bswap_32( a ) __builtin_bswap32( a )
#define bswap_64 __builtin_bswap64
#define bswap_32 __builtin_bswap32
// Bit rotation
#define rol64 __rolq
#define ror64 __rorq
#define rol32 __rold
#define ror32 __rord
// Safe division, integer or floating point. For floating point it's as
// safe as 0. is precisely zero.
// Returns safe_result if division by zero.
// safe as 0 is precisely zero.
// Returns safe_result if division by zero, typically zero.
#define safe_div( dividend, divisor, safe_result ) \
( (divisor) == 0 ? safe_result : ( (dividend) / (divisor) ) )
// Aliases with familiar names for built in bit rotate instructions
#define rol64( a, n ) _lrotl( a, n )
#define ror64( a, n ) _lrotr( a, n )
#define rol32( a, n ) _rotl( a, n )
#define ror32( a, n ) _rotr( a, n )
#define rol16( a, n ) _rotwl( a, n )
#define ror16( a, n ) _rotwr( a, n )
///////////////////////////////////////
//