Compare commits

...

1 Commits

Author SHA1 Message Date
Jay D Dee
1a234cbe53 v3.18.2 2021-10-19 22:35:36 -04:00
18 changed files with 474 additions and 189 deletions

View File

@@ -171,6 +171,7 @@ cpuminer_SOURCES = \
algo/sha/hmac-sha256-hash-4way.c \ algo/sha/hmac-sha256-hash-4way.c \
algo/sha/sha256d.c \ algo/sha/sha256d.c \
algo/sha/sha2.c \ algo/sha/sha2.c \
algo/sha/sha256d-4way.c \
algo/sha/sha256t-gate.c \ algo/sha/sha256t-gate.c \
algo/sha/sha256t-4way.c \ algo/sha/sha256t-4way.c \
algo/sha/sha256t.c \ algo/sha/sha256t.c \

View File

@@ -65,6 +65,20 @@ If not what makes it happen or not happen?
Change Log Change Log
---------- ----------
v3.8.2
Issue #342, fixed Groestl AES on Windows, broken in v3.18.0.
AVX512 for sha256d.
SSE42 and AVX may now be displayed as mining features at startup.
This is hard coded for each algo, and is only implemented for scrypt
at this time as it is the only algo with significant performance differences
with those features.
Fixed an issue where a high hashrate algo could cause excessive invalid hash
rate log reports when starting up in benchmark mode.
v3.18.1 v3.18.1
More speed for scrypt: More speed for scrypt:

View File

@@ -337,42 +337,42 @@ do{ \
XC2 = XOR( XC2, TC ); \ XC2 = XOR( XC2, TC ); \
\ \
TA = ADD32( XA2, XA1 ); \ TA = ADD32( XA2, XA1 ); \
XA1 = ROL_1X32( XA1 ); \
TB = ADD32( XB2, XB1 ); \ TB = ADD32( XB2, XB1 ); \
TC = ADD32( XC2, XC1 ); \ TC = ADD32( XC2, XC1 ); \
TA = ROL32( TA, 13 ); \
XA1 = ROL_1X32( XA1 ); \
XB1 = ROL_1X32( XB1 ); \ XB1 = ROL_1X32( XB1 ); \
XC1 = ROL_1X32( XC1 ); \ TA = ROL32( TA, 13 ); \
XA3 = XOR( XA3, TA ); \ XA3 = XOR( XA3, TA ); \
XC1 = ROL_1X32( XC1 ); \
TB = ROL32( TB, 13 ); \ TB = ROL32( TB, 13 ); \
XB3 = XOR( XB3, TB ); \ XB3 = XOR( XB3, TB ); \
TC = ROL32( TC, 13 ); \ TC = ROL32( TC, 13 ); \
XC3 = XOR( XC3, TC ); \ XC3 = XOR( XC3, TC ); \
\ \
TA = ADD32( XA3, XA2 ); \ TA = ADD32( XA3, XA2 ); \
XA2 = SWAP_64( XA2 ); \
TB = ADD32( XB3, XB2 ); \ TB = ADD32( XB3, XB2 ); \
TC = ADD32( XC3, XC2 ); \ TC = ADD32( XC3, XC2 ); \
TA = ROL32( TA, 18 ); \ TA = ROL32( TA, 18 ); \
XA2 = SWAP_64( XA2 ); \
XB2 = SWAP_64( XB2 ); \ XB2 = SWAP_64( XB2 ); \
XC2 = SWAP_64( XC2 ); \
XA0 = XOR( XA0, TA ); \ XA0 = XOR( XA0, TA ); \
TB = ROL32( TB, 18 ); \ TB = ROL32( TB, 18 ); \
XB0 = XOR( XB0, TB ); \ XB0 = XOR( XB0, TB ); \
XC2 = SWAP_64( XC2 ); \
TC = ROL32( TC, 18 ); \ TC = ROL32( TC, 18 ); \
XC0 = XOR( XC0, TC ); \ XC0 = XOR( XC0, TC ); \
\ \
TA = ADD32( XA0, XA1 ); \ TA = ADD32( XA0, XA1 ); \
XA3 = ROR_1X32( XA3 ); \
TB = ADD32( XB0, XB1 ); \ TB = ADD32( XB0, XB1 ); \
TC = ADD32( XC0, XC1 ); \ TC = ADD32( XC0, XC1 ); \
TA = ROL32( TA, 7 ); \ TA = ROL32( TA, 7 ); \
XA3 = ROR_1X32( XA3 ); \ XB3 = ROR_1X32( XB3 ); \
XA3 = XOR( XA3, TA ); \ XA3 = XOR( XA3, TA ); \
TB = ROL32( TB, 7 ); \ TB = ROL32( TB, 7 ); \
XB3 = ROR_1X32( XB3 ); \ XC3 = ROR_1X32( XC3 ); \
XB3 = XOR( XB3, TB ); \ XB3 = XOR( XB3, TB ); \
TC = ROL32( TC, 7 ); \ TC = ROL32( TC, 7 ); \
XC3 = ROR_1X32( XC3 ); \
XC3 = XOR( XC3, TC ); \ XC3 = XOR( XC3, TC ); \
\ \
TA = ADD32( XA3, XA0 ); \ TA = ADD32( XA3, XA0 ); \
@@ -399,24 +399,24 @@ do{ \
XC1 = XOR( XC1, TC ); \ XC1 = XOR( XC1, TC ); \
\ \
TA = ADD32( XA1, XA2 ); \ TA = ADD32( XA1, XA2 ); \
XA2 = SWAP_64( XA2 ); \
TB = ADD32( XB1, XB2 ); \ TB = ADD32( XB1, XB2 ); \
XB2 = SWAP_64( XB2 ); \
TA = ROL32( TA, 18); \ TA = ROL32( TA, 18); \
TC = ADD32( XC1, XC2 ); \ TC = ADD32( XC1, XC2 ); \
XA2 = SWAP_64( XA2 ); \ XC2 = SWAP_64( XC2 ); \
TB = ROL32( TB, 18); \ TB = ROL32( TB, 18); \
XA0 = XOR( XA0, TA ); \ XA0 = XOR( XA0, TA ); \
XB2 = SWAP_64( XB2 ); \ XA1 = ROR_1X32( XA1 ); \
TC = ROL32( TC, 18); \ TC = ROL32( TC, 18); \
XB0 = XOR( XB0, TB ); \ XB0 = XOR( XB0, TB ); \
XC2 = SWAP_64( XC2 ); \
XA1 = ROR_1X32( XA1 ); \
XB1 = ROR_1X32( XB1 ); \ XB1 = ROR_1X32( XB1 ); \
XC0 = XOR( XC0, TC ); \ XC0 = XOR( XC0, TC ); \
XC1 = ROR_1X32( XC1 ); \ XC1 = ROR_1X32( XC1 ); \
} while (0); } while (0);
// slow rol, an attempt to optimze non-avx512 bit rotations // slow rot, an attempt to optimze non-avx512 bit rotations
// Contains target specific instructions, only for use with 128 bit vectors // Contains target specific instructions, only for use with 128 bit vectors
#define SALSA_2ROUNDS_SIMD128_3BUF_SLOROT \ #define SALSA_2ROUNDS_SIMD128_3BUF_SLOROT \
do{ \ do{ \

View File

@@ -28,7 +28,6 @@
*/ */
#include "algo-gate-api.h" #include "algo-gate-api.h"
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <inttypes.h> #include <inttypes.h>
@@ -55,11 +54,25 @@ static const uint32_t sha256_initial_state[8] =
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
}; };
static int scrypt_throughput = 0; #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SCRYPT_THROUGHPUT 16
#elif defined(__AVX2__)
#define SCRYPT_THROUGHPUT 8
#else
#define SCRYPT_THROUGHPUT 4
#endif
// static int scrypt_throughput = 0;
static int scratchbuf_size = 0; static int scratchbuf_size = 0;
static __thread char *scratchbuf = NULL; static __thread uint32_t *scratchbuf = NULL;
// change this to a constant to be used directly as input state arg // change this to a constant to be used directly as input state arg
// vectors still need an init function. // vectors still need an init function.
@@ -709,15 +722,11 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
#endif // AVX512 #endif // AVX512
//#if defined(USE_ASM) && defined(__x86_64__)
#define SCRYPT_MAX_WAYS 12 #define SCRYPT_MAX_WAYS 12
#define HAVE_SCRYPT_3WAY 1 #define HAVE_SCRYPT_3WAY 1
//int scrypt_best_throughput();
void scrypt_core(uint32_t *X, uint32_t *V, int N); void scrypt_core(uint32_t *X, uint32_t *V, int N);
void scrypt_core_3way(uint32_t *X, uint32_t *V, int N); void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
//#if defined(USE_AVX2)
#if defined(__AVX2__) #if defined(__AVX2__)
#undef SCRYPT_MAX_WAYS #undef SCRYPT_MAX_WAYS
#define SCRYPT_MAX_WAYS 24 #define SCRYPT_MAX_WAYS 24
@@ -727,40 +736,39 @@ void scrypt_core_6way(uint32_t *X, uint32_t *V, int N);
#ifndef SCRYPT_MAX_WAYS #ifndef SCRYPT_MAX_WAYS
#define SCRYPT_MAX_WAYS 1 #define SCRYPT_MAX_WAYS 1
//#define scrypt_best_throughput() 1
#endif #endif
#include "scrypt-core-4way.h" #include "scrypt-core-4way.h"
static bool scrypt_N_1_1_256(const uint32_t *input, uint32_t *output, /*
uint32_t *midstate, unsigned char *scratchpad, int N, int thr_id ) static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
uint32_t *midstate, int N, int thr_id )
{ {
uint32_t tstate[8], ostate[8]; uint32_t tstate[8], ostate[8];
uint32_t X[32]; uint32_t X[32];
uint32_t *V = (uint32_t*)scratchpad;
memcpy(tstate, midstate, 32); memcpy(tstate, midstate, 32);
HMAC_SHA256_80_init(input, tstate, ostate); HMAC_SHA256_80_init(input, tstate, ostate);
PBKDF2_SHA256_80_128(tstate, ostate, input, X); PBKDF2_SHA256_80_128(tstate, ostate, input, X);
scrypt_core_simd128( X, V, N ); // woring scrypt_core_simd128( X, scratchbuf, N ); // woring
// scrypt_core_1way( X, V, N ); // working // scrypt_core_1way( X, V, N ); // working
// scrypt_core(X, V, N); // scrypt_core(X, V, N);
PBKDF2_SHA256_128_32(tstate, ostate, X, output); PBKDF2_SHA256_128_32(tstate, ostate, X, output);
return true; return true;
} }
*/
#if defined(__AVX2__) #if ( SCRYPT_THROUGHPUT == 8 )
static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output, static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid ) uint32_t *midstate, int N, int thrid )
{ {
uint32_t _ALIGN(128) tstate[ 8*8 ]; uint32_t _ALIGN(128) tstate[ 8*8 ];
uint32_t _ALIGN(128) ostate[ 8*8 ]; uint32_t _ALIGN(128) ostate[ 8*8 ];
uint32_t _ALIGN(128) W[ 8*32 ]; uint32_t _ALIGN(128) W[ 8*32 ];
uint32_t _ALIGN(128) X[ 8*32 ]; uint32_t _ALIGN(128) X[ 8*32 ];
uint32_t *V = (uint32_t*)scratchpad;
intrlv_8x32( W, input, input+ 20, input+ 40, input+ 60, intrlv_8x32( W, input, input+ 20, input+ 40, input+ 60,
input+80, input+100, input+120, input+140, 640 ); input+80, input+100, input+120, input+140, 640 );
@@ -774,11 +782,11 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
if ( opt_param_n > 0x4000 ) if ( opt_param_n > 0x4000 )
{ {
scrypt_core_simd128_3buf( X, V, N ); scrypt_core_simd128_3buf( X, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0; if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_3buf( X+ 96, V, N ); scrypt_core_simd128_3buf( X+ 96, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0; if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+192, V, N ); scrypt_core_simd128_2buf( X+192, scratchbuf, N );
} }
else else
{ {
@@ -786,13 +794,13 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 ); intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
intrlv_2x128( W+128, X+128, X+160, 1024 ); intrlv_2x128( W+128, X+128, X+160, 1024 );
intrlv_2x128( W+192, X+192, X+224, 1024 ); intrlv_2x128( W+192, X+192, X+224, 1024 );
scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)V, N ); scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0; if ( work_restart[thrid].restart ) return 0;
scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N ); scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0; if ( work_restart[thrid].restart ) return 0;
scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N ); scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0; if ( work_restart[thrid].restart ) return 0;
scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N ); scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)scratchbuf, N );
dintrlv_2x128( X, X+ 32, W, 1024 ); dintrlv_2x128( X, X+ 32, W, 1024 );
dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 ); dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
dintrlv_2x128( X+128, X+160, W+128, 1024 ); dintrlv_2x128( X+128, X+160, W+128, 1024 );
@@ -928,16 +936,15 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
#endif // AVX2 #endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #if ( SCRYPT_THROUGHPUT == 16 )
static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output, static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid ) uint32_t *midstate, int N, int thrid )
{ {
uint32_t _ALIGN(128) tstate[ 16*8 ]; uint32_t _ALIGN(128) tstate[ 16*8 ];
uint32_t _ALIGN(128) ostate[ 16*8 ]; uint32_t _ALIGN(128) ostate[ 16*8 ];
uint32_t _ALIGN(128) W[ 16*32 ]; uint32_t _ALIGN(128) W[ 16*32 ];
uint32_t _ALIGN(128) X[ 16*32 ]; uint32_t _ALIGN(128) X[ 16*32 ];
uint32_t *V = (uint32_t*)scratchpad;
intrlv_16x32( W, input, input+ 20, input+ 40, input+ 60, intrlv_16x32( W, input, input+ 20, input+ 40, input+ 60,
input+ 80, input+100, input+120, input+140, input+ 80, input+100, input+120, input+140,
@@ -956,17 +963,17 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
if ( opt_param_n > 0x4000 ) if ( opt_param_n > 0x4000 )
{ {
scrypt_core_simd128_3buf( X, V, N ); scrypt_core_simd128_3buf( X, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0; if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_3buf( X+ 96, V, N ); scrypt_core_simd128_3buf( X+ 96, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0; if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+192, V, N ); scrypt_core_simd128_2buf( X+192, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0; if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_3buf( X+256, V, N ); scrypt_core_simd128_3buf( X+256, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0; if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_3buf( X+352, V, N ); scrypt_core_simd128_3buf( X+352, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0; if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+448, V, N ); scrypt_core_simd128_2buf( X+448, scratchbuf, N );
} }
else else
{ {
@@ -974,13 +981,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 ); intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
intrlv_4x128( W+256, X+256, X+288, X+320, X+352, 1024 ); intrlv_4x128( W+256, X+256, X+288, X+320, X+352, 1024 );
intrlv_4x128( W+384, X+384, X+416, X+448, X+480, 1024 ); intrlv_4x128( W+384, X+384, X+416, X+448, X+480, 1024 );
scrypt_core_4way_simd128( (__m512i*) W, (__m512i*)V, N ); scrypt_core_4way_simd128( (__m512i*) W, (__m512i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0; if ( work_restart[thrid].restart ) return 0;
scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N ); scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0; if ( work_restart[thrid].restart ) return 0;
scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)V, N ); scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0; if ( work_restart[thrid].restart ) return 0;
scrypt_core_4way_simd128( (__m512i*)(W+384), (__m512i*)V, N ); scrypt_core_4way_simd128( (__m512i*)(W+384), (__m512i*)scratchbuf, N );
dintrlv_4x128( X, X+ 32, X+ 64, X+ 96, W, 1024 ); dintrlv_4x128( X, X+ 32, X+ 64, X+ 96, W, 1024 );
dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 ); dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
dintrlv_4x128( X+256, X+288, X+320, X+352, W+256, 1024 ); dintrlv_4x128( X+256, X+288, X+320, X+352, W+256, 1024 );
@@ -1236,15 +1243,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
#endif // AVX512 #endif // AVX512
#if defined(__SHA__) #if 0
static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output, static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid ) uint32_t *midstate, int N, int thrid )
{ {
uint32_t _ALIGN(128) tstate[ 2*8 ]; uint32_t _ALIGN(128) tstate[ 2*8 ];
uint32_t _ALIGN(128) ostate[ 2*8 ]; uint32_t _ALIGN(128) ostate[ 2*8 ];
uint32_t _ALIGN(128) W[ 2*32 ]; uint32_t _ALIGN(128) W[ 2*32 ];
uint32_t *V = (uint32_t*)scratchpad;
memcpy( tstate, midstate, 32 ); memcpy( tstate, midstate, 32 );
memcpy( tstate+ 8, midstate, 32 ); memcpy( tstate+ 8, midstate, 32 );
@@ -1254,7 +1259,7 @@ static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
PBKDF2_SHA256_80_128_SHA_2BUF( tstate, tstate+8, ostate, ostate+8, PBKDF2_SHA256_80_128_SHA_2BUF( tstate, tstate+8, ostate, ostate+8,
input, input+20, W, W+32 ); input, input+20, W, W+32 );
scrypt_core_simd128_2buf( W, V, N ); scrypt_core_simd128_2buf( W, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0; if ( work_restart[thrid].restart ) return 0;
PBKDF2_SHA256_128_32_SHA_2BUF( tstate, tstate+8, ostate, ostate+8, W, W+32, PBKDF2_SHA256_128_32_SHA_2BUF( tstate, tstate+8, ostate, ostate+8, W, W+32,
@@ -1264,12 +1269,11 @@ static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
} }
static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output, static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid ) uint32_t *midstate, int N, int thrid )
{ {
uint32_t _ALIGN(128) tstate[4 * 8]; uint32_t _ALIGN(128) tstate[4 * 8];
uint32_t _ALIGN(128) ostate[4 * 8]; uint32_t _ALIGN(128) ostate[4 * 8];
uint32_t _ALIGN(128) W[4 * 32]; uint32_t _ALIGN(128) W[4 * 32];
uint32_t *V = (uint32_t*)scratchpad;
memcpy( tstate, midstate, 32 ); memcpy( tstate, midstate, 32 );
memcpy( tstate+ 8, midstate, 32 ); memcpy( tstate+ 8, midstate, 32 );
@@ -1300,9 +1304,9 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
*/ */
// working, double buffered linear simd // working, double buffered linear simd
scrypt_core_simd128_2buf( W, V, N ); scrypt_core_simd128_2buf( W, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0; if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( W+64, V, N ); scrypt_core_simd128_2buf( W+64, scratchbuf, N );
/* /*
scrypt_core_simd128_3buf( W, V, N ); scrypt_core_simd128_3buf( W, V, N );
@@ -1323,17 +1327,15 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
return 1; return 1;
} }
#endif
#else #if ( SCRYPT_THROUGHPUT == 4 )
#ifdef HAVE_SHA256_4WAY
static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output, static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid ) uint32_t *midstate, int N, int thrid )
{ {
uint32_t _ALIGN(128) tstate[ 4*8 ]; uint32_t _ALIGN(128) tstate[ 4*8 ];
uint32_t _ALIGN(128) ostate[ 4*8 ]; uint32_t _ALIGN(128) ostate[ 4*8 ];
uint32_t _ALIGN(128) W[ 4*32 ]; uint32_t _ALIGN(128) W[ 4*32 ];
uint32_t *V = (uint32_t*)scratchpad;
intrlv_4x32( W, input, input+20, input+40, input+60, 640 ); intrlv_4x32( W, input, input+20, input+40, input+60, 640 );
for ( int i = 0; i < 8; i++ ) for ( int i = 0; i < 8; i++ )
@@ -1346,13 +1348,13 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
{ {
uint32_t _ALIGN(128) X[ 4*32 ]; uint32_t _ALIGN(128) X[ 4*32 ];
dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 ); dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
scrypt_core_simd128_2buf( X, V, N ); scrypt_core_simd128_2buf( X, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0; if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+64, V, N ); scrypt_core_simd128_2buf( X+64, scratchbuf, N );
intrlv_4x32( W, X, X+32, X+64, X+96, 1024 ); intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
} }
else else
scrypt_core_4way( (__m128i*)W, (__m128i*)V, N ); scrypt_core_4way( (__m128i*)W, (__m128i*)scratchbuf, N );
@@ -1398,65 +1400,73 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
return 1; return 1;
} }
#endif /* HAVE_SHA256_4WAY */ #endif // SCRYPT_THROUGHPUT == 4
#endif // SHA //#endif // SHA
extern int scanhash_scrypt( struct work *work, uint32_t max_nonce, extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr ) uint64_t *hashes_done, struct thr_info *mythr )
{ {
uint32_t _ALIGN(64) hash[ 8*SCRYPT_THROUGHPUT ];
uint32_t _ALIGN(64) data[ 20*SCRYPT_THROUGHPUT ];
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
uint32_t *ptarget = work->target; uint32_t *ptarget = work->target;
uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
uint32_t midstate[8]; uint32_t midstate[8];
uint32_t n = pdata[19] - 1; uint32_t n = pdata[19] - 1;
int thr_id = mythr->id; int thr_id = mythr->id;
int throughput = scrypt_throughput;
int i; int i;
volatile uint8_t *restart = &(work_restart[thr_id].restart); volatile uint8_t *restart = &(work_restart[thr_id].restart);
for ( i = 0; i < throughput; i++ ) for ( i = 0; i < SCRYPT_THROUGHPUT; i++ )
memcpy( data + i * 20, pdata, 80 ); memcpy( data + i * 20, pdata, 80 );
sha256_transform_le( midstate, data, sha256_initial_state ); sha256_transform_le( midstate, data, sha256_initial_state );
do { do {
bool rc = true; bool rc = true;
for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n; for ( i = 0; i < SCRYPT_THROUGHPUT; i++ ) data[ i*20 + 19 ] = ++n;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) //#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
if ( throughput == 16 ) #if ( SCRYPT_THROUGHPUT == 16 )
rc = scrypt_N_1_1_256_16way( data, hash, midstate, scratchbuf, // if ( SCRYPT_THROUGHPUT == 16 )
opt_param_n, thr_id ); rc = scrypt_N_1_1_256_16way( data, hash, midstate, opt_param_n,
else thr_id );
#endif // else
#if defined(__AVX2__) //#endif
if ( throughput == 8 ) //#if defined(__AVX2__)
rc = scrypt_N_1_1_256_8way( data, hash, midstate, scratchbuf, #elif ( SCRYPT_THROUGHPUT == 8 )
opt_param_n, thr_id ); // if ( SCRYPT_THROUGHPUT == 8 )
else rc = scrypt_N_1_1_256_8way( data, hash, midstate, opt_param_n,
#endif thr_id );
if ( throughput == 4 ) // slower on Ryzen than 8way // else
#if defined(__SHA__) //#endif
rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, scratchbuf, #elif ( SCRYPT_THROUGHPUT == 4 )
opt_param_n, thr_id ); // if ( SCRYPT_THROUGHPUT == 4 ) // slower on Ryzen than 8way
//#if defined(__SHA__)
// rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, opt_param_n,
// thr_id );
//#else
rc = scrypt_N_1_1_256_4way( data, hash, midstate, opt_param_n,
thr_id );
#else #else
rc = scrypt_N_1_1_256_4way( data, hash, midstate, scratchbuf,
opt_param_n, thr_id ); #error "Invalid SCRYPT_THROUGHPUT"
#endif #endif
/*
#if defined(__SHA__) #if defined(__SHA__)
else else
if (throughput == 2 ) // slower on Ryzen than 4way_sha & 8way if ( SCRYPT_THROUGHPUT == 2 ) // slower on Ryzen than 4way_sha & 8way
rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, scratchbuf, rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, opt_param_n,
opt_param_n, thr_id ); thr_id );
#endif #endif
else // should never get here else // should never get here
rc = scrypt_N_1_1_256( data, hash, midstate, scratchbuf, rc = scrypt_N_1_1_256( data, hash, midstate, opt_param_n, thr_id );
opt_param_n, thr_id ); */
// test the hash // test the hash
if ( rc ) if ( rc )
for ( i = 0; i < throughput; i++ ) for ( i = 0; i < SCRYPT_THROUGHPUT; i++ )
{ {
if ( unlikely( valid_hash( hash + i*8, ptarget ) && !opt_benchmark ) ) if ( unlikely( valid_hash( hash + i*8, ptarget ) && !opt_benchmark ) )
{ {
@@ -1468,7 +1478,7 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
} }
} while ( likely( ( n < ( max_nonce - throughput ) ) && !(*restart) ) ); } while ( likely( ( n < ( max_nonce - SCRYPT_THROUGHPUT ) ) && !(*restart) ) );
*hashes_done = n - pdata[19]; *hashes_done = n - pdata[19];
pdata[19] = n; pdata[19] = n;
@@ -1489,7 +1499,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
//#if defined(__SHA__) //#if defined(__SHA__)
// gate->optimizations = SSE2_OPT | SHA_OPT; // gate->optimizations = SSE2_OPT | SHA_OPT;
//#else //#else
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT; gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT;
//#endif //#endif
gate->miner_thread_init =(void*)&scrypt_miner_thread_init; gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
gate->scanhash = (void*)&scanhash_scrypt; gate->scanhash = (void*)&scanhash_scrypt;
@@ -1497,8 +1507,11 @@ bool register_scrypt_algo( algo_gate_t* gate )
opt_param_n = opt_param_n ? opt_param_n : 1024; opt_param_n = opt_param_n ? opt_param_n : 1024;
applog( LOG_INFO,"Scrypt paramaters: N= %d, R= 1", opt_param_n ); applog( LOG_INFO,"Scrypt paramaters: N= %d, R= 1", opt_param_n );
// scrypt_throughput can be defined at compile time and used to replace
// MAX_WAYS to reduce memory usage.
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
scrypt_throughput = 16; // scrypt_throughput = 16;
if ( opt_param_n > 0x4000 ) if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
else else
@@ -1511,13 +1524,13 @@ bool register_scrypt_algo( algo_gate_t* gate )
*/ */
#elif defined(__AVX2__) #elif defined(__AVX2__)
scrypt_throughput = 8; // scrypt_throughput = 8;
if ( opt_param_n > 0x4000 ) if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
else else
scratchbuf_size = opt_param_n * 2 * 128; // 2 way scratchbuf_size = opt_param_n * 2 * 128; // 2 way
#else #else
scrypt_throughput = 4; // scrypt_throughput = 4;
if ( opt_param_n > 0x4000 ) if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
else else
@@ -1533,7 +1546,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
format_number_si( &d_size, d_units ); format_number_si( &d_size, d_units );
applog( LOG_INFO,"Throughput %d/thr, Buffer %.0f %siB/thr, Total %.0f %siB\n", applog( LOG_INFO,"Throughput %d/thr, Buffer %.0f %siB/thr, Total %.0f %siB\n",
scrypt_throughput, t_size, t_units, d_size, d_units ); SCRYPT_THROUGHPUT, t_size, t_units, d_size, d_units );
return true; return true;
}; };

View File

@@ -84,6 +84,11 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data, void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
const __m256i *state_in ); const __m256i *state_in );
void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
const __m256i *state_in );
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const __m256i *state_mid );
#endif // AVX2 #endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

View File

@@ -8,7 +8,7 @@
* any later version. See COPYING for more details. * any later version. See COPYING for more details.
*/ */
#include "algo-gate-api.h" #include "sha256d-4way.h"
#include <string.h> #include <string.h>
#include <inttypes.h> #include <inttypes.h>
@@ -181,6 +181,8 @@ static const uint32_t sha256d_hash1[16] = {
}; };
// this performs the entire hash all over again, why? // this performs the entire hash all over again, why?
// because main function only does 56 rounds.
static void sha256d_80_swap(uint32_t *hash, const uint32_t *data) static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
{ {
uint32_t S[16]; uint32_t S[16];
@@ -492,7 +494,7 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
void sha256d_ms_4way(uint32_t *hash, uint32_t *data, void sha256d_ms_4way(uint32_t *hash, uint32_t *data,
const uint32_t *midstate, const uint32_t *prehash); const uint32_t *midstate, const uint32_t *prehash);
static inline int scanhash_sha256d_4way( struct work *work, static inline int scanhash_sha256d_4way_pooler( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{ {
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
@@ -553,7 +555,7 @@ static inline int scanhash_sha256d_4way( struct work *work,
void sha256d_ms_8way(uint32_t *hash, uint32_t *data, void sha256d_ms_8way(uint32_t *hash, uint32_t *data,
const uint32_t *midstate, const uint32_t *prehash); const uint32_t *midstate, const uint32_t *prehash);
static inline int scanhash_sha256d_8way( struct work *work, static inline int scanhash_sha256d_8way_pooler( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{ {
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
@@ -609,7 +611,7 @@ static inline int scanhash_sha256d_8way( struct work *work,
#endif /* HAVE_SHA256_8WAY */ #endif /* HAVE_SHA256_8WAY */
int scanhash_sha256d( struct work *work, int scanhash_sha256d_pooler( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{ {
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
@@ -625,11 +627,11 @@ int scanhash_sha256d( struct work *work,
#ifdef HAVE_SHA256_8WAY #ifdef HAVE_SHA256_8WAY
if (sha256_use_8way()) if (sha256_use_8way())
return scanhash_sha256d_8way( work, max_nonce, hashes_done, mythr ); return scanhash_sha256d_8way_pooler( work, max_nonce, hashes_done, mythr );
#endif #endif
#ifdef HAVE_SHA256_4WAY #ifdef HAVE_SHA256_4WAY
if (sha256_use_4way()) if (sha256_use_4way())
return scanhash_sha256d_4way( work, max_nonce, hashes_done, mythr ); return scanhash_sha256d_4way_pooler( work, max_nonce, hashes_done, mythr );
#endif #endif
memcpy(data, pdata + 16, 64); memcpy(data, pdata + 16, 64);
@@ -690,9 +692,13 @@ int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
bool register_sha256d_algo( algo_gate_t* gate ) bool register_sha256d_algo( algo_gate_t* gate )
{ {
gate->optimizations = SSE2_OPT | AVX2_OPT; gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
gate->scanhash = (void*)&scanhash_sha256d; #if defined(SHA256D_16WAY)
// gate->hash = (void*)&sha256d; gate->scanhash = (void*)&scanhash_sha256d_16way;
#else
gate->scanhash = (void*)&scanhash_sha256d_pooler;
#endif
// gate->hash = (void*)&sha256d;
return true; return true;
}; };

View File

@@ -548,6 +548,136 @@ void sha256_8way_init( sha256_8way_context *sc )
sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 ); sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
} }
// Aggresive prehashing, LE byte order
void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
const __m256i *state_in )
{
__m256i A, B, C, D, E, F, G, H;
A = _mm256_load_si256( state_in );
B = _mm256_load_si256( state_in + 1 );
C = _mm256_load_si256( state_in + 2 );
D = _mm256_load_si256( state_in + 3 );
E = _mm256_load_si256( state_in + 4 );
F = _mm256_load_si256( state_in + 5 );
G = _mm256_load_si256( state_in + 6 );
H = _mm256_load_si256( state_in + 7 );
#if !defined(__AVX512VL__)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
#endif
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
_mm256_store_si256( state_mid , A );
_mm256_store_si256( state_mid + 1, B );
_mm256_store_si256( state_mid + 2, C );
_mm256_store_si256( state_mid + 3, D );
_mm256_store_si256( state_mid + 4, E );
_mm256_store_si256( state_mid + 5, F );
_mm256_store_si256( state_mid + 6, G );
_mm256_store_si256( state_mid + 7, H );
}
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const __m256i *state_mid )
{
__m256i A, B, C, D, E, F, G, H;
__m256i W[16];
memcpy_256( W, data, 16 );
A = _mm256_load_si256( state_mid );
B = _mm256_load_si256( state_mid + 1 );
C = _mm256_load_si256( state_mid + 2 );
D = _mm256_load_si256( state_mid + 3 );
E = _mm256_load_si256( state_mid + 4 );
F = _mm256_load_si256( state_mid + 5 );
G = _mm256_load_si256( state_mid + 6 );
H = _mm256_load_si256( state_mid + 7 );
// SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
// SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
// SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
#if !defined(__AVX512VL__)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( G, H );
#endif
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
for ( int j = 16; j < 64; j += 16 )
{
W[ 0] = SHA2x_MEXP( 14, 9, 1, 0 );
W[ 1] = SHA2x_MEXP( 15, 10, 2, 1 );
W[ 2] = SHA2x_MEXP( 0, 11, 3, 2 );
W[ 3] = SHA2x_MEXP( 1, 12, 4, 3 );
W[ 4] = SHA2x_MEXP( 2, 13, 5, 4 );
W[ 5] = SHA2x_MEXP( 3, 14, 6, 5 );
W[ 6] = SHA2x_MEXP( 4, 15, 7, 6 );
W[ 7] = SHA2x_MEXP( 5, 0, 8, 7 );
W[ 8] = SHA2x_MEXP( 6, 1, 9, 8 );
W[ 9] = SHA2x_MEXP( 7, 2, 10, 9 );
W[10] = SHA2x_MEXP( 8, 3, 11, 10 );
W[11] = SHA2x_MEXP( 9, 4, 12, 11 );
W[12] = SHA2x_MEXP( 10, 5, 13, 12 );
W[13] = SHA2x_MEXP( 11, 6, 14, 13 );
W[14] = SHA2x_MEXP( 12, 7, 15, 14 );
W[15] = SHA2x_MEXP( 13, 8, 0, 15 );
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
}
A = _mm256_add_epi32( A, _mm256_load_si256( state_in ) );
B = _mm256_add_epi32( B, _mm256_load_si256( state_in + 1 ) );
C = _mm256_add_epi32( C, _mm256_load_si256( state_in + 2 ) );
D = _mm256_add_epi32( D, _mm256_load_si256( state_in + 3 ) );
E = _mm256_add_epi32( E, _mm256_load_si256( state_in + 4 ) );
F = _mm256_add_epi32( F, _mm256_load_si256( state_in + 5 ) );
G = _mm256_add_epi32( G, _mm256_load_si256( state_in + 6 ) );
H = _mm256_add_epi32( H, _mm256_load_si256( state_in + 7 ) );
_mm256_store_si256( state_out , A );
_mm256_store_si256( state_out + 1, B );
_mm256_store_si256( state_out + 2, C );
_mm256_store_si256( state_out + 3, D );
_mm256_store_si256( state_out + 4, E );
_mm256_store_si256( state_out + 5, F );
_mm256_store_si256( state_out + 6, G );
_mm256_store_si256( state_out + 7, H );
}
// need to handle odd byte length for yespower. // need to handle odd byte length for yespower.
// Assume only last update is odd. // Assume only last update is odd.

View File

@@ -53,4 +53,8 @@ void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
#define sha256_transform_be sph_sha256_transform_be #define sha256_transform_be sph_sha256_transform_be
#endif #endif
// SHA can't do only 3 rounds
#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
#endif #endif

View File

@@ -1,4 +1,4 @@
#include "sha256t-gate.h" #include "sha256d-4way.h"
#include <stdlib.h> #include <stdlib.h>
#include <stdint.h> #include <stdint.h>
#include <string.h> #include <string.h>
@@ -13,7 +13,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
__m512i block[16] __attribute__ ((aligned (64))); __m512i block[16] __attribute__ ((aligned (64)));
__m512i hash32[8] __attribute__ ((aligned (32))); __m512i hash32[8] __attribute__ ((aligned (32)));
__m512i initstate[8] __attribute__ ((aligned (32))); __m512i initstate[8] __attribute__ ((aligned (32)));
__m512i midstate[8] __attribute__ ((aligned (32))); __m512i midstate1[8] __attribute__ ((aligned (32)));
__m512i midstate2[8] __attribute__ ((aligned (32))); __m512i midstate2[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m512i vdata[20] __attribute__ ((aligned (32))); __m512i vdata[20] __attribute__ ((aligned (32)));
@@ -46,11 +46,10 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB ); initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 ); initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 byte block of data sha256_16way_transform_le( midstate1, vdata, initstate );
sha256_16way_transform_le( midstate, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block // Do 3 rounds on the first 12 bytes of the next block
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate ); sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
do do
{ {
@@ -59,7 +58,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
block[ 4] = last_byte; block[ 4] = last_byte;
memset_zero_512( block + 5, 10 ); memset_zero_512( block + 5, 10 );
block[15] = m512_const1_32( 80*8 ); // bit count block[15] = m512_const1_32( 80*8 ); // bit count
sha256_16way_final_rounds( hash32, block, midstate, midstate2 ); sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
// 2. 32 byte hash from 1. // 2. 32 byte hash from 1.
memcpy_512( block, hash32, 8 ); memcpy_512( block, hash32, 8 );
@@ -99,7 +98,8 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
__m256i block[16] __attribute__ ((aligned (64))); __m256i block[16] __attribute__ ((aligned (64)));
__m256i hash32[8] __attribute__ ((aligned (32))); __m256i hash32[8] __attribute__ ((aligned (32)));
__m256i initstate[8] __attribute__ ((aligned (32))); __m256i initstate[8] __attribute__ ((aligned (32)));
__m256i midstate[8] __attribute__ ((aligned (32))); __m256i midstate1[8] __attribute__ ((aligned (32)));
__m256i midstate2[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m256i vdata[20] __attribute__ ((aligned (32))); __m256i vdata[20] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
@@ -116,7 +116,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
const __m256i eight = m256_const1_32( 8 ); const __m256i eight = m256_const1_32( 8 );
for ( int i = 0; i < 19; i++ ) for ( int i = 0; i < 19; i++ )
vdata[i] = m256_const1_32( pdata[i] ); vdata[i] = m256_const1_32( pdata[i] );
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n ); *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
@@ -130,8 +130,10 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB ); initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 ); initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data sha256_8way_transform_le( midstate1, vdata, initstate );
sha256_8way_transform_le( midstate, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
do do
{ {
@@ -140,7 +142,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
block[ 4] = last_byte; block[ 4] = last_byte;
memset_zero_256( block + 5, 10 ); memset_zero_256( block + 5, 10 );
block[15] = m256_const1_32( 80*8 ); // bit count block[15] = m256_const1_32( 80*8 ); // bit count
sha256_8way_transform_le( hash32, block, midstate ); sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
// 2. 32 byte hash from 1. // 2. 32 byte hash from 1.
memcpy_256( block, hash32, 8 ); memcpy_256( block, hash32, 8 );
@@ -253,3 +255,20 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
#endif #endif
/*
bool register_sha256d_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
#if defined(SHA256D_16WAY)
gate->scanhash = (void*)&scanhash_sha256d_16way;
#elif defined(SHA256D_8WAY)
gate->scanhash = (void*)&scanhash_sha256d_8way;
#elif defined(SHA256D_4WAY)
gate->scanhash = (void*)&scanhash_sha256d_4way;
#endif
// gate->hash = (void*)&sha256d;
return true;
};
*/

48
algo/sha/sha256d-4way.h Normal file
View File

@@ -0,0 +1,48 @@
#ifndef __SHA256D_4WAY_H__
#define __SHA256D_4WAY_H__ 1
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA256D_16WAY 1
/*
#elif defined(__AVX2__)
#define SHA256D_8WAY 1
#else
#define SHA256D_4WAY 1
*/
#endif
bool register_sha256d_algo( algo_gate_t* gate );
#if defined(SHA256D_16WAY)
int scanhash_sha256d_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
/*
#if defined(SHA256D_8WAY)
int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA256D_4WAY)
int scanhash_sha256d_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
*/
/*
#if defined(__SHA__)
int scanhash_sha256d( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
*/
#endif

View File

@@ -13,7 +13,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
__m512i block[16] __attribute__ ((aligned (64))); __m512i block[16] __attribute__ ((aligned (64)));
__m512i hash32[8] __attribute__ ((aligned (32))); __m512i hash32[8] __attribute__ ((aligned (32)));
__m512i initstate[8] __attribute__ ((aligned (32))); __m512i initstate[8] __attribute__ ((aligned (32)));
__m512i midstate[8] __attribute__ ((aligned (32))); __m512i midstate1[8] __attribute__ ((aligned (32)));
__m512i midstate2[8] __attribute__ ((aligned (32))); __m512i midstate2[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m512i vdata[20] __attribute__ ((aligned (32))); __m512i vdata[20] __attribute__ ((aligned (32)));
@@ -31,7 +31,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
const __m512i sixteen = m512_const1_32( 16 ); const __m512i sixteen = m512_const1_32( 16 );
for ( int i = 0; i < 19; i++ ) for ( int i = 0; i < 19; i++ )
vdata[i] = m512_const1_32( pdata[i] ); vdata[i] = m512_const1_32( pdata[i] );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8, *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n ); n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
@@ -46,11 +46,10 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB ); initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 ); initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 byte block of data sha256_16way_transform_le( midstate1, vdata, initstate );
sha256_16way_transform_le( midstate, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block // Do 3 rounds on the first 12 bytes of the next block
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate ); sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
do do
{ {
@@ -59,7 +58,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
block[ 4] = last_byte; block[ 4] = last_byte;
memset_zero_512( block + 5, 10 ); memset_zero_512( block + 5, 10 );
block[15] = m512_const1_32( 80*8 ); // bit count block[15] = m512_const1_32( 80*8 ); // bit count
sha256_16way_final_rounds( hash32, block, midstate, midstate2 ); sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
// 2. 32 byte hash from 1. // 2. 32 byte hash from 1.
memcpy_512( block, hash32, 8 ); memcpy_512( block, hash32, 8 );
@@ -104,7 +103,8 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
__m256i block[16] __attribute__ ((aligned (64))); __m256i block[16] __attribute__ ((aligned (64)));
__m256i hash32[8] __attribute__ ((aligned (32))); __m256i hash32[8] __attribute__ ((aligned (32)));
__m256i initstate[8] __attribute__ ((aligned (32))); __m256i initstate[8] __attribute__ ((aligned (32)));
__m256i midstate[8] __attribute__ ((aligned (32))); __m256i midstate1[8] __attribute__ ((aligned (32)));
__m256i midstate2[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m256i vdata[20] __attribute__ ((aligned (32))); __m256i vdata[20] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
@@ -121,7 +121,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
const __m256i eight = m256_const1_32( 8 ); const __m256i eight = m256_const1_32( 8 );
for ( int i = 0; i < 19; i++ ) for ( int i = 0; i < 19; i++ )
vdata[i] = m256_const1_32( pdata[i] ); vdata[i] = m256_const1_32( pdata[i] );
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n ); *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
@@ -135,8 +135,10 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB ); initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 ); initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data sha256_8way_transform_le( midstate1, vdata, initstate );
sha256_8way_transform_le( midstate, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
do do
{ {
@@ -145,7 +147,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
block[ 4] = last_byte; block[ 4] = last_byte;
memset_zero_256( block + 5, 10 ); memset_zero_256( block + 5, 10 );
block[15] = m256_const1_32( 80*8 ); // bit count block[15] = m256_const1_32( 80*8 ); // bit count
sha256_8way_transform_le( hash32, block, midstate ); sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
// 2. 32 byte hash from 1. // 2. 32 byte hash from 1.
memcpy_256( block, hash32, 8 ); memcpy_256( block, hash32, 8 );

View File

@@ -702,6 +702,36 @@ memcpy( state_out, state_in, 32 );
} }
void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in )
{
uint32_t t1, t2, X_xor_Y, Y_xor_Z = state_in[1] ^ state_in[2];
memcpy( state_out, state_in, 32 );
t1 = state_out[7] + BSG2_1( state_out[4] )
+ CH( state_out[4], state_out[5], state_out[6] ) + 0x428A2F98 + data[0];
t2 = BSG2_0( state_out[0] )
+ MAJ( state_out[0], state_out[1], state_out[2] );
Y_xor_Z = X_xor_Y;
state_out[3] += t1;
state_out[7] = t1 + t2;
t1 = state_out[6] + BSG2_1( state_out[3] )
+ CH( state_out[3], state_out[4], state_out[5] ) + 0x71374491 + data[1];
t2 = BSG2_0( state_out[7] )
+ MAJ( state_out[7], state_out[0], state_out[1] );
Y_xor_Z = X_xor_Y;
state_out[2] += t1;
state_out[6] = t1 + t2;
t1 = state_out[5] + BSG2_1( state_out[2] )
+ CH( state_out[2], state_out[3], state_out[4] ) + 0xB5C0FBCF + data[2];
t2 = BSG2_0( state_out[6] )
+ MAJ( state_out[6], state_out[7], state_out[0] );
state_out[1] += t1;
state_out[5] = t1 + t2;
}
/* see sph_sha2.h */ /* see sph_sha2.h */
void void
sph_sha224_init(void *cc) sph_sha224_init(void *cc)

View File

@@ -215,6 +215,9 @@ void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data, void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in ); const uint32_t *state_in );
void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
#if SPH_64 #if SPH_64

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.1. # Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.2.
# #
# #
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='cpuminer-opt' PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.18.1' PACKAGE_VERSION='3.18.2'
PACKAGE_STRING='cpuminer-opt 3.18.1' PACKAGE_STRING='cpuminer-opt 3.18.2'
PACKAGE_BUGREPORT='' PACKAGE_BUGREPORT=''
PACKAGE_URL='' PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
\`configure' configures cpuminer-opt 3.18.1 to adapt to many kinds of systems. \`configure' configures cpuminer-opt 3.18.2 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.18.1:";; short | recursive ) echo "Configuration of cpuminer-opt 3.18.2:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
cpuminer-opt configure 3.18.1 cpuminer-opt configure 3.18.2
generated by GNU Autoconf 2.69 generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc. Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.18.1, which was It was created by cpuminer-opt $as_me 3.18.2, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@ $ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package. # Define the identity of the package.
PACKAGE='cpuminer-opt' PACKAGE='cpuminer-opt'
VERSION='3.18.1' VERSION='3.18.2'
cat >>confdefs.h <<_ACEOF cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by cpuminer-opt $as_me 3.18.1, which was This file was extended by cpuminer-opt $as_me 3.18.2, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\ ac_cs_version="\\
cpuminer-opt config.status 3.18.1 cpuminer-opt config.status 3.18.2
configured by $0, generated by GNU Autoconf 2.69, configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.18.1]) AC_INIT([cpuminer-opt], [3.18.2])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM

View File

@@ -1112,19 +1112,17 @@ void report_summary_log( bool force )
applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], short_url ); applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], short_url );
applog2( LOG_NOTICE, "Periodic Report %s %s", et_str, upt_str ); applog2( LOG_NOTICE, "Periodic Report %s %s", et_str, upt_str );
applog2( LOG_INFO, "Share rate %.2f/min %.2f/min", applog2( LOG_INFO, "Share rate %.2f/min %.2f/min",
submit_rate, (double)submitted_share_count*60. / submit_rate, safe_div( (double)submitted_share_count*60.,
( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ) ); ( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ), 0. ) );
applog2( LOG_INFO, "Hash rate %7.2f%sh/s %7.2f%sh/s (%.2f%sh/s)", applog2( LOG_INFO, "Hash rate %7.2f%sh/s %7.2f%sh/s (%.2f%sh/s)",
shrate, shr_units, sess_hrate, sess_hr_units, ghrate, ghr_units ); shrate, shr_units, sess_hrate, sess_hr_units, ghrate, ghr_units );
if ( accepted_share_count < submitted_share_count ) if ( accepted_share_count < submitted_share_count )
{ {
double lost_ghrate = uptime.tv_sec == 0 ? 0. double lost_ghrate = safe_div( target_diff
: target_diff * (double)(submitted_share_count - accepted_share_count ),
* (double)(submitted_share_count - accepted_share_count ) (double)uptime.tv_sec, 0. );
/ (double)uptime.tv_sec; double lost_shrate = safe_div( target_diff * (double)(submits - accepts ), share_time, 0. );
double lost_shrate = share_time == 0. ? 0.
: target_diff * (double)(submits - accepts ) / share_time;
char lshr_units[4] = {0}; char lshr_units[4] = {0};
char lghr_units[4] = {0}; char lghr_units[4] = {0};
scale_hash_for_display( &lost_shrate, lshr_units ); scale_hash_for_display( &lost_shrate, lshr_units );
@@ -2495,18 +2493,21 @@ static void *miner_thread( void *userdata )
timeval_subtract( &uptime, &total_hashes_time, &session_start ); timeval_subtract( &uptime, &total_hashes_time, &session_start );
double hashrate = safe_div( total_hashes, uptime.tv_sec, 0. ); double hashrate = safe_div( total_hashes, uptime.tv_sec, 0. );
scale_hash_for_display( &hashrate, hr_units ); if ( hashrate > 0. )
sprintf( hr, "%.2f", hashrate ); {
scale_hash_for_display( &hashrate, hr_units );
sprintf( hr, "%.2f", hashrate );
#if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32)) #if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32))
applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units ); applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units );
#else #else
float lo_freq = 0., hi_freq = 0.; float lo_freq = 0., hi_freq = 0.;
linux_cpu_hilo_freq( &lo_freq, &hi_freq ); linux_cpu_hilo_freq( &lo_freq, &hi_freq );
applog( LOG_NOTICE, applog( LOG_NOTICE,
"Total: %s %sH/s, Temp: %dC, Freq: %.3f/%.3f GHz", "Total: %s %sH/s, Temp: %dC, Freq: %.3f/%.3f GHz",
hr, hr_units, (uint32_t)cpu_temp(0), lo_freq / 1e6, hr, hr_units, (uint32_t)cpu_temp(0), lo_freq / 1e6,
hi_freq / 1e6 ); hi_freq / 1e6 );
#endif #endif
}
} }
} // benchmark } // benchmark
@@ -2900,6 +2901,7 @@ static bool cpu_capability( bool display_only )
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features ); bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
bool algo_has_aes = set_incl( AES_OPT, algo_features ); bool algo_has_aes = set_incl( AES_OPT, algo_features );
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features ); bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
bool algo_has_avx = set_incl( AVX_OPT, algo_features );
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features ); bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features ); bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features );
bool algo_has_sha = set_incl( SHA_OPT, algo_features ); bool algo_has_sha = set_incl( SHA_OPT, algo_features );
@@ -2907,6 +2909,8 @@ static bool cpu_capability( bool display_only )
bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features ); bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features );
bool use_aes; bool use_aes;
bool use_sse2; bool use_sse2;
bool use_sse42;
bool use_avx;
bool use_avx2; bool use_avx2;
bool use_avx512; bool use_avx512;
bool use_sha; bool use_sha;
@@ -2976,18 +2980,21 @@ static bool cpu_capability( bool display_only )
else if ( sw_has_aes ) printf( " AES" ); else if ( sw_has_aes ) printf( " AES" );
if ( sw_has_sha ) printf( " SHA" ); if ( sw_has_sha ) printf( " SHA" );
printf("\nAlgo features:"); if ( !display_only )
if ( algo_features == EMPTY_SET ) printf( " None" );
else
{ {
if ( algo_has_avx512 ) printf( " AVX512" ); printf("\nAlgo features:");
else if ( algo_has_avx2 ) printf( " AVX2 " ); if ( algo_features == EMPTY_SET ) printf( " None" );
else if ( algo_has_sse42 ) printf( " SSE4.2" ); else
else if ( algo_has_sse2 ) printf( " SSE2 " ); {
if ( algo_has_vaes || if ( algo_has_avx512 ) printf( " AVX512" );
algo_has_vaes256 ) printf( " VAES" ); else if ( algo_has_avx2 ) printf( " AVX2 " );
else if ( algo_has_aes ) printf( " AES" ); else if ( algo_has_sse42 ) printf( " SSE4.2" );
if ( algo_has_sha ) printf( " SHA" ); else if ( algo_has_sse2 ) printf( " SSE2 " );
if ( algo_has_vaes ||
algo_has_vaes256 ) printf( " VAES" );
else if ( algo_has_aes ) printf( " AES" );
if ( algo_has_sha ) printf( " SHA" );
}
} }
printf("\n"); printf("\n");
@@ -3022,6 +3029,8 @@ static bool cpu_capability( bool display_only )
// Determine mining options // Determine mining options
use_sse2 = cpu_has_sse2 && algo_has_sse2; use_sse2 = cpu_has_sse2 && algo_has_sse2;
use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42;
use_avx = cpu_has_avx && sw_has_avx && algo_has_avx;
use_aes = cpu_has_aes && sw_has_aes && algo_has_aes; use_aes = cpu_has_aes && sw_has_aes && algo_has_aes;
use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2; use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2;
use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512; use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
@@ -3038,6 +3047,8 @@ static bool cpu_capability( bool display_only )
{ {
if ( use_avx512 ) printf( " AVX512" ); if ( use_avx512 ) printf( " AVX512" );
else if ( use_avx2 ) printf( " AVX2" ); else if ( use_avx2 ) printf( " AVX2" );
else if ( use_avx ) printf( " AVX" );
else if ( use_sse42 ) printf( " SSE42" );
else if ( use_sse2 ) printf( " SSE2" ); else if ( use_sse2 ) printf( " SSE2" );
if ( use_vaes ) printf( " VAES" ); if ( use_vaes ) printf( " VAES" );
else if ( use_aes ) printf( " AES" ); else if ( use_aes ) printf( " AES" );

18
miner.h
View File

@@ -868,9 +868,9 @@ Options:\n\
yespowerr16 Yenten (YTN)\n\ yespowerr16 Yenten (YTN)\n\
yespower-b2b generic yespower + blake2b\n\ yespower-b2b generic yespower + blake2b\n\
zr5 Ziftr\n\ zr5 Ziftr\n\
-N, --param-n N parameter for scrypt based algos\n\ -N, --param-n=N N parameter for scrypt based algos\n\
-R, --param-r R parameter for scrypt based algos\n\ -R, --param-r=N R parameter for scrypt based algos\n\
-K, --param-key Key (pers) parameter for algos that use it\n\ -K, --param-key=STRING Key (pers) parameter for algos that use it\n\
-o, --url=URL URL of mining server\n\ -o, --url=URL URL of mining server\n\
-O, --userpass=U:P username:password pair for mining server\n\ -O, --userpass=U:P username:password pair for mining server\n\
-u, --user=USERNAME username for mining server\n\ -u, --user=USERNAME username for mining server\n\
@@ -886,8 +886,8 @@ Options:\n\
-s, --scantime=N upper bound on time spent scanning current work when\n\ -s, --scantime=N upper bound on time spent scanning current work when\n\
long polling is unavailable, in seconds (default: 5)\n\ long polling is unavailable, in seconds (default: 5)\n\
--randomize Randomize scan range start to reduce duplicates\n\ --randomize Randomize scan range start to reduce duplicates\n\
-f, --diff-factor Divide req. difficulty by this factor (std is 1.0)\n\ -f, --diff-factor=N Divide req. difficulty by this factor (std is 1.0)\n\
-m, --diff-multiplier Multiply difficulty by this factor (std is 1.0)\n\ -m, --diff-multiplier=N Multiply difficulty by this factor (std is 1.0)\n\
--hash-meter Display thread hash rates\n\ --hash-meter Display thread hash rates\n\
--coinbase-addr=ADDR payout address for solo mining\n\ --coinbase-addr=ADDR payout address for solo mining\n\
--coinbase-sig=TEXT data to insert in the coinbase when possible\n\ --coinbase-sig=TEXT data to insert in the coinbase when possible\n\
@@ -895,9 +895,9 @@ Options:\n\
--no-getwork disable getwork support\n\ --no-getwork disable getwork support\n\
--no-gbt disable getblocktemplate support\n\ --no-gbt disable getblocktemplate support\n\
--no-stratum disable X-Stratum support\n\ --no-stratum disable X-Stratum support\n\
--no-extranonce disable Stratum extranonce support\n\ --no-extranonce disable Stratum extranonce subscribe\n\
--no-redirect ignore requests to change the URL of the mining server\n\ --no-redirect ignore requests to change the URL of the mining server\n\
-q, --quiet disable per-thread hashmeter output\n\ -q, --quiet reduce log verbosity\n\
--no-color disable colored output\n\ --no-color disable colored output\n\
-D, --debug enable debug output\n\ -D, --debug enable debug output\n\
-P, --protocol-dump verbose dump of protocol-level activities\n" -P, --protocol-dump verbose dump of protocol-level activities\n"
@@ -916,9 +916,9 @@ Options:\n\
--max-rate=N[KMG] Only mine if net hashrate is less than specified value\n\ --max-rate=N[KMG] Only mine if net hashrate is less than specified value\n\
--max-diff=N Only mine if net difficulty is less than specified value\n\ --max-diff=N Only mine if net difficulty is less than specified value\n\
-c, --config=FILE load a JSON-format configuration file\n\ -c, --config=FILE load a JSON-format configuration file\n\
--data-file path and name of data file\n\ --data-file=FILE path and name of data file\n\
--verify enable additional time consuming start up tests\n\ --verify enable additional time consuming start up tests\n\
-V, --version display version information and exit\n\ -V, --version display version and CPU information and exit\n\
-h, --help display this help text and exit\n\ -h, --help display this help text and exit\n\
"; ";

View File

@@ -2,22 +2,21 @@
#define SIMD_INT_H__ 1 #define SIMD_INT_H__ 1
// Endian byte swap // Endian byte swap
#define bswap_64( a ) __builtin_bswap64( a ) #define bswap_64 __builtin_bswap64
#define bswap_32( a ) __builtin_bswap32( a ) #define bswap_32 __builtin_bswap32
// Bit rotation
#define rol64 __rolq
#define ror64 __rorq
#define rol32 __rold
#define ror32 __rord
// Safe division, integer or floating point. For floating point it's as // Safe division, integer or floating point. For floating point it's as
// safe as 0. is precisely zero. // safe as 0 is precisely zero.
// Returns safe_result if division by zero. // Returns safe_result if division by zero, typically zero.
#define safe_div( dividend, divisor, safe_result ) \ #define safe_div( dividend, divisor, safe_result ) \
( (divisor) == 0 ? safe_result : ( (dividend) / (divisor) ) ) ( (divisor) == 0 ? safe_result : ( (dividend) / (divisor) ) )
// Aliases with familiar names for built in bit rotate instructions
#define rol64( a, n ) _lrotl( a, n )
#define ror64( a, n ) _lrotr( a, n )
#define rol32( a, n ) _rotl( a, n )
#define ror32( a, n ) _rotr( a, n )
#define rol16( a, n ) _rotwl( a, n )
#define ror16( a, n ) _rotwr( a, n )
/////////////////////////////////////// ///////////////////////////////////////
// //