Compare commits

...

2 Commits

Author SHA1 Message Date
Jay D Dee
1a234cbe53 v3.18.2 2021-10-19 22:35:36 -04:00
Jay D Dee
47cc5dcff5 v3.18.1 2021-10-10 22:50:19 -04:00
24 changed files with 2490 additions and 2975 deletions

View File

@@ -32,14 +32,26 @@ but different package names.
$ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git
SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA
support depending on your CPU and compiler version:
openssl 1.1.0e or higher.
"-march=native" is always the best choice
znver1 and znver2 should be recognized on most recent version of GCC and
znver3 is expected with GCC 11. GCC 11 also includes rocketlake support.
In the meantime here are some suggestions to compile with new CPUs:
"-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000.
"-march=native" is usually the best choice, used by build.sh.
"-msha" Add SHA to other tuning options
"-march=znver2 -mvaes" can be used for Ryzen 5000 if znver3 is not recongized.
"-mcascadelake -msha" or
"-mcometlake -mavx512 -msha" can be used for Rocket Lake.
Features can also be added individually:
"-msha" adds support for HW accelerated sha256.
"-mavx512" adds support for 512 bit vectors
"-mvaes" add support for parallel AES
Additional instructions for static compilalation can be found here:
https://lxadm.com/Static_compilation_of_cpuminer

View File

@@ -171,6 +171,7 @@ cpuminer_SOURCES = \
algo/sha/hmac-sha256-hash-4way.c \
algo/sha/sha256d.c \
algo/sha/sha2.c \
algo/sha/sha256d-4way.c \
algo/sha/sha256t-gate.c \
algo/sha/sha256t-4way.c \
algo/sha/sha256t.c \

View File

@@ -65,10 +65,38 @@ If not what makes it happen or not happen?
Change Log
----------
v3.8.2
Issue #342, fixed Groestl AES on Windows, broken in v3.18.0.
AVX512 for sha256d.
SSE42 and AVX may now be displayed as mining features at startup.
This is hard coded for each algo, and is only implemented for scrypt
at this time as it is the only algo with significant performance differences
with those features.
Fixed an issue where a high hashrate algo could cause excessive invalid hash
rate log reports when starting up in benchmark mode.
v3.18.1
More speed for scrypt:
- additional scryptn2 optimizations for all CPU architectures,
- AVX2 is now used by default on CPUS with SHA but not AVX512,
- scrypt:1024 performance lost in v3.18.0 is restored,
- AVX512 & AVX2 improvements to scrypt:1024.
Big speedup for SwiFFTx AVX2 & SSE4.1: x22i +55%, x25x +22%.
Issue #337: fixed a problem that could display negative stats values in the
first summary report if the report was forced prematurely due to a stratum
diff change. The stats will still be invalid but should display zeros.
v3.18.0
Complete rewrite of Scrypt code, optimized for large N factor (scryptn2):
- AVX512 & SHA support for SHA256, AVX512 has priority,
- AVX512 & SHA support for sha256, AVX512 has priority,
- up to 50% increase in hashrate,
- memory requirements reduced 30-60% depending on CPU architecture,
- memory usage displayed at startup,

File diff suppressed because it is too large Load Diff

View File

@@ -28,7 +28,6 @@
*/
#include "algo-gate-api.h"
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
@@ -55,11 +54,25 @@ static const uint32_t sha256_initial_state[8] =
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
static int scrypt_throughput = 0;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SCRYPT_THROUGHPUT 16
#elif defined(__AVX2__)
#define SCRYPT_THROUGHPUT 8
#else
#define SCRYPT_THROUGHPUT 4
#endif
// static int scrypt_throughput = 0;
static int scratchbuf_size = 0;
static __thread char *scratchbuf = NULL;
static __thread uint32_t *scratchbuf = NULL;
// change this to a constant to be used directly as input state arg
// vectors still need an init function.
@@ -146,6 +159,119 @@ static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
output[i] = bswap_32( ostate[i] );
}
#if defined(__SHA__)
static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0,
const uint32_t *key1, uint32_t *tstate0, uint32_t *tstate1,
uint32_t *ostate0, uint32_t *ostate1 )
{
uint32_t ihash0[8], ihash1[8], pad0[16], pad1[16];
int i;
memcpy( pad0, key0 + 16, 16 );
memcpy( pad0 + 4, keypad, 48 );
memcpy( pad1, key1 + 16, 16 );
memcpy( pad1 + 4, keypad, 48 );
sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1,
tstate0, tstate1 );
memcpy( ihash0, tstate0, 32 );
memcpy( ihash1, tstate1, 32 );
for ( i = 0; i < 8; i++ )
{
pad0[i] = ihash0[i] ^ 0x5c5c5c5c;
pad1[i] = ihash1[i] ^ 0x5c5c5c5c;
}
for ( ; i < 16; i++ ) pad0[i] = pad1[i] = 0x5c5c5c5c;
sha256_ni2way_transform_le( ostate0, ostate1, pad0, pad1,
sha256_initial_state, sha256_initial_state );
for ( i = 0; i < 8; i++ )
{
pad0[i] = ihash0[i] ^ 0x36363636;
pad1[i] = ihash1[i] ^ 0x36363636;
}
for ( ; i < 16; i++ ) pad0[i] = pad1[i] = 0x36363636;
sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1,
sha256_initial_state, sha256_initial_state );
}
static inline void PBKDF2_SHA256_80_128_SHA_2BUF( const uint32_t *tstate0,
const uint32_t *tstate1, uint32_t *ostate0, uint32_t *ostate1,
const uint32_t *salt0, const uint32_t *salt1, uint32_t *output0,
uint32_t *output1 )
{
uint32_t istate0[8], istate1[8], ostateb0[8], ostateb1[8];
uint32_t ibuf0[16], obuf0[16], ibuf1[16], obuf1[16];
int i, j;
sha256_ni2way_transform_le( istate0, istate1, salt0, salt1,
tstate0, tstate1 );
memcpy( ibuf0, salt0 + 16, 16 );
memcpy( ibuf0 + 5, innerpad, 44 );
memcpy( obuf0 + 8, outerpad, 32 );
memcpy( ibuf1, salt1 + 16, 16 );
memcpy( ibuf1 + 5, innerpad, 44 );
memcpy( obuf1 + 8, outerpad, 32 );
for ( i = 0; i < 4; i++ )
{
memcpy( obuf0, istate0, 32 );
memcpy( obuf1, istate1, 32 );
ibuf0[4] = ibuf1[4] = i + 1;
sha256_ni2way_transform_le( obuf0, obuf1, ibuf0, ibuf1,
obuf0, obuf1 );
sha256_ni2way_transform_le( ostateb0, ostateb1, obuf0, obuf1,
ostate0, ostate1 );
for ( j = 0; j < 8; j++ )
{
output0[ 8*i + j ] = bswap_32( ostateb0[j] );
output1[ 8*i + j ] = bswap_32( ostateb1[j] );
}
}
}
static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,
uint32_t *tstate1, uint32_t *ostate0, uint32_t *ostate1,
const uint32_t *salt0, const uint32_t *salt1,
uint32_t *output0, uint32_t *output1 )
{
uint32_t buf0[16], buf1[16];
int i;
sha256_ni2way_transform_be( tstate0, tstate1, salt0, salt1,
tstate0, tstate1 );
sha256_ni2way_transform_be( tstate0, tstate1, salt0+16, salt1+16,
tstate0, tstate1 );
sha256_ni2way_transform_le( tstate0, tstate1, finalblk, finalblk,
tstate0, tstate1 );
memcpy( buf0, tstate0, 32 );
memcpy( buf0 + 8, outerpad, 32 );
memcpy( buf1, tstate1, 32 );
memcpy( buf1 + 8, outerpad, 32 );
sha256_ni2way_transform_le( ostate0, ostate1, buf0, buf1,
ostate0, ostate1 );
for ( i = 0; i < 8; i++ )
{
output0[i] = bswap_32( ostate0[i] );
output1[i] = bswap_32( ostate1[i] );
}
}
#endif
#ifdef HAVE_SHA256_4WAY
static const uint32_t keypad_4way[4 * 12] = {
@@ -596,15 +722,11 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
#endif // AVX512
//#if defined(USE_ASM) && defined(__x86_64__)
#define SCRYPT_MAX_WAYS 12
#define HAVE_SCRYPT_3WAY 1
//int scrypt_best_throughput();
void scrypt_core(uint32_t *X, uint32_t *V, int N);
void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
//#if defined(USE_AVX2)
#if defined(__AVX2__)
#undef SCRYPT_MAX_WAYS
#define SCRYPT_MAX_WAYS 24
@@ -614,40 +736,39 @@ void scrypt_core_6way(uint32_t *X, uint32_t *V, int N);
#ifndef SCRYPT_MAX_WAYS
#define SCRYPT_MAX_WAYS 1
//#define scrypt_best_throughput() 1
#endif
#include "scrypt-core-4way.h"
static bool scrypt_N_1_1_256(const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thr_id )
/*
static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
uint32_t *midstate, int N, int thr_id )
{
uint32_t tstate[8], ostate[8];
uint32_t X[32];
uint32_t *V = (uint32_t*)scratchpad;
memcpy(tstate, midstate, 32);
HMAC_SHA256_80_init(input, tstate, ostate);
PBKDF2_SHA256_80_128(tstate, ostate, input, X);
scrypt_core_simd128( X, V, N ); // woring
scrypt_core_simd128( X, scratchbuf, N ); // woring
// scrypt_core_1way( X, V, N ); // working
// scrypt_core(X, V, N);
PBKDF2_SHA256_128_32(tstate, ostate, X, output);
return true;
}
*/
#if defined(__AVX2__)
#if ( SCRYPT_THROUGHPUT == 8 )
static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
uint32_t *midstate, int N, int thrid )
{
uint32_t _ALIGN(128) tstate[8 * 8];
uint32_t _ALIGN(128) ostate[8 * 8];
uint32_t _ALIGN(128) W[8 * 32];
uint32_t _ALIGN(128) X[8 * 32];
uint32_t *V = (uint32_t*)scratchpad;
uint32_t _ALIGN(128) tstate[ 8*8 ];
uint32_t _ALIGN(128) ostate[ 8*8 ];
uint32_t _ALIGN(128) W[ 8*32 ];
uint32_t _ALIGN(128) X[ 8*32 ];
intrlv_8x32( W, input, input+ 20, input+ 40, input+ 60,
input+80, input+100, input+120, input+140, 640 );
@@ -658,53 +779,45 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
PBKDF2_SHA256_80_128_8way( tstate, ostate, W, W );
dintrlv_8x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, W, 1024 );
if ( opt_param_n > 0x4000 )
{
scrypt_core_simd128_3buf( X, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_3buf( X+ 96, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+192, scratchbuf, N );
}
else
{
intrlv_2x128( W, X, X+ 32, 1024 );
intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
intrlv_2x128( W+128, X+128, X+160, 1024 );
intrlv_2x128( W+192, X+192, X+224, 1024 );
scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)scratchbuf, N );
dintrlv_2x128( X, X+ 32, W, 1024 );
dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
dintrlv_2x128( X+128, X+160, W+128, 1024 );
dintrlv_2x128( X+192, X+224, W+192, 1024 );
}
// SCRYPT CORE
// AVX512
/*
// AVX512 16 way working
intrlv_16x32( W, X, X+32, X+64, X+96, X+128, X+160, X+192, X+224,
X+256, X+256+32, X+256+64, X+256+96, X+256+128,
X+256+160, X+256+192, X+256+224, 1024 );
scrypt_core_16way( (__m512i*)W , (__m512i*)V, N );
dintrlv_16x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224,
X+256, X+256+32, X+256+64, X+256+96, X+256+128,
X+256+160, X+256+192, X+256+224, W, 1024 );
*/
/*
// AVX512 working
intrlv_4x32( W, X, X+ 32, X+ 64, X+ 96, 1024 );
intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 );
scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N );
dintrlv_4x32( X, X+ 32, X+ 64, X+ 96, W, 1024 );
dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
*/
/*
// AVX512, not working, very slow
intrlv_4x128( W, X, X+ 32, X+ 64, X+ 96, 1024 );
intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
scrypt_core_4way_simd128( (__m512i*)W, (__m512i*)V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
dintrlv_4x128( X, X+ 32, X+ 64, X+ 96, W, 1024 );
dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
*/
// AVX2
/*
// AVX2
// disable de/interleave for testing.
scrypt_core_8way( (__m256i*)W , (__m256i*)V, N );
*/
// scrypt_core_8way( (__m256i*)W , (__m256i*)V, N );
/*
// AVX2 working
@@ -714,23 +827,18 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
intrlv_2x128( W+192, X+192, X+224, 1024 );
// working
// scrypt_core_2way_simd128_3buf( (__m256i*) W, (__m256i*)V, N );
// scrypt_core_2way_simd128_2buf( (__m256i*) W, (__m256i*)V, N );
// if ( work_restart[thrid].restart ) return 0;
// scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
// scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
// working
scrypt_core_2way_simd128_2buf( (__m256i*) W, (__m256i*)V, N );
scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
// working
// scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)V, N );
// if ( work_restart[thrid].restart ) return 0;
// scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
// if ( work_restart[thrid].restart ) return 0;
// scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
// if ( work_restart[thrid].restart ) return 0;
// scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
dintrlv_2x128( X, X+ 32, W, 1024 );
dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
@@ -745,18 +853,10 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
intrlv_2x32( W+128, X+128, X+160, 1024 );
intrlv_2x32( W+192, X+192, X+224, 1024 );
// working, deprecated, not up to data
// scrypt_core_simd128_2way_4buf( (uint64_t*)W, (uint64_t*)V, N );
// deprecated, not up to date
// scrypt_core_simd128_2way_3buf( (uint64_t*) W, (uint64_t*)V, N );
// if ( work_restart[thrid].restart ) return 0;
// scrypt_core_simd128_2way( (uint64_t*)( W+192 ), (uint64_t*)V, N );
// working
// scrypt_core_simd128_2way_2buf( (uint64_t*) W, (uint64_t*)V, N );
// if ( work_restart[thrid].restart ) return 0;
// scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
scrypt_core_simd128_2way_2buf( (uint64_t*) W, (uint64_t*)V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
// scrypt_core_simd128_2way( (uint64_t*) W, (uint64_t*)V, N );
// if ( work_restart[thrid].restart ) return 0;
@@ -813,19 +913,13 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+192, V, N );
*/
/**************
scrypt_core_simd128_3buf( X, V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_3buf( X+ 96, V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+192, V, N );
/*
// SSE2 working
scrypt_core_simd128_4buf( X, V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_4buf( X+128, V, N );
*/
*************/
if ( work_restart[thrid].restart ) return 0;
@@ -842,16 +936,15 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if ( SCRYPT_THROUGHPUT == 16 )
static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
uint32_t *midstate, int N, int thrid )
{
uint32_t _ALIGN(128) tstate[ 16*8 ];
uint32_t _ALIGN(128) ostate[ 16*8 ];
uint32_t _ALIGN(128) W[ 16*32 ];
uint32_t _ALIGN(128) X[ 16*32 ];
uint32_t *V = (uint32_t*)scratchpad;
intrlv_16x32( W, input, input+ 20, input+ 40, input+ 60,
input+ 80, input+100, input+120, input+140,
@@ -868,6 +961,39 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
W, 1024 );
if ( opt_param_n > 0x4000 )
{
scrypt_core_simd128_3buf( X, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_3buf( X+ 96, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+192, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_3buf( X+256, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_3buf( X+352, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+448, scratchbuf, N );
}
else
{
intrlv_4x128( W, X, X+ 32, X+ 64, X+ 96, 1024 );
intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
intrlv_4x128( W+256, X+256, X+288, X+320, X+352, 1024 );
intrlv_4x128( W+384, X+384, X+416, X+448, X+480, 1024 );
scrypt_core_4way_simd128( (__m512i*) W, (__m512i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_4way_simd128( (__m512i*)(W+384), (__m512i*)scratchbuf, N );
dintrlv_4x128( X, X+ 32, X+ 64, X+ 96, W, 1024 );
dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
dintrlv_4x128( X+256, X+288, X+320, X+352, W+256, 1024 );
dintrlv_4x128( X+384, X+416, X+448, X+480, W+384, 1024 );
}
// SCRYPT CORE
@@ -888,23 +1014,40 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
// AVX512 working
intrlv_4x32( W, X, X+ 32, X+ 64, X+ 96, 1024 );
intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 );
intrlv_4x32( W+256, X+256, X+256+ 32, X+256+ 64, X+256+ 96, 1024 );
intrlv_4x32( W+256+128, X+256+128, X+256+160, X+256+192, X+256+224, 1024 );
scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_4way( (__m128i*)(W+256), (__m128i*)V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_4way( (__m128i*)(W+256+128), (__m128i*)V, N );
dintrlv_4x32( X, X+ 32, X+ 64, X+ 96, W, 1024 );
dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
dintrlv_4x32( X+256, X+256+ 32, X+256+ 64, X+256+ 96, W+256, 1024 );
dintrlv_4x32( X+256+128, X+256+160, X+256+192, X+256+224, W+256+128, 1024 );
*/
/*
// AVX512, not working, very slow
// AVX512, working
intrlv_4x128( W, X, X+ 32, X+ 64, X+ 96, 1024 );
intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
intrlv_4x128( W+256, X+256, X+256+ 32, X+256+ 64, X+256+ 96, 1024 );
intrlv_4x128( W+256+128, X+256+128, X+256+160, X+256+192, X+256+224, 1024 );
scrypt_core_4way_simd128( (__m512i*)W, (__m512i*)V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_4way_simd128( (__m512i*)(W+256+128), (__m512i*)V, N );
dintrlv_4x128( X, X+ 32, X+ 64, X+ 96, W, 1024 );
dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
dintrlv_4x128( X+256, X+256+ 32, X+256+ 64, X+256+ 96, W+256, 1024 );
dintrlv_4x128( X+256+128, X+256+160, X+256+192, X+256+224, W+256+128, 1024 );
*/
// AVX2
/*
@@ -919,16 +1062,19 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
intrlv_2x128( W+128, X+128, X+160, 1024 );
intrlv_2x128( W+192, X+192, X+224, 1024 );
// working
// scrypt_core_2way_simd128_3buf( (__m256i*) W, (__m256i*)V, N );
// if ( work_restart[thrid].restart ) return 0;
// scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
intrlv_2x128( W+256, X+256, X+256+ 32, 1024 );
intrlv_2x128( W+256+ 64, X+256+ 64, X+256+ 96, 1024 );
intrlv_2x128( W+256+128, X+256+128, X+256+160, 1024 );
intrlv_2x128( W+256+192, X+256+192, X+256+224, 1024 );
// working
scrypt_core_2way_simd128_2buf( (__m256i*) W, (__m256i*)V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_2way_simd128_2buf( (__m256i*)(W+256), (__m256i*)V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_2way_simd128_2buf( (__m256i*)(W+256+128), (__m256i*)V, N );
// working
// scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)V, N );
@@ -938,11 +1084,23 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
// scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
// if ( work_restart[thrid].restart ) return 0;
// scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
// if ( work_restart[thrid].restart ) return 0;
// scrypt_core_2way_simd128( (__m256i*)(W+256), (__m256i*)V, N );
// if ( work_restart[thrid].restart ) return 0;
// scrypt_core_2way_simd128( (__m256i*)(W+256+ 64), (__m256i*)V, N );
// if ( work_restart[thrid].restart ) return 0;
// scrypt_core_2way_simd128( (__m256i*)(W+256+128), (__m256i*)V, N );
// if ( work_restart[thrid].restart ) return 0;
// scrypt_core_2way_simd128( (__m256i*)(W+256+192), (__m256i*)V, N );
dintrlv_2x128( X, X+ 32, W, 1024 );
dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
dintrlv_2x128( X+128, X+160, W+128, 1024 );
dintrlv_2x128( X+192, X+224, W+192, 1024 );
dintrlv_2x128( X+256, X+256+ 32, W+256, 1024 );
dintrlv_2x128( X+256+ 64, X+256+ 96, W+256+ 64, 1024 );
dintrlv_2x128( X+256+128, X+256+160, W+256+128, 1024 );
dintrlv_2x128( X+256+192, X+256+224, W+256+192, 1024 );
*/
/*
@@ -952,18 +1110,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
intrlv_2x32( W+128, X+128, X+160, 1024 );
intrlv_2x32( W+192, X+192, X+224, 1024 );
// working, deprecated, not up to data
// scrypt_core_simd128_2way_4buf( (uint64_t*)W, (uint64_t*)V, N );
// deprecated, not up to date
// scrypt_core_simd128_2way_3buf( (uint64_t*) W, (uint64_t*)V, N );
// if ( work_restart[thrid].restart ) return 0;
// scrypt_core_simd128_2way( (uint64_t*)( W+192 ), (uint64_t*)V, N );
// working
// scrypt_core_simd128_2way_2buf( (uint64_t*) W, (uint64_t*)V, N );
// if ( work_restart[thrid].restart ) return 0;
// scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
// scrypt_core_simd128_2way_2buf( (uint64_t*) W, (uint64_t*)V, N );
// if ( work_restart[thrid].restart ) return 0;
// scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
// scrypt_core_simd128_2way( (uint64_t*) W, (uint64_t*)V, N );
// if ( work_restart[thrid].restart ) return 0;
@@ -1043,7 +1196,7 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+448, V, N );
*/
/***************
scrypt_core_simd128_3buf( X, V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_3buf( X+ 96, V, N );
@@ -1055,17 +1208,7 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
scrypt_core_simd128_3buf( X+352, V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+448, V, N );
/*
// SSE2 working
scrypt_core_simd128_4buf( X, V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_4buf( X+128, V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_4buf( X+256, V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_4buf( X+384, V, N );
*/
********************/
/*
scrypt_core_3way( X, V, N );
if ( work_restart[thrid].restart ) return 0;
@@ -1100,15 +1243,37 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
#endif // AVX512
#if defined(__SHA__)
#if 0
static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
uint32_t *midstate, int N, int thrid )
{
uint32_t _ALIGN(128) tstate[ 2*8 ];
uint32_t _ALIGN(128) ostate[ 2*8 ];
uint32_t _ALIGN(128) W[ 2*32 ];
memcpy( tstate, midstate, 32 );
memcpy( tstate+ 8, midstate, 32 );
HMAC_SHA256_80_init_SHA_2BUF( input, input+20, tstate, tstate+8,
ostate, ostate+8 );
PBKDF2_SHA256_80_128_SHA_2BUF( tstate, tstate+8, ostate, ostate+8,
input, input+20, W, W+32 );
scrypt_core_simd128_2buf( W, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
PBKDF2_SHA256_128_32_SHA_2BUF( tstate, tstate+8, ostate, ostate+8, W, W+32,
output, output+8 );
return 1;
}
static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
uint32_t *midstate, int N, int thrid )
{
uint32_t _ALIGN(128) tstate[4 * 8];
uint32_t _ALIGN(128) ostate[4 * 8];
uint32_t _ALIGN(128) W[4 * 32];
uint32_t *V = (uint32_t*)scratchpad;
memcpy( tstate, midstate, 32 );
memcpy( tstate+ 8, midstate, 32 );
@@ -1139,9 +1304,9 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
*/
// working, double buffered linear simd
scrypt_core_simd128_2buf( W, V, N );
scrypt_core_simd128_2buf( W, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( W+64, V, N );
scrypt_core_simd128_2buf( W+64, scratchbuf, N );
/*
scrypt_core_simd128_3buf( W, V, N );
@@ -1149,8 +1314,6 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
scrypt_core_simd128( W+96, V, N );
*/
// working
// scrypt_core_simd128_4buf( W, V, N );
if ( work_restart[thrid].restart ) return 0;
@@ -1164,18 +1327,15 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
return 1;
}
#endif
#else
#ifdef HAVE_SHA256_4WAY
#if ( SCRYPT_THROUGHPUT == 4 )
static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
uint32_t *midstate, int N, int thrid )
{
uint32_t _ALIGN(128) tstate[4 * 8];
uint32_t _ALIGN(128) ostate[4 * 8];
uint32_t _ALIGN(128) W[4 * 32];
uint32_t _ALIGN(128) X[4 * 32];
uint32_t *V = (uint32_t*)scratchpad;
uint32_t _ALIGN(128) tstate[ 4*8 ];
uint32_t _ALIGN(128) ostate[ 4*8 ];
uint32_t _ALIGN(128) W[ 4*32 ];
intrlv_4x32( W, input, input+20, input+40, input+60, 640 );
for ( int i = 0; i < 8; i++ )
@@ -1184,7 +1344,21 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
HMAC_SHA256_80_init_4way(W, tstate, ostate);
PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
if ( opt_param_n > 0x4000 )
{
uint32_t _ALIGN(128) X[ 4*32 ];
dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
scrypt_core_simd128_2buf( X, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+64, scratchbuf, N );
intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
}
else
scrypt_core_4way( (__m128i*)W, (__m128i*)scratchbuf, N );
// dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
////// SCRYPT_CORE
@@ -1202,35 +1376,23 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128( X+96, V, N );
*/
/*
// working, double buffered linear simd, best for n2
scrypt_core_simd128_2buf( X, V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+64, V, N );
*/
/*
scrypt_core_simd128_3buf( X, V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128( X+96, V, N );
*/
// working
// scrypt_core_simd128_4buf( X, V, N );
/*
// original
scrypt_core(X + 0 * 32, V, N);
scrypt_core(X + 1 * 32, V, N);
scrypt_core(X + 2 * 32, V, N);
scrypt_core(X + 3 * 32, V, N);
*/
////////////////////////////////
if ( work_restart[thrid].restart ) return 0;
intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
// intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
@@ -1238,58 +1400,73 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
return 1;
}
#endif /* HAVE_SHA256_4WAY */
#endif // SCRYPT_THROUGHPUT == 4
#endif // SHA
//#endif // SHA
extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) hash[ 8*SCRYPT_THROUGHPUT ];
uint32_t _ALIGN(64) data[ 20*SCRYPT_THROUGHPUT ];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
uint32_t midstate[8];
uint32_t n = pdata[19] - 1;
uint32_t midstate[8];
uint32_t n = pdata[19] - 1;
int thr_id = mythr->id;
int throughput = scrypt_throughput;
int i;
int i;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
for ( i = 0; i < throughput; i++ )
memcpy( data + i * 20, pdata, 80 );
for ( i = 0; i < SCRYPT_THROUGHPUT; i++ )
memcpy( data + i * 20, pdata, 80 );
sha256_transform_le( midstate, data, sha256_initial_state );
do {
do {
bool rc = true;
for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n;
for ( i = 0; i < SCRYPT_THROUGHPUT; i++ ) data[ i*20 + 19 ] = ++n;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
if ( throughput == 16 )
rc = scrypt_N_1_1_256_16way( data, hash, midstate, scratchbuf,
opt_param_n, thr_id );
else
#endif
#if defined(__AVX2__)
if ( throughput == 8 )
rc = scrypt_N_1_1_256_8way( data, hash, midstate, scratchbuf,
opt_param_n, thr_id );
else
#endif
if ( throughput == 4 )
#if defined(__SHA__)
rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, scratchbuf,
opt_param_n, thr_id );
//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if ( SCRYPT_THROUGHPUT == 16 )
// if ( SCRYPT_THROUGHPUT == 16 )
rc = scrypt_N_1_1_256_16way( data, hash, midstate, opt_param_n,
thr_id );
// else
//#endif
//#if defined(__AVX2__)
#elif ( SCRYPT_THROUGHPUT == 8 )
// if ( SCRYPT_THROUGHPUT == 8 )
rc = scrypt_N_1_1_256_8way( data, hash, midstate, opt_param_n,
thr_id );
// else
//#endif
#elif ( SCRYPT_THROUGHPUT == 4 )
// if ( SCRYPT_THROUGHPUT == 4 ) // slower on Ryzen than 8way
//#if defined(__SHA__)
// rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, opt_param_n,
// thr_id );
//#else
rc = scrypt_N_1_1_256_4way( data, hash, midstate, opt_param_n,
thr_id );
#else
rc = scrypt_N_1_1_256_4way( data, hash, midstate, scratchbuf,
opt_param_n, thr_id );
#endif
else
rc = scrypt_N_1_1_256( data, hash, midstate, scratchbuf,
opt_param_n, thr_id );
#error "Invalid SCRYPT_THROUGHPUT"
#endif
/*
#if defined(__SHA__)
else
if ( SCRYPT_THROUGHPUT == 2 ) // slower on Ryzen than 4way_sha & 8way
rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, opt_param_n,
thr_id );
#endif
else // should never get here
rc = scrypt_N_1_1_256( data, hash, midstate, opt_param_n, thr_id );
*/
// test the hash
if ( rc )
for ( i = 0; i < throughput; i++ )
for ( i = 0; i < SCRYPT_THROUGHPUT; i++ )
{
if ( unlikely( valid_hash( hash + i*8, ptarget ) && !opt_benchmark ) )
{
@@ -1301,7 +1478,7 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
}
} while ( likely( ( n < ( max_nonce - throughput ) ) && !(*restart) ) );
} while ( likely( ( n < ( max_nonce - SCRYPT_THROUGHPUT ) ) && !(*restart) ) );
*hashes_done = n - pdata[19];
pdata[19] = n;
@@ -1319,29 +1496,45 @@ bool scrypt_miner_thread_init( int thr_id )
bool register_scrypt_algo( algo_gate_t* gate )
{
#if defined(__SHA__)
gate->optimizations = SSE2_OPT | SHA_OPT;
#else
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
#endif
//#if defined(__SHA__)
// gate->optimizations = SSE2_OPT | SHA_OPT;
//#else
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT;
//#endif
gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
gate->scanhash = (void*)&scanhash_scrypt;
opt_target_factor = 65536.0;
opt_param_n = opt_param_n ? opt_param_n : 1024;
applog( LOG_INFO,"Scrypt paramaters: N= %d, R= 1", opt_param_n );
// scrypt_throughput can be defined at compile time and used to replace
// MAX_WAYS to reduce memory usage.
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
scrypt_throughput = 16;
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
// scrypt_throughput = 16;
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
else
scratchbuf_size = opt_param_n * 4 * 128; // 4 way
/* SHA is slower than AVX2 on Ryzen
#elif defined(__SHA__)
scrypt_throughput = 4;
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
*/
#elif defined(__AVX2__)
scrypt_throughput = 8;
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
// scrypt_throughput = 8;
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
else
scratchbuf_size = opt_param_n * 2 * 128; // 2 way
#else
scrypt_throughput = 4;
// scrypt_throughput = 4;
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
else
scratchbuf_size = opt_param_n * 4 * 128; // 4 way
#endif
char t_units[4] = {0};
@@ -1353,7 +1546,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
format_number_si( &d_size, d_units );
applog( LOG_INFO,"Throughput %d/thr, Buffer %.0f %siB/thr, Total %.0f %siB\n",
scrypt_throughput, t_size, t_units, d_size, d_units );
SCRYPT_THROUGHPUT, t_size, t_units, d_size, d_units );
return true;
};

View File

@@ -51,7 +51,6 @@ typedef struct {
__m128i buf[64>>2];
__m128i val[8];
uint32_t count_high, count_low;
bool initialized;
} sha256_4way_context __attribute__ ((aligned (64)));
void sha256_4way_init( sha256_4way_context *sc );
@@ -74,7 +73,6 @@ typedef struct {
__m256i buf[64>>2];
__m256i val[8];
uint32_t count_high, count_low;
bool initialized;
} sha256_8way_context __attribute__ ((aligned (128)));
void sha256_8way_init( sha256_8way_context *sc );
@@ -86,6 +84,11 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
const __m256i *state_in );
void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
const __m256i *state_in );
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const __m256i *state_mid );
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
@@ -96,7 +99,6 @@ typedef struct {
__m512i buf[64>>2];
__m512i val[8];
uint32_t count_high, count_low;
bool initialized;
} sha256_16way_context __attribute__ ((aligned (128)));
void sha256_16way_init( sha256_16way_context *sc );

View File

@@ -8,7 +8,7 @@
* any later version. See COPYING for more details.
*/
#include "algo-gate-api.h"
#include "sha256d-4way.h"
#include <string.h>
#include <inttypes.h>
@@ -181,6 +181,8 @@ static const uint32_t sha256d_hash1[16] = {
};
// this performs the entire hash all over again, why?
// because main function only does 56 rounds.
static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
{
uint32_t S[16];
@@ -492,7 +494,7 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
void sha256d_ms_4way(uint32_t *hash, uint32_t *data,
const uint32_t *midstate, const uint32_t *prehash);
static inline int scanhash_sha256d_4way( struct work *work,
static inline int scanhash_sha256d_4way_pooler( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
@@ -553,7 +555,7 @@ static inline int scanhash_sha256d_4way( struct work *work,
void sha256d_ms_8way(uint32_t *hash, uint32_t *data,
const uint32_t *midstate, const uint32_t *prehash);
static inline int scanhash_sha256d_8way( struct work *work,
static inline int scanhash_sha256d_8way_pooler( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
@@ -609,7 +611,7 @@ static inline int scanhash_sha256d_8way( struct work *work,
#endif /* HAVE_SHA256_8WAY */
int scanhash_sha256d( struct work *work,
int scanhash_sha256d_pooler( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
@@ -625,11 +627,11 @@ int scanhash_sha256d( struct work *work,
#ifdef HAVE_SHA256_8WAY
if (sha256_use_8way())
return scanhash_sha256d_8way( work, max_nonce, hashes_done, mythr );
return scanhash_sha256d_8way_pooler( work, max_nonce, hashes_done, mythr );
#endif
#ifdef HAVE_SHA256_4WAY
if (sha256_use_4way())
return scanhash_sha256d_4way( work, max_nonce, hashes_done, mythr );
return scanhash_sha256d_4way_pooler( work, max_nonce, hashes_done, mythr );
#endif
memcpy(data, pdata + 16, 64);
@@ -690,9 +692,13 @@ int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
bool register_sha256d_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_sha256d;
// gate->hash = (void*)&sha256d;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
#if defined(SHA256D_16WAY)
gate->scanhash = (void*)&scanhash_sha256d_16way;
#else
gate->scanhash = (void*)&scanhash_sha256d_pooler;
#endif
// gate->hash = (void*)&sha256d;
return true;
};

File diff suppressed because it is too large Load Diff

View File

@@ -53,4 +53,8 @@ void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
#define sha256_transform_be sph_sha256_transform_be
#endif
// SHA can't do only 3 rounds
#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
#endif

View File

@@ -1,4 +1,4 @@
#include "sha256t-gate.h"
#include "sha256d-4way.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -13,7 +13,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
__m512i block[16] __attribute__ ((aligned (64)));
__m512i hash32[8] __attribute__ ((aligned (32)));
__m512i initstate[8] __attribute__ ((aligned (32)));
__m512i midstate[8] __attribute__ ((aligned (32)));
__m512i midstate1[8] __attribute__ ((aligned (32)));
__m512i midstate2[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m512i vdata[20] __attribute__ ((aligned (32)));
@@ -46,11 +46,10 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 byte block of data
sha256_16way_transform_le( midstate, vdata, initstate );
sha256_16way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
do
{
@@ -59,7 +58,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
block[ 4] = last_byte;
memset_zero_512( block + 5, 10 );
block[15] = m512_const1_32( 80*8 ); // bit count
sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
// 2. 32 byte hash from 1.
memcpy_512( block, hash32, 8 );
@@ -99,7 +98,8 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
__m256i block[16] __attribute__ ((aligned (64)));
__m256i hash32[8] __attribute__ ((aligned (32)));
__m256i initstate[8] __attribute__ ((aligned (32)));
__m256i midstate[8] __attribute__ ((aligned (32)));
__m256i midstate1[8] __attribute__ ((aligned (32)));
__m256i midstate2[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m256i vdata[20] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
@@ -116,7 +116,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
const __m256i eight = m256_const1_32( 8 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m256_const1_32( pdata[i] );
vdata[i] = m256_const1_32( pdata[i] );
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
@@ -130,8 +130,10 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_8way_transform_le( midstate, vdata, initstate );
sha256_8way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
do
{
@@ -140,7 +142,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
block[ 4] = last_byte;
memset_zero_256( block + 5, 10 );
block[15] = m256_const1_32( 80*8 ); // bit count
sha256_8way_transform_le( hash32, block, midstate );
sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
// 2. 32 byte hash from 1.
memcpy_256( block, hash32, 8 );
@@ -253,3 +255,20 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
#endif
/*
bool register_sha256d_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
#if defined(SHA256D_16WAY)
gate->scanhash = (void*)&scanhash_sha256d_16way;
#elif defined(SHA256D_8WAY)
gate->scanhash = (void*)&scanhash_sha256d_8way;
#elif defined(SHA256D_4WAY)
gate->scanhash = (void*)&scanhash_sha256d_4way;
#endif
// gate->hash = (void*)&sha256d;
return true;
};
*/

48
algo/sha/sha256d-4way.h Normal file
View File

@@ -0,0 +1,48 @@
#ifndef __SHA256D_4WAY_H__
#define __SHA256D_4WAY_H__ 1
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA256D_16WAY 1
/*
#elif defined(__AVX2__)
#define SHA256D_8WAY 1
#else
#define SHA256D_4WAY 1
*/
#endif
bool register_sha256d_algo( algo_gate_t* gate );
#if defined(SHA256D_16WAY)
int scanhash_sha256d_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
/*
#if defined(SHA256D_8WAY)
int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA256D_4WAY)
int scanhash_sha256d_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
*/
/*
#if defined(__SHA__)
int scanhash_sha256d( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
*/
#endif

View File

@@ -13,7 +13,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
__m512i block[16] __attribute__ ((aligned (64)));
__m512i hash32[8] __attribute__ ((aligned (32)));
__m512i initstate[8] __attribute__ ((aligned (32)));
__m512i midstate[8] __attribute__ ((aligned (32)));
__m512i midstate1[8] __attribute__ ((aligned (32)));
__m512i midstate2[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m512i vdata[20] __attribute__ ((aligned (32)));
@@ -31,7 +31,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
const __m512i sixteen = m512_const1_32( 16 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m512_const1_32( pdata[i] );
vdata[i] = m512_const1_32( pdata[i] );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
@@ -46,11 +46,10 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 byte block of data
sha256_16way_transform_le( midstate, vdata, initstate );
sha256_16way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
do
{
@@ -59,7 +58,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
block[ 4] = last_byte;
memset_zero_512( block + 5, 10 );
block[15] = m512_const1_32( 80*8 ); // bit count
sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
// 2. 32 byte hash from 1.
memcpy_512( block, hash32, 8 );
@@ -104,7 +103,8 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
__m256i block[16] __attribute__ ((aligned (64)));
__m256i hash32[8] __attribute__ ((aligned (32)));
__m256i initstate[8] __attribute__ ((aligned (32)));
__m256i midstate[8] __attribute__ ((aligned (32)));
__m256i midstate1[8] __attribute__ ((aligned (32)));
__m256i midstate2[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m256i vdata[20] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
@@ -121,7 +121,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
const __m256i eight = m256_const1_32( 8 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m256_const1_32( pdata[i] );
vdata[i] = m256_const1_32( pdata[i] );
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
@@ -135,9 +135,11 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_8way_transform_le( midstate, vdata, initstate );
sha256_8way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
do
{
// 1. final 16 bytes of data, with padding
@@ -145,7 +147,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
block[ 4] = last_byte;
memset_zero_256( block + 5, 10 );
block[15] = m256_const1_32( 80*8 ); // bit count
sha256_8way_transform_le( hash32, block, midstate );
sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
// 2. 32 byte hash from 1.
memcpy_256( block, hash32, 8 );

View File

@@ -702,6 +702,36 @@ memcpy( state_out, state_in, 32 );
}
void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in )
{
uint32_t t1, t2, X_xor_Y, Y_xor_Z = state_in[1] ^ state_in[2];
memcpy( state_out, state_in, 32 );
t1 = state_out[7] + BSG2_1( state_out[4] )
+ CH( state_out[4], state_out[5], state_out[6] ) + 0x428A2F98 + data[0];
t2 = BSG2_0( state_out[0] )
+ MAJ( state_out[0], state_out[1], state_out[2] );
Y_xor_Z = X_xor_Y;
state_out[3] += t1;
state_out[7] = t1 + t2;
t1 = state_out[6] + BSG2_1( state_out[3] )
+ CH( state_out[3], state_out[4], state_out[5] ) + 0x71374491 + data[1];
t2 = BSG2_0( state_out[7] )
+ MAJ( state_out[7], state_out[0], state_out[1] );
Y_xor_Z = X_xor_Y;
state_out[2] += t1;
state_out[6] = t1 + t2;
t1 = state_out[5] + BSG2_1( state_out[2] )
+ CH( state_out[2], state_out[3], state_out[4] ) + 0xB5C0FBCF + data[2];
t2 = BSG2_0( state_out[6] )
+ MAJ( state_out[6], state_out[7], state_out[0] );
state_out[1] += t1;
state_out[5] = t1 + t2;
}
/* see sph_sha2.h */
void
sph_sha224_init(void *cc)

View File

@@ -215,6 +215,9 @@ void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
#if SPH_64

View File

@@ -70,6 +70,8 @@ extern "C"{
C8, C9, CA, CB, CC, CD, CE, CF; \
__m256i M0, M1, M2, M3, M4, M5, M6, M7, \
M8, M9, MA, MB, MC, MD, ME, MF; \
const __m256i FIVE = _mm256_set1_epi32( 5 ); \
const __m256i THREE = _mm256_set1_epi32( 3 ); \
sph_u32 Wlow, Whigh;
#define READ_STATE8(state) do \
@@ -314,8 +316,7 @@ do { \
_mm256_andnot_si256( xb3, xb2 ), \
_mm256_mullo_epi32( mm256_xor3( xa0, xc, \
_mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \
_mm256_set1_epi32(5UL) ) ), \
_mm256_set1_epi32(3UL) ) ) ); \
FIVE ) ), THREE ) ) ); \
xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
} while (0)
@@ -667,7 +668,9 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
C8, C9, CA, CB, CC, CD, CE, CF; \
__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
M8, M9, MA, MB, MC, MD, ME, MF; \
sph_u32 Wlow, Whigh;
const __m128i FIVE = _mm_set1_epi32( 5 ); \
const __m128i THREE = _mm_set1_epi32( 3 ); \
sph_u32 Wlow, Whigh;
#define READ_STATE(state) do \
{ \
@@ -931,8 +934,8 @@ do { \
xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \
_mm_andnot_si128( xb3, xb2 ), \
_mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
_mm_mullo_epi32( mm128_rol_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
) ), _mm_set1_epi32(3UL) ) ) ) ); \
_mm_mullo_epi32( mm128_rol_32( xa1, 15 ), FIVE ) \
) ), THREE ) ) ) ); \
xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \
} while (0)

View File

@@ -18,16 +18,20 @@
#ifndef __INTTYPES_H_
#define __INTTYPES_H_
#include <stdint.h>
/* Use [u]intN_t if you need exactly N bits.
XXX - doesn't handle the -mint8 option. */
typedef signed char swift_int8_t;
typedef unsigned char swift_uint8_t;
typedef int swift_int16_t;
typedef int32_t swift_int16_t;
// typedef int swift_int16_t;
typedef unsigned int swift_uint16_t;
typedef long swift_int32_t;
typedef int32_t swift_int32_t;
// typedef long swift_int32_t;
typedef unsigned long swift_uint32_t;
typedef long long swift_int64_t;

View File

@@ -18,6 +18,8 @@
//#include "stdbool.h"
#include <memory.h>
#include "simd-utils.h"
///////////////////////////////////////////////////////////////////////////////////////////////
// Constants and static tables portion.
///////////////////////////////////////////////////////////////////////////////////////////////
@@ -49,20 +51,20 @@
// - A: the first operand. After the operation stores the sum of the two operands.
// - B: the second operand. After the operation stores the difference between the first and the
// second operands.
#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));}
//#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));}
// Quickly reduces an integer modulo 257.
//
// Parameters:
// - A: the input.
#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8))
//#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8))
// Since we need to do the setup only once, this is the indicator variable:
static bool wasSetupDone = false;
// This array stores the powers of omegas that correspond to the indices, which are the input
// values. Known also as the "outer FFT twiddle factors".
swift_int16_t multipliers[N];
swift_int16_t multipliers[N] __attribute__ ((aligned (64)));
// This array stores the powers of omegas, multiplied by the corresponding values.
// We store this table to save computation time.
@@ -72,14 +74,14 @@ swift_int16_t multipliers[N];
// compression function, i is between 0 and 31, x_i is a 64-bit value.
// One can see the formula for this (intermediate) stage in the SWIFFT FSE 2008 paper --
// formula (2), section 3, page 6.
swift_int16_t fftTable[256 * EIGHTH_N];
swift_int16_t fftTable[256 * EIGHTH_N] __attribute__ ((aligned (64)));
// The A's we use in SWIFFTX shall be random elements of Z_257.
// We generated these A's from the decimal expansion of PI as follows: we converted each
// triple of digits into a decimal number d. If d < (257 * 3) we used (d % 257) for the next A
// element, otherwise move to the next triple of digits in the expansion. This guarntees that
// the A's are random, provided that PI digits are.
const swift_int16_t As[3 * M * N] =
const swift_int16_t As[3 * M * N] __attribute__ ((aligned (64))) =
{141, 78, 139, 75, 238, 205, 129, 126, 22, 245, 197, 169, 142, 118, 105, 78,
50, 149, 29, 208, 114, 34, 85, 117, 67, 148, 86, 256, 25, 49, 133, 93,
95, 36, 68, 231, 211, 102, 151, 128, 224, 117, 193, 27, 102, 187, 7, 105,
@@ -636,9 +638,202 @@ void InitializeSWIFFTX()
wasSetupDone = true;
}
// In the original code the F matrix is rotated so it was not aranged
// the same as all the other data. Rearanging F to match all the other
// data made vectorizing possible, the compiler probably could have been
// able to auto-vectorize with proper data organisation.
// Also in the original code the custom 16 bit data types are all now 32
// bit int32_t regardless of the type name.
//
void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
{
swift_int16_t *mult = multipliers;
#if defined(__AVX2__)
__m256i F[8] __attribute__ ((aligned (64)));
__m256i *mul = (__m256i*)multipliers;
__m256i *out = (__m256i*)output;
__m256i *tbl = (__m256i*)&( fftTable[ input[0] << 3 ] );
F[0] = _mm256_mullo_epi32( mul[0], *tbl );
tbl = (__m256i*)&( fftTable[ input[1] << 3 ] );
F[1] = _mm256_mullo_epi32( mul[1], *tbl );
tbl = (__m256i*)&( fftTable[ input[2] << 3 ] );
F[2] = _mm256_mullo_epi32( mul[2], *tbl );
tbl = (__m256i*)&( fftTable[ input[3] << 3 ] );
F[3] = _mm256_mullo_epi32( mul[3], *tbl );
tbl = (__m256i*)&( fftTable[ input[4] << 3 ] );
F[4] = _mm256_mullo_epi32( mul[4], *tbl );
tbl = (__m256i*)&( fftTable[ input[5] << 3 ] );
F[5] = _mm256_mullo_epi32( mul[5], *tbl );
tbl = (__m256i*)&( fftTable[ input[6] << 3 ] );
F[6] = _mm256_mullo_epi32( mul[6], *tbl );
tbl = (__m256i*)&( fftTable[ input[7] << 3 ] );
F[7] = _mm256_mullo_epi32( mul[7], *tbl );
#define ADD_SUB( a, b ) \
{ \
__m256i tmp = b; \
b = _mm256_sub_epi32( a, b ); \
a = _mm256_add_epi32( a, tmp ); \
}
ADD_SUB( F[0], F[1] );
ADD_SUB( F[2], F[3] );
ADD_SUB( F[4], F[5] );
ADD_SUB( F[6], F[7] );
F[3] = _mm256_slli_epi32( F[3], 4 );
F[7] = _mm256_slli_epi32( F[7], 4 );
ADD_SUB( F[0], F[2] );
ADD_SUB( F[1], F[3] );
ADD_SUB( F[4], F[6] );
ADD_SUB( F[5], F[7] );
F[5] = _mm256_slli_epi32( F[5], 2 );
F[6] = _mm256_slli_epi32( F[6], 4 );
F[7] = _mm256_slli_epi32( F[7], 6 );
ADD_SUB( F[0], F[4] );
ADD_SUB( F[1], F[5] );
ADD_SUB( F[2], F[6] );
ADD_SUB( F[3], F[7] );
#undef ADD_SUB
#if defined (__AVX512VL__) && defined(__AVX512BW__)
#define Q_REDUCE( a ) \
_mm256_sub_epi32( _mm256_and_si256( a, \
_mm256_movm_epi8( 0x11111111 ) ), _mm256_srai_epi32( a, 8 ) )
#else
#define Q_REDUCE( a ) \
_mm256_sub_epi32( _mm256_and_si256( a, \
m256_const1_32( 0x000000ff ) ), _mm256_srai_epi32( a, 8 ) )
#endif
out[0] = Q_REDUCE( F[0] );
out[1] = Q_REDUCE( F[1] );
out[2] = Q_REDUCE( F[2] );
out[3] = Q_REDUCE( F[3] );
out[4] = Q_REDUCE( F[4] );
out[5] = Q_REDUCE( F[5] );
out[6] = Q_REDUCE( F[6] );
out[7] = Q_REDUCE( F[7] );
#undef Q_REDUCE
#elif defined(__SSE4_1__)
__m128i F[16] __attribute__ ((aligned (64)));
__m128i *mul = (__m128i*)multipliers;
__m128i *out = (__m128i*)output;
__m128i *tbl = (__m128i*)&( fftTable[ input[0] << 3 ] );
F[ 0] = _mm_mullo_epi32( mul[ 0], tbl[0] );
F[ 1] = _mm_mullo_epi32( mul[ 1], tbl[1] );
tbl = (__m128i*)&( fftTable[ input[1] << 3 ] );
F[ 2] = _mm_mullo_epi32( mul[ 2], tbl[0] );
F[ 3] = _mm_mullo_epi32( mul[ 3], tbl[1] );
tbl = (__m128i*)&( fftTable[ input[2] << 3 ] );
F[ 4] = _mm_mullo_epi32( mul[ 4], tbl[0] );
F[ 5] = _mm_mullo_epi32( mul[ 5], tbl[1] );
tbl = (__m128i*)&( fftTable[ input[3] << 3 ] );
F[ 6] = _mm_mullo_epi32( mul[ 6], tbl[0] );
F[ 7] = _mm_mullo_epi32( mul[ 7], tbl[1] );
tbl = (__m128i*)&( fftTable[ input[4] << 3 ] );
F[ 8] = _mm_mullo_epi32( mul[ 8], tbl[0] );
F[ 9] = _mm_mullo_epi32( mul[ 9], tbl[1] );
tbl = (__m128i*)&( fftTable[ input[5] << 3 ] );
F[10] = _mm_mullo_epi32( mul[10], tbl[0] );
F[11] = _mm_mullo_epi32( mul[11], tbl[1] );
tbl = (__m128i*)&( fftTable[ input[6] << 3 ] );
F[12] = _mm_mullo_epi32( mul[12], tbl[0] );
F[13] = _mm_mullo_epi32( mul[13], tbl[1] );
tbl = (__m128i*)&( fftTable[ input[7] << 3 ] );
F[14] = _mm_mullo_epi32( mul[14], tbl[0] );
F[15] = _mm_mullo_epi32( mul[15], tbl[1] );
#define ADD_SUB( a, b ) \
{ \
__m128i tmp = b; \
b = _mm_sub_epi32( a, b ); \
a = _mm_add_epi32( a, tmp ); \
}
ADD_SUB( F[ 0], F[ 2] );
ADD_SUB( F[ 1], F[ 3] );
ADD_SUB( F[ 4], F[ 6] );
ADD_SUB( F[ 5], F[ 7] );
ADD_SUB( F[ 8], F[10] );
ADD_SUB( F[ 9], F[11] );
ADD_SUB( F[12], F[14] );
ADD_SUB( F[13], F[15] );
F[ 6] = _mm_slli_epi32( F[ 6], 4 );
F[ 7] = _mm_slli_epi32( F[ 7], 4 );
F[14] = _mm_slli_epi32( F[14], 4 );
F[15] = _mm_slli_epi32( F[15], 4 );
ADD_SUB( F[ 0], F[ 4] );
ADD_SUB( F[ 1], F[ 5] );
ADD_SUB( F[ 2], F[ 6] );
ADD_SUB( F[ 3], F[ 7] );
ADD_SUB( F[ 8], F[12] );
ADD_SUB( F[ 9], F[13] );
ADD_SUB( F[10], F[14] );
ADD_SUB( F[11], F[15] );
F[10] = _mm_slli_epi32( F[10], 2 );
F[11] = _mm_slli_epi32( F[11], 2 );
F[12] = _mm_slli_epi32( F[12], 4 );
F[13] = _mm_slli_epi32( F[13], 4 );
F[14] = _mm_slli_epi32( F[14], 6 );
F[15] = _mm_slli_epi32( F[15], 6 );
ADD_SUB( F[ 0], F[ 8] );
ADD_SUB( F[ 1], F[ 9] );
ADD_SUB( F[ 2], F[10] );
ADD_SUB( F[ 3], F[11] );
ADD_SUB( F[ 4], F[12] );
ADD_SUB( F[ 5], F[13] );
ADD_SUB( F[ 6], F[14] );
ADD_SUB( F[ 7], F[15] );
#undef ADD_SUB
#define Q_REDUCE( a ) \
_mm_sub_epi32( _mm_and_si128( a, \
m128_const1_32( 0x000000ff ) ), _mm_srai_epi32( a, 8 ) )
out[ 0] = Q_REDUCE( F[ 0] );
out[ 1] = Q_REDUCE( F[ 1] );
out[ 2] = Q_REDUCE( F[ 2] );
out[ 3] = Q_REDUCE( F[ 3] );
out[ 4] = Q_REDUCE( F[ 4] );
out[ 5] = Q_REDUCE( F[ 5] );
out[ 6] = Q_REDUCE( F[ 6] );
out[ 7] = Q_REDUCE( F[ 7] );
out[ 8] = Q_REDUCE( F[ 8] );
out[ 9] = Q_REDUCE( F[ 9] );
out[10] = Q_REDUCE( F[10] );
out[11] = Q_REDUCE( F[11] );
out[12] = Q_REDUCE( F[12] );
out[13] = Q_REDUCE( F[13] );
out[14] = Q_REDUCE( F[14] );
out[15] = Q_REDUCE( F[15] );
#undef Q_REDUCE
#else // < SSE4.1
swift_int16_t *mult = multipliers;
// First loop unrolling:
register swift_int16_t *table = &(fftTable[input[0] << 3]);
/*
swift_int32_t F[64];
@@ -666,11 +861,8 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
F50, F51, F52, F53, F54, F55, F56, F57, F58, F59,
F60, F61, F62, F63;
// First loop unrolling:
register swift_int16_t *table = &(fftTable[input[0] << 3]);
F0 = mult[0] * table[0];
F8 = mult[1] * table[1];
F0 = mult[0] * table[0];
F8 = mult[1] * table[1];
F16 = mult[2] * table[2];
F24 = mult[3] * table[3];
F32 = mult[4] * table[4];
@@ -678,90 +870,93 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
F48 = mult[6] * table[6];
F56 = mult[7] * table[7];
mult += 8;
table = &(fftTable[input[1] << 3]);
F1 = mult[0] * table[0];
F9 = mult[1] * table[1];
F17 = mult[2] * table[2];
F25 = mult[3] * table[3];
F33 = mult[4] * table[4];
F41 = mult[5] * table[5];
F49 = mult[6] * table[6];
F57 = mult[7] * table[7];
F1 = mult[ 8] * table[0];
F9 = mult[ 9] * table[1];
F17 = mult[10] * table[2];
F25 = mult[11] * table[3];
F33 = mult[12] * table[4];
F41 = mult[13] * table[5];
F49 = mult[14] * table[6];
F57 = mult[15] * table[7];
mult += 8;
table = &(fftTable[input[2] << 3]);
F2 = mult[0] * table[0];
F10 = mult[1] * table[1];
F18 = mult[2] * table[2];
F26 = mult[3] * table[3];
F34 = mult[4] * table[4];
F42 = mult[5] * table[5];
F50 = mult[6] * table[6];
F58 = mult[7] * table[7];
F2 = mult[16] * table[0];
F10 = mult[17] * table[1];
F18 = mult[18] * table[2];
F26 = mult[19] * table[3];
F34 = mult[20] * table[4];
F42 = mult[21] * table[5];
F50 = mult[22] * table[6];
F58 = mult[23] * table[7];
mult += 8;
table = &(fftTable[input[3] << 3]);
F3 = mult[0] * table[0];
F11 = mult[1] * table[1];
F19 = mult[2] * table[2];
F27 = mult[3] * table[3];
F35 = mult[4] * table[4];
F43 = mult[5] * table[5];
F51 = mult[6] * table[6];
F59 = mult[7] * table[7];
F3 = mult[24] * table[0];
F11 = mult[25] * table[1];
F19 = mult[26] * table[2];
F27 = mult[27] * table[3];
F35 = mult[28] * table[4];
F43 = mult[29] * table[5];
F51 = mult[30] * table[6];
F59 = mult[31] * table[7];
mult += 8;
table = &(fftTable[input[4] << 3]);
F4 = mult[0] * table[0];
F12 = mult[1] * table[1];
F20 = mult[2] * table[2];
F28 = mult[3] * table[3];
F36 = mult[4] * table[4];
F44 = mult[5] * table[5];
F52 = mult[6] * table[6];
F60 = mult[7] * table[7];
F4 = mult[32] * table[0];
F12 = mult[33] * table[1];
F20 = mult[34] * table[2];
F28 = mult[35] * table[3];
F36 = mult[36] * table[4];
F44 = mult[37] * table[5];
F52 = mult[38] * table[6];
F60 = mult[39] * table[7];
mult += 8;
table = &(fftTable[input[5] << 3]);
F5 = mult[0] * table[0];
F13 = mult[1] * table[1];
F21 = mult[2] * table[2];
F29 = mult[3] * table[3];
F37 = mult[4] * table[4];
F45 = mult[5] * table[5];
F53 = mult[6] * table[6];
F61 = mult[7] * table[7];
F5 = mult[40] * table[0];
F13 = mult[41] * table[1];
F21 = mult[42] * table[2];
F29 = mult[43] * table[3];
F37 = mult[44] * table[4];
F45 = mult[45] * table[5];
F53 = mult[46] * table[6];
F61 = mult[47] * table[7];
mult += 8;
table = &(fftTable[input[6] << 3]);
F6 = mult[0] * table[0];
F14 = mult[1] * table[1];
F22 = mult[2] * table[2];
F30 = mult[3] * table[3];
F38 = mult[4] * table[4];
F46 = mult[5] * table[5];
F54 = mult[6] * table[6];
F62 = mult[7] * table[7];
F6 = mult[48] * table[0];
F14 = mult[49] * table[1];
F22 = mult[50] * table[2];
F30 = mult[51] * table[3];
F38 = mult[52] * table[4];
F46 = mult[53] * table[5];
F54 = mult[54] * table[6];
F62 = mult[55] * table[7];
mult += 8;
table = &(fftTable[input[7] << 3]);
F7 = mult[0] * table[0];
F15 = mult[1] * table[1];
F23 = mult[2] * table[2];
F31 = mult[3] * table[3];
F39 = mult[4] * table[4];
F47 = mult[5] * table[5];
F55 = mult[6] * table[6];
F63 = mult[7] * table[7];
F7 = mult[56] * table[0];
F15 = mult[57] * table[1];
F23 = mult[58] * table[2];
F31 = mult[59] * table[3];
F39 = mult[60] * table[4];
F47 = mult[61] * table[5];
F55 = mult[62] * table[6];
F63 = mult[63] * table[7];
#define ADD_SUB( a, b ) \
{ \
int temp = b; \
b = a - b; \
a = a + temp; \
}
#define Q_REDUCE( a ) \
( ( (a) & 0xff ) - ( (a) >> 8 ) )
/*
for ( int i = 0; i < 8; i++ )
@@ -800,7 +995,6 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
}
*/
// Second loop unrolling:
// Iteration 0:
ADD_SUB(F0, F1);
@@ -1057,6 +1251,11 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
output[47] = Q_REDUCE(F61);
output[55] = Q_REDUCE(F62);
output[63] = Q_REDUCE(F63);
#undef ADD_SUB
#undef Q_REDUCE
#endif // AVX2 elif SSE4.1 else
}
// Calculates the FFT part of SWIFFT.
@@ -1086,24 +1285,66 @@ void SWIFFTFFT(const unsigned char *input, int m, swift_int32_t *output)
// - m: the input size divided by 64.
// - output: will store the result.
// - a: the coefficients in the sum. Of size 64 * m.
void SWIFFTSum(const swift_int32_t *input, int m, unsigned char *output, const swift_int16_t *a)
void SWIFFTSum( const swift_int32_t *input, int m, unsigned char *output,
const swift_int16_t *a )
{
int i, j;
swift_int32_t result[N];
swift_int32_t result[N] __attribute__ ((aligned (64)));
register swift_int16_t carry = 0;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
__m512i *res = (__m512i*)result;
for ( j = 0; j < N/16; ++j )
{
__m512i sum = _mm512_setzero_si512();
const __m512i *f = (__m512i*)input + j;
const __m512i *k = (__m512i*)a + j;
for ( i = 0; i < m; i++, f += N/16, k += N/16 )
sum = _mm512_add_epi32( sum, _mm512_mullo_epi32( *f, *k ) );
res[j] = sum;
}
#elif defined(__AVX2__)
__m256i *res = (__m256i*)result;
for ( j = 0; j < N/8; ++j )
{
__m256i sum = _mm256_setzero_si256();
const __m256i *f = (__m256i*)input + j;
const __m256i *k = (__m256i*)a + j;
for ( i = 0; i < m; i++, f += N/8, k += N/8 )
sum = _mm256_add_epi32( sum, _mm256_mullo_epi32( *f, *k ) );
res[j] = sum;
}
#elif defined(__SSE4_1__)
__m128i *res = (__m128i*)result;
for ( j = 0; j < N/4; ++j )
{
__m128i sum = _mm_setzero_si128();
const __m128i *f = (__m128i*)input + j;
const __m128i *k = (__m128i*)a + j;
for ( i = 0; i < m; i++, f += N/4, k += N/4 )
sum = _mm_add_epi32( sum, _mm_mullo_epi32( *f, *k ) );
res[j] = sum;
}
#else
for (j = 0; j < N; ++j)
{
register swift_int32_t sum = 0;
const register swift_int32_t *f = input + j;
const register swift_int16_t *k = a + j;
for (i = 0; i < m; i++, f += N,k += N)
sum += (*f) * (*k);
result[j] = sum;
}
#endif
for (j = 0; j < N; ++j)
result[j] = ((FIELD_SIZE << 22) + result[j]) % FIELD_SIZE;
@@ -1122,8 +1363,8 @@ void ComputeSingleSWIFFTX_smooth(unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
{
int i;
// Will store the result of the FFT parts:
swift_int32_t fftOut[N * M];
unsigned char intermediate[N * 3 + 8];
swift_int32_t fftOut[N * M] __attribute__ ((aligned (64)));
unsigned char intermediate[N * 3 + 8] __attribute__ ((aligned (64)));
unsigned char carry0,carry1,carry2;
// Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets
@@ -1199,8 +1440,8 @@ void ComputeSingleSWIFFTX( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
{
int i;
// Will store the result of the FFT parts:
swift_int32_t fftOut[N * M];
unsigned char intermediate[N * 3 + 8];
swift_int32_t fftOut[N * M] __attribute__ ((aligned (64)));
unsigned char intermediate[N * 3 + 8] __attribute__ ((aligned (64)));
unsigned char carry0,carry1,carry2;
// Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.0.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.2.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.18.0'
PACKAGE_STRING='cpuminer-opt 3.18.0'
PACKAGE_VERSION='3.18.2'
PACKAGE_STRING='cpuminer-opt 3.18.2'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.18.0 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.18.2 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.18.0:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.18.2:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.18.0
cpuminer-opt configure 3.18.2
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.18.0, which was
It was created by cpuminer-opt $as_me 3.18.2, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.18.0'
VERSION='3.18.2'
cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.18.0, which was
This file was extended by cpuminer-opt $as_me 3.18.2, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.18.0
cpuminer-opt config.status 3.18.2
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.18.0])
AC_INIT([cpuminer-opt], [3.18.2])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -1112,19 +1112,17 @@ void report_summary_log( bool force )
applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], short_url );
applog2( LOG_NOTICE, "Periodic Report %s %s", et_str, upt_str );
applog2( LOG_INFO, "Share rate %.2f/min %.2f/min",
submit_rate, (double)submitted_share_count*60. /
( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ) );
submit_rate, safe_div( (double)submitted_share_count*60.,
( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ), 0. ) );
applog2( LOG_INFO, "Hash rate %7.2f%sh/s %7.2f%sh/s (%.2f%sh/s)",
shrate, shr_units, sess_hrate, sess_hr_units, ghrate, ghr_units );
if ( accepted_share_count < submitted_share_count )
{
double lost_ghrate = uptime.tv_sec == 0 ? 0.
: target_diff
* (double)(submitted_share_count - accepted_share_count )
/ (double)uptime.tv_sec;
double lost_shrate = share_time == 0. ? 0.
: target_diff * (double)(submits - accepts ) / share_time;
double lost_ghrate = safe_div( target_diff
* (double)(submitted_share_count - accepted_share_count ),
(double)uptime.tv_sec, 0. );
double lost_shrate = safe_div( target_diff * (double)(submits - accepts ), share_time, 0. );
char lshr_units[4] = {0};
char lghr_units[4] = {0};
scale_hash_for_display( &lost_shrate, lshr_units );
@@ -2083,7 +2081,8 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
/ ( opt_target_factor * opt_diff_factor );
diff_to_hash( g_work->target, g_work->targetdiff );
// Increment extranonce2
// Pre increment extranonce2 in case of being called again before receiving
// a new job
for ( int t = 0;
t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] );
t++ );
@@ -2103,20 +2102,12 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
pthread_mutex_unlock( &stats_lock );
if ( !opt_quiet )
{
int mismatch = submitted_share_count
- ( accepted_share_count + stale_share_count + rejected_share_count );
if ( mismatch )
applog(LOG_INFO, CL_LBL "%d Submitted share pending, maybe stale" CL_N, submitted_share_count );
}
if ( stratum_diff != sctx->job.diff )
applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Job %s",
sctx->job.diff, sctx->block_height, g_work->job_id );
else if ( last_block_height != sctx->block_height )
applog( LOG_BLUE, "New Block %d, Job %s",
sctx->block_height, g_work->job_id );
applog( LOG_BLUE, "New Block %d, Net diff %.5g, Job %s",
sctx->block_height, net_diff, g_work->job_id );
else if ( g_work->job_id && new_job )
applog( LOG_BLUE, "New Work: Block %d, Net diff %.5g, Job %s",
sctx->block_height, net_diff, g_work->job_id );
@@ -2173,7 +2164,6 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
{
double net_hr = nd / net_ttf;
char net_hr_units[4] = {0};
scale_hash_for_display ( &net_hr, net_hr_units );
applog2( LOG_INFO, "Net hash rate (est) %.2f %sh/s",
net_hr, net_hr_units );
@@ -2182,6 +2172,17 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
} // hr > 0
} // !quiet
} // new diff/block
if ( new_job && !opt_quiet )
{
int mismatch = submitted_share_count - ( accepted_share_count
+ stale_share_count
+ rejected_share_count );
if ( mismatch )
applog( LOG_INFO,
CL_LBL "%d Submitted share pending, maybe stale" CL_N,
submitted_share_count );
}
}
static void *miner_thread( void *userdata )
@@ -2492,18 +2493,21 @@ static void *miner_thread( void *userdata )
timeval_subtract( &uptime, &total_hashes_time, &session_start );
double hashrate = safe_div( total_hashes, uptime.tv_sec, 0. );
scale_hash_for_display( &hashrate, hr_units );
sprintf( hr, "%.2f", hashrate );
if ( hashrate > 0. )
{
scale_hash_for_display( &hashrate, hr_units );
sprintf( hr, "%.2f", hashrate );
#if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32))
applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units );
applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units );
#else
float lo_freq = 0., hi_freq = 0.;
linux_cpu_hilo_freq( &lo_freq, &hi_freq );
applog( LOG_NOTICE,
float lo_freq = 0., hi_freq = 0.;
linux_cpu_hilo_freq( &lo_freq, &hi_freq );
applog( LOG_NOTICE,
"Total: %s %sH/s, Temp: %dC, Freq: %.3f/%.3f GHz",
hr, hr_units, (uint32_t)cpu_temp(0), lo_freq / 1e6,
hi_freq / 1e6 );
#endif
}
}
} // benchmark
@@ -2897,6 +2901,7 @@ static bool cpu_capability( bool display_only )
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
bool algo_has_aes = set_incl( AES_OPT, algo_features );
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
bool algo_has_avx = set_incl( AVX_OPT, algo_features );
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features );
bool algo_has_sha = set_incl( SHA_OPT, algo_features );
@@ -2904,6 +2909,8 @@ static bool cpu_capability( bool display_only )
bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features );
bool use_aes;
bool use_sse2;
bool use_sse42;
bool use_avx;
bool use_avx2;
bool use_avx512;
bool use_sha;
@@ -2973,18 +2980,21 @@ static bool cpu_capability( bool display_only )
else if ( sw_has_aes ) printf( " AES" );
if ( sw_has_sha ) printf( " SHA" );
printf("\nAlgo features:");
if ( algo_features == EMPTY_SET ) printf( " None" );
else
if ( !display_only )
{
if ( algo_has_avx512 ) printf( " AVX512" );
else if ( algo_has_avx2 ) printf( " AVX2 " );
else if ( algo_has_sse42 ) printf( " SSE4.2" );
else if ( algo_has_sse2 ) printf( " SSE2 " );
if ( algo_has_vaes ||
algo_has_vaes256 ) printf( " VAES" );
else if ( algo_has_aes ) printf( " AES" );
if ( algo_has_sha ) printf( " SHA" );
printf("\nAlgo features:");
if ( algo_features == EMPTY_SET ) printf( " None" );
else
{
if ( algo_has_avx512 ) printf( " AVX512" );
else if ( algo_has_avx2 ) printf( " AVX2 " );
else if ( algo_has_sse42 ) printf( " SSE4.2" );
else if ( algo_has_sse2 ) printf( " SSE2 " );
if ( algo_has_vaes ||
algo_has_vaes256 ) printf( " VAES" );
else if ( algo_has_aes ) printf( " AES" );
if ( algo_has_sha ) printf( " SHA" );
}
}
printf("\n");
@@ -3019,6 +3029,8 @@ static bool cpu_capability( bool display_only )
// Determine mining options
use_sse2 = cpu_has_sse2 && algo_has_sse2;
use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42;
use_avx = cpu_has_avx && sw_has_avx && algo_has_avx;
use_aes = cpu_has_aes && sw_has_aes && algo_has_aes;
use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2;
use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
@@ -3035,6 +3047,8 @@ static bool cpu_capability( bool display_only )
{
if ( use_avx512 ) printf( " AVX512" );
else if ( use_avx2 ) printf( " AVX2" );
else if ( use_avx ) printf( " AVX" );
else if ( use_sse42 ) printf( " SSE42" );
else if ( use_sse2 ) printf( " SSE2" );
if ( use_vaes ) printf( " VAES" );
else if ( use_aes ) printf( " AES" );
@@ -3970,6 +3984,7 @@ int main(int argc, char *argv[])
gettimeofday( &last_submit_time, NULL );
memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
memcpy( &session_start, &last_submit_time, sizeof (struct timeval) );
memcpy( &total_hashes_time, &last_submit_time, sizeof (struct timeval) );
pthread_mutex_unlock( &stats_lock );
applog( LOG_INFO, "%d of %d miner threads started using '%s' algorithm",

18
miner.h
View File

@@ -868,9 +868,9 @@ Options:\n\
yespowerr16 Yenten (YTN)\n\
yespower-b2b generic yespower + blake2b\n\
zr5 Ziftr\n\
-N, --param-n N parameter for scrypt based algos\n\
-R, --param-r R parameter for scrypt based algos\n\
-K, --param-key Key (pers) parameter for algos that use it\n\
-N, --param-n=N N parameter for scrypt based algos\n\
-R, --param-r=N R parameter for scrypt based algos\n\
-K, --param-key=STRING Key (pers) parameter for algos that use it\n\
-o, --url=URL URL of mining server\n\
-O, --userpass=U:P username:password pair for mining server\n\
-u, --user=USERNAME username for mining server\n\
@@ -886,8 +886,8 @@ Options:\n\
-s, --scantime=N upper bound on time spent scanning current work when\n\
long polling is unavailable, in seconds (default: 5)\n\
--randomize Randomize scan range start to reduce duplicates\n\
-f, --diff-factor Divide req. difficulty by this factor (std is 1.0)\n\
-m, --diff-multiplier Multiply difficulty by this factor (std is 1.0)\n\
-f, --diff-factor=N Divide req. difficulty by this factor (std is 1.0)\n\
-m, --diff-multiplier=N Multiply difficulty by this factor (std is 1.0)\n\
--hash-meter Display thread hash rates\n\
--coinbase-addr=ADDR payout address for solo mining\n\
--coinbase-sig=TEXT data to insert in the coinbase when possible\n\
@@ -895,9 +895,9 @@ Options:\n\
--no-getwork disable getwork support\n\
--no-gbt disable getblocktemplate support\n\
--no-stratum disable X-Stratum support\n\
--no-extranonce disable Stratum extranonce support\n\
--no-extranonce disable Stratum extranonce subscribe\n\
--no-redirect ignore requests to change the URL of the mining server\n\
-q, --quiet disable per-thread hashmeter output\n\
-q, --quiet reduce log verbosity\n\
--no-color disable colored output\n\
-D, --debug enable debug output\n\
-P, --protocol-dump verbose dump of protocol-level activities\n"
@@ -916,9 +916,9 @@ Options:\n\
--max-rate=N[KMG] Only mine if net hashrate is less than specified value\n\
--max-diff=N Only mine if net difficulty is less than specified value\n\
-c, --config=FILE load a JSON-format configuration file\n\
--data-file path and name of data file\n\
--data-file=FILE path and name of data file\n\
--verify enable additional time consuming start up tests\n\
-V, --version display version information and exit\n\
-V, --version display version and CPU information and exit\n\
-h, --help display this help text and exit\n\
";

View File

@@ -1,7 +1,7 @@
#if !defined(SIMD_256_H__)
#define SIMD_256_H__ 1
#if defined(__AVX2__)
//#if defined(__AVX2__)
/////////////////////////////////////////////////////////////////////
//
@@ -14,7 +14,9 @@
// is limited because 256 bit vectors are less likely to be used when 512
// is available.
// Used instead if casting.
#if defined(__AVX__)
// Used instead of casting.
typedef union
{
__m256i m256;
@@ -23,6 +25,28 @@ typedef union
uint32_t u32[8];
} __attribute__ ((aligned (32))) m256_ovly;
//
// Pointer casting
// p = any aligned pointer
// returns p as pointer to vector type, not very useful
#define castp_m256i(p) ((__m256i*)(p))
// p = any aligned pointer
// returns *p, watch your pointer arithmetic
#define cast_m256i(p) (*((__m256i*)(p)))
// p = any aligned pointer, i = scaled array index
// returns value p[i]
#define casti_m256i(p,i) (((__m256i*)(p))[(i)])
// p = any aligned pointer, o = scaled offset
// returns pointer p+o
#define casto_m256i(p,o) (((__m256i*)(p))+(o))
#endif
#if defined(__AVX2__)
// Move integer to low element of vector, other elements are set to zero.
#define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
@@ -91,26 +115,6 @@ static inline __m256i mm256_neg1_fn()
#define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v )
#define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 )
//
// Pointer casting
// p = any aligned pointer
// returns p as pointer to vector type, not very useful
#define castp_m256i(p) ((__m256i*)(p))
// p = any aligned pointer
// returns *p, watch your pointer arithmetic
#define cast_m256i(p) (*((__m256i*)(p)))
// p = any aligned pointer, i = scaled array index
// returns value p[i]
#define casti_m256i(p,i) (((__m256i*)(p))[(i)])
// p = any aligned pointer, o = scaled offset
// returns pointer p+o
#define casto_m256i(p,o) (((__m256i*)(p))+(o))
//
// Memory functions
// n = number of 256 bit (32 byte) vectors

View File

@@ -535,7 +535,6 @@ static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
// Rotate 256 bit lanes by one 64 bit element
#define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 )
#define mm512_shufll256_64( v ) _mm512_permutex_epi64( v, 0x93 )
// Rotate 256 bit lanes by one 32 bit element
@@ -611,9 +610,6 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
// shufl2r is 2 input ...
// Drop macros? They can easilly be rebuilt using shufl2 functions
// add shuflr shufll functions performing rotate, returning first arg
// They're faster than doing both, when both not needed.
// Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
// rotated v1
// visually confusing for shif2r because of arg order. First arg is always

View File

@@ -2,22 +2,21 @@
#define SIMD_INT_H__ 1
// Endian byte swap
#define bswap_64( a ) __builtin_bswap64( a )
#define bswap_32( a ) __builtin_bswap32( a )
#define bswap_64 __builtin_bswap64
#define bswap_32 __builtin_bswap32
// Bit rotation
#define rol64 __rolq
#define ror64 __rorq
#define rol32 __rold
#define ror32 __rord
// Safe division, integer or floating point. For floating point it's as
// safe as 0. is precisely zero.
// Returns safe_result if division by zero.
// safe as 0 is precisely zero.
// Returns safe_result if division by zero, typically zero.
#define safe_div( dividend, divisor, safe_result ) \
( (divisor) == 0 ? safe_result : ( (dividend) / (divisor) ) )
// Aliases with familiar names for built in bit rotate instructions
#define rol64( a, n ) _lrotl( a, n )
#define ror64( a, n ) _lrotr( a, n )
#define rol32( a, n ) _rotl( a, n )
#define ror32( a, n ) _rotr( a, n )
#define rol16( a, n ) _rotwl( a, n )
#define ror16( a, n ) _rotwr( a, n )
///////////////////////////////////////
//