mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
2 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
1a234cbe53 | ||
![]() |
47cc5dcff5 |
@@ -32,14 +32,26 @@ but different package names.
|
||||
$ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git
|
||||
|
||||
SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
|
||||
openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA
|
||||
support depending on your CPU and compiler version:
|
||||
openssl 1.1.0e or higher.
|
||||
|
||||
"-march=native" is always the best choice
|
||||
znver1 and znver2 should be recognized on most recent version of GCC and
|
||||
znver3 is expected with GCC 11. GCC 11 also includes rocketlake support.
|
||||
In the meantime here are some suggestions to compile with new CPUs:
|
||||
|
||||
"-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000.
|
||||
"-march=native" is usually the best choice, used by build.sh.
|
||||
|
||||
"-msha" Add SHA to other tuning options
|
||||
"-march=znver2 -mvaes" can be used for Ryzen 5000 if znver3 is not recongized.
|
||||
|
||||
"-mcascadelake -msha" or
|
||||
"-mcometlake -mavx512 -msha" can be used for Rocket Lake.
|
||||
|
||||
Features can also be added individually:
|
||||
|
||||
"-msha" adds support for HW accelerated sha256.
|
||||
|
||||
"-mavx512" adds support for 512 bit vectors
|
||||
|
||||
"-mvaes" add support for parallel AES
|
||||
|
||||
Additional instructions for static compilalation can be found here:
|
||||
https://lxadm.com/Static_compilation_of_cpuminer
|
||||
|
@@ -171,6 +171,7 @@ cpuminer_SOURCES = \
|
||||
algo/sha/hmac-sha256-hash-4way.c \
|
||||
algo/sha/sha256d.c \
|
||||
algo/sha/sha2.c \
|
||||
algo/sha/sha256d-4way.c \
|
||||
algo/sha/sha256t-gate.c \
|
||||
algo/sha/sha256t-4way.c \
|
||||
algo/sha/sha256t.c \
|
||||
|
@@ -65,10 +65,38 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.8.2
|
||||
|
||||
Issue #342, fixed Groestl AES on Windows, broken in v3.18.0.
|
||||
|
||||
AVX512 for sha256d.
|
||||
|
||||
SSE42 and AVX may now be displayed as mining features at startup.
|
||||
This is hard coded for each algo, and is only implemented for scrypt
|
||||
at this time as it is the only algo with significant performance differences
|
||||
with those features.
|
||||
|
||||
Fixed an issue where a high hashrate algo could cause excessive invalid hash
|
||||
rate log reports when starting up in benchmark mode.
|
||||
|
||||
v3.18.1
|
||||
|
||||
More speed for scrypt:
|
||||
- additional scryptn2 optimizations for all CPU architectures,
|
||||
- AVX2 is now used by default on CPUS with SHA but not AVX512,
|
||||
- scrypt:1024 performance lost in v3.18.0 is restored,
|
||||
- AVX512 & AVX2 improvements to scrypt:1024.
|
||||
|
||||
Big speedup for SwiFFTx AVX2 & SSE4.1: x22i +55%, x25x +22%.
|
||||
|
||||
Issue #337: fixed a problem that could display negative stats values in the
|
||||
first summary report if the report was forced prematurely due to a stratum
|
||||
diff change. The stats will still be invalid but should display zeros.
|
||||
|
||||
v3.18.0
|
||||
|
||||
Complete rewrite of Scrypt code, optimized for large N factor (scryptn2):
|
||||
- AVX512 & SHA support for SHA256, AVX512 has priority,
|
||||
- AVX512 & SHA support for sha256, AVX512 has priority,
|
||||
- up to 50% increase in hashrate,
|
||||
- memory requirements reduced 30-60% depending on CPU architecture,
|
||||
- memory usage displayed at startup,
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -28,7 +28,6 @@
|
||||
*/
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <inttypes.h>
|
||||
@@ -55,11 +54,25 @@ static const uint32_t sha256_initial_state[8] =
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
|
||||
static int scrypt_throughput = 0;
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define SCRYPT_THROUGHPUT 16
|
||||
|
||||
#elif defined(__AVX2__)
|
||||
|
||||
#define SCRYPT_THROUGHPUT 8
|
||||
|
||||
#else
|
||||
|
||||
#define SCRYPT_THROUGHPUT 4
|
||||
|
||||
#endif
|
||||
|
||||
// static int scrypt_throughput = 0;
|
||||
|
||||
static int scratchbuf_size = 0;
|
||||
|
||||
static __thread char *scratchbuf = NULL;
|
||||
static __thread uint32_t *scratchbuf = NULL;
|
||||
|
||||
// change this to a constant to be used directly as input state arg
|
||||
// vectors still need an init function.
|
||||
@@ -146,6 +159,119 @@ static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
|
||||
output[i] = bswap_32( ostate[i] );
|
||||
}
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0,
|
||||
const uint32_t *key1, uint32_t *tstate0, uint32_t *tstate1,
|
||||
uint32_t *ostate0, uint32_t *ostate1 )
|
||||
{
|
||||
uint32_t ihash0[8], ihash1[8], pad0[16], pad1[16];
|
||||
int i;
|
||||
|
||||
memcpy( pad0, key0 + 16, 16 );
|
||||
memcpy( pad0 + 4, keypad, 48 );
|
||||
memcpy( pad1, key1 + 16, 16 );
|
||||
memcpy( pad1 + 4, keypad, 48 );
|
||||
|
||||
sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1,
|
||||
tstate0, tstate1 );
|
||||
|
||||
memcpy( ihash0, tstate0, 32 );
|
||||
memcpy( ihash1, tstate1, 32 );
|
||||
|
||||
for ( i = 0; i < 8; i++ )
|
||||
{
|
||||
pad0[i] = ihash0[i] ^ 0x5c5c5c5c;
|
||||
pad1[i] = ihash1[i] ^ 0x5c5c5c5c;
|
||||
}
|
||||
for ( ; i < 16; i++ ) pad0[i] = pad1[i] = 0x5c5c5c5c;
|
||||
|
||||
sha256_ni2way_transform_le( ostate0, ostate1, pad0, pad1,
|
||||
sha256_initial_state, sha256_initial_state );
|
||||
|
||||
for ( i = 0; i < 8; i++ )
|
||||
{
|
||||
pad0[i] = ihash0[i] ^ 0x36363636;
|
||||
pad1[i] = ihash1[i] ^ 0x36363636;
|
||||
}
|
||||
for ( ; i < 16; i++ ) pad0[i] = pad1[i] = 0x36363636;
|
||||
|
||||
sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1,
|
||||
sha256_initial_state, sha256_initial_state );
|
||||
}
|
||||
|
||||
static inline void PBKDF2_SHA256_80_128_SHA_2BUF( const uint32_t *tstate0,
|
||||
const uint32_t *tstate1, uint32_t *ostate0, uint32_t *ostate1,
|
||||
const uint32_t *salt0, const uint32_t *salt1, uint32_t *output0,
|
||||
uint32_t *output1 )
|
||||
{
|
||||
uint32_t istate0[8], istate1[8], ostateb0[8], ostateb1[8];
|
||||
uint32_t ibuf0[16], obuf0[16], ibuf1[16], obuf1[16];
|
||||
int i, j;
|
||||
|
||||
sha256_ni2way_transform_le( istate0, istate1, salt0, salt1,
|
||||
tstate0, tstate1 );
|
||||
|
||||
memcpy( ibuf0, salt0 + 16, 16 );
|
||||
memcpy( ibuf0 + 5, innerpad, 44 );
|
||||
memcpy( obuf0 + 8, outerpad, 32 );
|
||||
memcpy( ibuf1, salt1 + 16, 16 );
|
||||
memcpy( ibuf1 + 5, innerpad, 44 );
|
||||
memcpy( obuf1 + 8, outerpad, 32 );
|
||||
|
||||
for ( i = 0; i < 4; i++ )
|
||||
{
|
||||
memcpy( obuf0, istate0, 32 );
|
||||
memcpy( obuf1, istate1, 32 );
|
||||
ibuf0[4] = ibuf1[4] = i + 1;
|
||||
|
||||
sha256_ni2way_transform_le( obuf0, obuf1, ibuf0, ibuf1,
|
||||
obuf0, obuf1 );
|
||||
sha256_ni2way_transform_le( ostateb0, ostateb1, obuf0, obuf1,
|
||||
ostate0, ostate1 );
|
||||
|
||||
for ( j = 0; j < 8; j++ )
|
||||
{
|
||||
output0[ 8*i + j ] = bswap_32( ostateb0[j] );
|
||||
output1[ 8*i + j ] = bswap_32( ostateb1[j] );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,
|
||||
uint32_t *tstate1, uint32_t *ostate0, uint32_t *ostate1,
|
||||
const uint32_t *salt0, const uint32_t *salt1,
|
||||
uint32_t *output0, uint32_t *output1 )
|
||||
{
|
||||
uint32_t buf0[16], buf1[16];
|
||||
int i;
|
||||
|
||||
sha256_ni2way_transform_be( tstate0, tstate1, salt0, salt1,
|
||||
tstate0, tstate1 );
|
||||
sha256_ni2way_transform_be( tstate0, tstate1, salt0+16, salt1+16,
|
||||
tstate0, tstate1 );
|
||||
sha256_ni2way_transform_le( tstate0, tstate1, finalblk, finalblk,
|
||||
tstate0, tstate1 );
|
||||
|
||||
memcpy( buf0, tstate0, 32 );
|
||||
memcpy( buf0 + 8, outerpad, 32 );
|
||||
memcpy( buf1, tstate1, 32 );
|
||||
memcpy( buf1 + 8, outerpad, 32 );
|
||||
|
||||
sha256_ni2way_transform_le( ostate0, ostate1, buf0, buf1,
|
||||
ostate0, ostate1 );
|
||||
|
||||
for ( i = 0; i < 8; i++ )
|
||||
{
|
||||
output0[i] = bswap_32( ostate0[i] );
|
||||
output1[i] = bswap_32( ostate1[i] );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_SHA256_4WAY
|
||||
|
||||
static const uint32_t keypad_4way[4 * 12] = {
|
||||
@@ -596,15 +722,11 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
//#if defined(USE_ASM) && defined(__x86_64__)
|
||||
|
||||
#define SCRYPT_MAX_WAYS 12
|
||||
#define HAVE_SCRYPT_3WAY 1
|
||||
//int scrypt_best_throughput();
|
||||
void scrypt_core(uint32_t *X, uint32_t *V, int N);
|
||||
void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
|
||||
|
||||
//#if defined(USE_AVX2)
|
||||
#if defined(__AVX2__)
|
||||
#undef SCRYPT_MAX_WAYS
|
||||
#define SCRYPT_MAX_WAYS 24
|
||||
@@ -614,40 +736,39 @@ void scrypt_core_6way(uint32_t *X, uint32_t *V, int N);
|
||||
|
||||
#ifndef SCRYPT_MAX_WAYS
|
||||
#define SCRYPT_MAX_WAYS 1
|
||||
//#define scrypt_best_throughput() 1
|
||||
#endif
|
||||
|
||||
#include "scrypt-core-4way.h"
|
||||
|
||||
static bool scrypt_N_1_1_256(const uint32_t *input, uint32_t *output,
|
||||
uint32_t *midstate, unsigned char *scratchpad, int N, int thr_id )
|
||||
/*
|
||||
static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
|
||||
uint32_t *midstate, int N, int thr_id )
|
||||
{
|
||||
uint32_t tstate[8], ostate[8];
|
||||
uint32_t X[32];
|
||||
uint32_t *V = (uint32_t*)scratchpad;
|
||||
|
||||
memcpy(tstate, midstate, 32);
|
||||
HMAC_SHA256_80_init(input, tstate, ostate);
|
||||
PBKDF2_SHA256_80_128(tstate, ostate, input, X);
|
||||
|
||||
scrypt_core_simd128( X, V, N ); // woring
|
||||
scrypt_core_simd128( X, scratchbuf, N ); // woring
|
||||
// scrypt_core_1way( X, V, N ); // working
|
||||
// scrypt_core(X, V, N);
|
||||
|
||||
PBKDF2_SHA256_128_32(tstate, ostate, X, output);
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#if ( SCRYPT_THROUGHPUT == 8 )
|
||||
|
||||
static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
|
||||
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
|
||||
uint32_t *midstate, int N, int thrid )
|
||||
{
|
||||
uint32_t _ALIGN(128) tstate[8 * 8];
|
||||
uint32_t _ALIGN(128) ostate[8 * 8];
|
||||
uint32_t _ALIGN(128) W[8 * 32];
|
||||
uint32_t _ALIGN(128) X[8 * 32];
|
||||
uint32_t *V = (uint32_t*)scratchpad;
|
||||
uint32_t _ALIGN(128) tstate[ 8*8 ];
|
||||
uint32_t _ALIGN(128) ostate[ 8*8 ];
|
||||
uint32_t _ALIGN(128) W[ 8*32 ];
|
||||
uint32_t _ALIGN(128) X[ 8*32 ];
|
||||
|
||||
intrlv_8x32( W, input, input+ 20, input+ 40, input+ 60,
|
||||
input+80, input+100, input+120, input+140, 640 );
|
||||
@@ -658,53 +779,45 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
|
||||
PBKDF2_SHA256_80_128_8way( tstate, ostate, W, W );
|
||||
|
||||
dintrlv_8x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, W, 1024 );
|
||||
|
||||
if ( opt_param_n > 0x4000 )
|
||||
{
|
||||
scrypt_core_simd128_3buf( X, scratchbuf, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_3buf( X+ 96, scratchbuf, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_2buf( X+192, scratchbuf, N );
|
||||
}
|
||||
else
|
||||
{
|
||||
intrlv_2x128( W, X, X+ 32, 1024 );
|
||||
intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
|
||||
intrlv_2x128( W+128, X+128, X+160, 1024 );
|
||||
intrlv_2x128( W+192, X+192, X+224, 1024 );
|
||||
scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)scratchbuf, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)scratchbuf, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)scratchbuf, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)scratchbuf, N );
|
||||
dintrlv_2x128( X, X+ 32, W, 1024 );
|
||||
dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
|
||||
dintrlv_2x128( X+128, X+160, W+128, 1024 );
|
||||
dintrlv_2x128( X+192, X+224, W+192, 1024 );
|
||||
}
|
||||
|
||||
|
||||
|
||||
// SCRYPT CORE
|
||||
|
||||
|
||||
// AVX512
|
||||
|
||||
/*
|
||||
// AVX512 16 way working
|
||||
intrlv_16x32( W, X, X+32, X+64, X+96, X+128, X+160, X+192, X+224,
|
||||
X+256, X+256+32, X+256+64, X+256+96, X+256+128,
|
||||
X+256+160, X+256+192, X+256+224, 1024 );
|
||||
|
||||
scrypt_core_16way( (__m512i*)W , (__m512i*)V, N );
|
||||
|
||||
dintrlv_16x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224,
|
||||
X+256, X+256+32, X+256+64, X+256+96, X+256+128,
|
||||
X+256+160, X+256+192, X+256+224, W, 1024 );
|
||||
*/
|
||||
/*
|
||||
// AVX512 working
|
||||
intrlv_4x32( W, X, X+ 32, X+ 64, X+ 96, 1024 );
|
||||
intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 );
|
||||
scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N );
|
||||
dintrlv_4x32( X, X+ 32, X+ 64, X+ 96, W, 1024 );
|
||||
dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
|
||||
*/
|
||||
/*
|
||||
// AVX512, not working, very slow
|
||||
intrlv_4x128( W, X, X+ 32, X+ 64, X+ 96, 1024 );
|
||||
intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
|
||||
scrypt_core_4way_simd128( (__m512i*)W, (__m512i*)V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
|
||||
dintrlv_4x128( X, X+ 32, X+ 64, X+ 96, W, 1024 );
|
||||
dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
|
||||
*/
|
||||
|
||||
// AVX2
|
||||
|
||||
/*
|
||||
|
||||
// AVX2
|
||||
// disable de/interleave for testing.
|
||||
scrypt_core_8way( (__m256i*)W , (__m256i*)V, N );
|
||||
*/
|
||||
// scrypt_core_8way( (__m256i*)W , (__m256i*)V, N );
|
||||
|
||||
|
||||
/*
|
||||
// AVX2 working
|
||||
@@ -714,23 +827,18 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
|
||||
intrlv_2x128( W+192, X+192, X+224, 1024 );
|
||||
|
||||
// working
|
||||
// scrypt_core_2way_simd128_3buf( (__m256i*) W, (__m256i*)V, N );
|
||||
// scrypt_core_2way_simd128_2buf( (__m256i*) W, (__m256i*)V, N );
|
||||
// if ( work_restart[thrid].restart ) return 0;
|
||||
// scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
|
||||
// scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
|
||||
|
||||
// working
|
||||
scrypt_core_2way_simd128_2buf( (__m256i*) W, (__m256i*)V, N );
|
||||
scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
|
||||
|
||||
// working
|
||||
// scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)V, N );
|
||||
// if ( work_restart[thrid].restart ) return 0;
|
||||
// scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
|
||||
// if ( work_restart[thrid].restart ) return 0;
|
||||
// scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
|
||||
// if ( work_restart[thrid].restart ) return 0;
|
||||
// scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
|
||||
scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
|
||||
|
||||
dintrlv_2x128( X, X+ 32, W, 1024 );
|
||||
dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
|
||||
@@ -745,18 +853,10 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
|
||||
intrlv_2x32( W+128, X+128, X+160, 1024 );
|
||||
intrlv_2x32( W+192, X+192, X+224, 1024 );
|
||||
|
||||
// working, deprecated, not up to data
|
||||
// scrypt_core_simd128_2way_4buf( (uint64_t*)W, (uint64_t*)V, N );
|
||||
|
||||
// deprecated, not up to date
|
||||
// scrypt_core_simd128_2way_3buf( (uint64_t*) W, (uint64_t*)V, N );
|
||||
// if ( work_restart[thrid].restart ) return 0;
|
||||
// scrypt_core_simd128_2way( (uint64_t*)( W+192 ), (uint64_t*)V, N );
|
||||
|
||||
// working
|
||||
// scrypt_core_simd128_2way_2buf( (uint64_t*) W, (uint64_t*)V, N );
|
||||
// if ( work_restart[thrid].restart ) return 0;
|
||||
// scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
|
||||
scrypt_core_simd128_2way_2buf( (uint64_t*) W, (uint64_t*)V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
|
||||
|
||||
// scrypt_core_simd128_2way( (uint64_t*) W, (uint64_t*)V, N );
|
||||
// if ( work_restart[thrid].restart ) return 0;
|
||||
@@ -813,19 +913,13 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_2buf( X+192, V, N );
|
||||
*/
|
||||
|
||||
/**************
|
||||
scrypt_core_simd128_3buf( X, V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_3buf( X+ 96, V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_2buf( X+192, V, N );
|
||||
|
||||
/*
|
||||
// SSE2 working
|
||||
scrypt_core_simd128_4buf( X, V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_4buf( X+128, V, N );
|
||||
*/
|
||||
*************/
|
||||
|
||||
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
@@ -842,16 +936,15 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if ( SCRYPT_THROUGHPUT == 16 )
|
||||
|
||||
static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
|
||||
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
|
||||
uint32_t *midstate, int N, int thrid )
|
||||
{
|
||||
uint32_t _ALIGN(128) tstate[ 16*8 ];
|
||||
uint32_t _ALIGN(128) ostate[ 16*8 ];
|
||||
uint32_t _ALIGN(128) W[ 16*32 ];
|
||||
uint32_t _ALIGN(128) X[ 16*32 ];
|
||||
uint32_t *V = (uint32_t*)scratchpad;
|
||||
|
||||
intrlv_16x32( W, input, input+ 20, input+ 40, input+ 60,
|
||||
input+ 80, input+100, input+120, input+140,
|
||||
@@ -868,6 +961,39 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
|
||||
W, 1024 );
|
||||
|
||||
|
||||
if ( opt_param_n > 0x4000 )
|
||||
{
|
||||
scrypt_core_simd128_3buf( X, scratchbuf, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_3buf( X+ 96, scratchbuf, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_2buf( X+192, scratchbuf, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_3buf( X+256, scratchbuf, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_3buf( X+352, scratchbuf, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_2buf( X+448, scratchbuf, N );
|
||||
}
|
||||
else
|
||||
{
|
||||
intrlv_4x128( W, X, X+ 32, X+ 64, X+ 96, 1024 );
|
||||
intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
|
||||
intrlv_4x128( W+256, X+256, X+288, X+320, X+352, 1024 );
|
||||
intrlv_4x128( W+384, X+384, X+416, X+448, X+480, 1024 );
|
||||
scrypt_core_4way_simd128( (__m512i*) W, (__m512i*)scratchbuf, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)scratchbuf, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)scratchbuf, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_4way_simd128( (__m512i*)(W+384), (__m512i*)scratchbuf, N );
|
||||
dintrlv_4x128( X, X+ 32, X+ 64, X+ 96, W, 1024 );
|
||||
dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
|
||||
dintrlv_4x128( X+256, X+288, X+320, X+352, W+256, 1024 );
|
||||
dintrlv_4x128( X+384, X+416, X+448, X+480, W+384, 1024 );
|
||||
}
|
||||
|
||||
// SCRYPT CORE
|
||||
|
||||
|
||||
@@ -888,23 +1014,40 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
|
||||
// AVX512 working
|
||||
intrlv_4x32( W, X, X+ 32, X+ 64, X+ 96, 1024 );
|
||||
intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 );
|
||||
intrlv_4x32( W+256, X+256, X+256+ 32, X+256+ 64, X+256+ 96, 1024 );
|
||||
intrlv_4x32( W+256+128, X+256+128, X+256+160, X+256+192, X+256+224, 1024 );
|
||||
scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_4way( (__m128i*)(W+256), (__m128i*)V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_4way( (__m128i*)(W+256+128), (__m128i*)V, N );
|
||||
dintrlv_4x32( X, X+ 32, X+ 64, X+ 96, W, 1024 );
|
||||
dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
|
||||
dintrlv_4x32( X+256, X+256+ 32, X+256+ 64, X+256+ 96, W+256, 1024 );
|
||||
dintrlv_4x32( X+256+128, X+256+160, X+256+192, X+256+224, W+256+128, 1024 );
|
||||
*/
|
||||
/*
|
||||
// AVX512, not working, very slow
|
||||
// AVX512, working
|
||||
intrlv_4x128( W, X, X+ 32, X+ 64, X+ 96, 1024 );
|
||||
intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
|
||||
intrlv_4x128( W+256, X+256, X+256+ 32, X+256+ 64, X+256+ 96, 1024 );
|
||||
intrlv_4x128( W+256+128, X+256+128, X+256+160, X+256+192, X+256+224, 1024 );
|
||||
scrypt_core_4way_simd128( (__m512i*)W, (__m512i*)V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_4way_simd128( (__m512i*)(W+256+128), (__m512i*)V, N );
|
||||
dintrlv_4x128( X, X+ 32, X+ 64, X+ 96, W, 1024 );
|
||||
dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
|
||||
dintrlv_4x128( X+256, X+256+ 32, X+256+ 64, X+256+ 96, W+256, 1024 );
|
||||
dintrlv_4x128( X+256+128, X+256+160, X+256+192, X+256+224, W+256+128, 1024 );
|
||||
*/
|
||||
|
||||
|
||||
// AVX2
|
||||
|
||||
/*
|
||||
@@ -919,16 +1062,19 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
|
||||
intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
|
||||
intrlv_2x128( W+128, X+128, X+160, 1024 );
|
||||
intrlv_2x128( W+192, X+192, X+224, 1024 );
|
||||
|
||||
// working
|
||||
// scrypt_core_2way_simd128_3buf( (__m256i*) W, (__m256i*)V, N );
|
||||
// if ( work_restart[thrid].restart ) return 0;
|
||||
// scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
|
||||
intrlv_2x128( W+256, X+256, X+256+ 32, 1024 );
|
||||
intrlv_2x128( W+256+ 64, X+256+ 64, X+256+ 96, 1024 );
|
||||
intrlv_2x128( W+256+128, X+256+128, X+256+160, 1024 );
|
||||
intrlv_2x128( W+256+192, X+256+192, X+256+224, 1024 );
|
||||
|
||||
// working
|
||||
scrypt_core_2way_simd128_2buf( (__m256i*) W, (__m256i*)V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_2way_simd128_2buf( (__m256i*)(W+256), (__m256i*)V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_2way_simd128_2buf( (__m256i*)(W+256+128), (__m256i*)V, N );
|
||||
|
||||
// working
|
||||
// scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)V, N );
|
||||
@@ -938,11 +1084,23 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
|
||||
// scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
|
||||
// if ( work_restart[thrid].restart ) return 0;
|
||||
// scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
|
||||
// if ( work_restart[thrid].restart ) return 0;
|
||||
// scrypt_core_2way_simd128( (__m256i*)(W+256), (__m256i*)V, N );
|
||||
// if ( work_restart[thrid].restart ) return 0;
|
||||
// scrypt_core_2way_simd128( (__m256i*)(W+256+ 64), (__m256i*)V, N );
|
||||
// if ( work_restart[thrid].restart ) return 0;
|
||||
// scrypt_core_2way_simd128( (__m256i*)(W+256+128), (__m256i*)V, N );
|
||||
// if ( work_restart[thrid].restart ) return 0;
|
||||
// scrypt_core_2way_simd128( (__m256i*)(W+256+192), (__m256i*)V, N );
|
||||
|
||||
dintrlv_2x128( X, X+ 32, W, 1024 );
|
||||
dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
|
||||
dintrlv_2x128( X+128, X+160, W+128, 1024 );
|
||||
dintrlv_2x128( X+192, X+224, W+192, 1024 );
|
||||
dintrlv_2x128( X+256, X+256+ 32, W+256, 1024 );
|
||||
dintrlv_2x128( X+256+ 64, X+256+ 96, W+256+ 64, 1024 );
|
||||
dintrlv_2x128( X+256+128, X+256+160, W+256+128, 1024 );
|
||||
dintrlv_2x128( X+256+192, X+256+224, W+256+192, 1024 );
|
||||
*/
|
||||
|
||||
/*
|
||||
@@ -952,18 +1110,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
|
||||
intrlv_2x32( W+128, X+128, X+160, 1024 );
|
||||
intrlv_2x32( W+192, X+192, X+224, 1024 );
|
||||
|
||||
// working, deprecated, not up to data
|
||||
// scrypt_core_simd128_2way_4buf( (uint64_t*)W, (uint64_t*)V, N );
|
||||
|
||||
// deprecated, not up to date
|
||||
// scrypt_core_simd128_2way_3buf( (uint64_t*) W, (uint64_t*)V, N );
|
||||
// if ( work_restart[thrid].restart ) return 0;
|
||||
// scrypt_core_simd128_2way( (uint64_t*)( W+192 ), (uint64_t*)V, N );
|
||||
|
||||
// working
|
||||
// scrypt_core_simd128_2way_2buf( (uint64_t*) W, (uint64_t*)V, N );
|
||||
// if ( work_restart[thrid].restart ) return 0;
|
||||
// scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
|
||||
// scrypt_core_simd128_2way_2buf( (uint64_t*) W, (uint64_t*)V, N );
|
||||
// if ( work_restart[thrid].restart ) return 0;
|
||||
// scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
|
||||
|
||||
// scrypt_core_simd128_2way( (uint64_t*) W, (uint64_t*)V, N );
|
||||
// if ( work_restart[thrid].restart ) return 0;
|
||||
@@ -1043,7 +1196,7 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_2buf( X+448, V, N );
|
||||
*/
|
||||
|
||||
/***************
|
||||
scrypt_core_simd128_3buf( X, V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_3buf( X+ 96, V, N );
|
||||
@@ -1055,17 +1208,7 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
|
||||
scrypt_core_simd128_3buf( X+352, V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_2buf( X+448, V, N );
|
||||
|
||||
/*
|
||||
// SSE2 working
|
||||
scrypt_core_simd128_4buf( X, V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_4buf( X+128, V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_4buf( X+256, V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_4buf( X+384, V, N );
|
||||
*/
|
||||
********************/
|
||||
/*
|
||||
scrypt_core_3way( X, V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
@@ -1100,15 +1243,37 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#if defined(__SHA__)
|
||||
#if 0
|
||||
static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
|
||||
uint32_t *midstate, int N, int thrid )
|
||||
{
|
||||
uint32_t _ALIGN(128) tstate[ 2*8 ];
|
||||
uint32_t _ALIGN(128) ostate[ 2*8 ];
|
||||
uint32_t _ALIGN(128) W[ 2*32 ];
|
||||
|
||||
memcpy( tstate, midstate, 32 );
|
||||
memcpy( tstate+ 8, midstate, 32 );
|
||||
|
||||
HMAC_SHA256_80_init_SHA_2BUF( input, input+20, tstate, tstate+8,
|
||||
ostate, ostate+8 );
|
||||
PBKDF2_SHA256_80_128_SHA_2BUF( tstate, tstate+8, ostate, ostate+8,
|
||||
input, input+20, W, W+32 );
|
||||
|
||||
scrypt_core_simd128_2buf( W, scratchbuf, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
|
||||
PBKDF2_SHA256_128_32_SHA_2BUF( tstate, tstate+8, ostate, ostate+8, W, W+32,
|
||||
output, output+8 );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
|
||||
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
|
||||
uint32_t *midstate, int N, int thrid )
|
||||
{
|
||||
uint32_t _ALIGN(128) tstate[4 * 8];
|
||||
uint32_t _ALIGN(128) ostate[4 * 8];
|
||||
uint32_t _ALIGN(128) W[4 * 32];
|
||||
uint32_t *V = (uint32_t*)scratchpad;
|
||||
|
||||
memcpy( tstate, midstate, 32 );
|
||||
memcpy( tstate+ 8, midstate, 32 );
|
||||
@@ -1139,9 +1304,9 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
|
||||
*/
|
||||
|
||||
// working, double buffered linear simd
|
||||
scrypt_core_simd128_2buf( W, V, N );
|
||||
scrypt_core_simd128_2buf( W, scratchbuf, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_2buf( W+64, V, N );
|
||||
scrypt_core_simd128_2buf( W+64, scratchbuf, N );
|
||||
|
||||
/*
|
||||
scrypt_core_simd128_3buf( W, V, N );
|
||||
@@ -1149,8 +1314,6 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
|
||||
scrypt_core_simd128( W+96, V, N );
|
||||
*/
|
||||
|
||||
// working
|
||||
// scrypt_core_simd128_4buf( W, V, N );
|
||||
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
|
||||
@@ -1164,18 +1327,15 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
|
||||
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#ifdef HAVE_SHA256_4WAY
|
||||
#if ( SCRYPT_THROUGHPUT == 4 )
|
||||
static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
|
||||
uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
|
||||
uint32_t *midstate, int N, int thrid )
|
||||
{
|
||||
uint32_t _ALIGN(128) tstate[4 * 8];
|
||||
uint32_t _ALIGN(128) ostate[4 * 8];
|
||||
uint32_t _ALIGN(128) W[4 * 32];
|
||||
uint32_t _ALIGN(128) X[4 * 32];
|
||||
uint32_t *V = (uint32_t*)scratchpad;
|
||||
uint32_t _ALIGN(128) tstate[ 4*8 ];
|
||||
uint32_t _ALIGN(128) ostate[ 4*8 ];
|
||||
uint32_t _ALIGN(128) W[ 4*32 ];
|
||||
|
||||
intrlv_4x32( W, input, input+20, input+40, input+60, 640 );
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
@@ -1184,7 +1344,21 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
|
||||
HMAC_SHA256_80_init_4way(W, tstate, ostate);
|
||||
PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
|
||||
|
||||
dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
|
||||
if ( opt_param_n > 0x4000 )
|
||||
{
|
||||
uint32_t _ALIGN(128) X[ 4*32 ];
|
||||
dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
|
||||
scrypt_core_simd128_2buf( X, scratchbuf, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_2buf( X+64, scratchbuf, N );
|
||||
intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
|
||||
}
|
||||
else
|
||||
scrypt_core_4way( (__m128i*)W, (__m128i*)scratchbuf, N );
|
||||
|
||||
|
||||
|
||||
// dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
|
||||
|
||||
////// SCRYPT_CORE
|
||||
|
||||
@@ -1202,35 +1376,23 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128( X+96, V, N );
|
||||
*/
|
||||
|
||||
/*
|
||||
// working, double buffered linear simd, best for n2
|
||||
scrypt_core_simd128_2buf( X, V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128_2buf( X+64, V, N );
|
||||
|
||||
*/
|
||||
/*
|
||||
scrypt_core_simd128_3buf( X, V, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
scrypt_core_simd128( X+96, V, N );
|
||||
*/
|
||||
|
||||
// working
|
||||
// scrypt_core_simd128_4buf( X, V, N );
|
||||
|
||||
|
||||
/*
|
||||
// original
|
||||
scrypt_core(X + 0 * 32, V, N);
|
||||
scrypt_core(X + 1 * 32, V, N);
|
||||
scrypt_core(X + 2 * 32, V, N);
|
||||
scrypt_core(X + 3 * 32, V, N);
|
||||
*/
|
||||
|
||||
////////////////////////////////
|
||||
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
|
||||
intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
|
||||
// intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
|
||||
|
||||
PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
|
||||
|
||||
@@ -1238,58 +1400,73 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
|
||||
|
||||
return 1;
|
||||
}
|
||||
#endif /* HAVE_SHA256_4WAY */
|
||||
#endif // SCRYPT_THROUGHPUT == 4
|
||||
|
||||
#endif // SHA
|
||||
//#endif // SHA
|
||||
|
||||
extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) hash[ 8*SCRYPT_THROUGHPUT ];
|
||||
uint32_t _ALIGN(64) data[ 20*SCRYPT_THROUGHPUT ];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
|
||||
uint32_t midstate[8];
|
||||
uint32_t n = pdata[19] - 1;
|
||||
uint32_t midstate[8];
|
||||
uint32_t n = pdata[19] - 1;
|
||||
int thr_id = mythr->id;
|
||||
int throughput = scrypt_throughput;
|
||||
int i;
|
||||
int i;
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
|
||||
for ( i = 0; i < throughput; i++ )
|
||||
memcpy( data + i * 20, pdata, 80 );
|
||||
for ( i = 0; i < SCRYPT_THROUGHPUT; i++ )
|
||||
memcpy( data + i * 20, pdata, 80 );
|
||||
|
||||
sha256_transform_le( midstate, data, sha256_initial_state );
|
||||
|
||||
do {
|
||||
do {
|
||||
bool rc = true;
|
||||
for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n;
|
||||
for ( i = 0; i < SCRYPT_THROUGHPUT; i++ ) data[ i*20 + 19 ] = ++n;
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
if ( throughput == 16 )
|
||||
rc = scrypt_N_1_1_256_16way( data, hash, midstate, scratchbuf,
|
||||
opt_param_n, thr_id );
|
||||
else
|
||||
#endif
|
||||
#if defined(__AVX2__)
|
||||
if ( throughput == 8 )
|
||||
rc = scrypt_N_1_1_256_8way( data, hash, midstate, scratchbuf,
|
||||
opt_param_n, thr_id );
|
||||
else
|
||||
#endif
|
||||
if ( throughput == 4 )
|
||||
#if defined(__SHA__)
|
||||
rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, scratchbuf,
|
||||
opt_param_n, thr_id );
|
||||
//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if ( SCRYPT_THROUGHPUT == 16 )
|
||||
// if ( SCRYPT_THROUGHPUT == 16 )
|
||||
rc = scrypt_N_1_1_256_16way( data, hash, midstate, opt_param_n,
|
||||
thr_id );
|
||||
// else
|
||||
//#endif
|
||||
//#if defined(__AVX2__)
|
||||
#elif ( SCRYPT_THROUGHPUT == 8 )
|
||||
// if ( SCRYPT_THROUGHPUT == 8 )
|
||||
rc = scrypt_N_1_1_256_8way( data, hash, midstate, opt_param_n,
|
||||
thr_id );
|
||||
// else
|
||||
//#endif
|
||||
#elif ( SCRYPT_THROUGHPUT == 4 )
|
||||
// if ( SCRYPT_THROUGHPUT == 4 ) // slower on Ryzen than 8way
|
||||
//#if defined(__SHA__)
|
||||
// rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, opt_param_n,
|
||||
// thr_id );
|
||||
//#else
|
||||
rc = scrypt_N_1_1_256_4way( data, hash, midstate, opt_param_n,
|
||||
thr_id );
|
||||
#else
|
||||
rc = scrypt_N_1_1_256_4way( data, hash, midstate, scratchbuf,
|
||||
opt_param_n, thr_id );
|
||||
#endif
|
||||
else
|
||||
rc = scrypt_N_1_1_256( data, hash, midstate, scratchbuf,
|
||||
opt_param_n, thr_id );
|
||||
|
||||
#error "Invalid SCRYPT_THROUGHPUT"
|
||||
|
||||
#endif
|
||||
/*
|
||||
#if defined(__SHA__)
|
||||
else
|
||||
if ( SCRYPT_THROUGHPUT == 2 ) // slower on Ryzen than 4way_sha & 8way
|
||||
rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, opt_param_n,
|
||||
thr_id );
|
||||
#endif
|
||||
else // should never get here
|
||||
rc = scrypt_N_1_1_256( data, hash, midstate, opt_param_n, thr_id );
|
||||
*/
|
||||
|
||||
// test the hash
|
||||
if ( rc )
|
||||
for ( i = 0; i < throughput; i++ )
|
||||
for ( i = 0; i < SCRYPT_THROUGHPUT; i++ )
|
||||
{
|
||||
if ( unlikely( valid_hash( hash + i*8, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
@@ -1301,7 +1478,7 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
|
||||
|
||||
} while ( likely( ( n < ( max_nonce - throughput ) ) && !(*restart) ) );
|
||||
} while ( likely( ( n < ( max_nonce - SCRYPT_THROUGHPUT ) ) && !(*restart) ) );
|
||||
|
||||
*hashes_done = n - pdata[19];
|
||||
pdata[19] = n;
|
||||
@@ -1319,29 +1496,45 @@ bool scrypt_miner_thread_init( int thr_id )
|
||||
|
||||
bool register_scrypt_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(__SHA__)
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT;
|
||||
#else
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
#endif
|
||||
//#if defined(__SHA__)
|
||||
// gate->optimizations = SSE2_OPT | SHA_OPT;
|
||||
//#else
|
||||
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT;
|
||||
//#endif
|
||||
gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_scrypt;
|
||||
opt_target_factor = 65536.0;
|
||||
opt_param_n = opt_param_n ? opt_param_n : 1024;
|
||||
applog( LOG_INFO,"Scrypt paramaters: N= %d, R= 1", opt_param_n );
|
||||
|
||||
// scrypt_throughput can be defined at compile time and used to replace
|
||||
// MAX_WAYS to reduce memory usage.
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
scrypt_throughput = 16;
|
||||
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
|
||||
// scrypt_throughput = 16;
|
||||
if ( opt_param_n > 0x4000 )
|
||||
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
|
||||
else
|
||||
scratchbuf_size = opt_param_n * 4 * 128; // 4 way
|
||||
|
||||
/* SHA is slower than AVX2 on Ryzen
|
||||
#elif defined(__SHA__)
|
||||
scrypt_throughput = 4;
|
||||
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
|
||||
*/
|
||||
|
||||
#elif defined(__AVX2__)
|
||||
scrypt_throughput = 8;
|
||||
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
|
||||
// scrypt_throughput = 8;
|
||||
if ( opt_param_n > 0x4000 )
|
||||
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
|
||||
else
|
||||
scratchbuf_size = opt_param_n * 2 * 128; // 2 way
|
||||
#else
|
||||
scrypt_throughput = 4;
|
||||
// scrypt_throughput = 4;
|
||||
if ( opt_param_n > 0x4000 )
|
||||
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
|
||||
else
|
||||
scratchbuf_size = opt_param_n * 4 * 128; // 4 way
|
||||
#endif
|
||||
|
||||
char t_units[4] = {0};
|
||||
@@ -1353,7 +1546,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
|
||||
format_number_si( &d_size, d_units );
|
||||
|
||||
applog( LOG_INFO,"Throughput %d/thr, Buffer %.0f %siB/thr, Total %.0f %siB\n",
|
||||
scrypt_throughput, t_size, t_units, d_size, d_units );
|
||||
SCRYPT_THROUGHPUT, t_size, t_units, d_size, d_units );
|
||||
|
||||
return true;
|
||||
};
|
||||
|
@@ -51,7 +51,6 @@ typedef struct {
|
||||
__m128i buf[64>>2];
|
||||
__m128i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
bool initialized;
|
||||
} sha256_4way_context __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256_4way_init( sha256_4way_context *sc );
|
||||
@@ -74,7 +73,6 @@ typedef struct {
|
||||
__m256i buf[64>>2];
|
||||
__m256i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
bool initialized;
|
||||
} sha256_8way_context __attribute__ ((aligned (128)));
|
||||
|
||||
void sha256_8way_init( sha256_8way_context *sc );
|
||||
@@ -86,6 +84,11 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
|
||||
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
|
||||
void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
|
||||
const __m256i *state_in );
|
||||
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in, const __m256i *state_mid );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
@@ -96,7 +99,6 @@ typedef struct {
|
||||
__m512i buf[64>>2];
|
||||
__m512i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
bool initialized;
|
||||
} sha256_16way_context __attribute__ ((aligned (128)));
|
||||
|
||||
void sha256_16way_init( sha256_16way_context *sc );
|
||||
|
@@ -8,7 +8,7 @@
|
||||
* any later version. See COPYING for more details.
|
||||
*/
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include "sha256d-4way.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <inttypes.h>
|
||||
@@ -181,6 +181,8 @@ static const uint32_t sha256d_hash1[16] = {
|
||||
};
|
||||
|
||||
// this performs the entire hash all over again, why?
|
||||
// because main function only does 56 rounds.
|
||||
|
||||
static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
|
||||
{
|
||||
uint32_t S[16];
|
||||
@@ -492,7 +494,7 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
|
||||
void sha256d_ms_4way(uint32_t *hash, uint32_t *data,
|
||||
const uint32_t *midstate, const uint32_t *prehash);
|
||||
|
||||
static inline int scanhash_sha256d_4way( struct work *work,
|
||||
static inline int scanhash_sha256d_4way_pooler( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
@@ -553,7 +555,7 @@ static inline int scanhash_sha256d_4way( struct work *work,
|
||||
void sha256d_ms_8way(uint32_t *hash, uint32_t *data,
|
||||
const uint32_t *midstate, const uint32_t *prehash);
|
||||
|
||||
static inline int scanhash_sha256d_8way( struct work *work,
|
||||
static inline int scanhash_sha256d_8way_pooler( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
@@ -609,7 +611,7 @@ static inline int scanhash_sha256d_8way( struct work *work,
|
||||
|
||||
#endif /* HAVE_SHA256_8WAY */
|
||||
|
||||
int scanhash_sha256d( struct work *work,
|
||||
int scanhash_sha256d_pooler( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
@@ -625,11 +627,11 @@ int scanhash_sha256d( struct work *work,
|
||||
|
||||
#ifdef HAVE_SHA256_8WAY
|
||||
if (sha256_use_8way())
|
||||
return scanhash_sha256d_8way( work, max_nonce, hashes_done, mythr );
|
||||
return scanhash_sha256d_8way_pooler( work, max_nonce, hashes_done, mythr );
|
||||
#endif
|
||||
#ifdef HAVE_SHA256_4WAY
|
||||
if (sha256_use_4way())
|
||||
return scanhash_sha256d_4way( work, max_nonce, hashes_done, mythr );
|
||||
return scanhash_sha256d_4way_pooler( work, max_nonce, hashes_done, mythr );
|
||||
#endif
|
||||
|
||||
memcpy(data, pdata + 16, 64);
|
||||
@@ -690,9 +692,13 @@ int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
bool register_sha256d_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256d;
|
||||
// gate->hash = (void*)&sha256d;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
#if defined(SHA256D_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256d_16way;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_sha256d_pooler;
|
||||
#endif
|
||||
// gate->hash = (void*)&sha256d;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -53,4 +53,8 @@ void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
|
||||
#define sha256_transform_be sph_sha256_transform_be
|
||||
|
||||
#endif
|
||||
|
||||
// SHA can't do only 3 rounds
|
||||
#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
|
||||
|
||||
#endif
|
||||
|
@@ -1,4 +1,4 @@
|
||||
#include "sha256t-gate.h"
|
||||
#include "sha256d-4way.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
@@ -13,7 +13,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
__m512i block[16] __attribute__ ((aligned (64)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m512i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m512i midstate[8] __attribute__ ((aligned (32)));
|
||||
__m512i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m512i midstate2[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m512i vdata[20] __attribute__ ((aligned (32)));
|
||||
@@ -46,11 +46,10 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 byte block of data
|
||||
sha256_16way_transform_le( midstate, vdata, initstate );
|
||||
sha256_16way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
|
||||
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
@@ -59,7 +58,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_512( block + 5, 10 );
|
||||
block[15] = m512_const1_32( 80*8 ); // bit count
|
||||
sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
|
||||
sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_512( block, hash32, 8 );
|
||||
@@ -99,7 +98,8 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
__m256i block[16] __attribute__ ((aligned (64)));
|
||||
__m256i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m256i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate2[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m256i vdata[20] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
@@ -116,7 +116,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
const __m256i eight = m256_const1_32( 8 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m256_const1_32( pdata[i] );
|
||||
vdata[i] = m256_const1_32( pdata[i] );
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
@@ -130,8 +130,10 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_8way_transform_le( midstate, vdata, initstate );
|
||||
sha256_8way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
@@ -140,7 +142,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_256( block + 5, 10 );
|
||||
block[15] = m256_const1_32( 80*8 ); // bit count
|
||||
sha256_8way_transform_le( hash32, block, midstate );
|
||||
sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_256( block, hash32, 8 );
|
||||
@@ -253,3 +255,20 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
bool register_sha256d_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
#if defined(SHA256D_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256d_16way;
|
||||
#elif defined(SHA256D_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256d_8way;
|
||||
#elif defined(SHA256D_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256d_4way;
|
||||
#endif
|
||||
|
||||
// gate->hash = (void*)&sha256d;
|
||||
return true;
|
||||
};
|
||||
*/
|
||||
|
||||
|
48
algo/sha/sha256d-4way.h
Normal file
48
algo/sha/sha256d-4way.h
Normal file
@@ -0,0 +1,48 @@
|
||||
#ifndef __SHA256D_4WAY_H__
|
||||
#define __SHA256D_4WAY_H__ 1
|
||||
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define SHA256D_16WAY 1
|
||||
/*
|
||||
#elif defined(__AVX2__)
|
||||
#define SHA256D_8WAY 1
|
||||
#else
|
||||
#define SHA256D_4WAY 1
|
||||
*/
|
||||
#endif
|
||||
|
||||
bool register_sha256d_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(SHA256D_16WAY)
|
||||
|
||||
int scanhash_sha256d_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
/*
|
||||
#if defined(SHA256D_8WAY)
|
||||
|
||||
int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
#if defined(SHA256D_4WAY)
|
||||
|
||||
int scanhash_sha256d_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
*/
|
||||
|
||||
/*
|
||||
#if defined(__SHA__)
|
||||
|
||||
int scanhash_sha256d( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
@@ -13,7 +13,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
__m512i block[16] __attribute__ ((aligned (64)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m512i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m512i midstate[8] __attribute__ ((aligned (32)));
|
||||
__m512i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m512i midstate2[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m512i vdata[20] __attribute__ ((aligned (32)));
|
||||
@@ -31,7 +31,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
const __m512i sixteen = m512_const1_32( 16 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m512_const1_32( pdata[i] );
|
||||
vdata[i] = m512_const1_32( pdata[i] );
|
||||
|
||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
@@ -46,11 +46,10 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 byte block of data
|
||||
sha256_16way_transform_le( midstate, vdata, initstate );
|
||||
|
||||
sha256_16way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
|
||||
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
@@ -59,7 +58,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_512( block + 5, 10 );
|
||||
block[15] = m512_const1_32( 80*8 ); // bit count
|
||||
sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
|
||||
sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_512( block, hash32, 8 );
|
||||
@@ -104,7 +103,8 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
__m256i block[16] __attribute__ ((aligned (64)));
|
||||
__m256i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m256i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate2[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m256i vdata[20] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
@@ -121,7 +121,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
const __m256i eight = m256_const1_32( 8 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m256_const1_32( pdata[i] );
|
||||
vdata[i] = m256_const1_32( pdata[i] );
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
@@ -135,9 +135,11 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_8way_transform_le( midstate, vdata, initstate );
|
||||
sha256_8way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
@@ -145,7 +147,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_256( block + 5, 10 );
|
||||
block[15] = m256_const1_32( 80*8 ); // bit count
|
||||
sha256_8way_transform_le( hash32, block, midstate );
|
||||
sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_256( block, hash32, 8 );
|
||||
|
@@ -702,6 +702,36 @@ memcpy( state_out, state_in, 32 );
|
||||
|
||||
}
|
||||
|
||||
void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
|
||||
const uint32_t *state_in )
|
||||
{
|
||||
uint32_t t1, t2, X_xor_Y, Y_xor_Z = state_in[1] ^ state_in[2];
|
||||
memcpy( state_out, state_in, 32 );
|
||||
|
||||
t1 = state_out[7] + BSG2_1( state_out[4] )
|
||||
+ CH( state_out[4], state_out[5], state_out[6] ) + 0x428A2F98 + data[0];
|
||||
t2 = BSG2_0( state_out[0] )
|
||||
+ MAJ( state_out[0], state_out[1], state_out[2] );
|
||||
Y_xor_Z = X_xor_Y;
|
||||
state_out[3] += t1;
|
||||
state_out[7] = t1 + t2;
|
||||
|
||||
t1 = state_out[6] + BSG2_1( state_out[3] )
|
||||
+ CH( state_out[3], state_out[4], state_out[5] ) + 0x71374491 + data[1];
|
||||
t2 = BSG2_0( state_out[7] )
|
||||
+ MAJ( state_out[7], state_out[0], state_out[1] );
|
||||
Y_xor_Z = X_xor_Y;
|
||||
state_out[2] += t1;
|
||||
state_out[6] = t1 + t2;
|
||||
|
||||
t1 = state_out[5] + BSG2_1( state_out[2] )
|
||||
+ CH( state_out[2], state_out[3], state_out[4] ) + 0xB5C0FBCF + data[2];
|
||||
t2 = BSG2_0( state_out[6] )
|
||||
+ MAJ( state_out[6], state_out[7], state_out[0] );
|
||||
state_out[1] += t1;
|
||||
state_out[5] = t1 + t2;
|
||||
}
|
||||
|
||||
/* see sph_sha2.h */
|
||||
void
|
||||
sph_sha224_init(void *cc)
|
||||
|
@@ -215,6 +215,9 @@ void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
|
||||
void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
|
||||
const uint32_t *state_in );
|
||||
|
||||
void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
|
||||
const uint32_t *state_in );
|
||||
|
||||
|
||||
#if SPH_64
|
||||
|
||||
|
@@ -70,6 +70,8 @@ extern "C"{
|
||||
C8, C9, CA, CB, CC, CD, CE, CF; \
|
||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7, \
|
||||
M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
const __m256i FIVE = _mm256_set1_epi32( 5 ); \
|
||||
const __m256i THREE = _mm256_set1_epi32( 3 ); \
|
||||
sph_u32 Wlow, Whigh;
|
||||
|
||||
#define READ_STATE8(state) do \
|
||||
@@ -314,8 +316,7 @@ do { \
|
||||
_mm256_andnot_si256( xb3, xb2 ), \
|
||||
_mm256_mullo_epi32( mm256_xor3( xa0, xc, \
|
||||
_mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \
|
||||
_mm256_set1_epi32(5UL) ) ), \
|
||||
_mm256_set1_epi32(3UL) ) ) ); \
|
||||
FIVE ) ), THREE ) ) ); \
|
||||
xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
|
||||
} while (0)
|
||||
|
||||
@@ -667,7 +668,9 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
C8, C9, CA, CB, CC, CD, CE, CF; \
|
||||
__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
|
||||
M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
sph_u32 Wlow, Whigh;
|
||||
const __m128i FIVE = _mm_set1_epi32( 5 ); \
|
||||
const __m128i THREE = _mm_set1_epi32( 3 ); \
|
||||
sph_u32 Wlow, Whigh;
|
||||
|
||||
#define READ_STATE(state) do \
|
||||
{ \
|
||||
@@ -931,8 +934,8 @@ do { \
|
||||
xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \
|
||||
_mm_andnot_si128( xb3, xb2 ), \
|
||||
_mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
|
||||
_mm_mullo_epi32( mm128_rol_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
|
||||
) ), _mm_set1_epi32(3UL) ) ) ) ); \
|
||||
_mm_mullo_epi32( mm128_rol_32( xa1, 15 ), FIVE ) \
|
||||
) ), THREE ) ) ) ); \
|
||||
xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \
|
||||
} while (0)
|
||||
|
||||
|
@@ -18,16 +18,20 @@
|
||||
#ifndef __INTTYPES_H_
|
||||
#define __INTTYPES_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
/* Use [u]intN_t if you need exactly N bits.
|
||||
XXX - doesn't handle the -mint8 option. */
|
||||
|
||||
typedef signed char swift_int8_t;
|
||||
typedef unsigned char swift_uint8_t;
|
||||
|
||||
typedef int swift_int16_t;
|
||||
typedef int32_t swift_int16_t;
|
||||
// typedef int swift_int16_t;
|
||||
typedef unsigned int swift_uint16_t;
|
||||
|
||||
typedef long swift_int32_t;
|
||||
typedef int32_t swift_int32_t;
|
||||
// typedef long swift_int32_t;
|
||||
typedef unsigned long swift_uint32_t;
|
||||
|
||||
typedef long long swift_int64_t;
|
||||
|
@@ -18,6 +18,8 @@
|
||||
//#include "stdbool.h"
|
||||
#include <memory.h>
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Constants and static tables portion.
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -49,20 +51,20 @@
|
||||
// - A: the first operand. After the operation stores the sum of the two operands.
|
||||
// - B: the second operand. After the operation stores the difference between the first and the
|
||||
// second operands.
|
||||
#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));}
|
||||
//#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));}
|
||||
|
||||
// Quickly reduces an integer modulo 257.
|
||||
//
|
||||
// Parameters:
|
||||
// - A: the input.
|
||||
#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8))
|
||||
//#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8))
|
||||
|
||||
// Since we need to do the setup only once, this is the indicator variable:
|
||||
static bool wasSetupDone = false;
|
||||
|
||||
// This array stores the powers of omegas that correspond to the indices, which are the input
|
||||
// values. Known also as the "outer FFT twiddle factors".
|
||||
swift_int16_t multipliers[N];
|
||||
swift_int16_t multipliers[N] __attribute__ ((aligned (64)));
|
||||
|
||||
// This array stores the powers of omegas, multiplied by the corresponding values.
|
||||
// We store this table to save computation time.
|
||||
@@ -72,14 +74,14 @@ swift_int16_t multipliers[N];
|
||||
// compression function, i is between 0 and 31, x_i is a 64-bit value.
|
||||
// One can see the formula for this (intermediate) stage in the SWIFFT FSE 2008 paper --
|
||||
// formula (2), section 3, page 6.
|
||||
swift_int16_t fftTable[256 * EIGHTH_N];
|
||||
swift_int16_t fftTable[256 * EIGHTH_N] __attribute__ ((aligned (64)));
|
||||
|
||||
// The A's we use in SWIFFTX shall be random elements of Z_257.
|
||||
// We generated these A's from the decimal expansion of PI as follows: we converted each
|
||||
// triple of digits into a decimal number d. If d < (257 * 3) we used (d % 257) for the next A
|
||||
// element, otherwise move to the next triple of digits in the expansion. This guarntees that
|
||||
// the A's are random, provided that PI digits are.
|
||||
const swift_int16_t As[3 * M * N] =
|
||||
const swift_int16_t As[3 * M * N] __attribute__ ((aligned (64))) =
|
||||
{141, 78, 139, 75, 238, 205, 129, 126, 22, 245, 197, 169, 142, 118, 105, 78,
|
||||
50, 149, 29, 208, 114, 34, 85, 117, 67, 148, 86, 256, 25, 49, 133, 93,
|
||||
95, 36, 68, 231, 211, 102, 151, 128, 224, 117, 193, 27, 102, 187, 7, 105,
|
||||
@@ -636,9 +638,202 @@ void InitializeSWIFFTX()
|
||||
wasSetupDone = true;
|
||||
}
|
||||
|
||||
// In the original code the F matrix is rotated so it was not aranged
|
||||
// the same as all the other data. Rearanging F to match all the other
|
||||
// data made vectorizing possible, the compiler probably could have been
|
||||
// able to auto-vectorize with proper data organisation.
|
||||
// Also in the original code the custom 16 bit data types are all now 32
|
||||
// bit int32_t regardless of the type name.
|
||||
//
|
||||
void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
|
||||
{
|
||||
swift_int16_t *mult = multipliers;
|
||||
#if defined(__AVX2__)
|
||||
|
||||
__m256i F[8] __attribute__ ((aligned (64)));
|
||||
__m256i *mul = (__m256i*)multipliers;
|
||||
__m256i *out = (__m256i*)output;
|
||||
__m256i *tbl = (__m256i*)&( fftTable[ input[0] << 3 ] );
|
||||
|
||||
F[0] = _mm256_mullo_epi32( mul[0], *tbl );
|
||||
tbl = (__m256i*)&( fftTable[ input[1] << 3 ] );
|
||||
F[1] = _mm256_mullo_epi32( mul[1], *tbl );
|
||||
tbl = (__m256i*)&( fftTable[ input[2] << 3 ] );
|
||||
F[2] = _mm256_mullo_epi32( mul[2], *tbl );
|
||||
tbl = (__m256i*)&( fftTable[ input[3] << 3 ] );
|
||||
F[3] = _mm256_mullo_epi32( mul[3], *tbl );
|
||||
tbl = (__m256i*)&( fftTable[ input[4] << 3 ] );
|
||||
F[4] = _mm256_mullo_epi32( mul[4], *tbl );
|
||||
tbl = (__m256i*)&( fftTable[ input[5] << 3 ] );
|
||||
F[5] = _mm256_mullo_epi32( mul[5], *tbl );
|
||||
tbl = (__m256i*)&( fftTable[ input[6] << 3 ] );
|
||||
F[6] = _mm256_mullo_epi32( mul[6], *tbl );
|
||||
tbl = (__m256i*)&( fftTable[ input[7] << 3 ] );
|
||||
F[7] = _mm256_mullo_epi32( mul[7], *tbl );
|
||||
|
||||
#define ADD_SUB( a, b ) \
|
||||
{ \
|
||||
__m256i tmp = b; \
|
||||
b = _mm256_sub_epi32( a, b ); \
|
||||
a = _mm256_add_epi32( a, tmp ); \
|
||||
}
|
||||
|
||||
ADD_SUB( F[0], F[1] );
|
||||
ADD_SUB( F[2], F[3] );
|
||||
ADD_SUB( F[4], F[5] );
|
||||
ADD_SUB( F[6], F[7] );
|
||||
|
||||
F[3] = _mm256_slli_epi32( F[3], 4 );
|
||||
F[7] = _mm256_slli_epi32( F[7], 4 );
|
||||
|
||||
ADD_SUB( F[0], F[2] );
|
||||
ADD_SUB( F[1], F[3] );
|
||||
ADD_SUB( F[4], F[6] );
|
||||
ADD_SUB( F[5], F[7] );
|
||||
|
||||
F[5] = _mm256_slli_epi32( F[5], 2 );
|
||||
F[6] = _mm256_slli_epi32( F[6], 4 );
|
||||
F[7] = _mm256_slli_epi32( F[7], 6 );
|
||||
|
||||
ADD_SUB( F[0], F[4] );
|
||||
ADD_SUB( F[1], F[5] );
|
||||
ADD_SUB( F[2], F[6] );
|
||||
ADD_SUB( F[3], F[7] );
|
||||
|
||||
#undef ADD_SUB
|
||||
|
||||
#if defined (__AVX512VL__) && defined(__AVX512BW__)
|
||||
|
||||
#define Q_REDUCE( a ) \
|
||||
_mm256_sub_epi32( _mm256_and_si256( a, \
|
||||
_mm256_movm_epi8( 0x11111111 ) ), _mm256_srai_epi32( a, 8 ) )
|
||||
|
||||
#else
|
||||
|
||||
#define Q_REDUCE( a ) \
|
||||
_mm256_sub_epi32( _mm256_and_si256( a, \
|
||||
m256_const1_32( 0x000000ff ) ), _mm256_srai_epi32( a, 8 ) )
|
||||
|
||||
#endif
|
||||
|
||||
out[0] = Q_REDUCE( F[0] );
|
||||
out[1] = Q_REDUCE( F[1] );
|
||||
out[2] = Q_REDUCE( F[2] );
|
||||
out[3] = Q_REDUCE( F[3] );
|
||||
out[4] = Q_REDUCE( F[4] );
|
||||
out[5] = Q_REDUCE( F[5] );
|
||||
out[6] = Q_REDUCE( F[6] );
|
||||
out[7] = Q_REDUCE( F[7] );
|
||||
|
||||
#undef Q_REDUCE
|
||||
|
||||
#elif defined(__SSE4_1__)
|
||||
|
||||
__m128i F[16] __attribute__ ((aligned (64)));
|
||||
__m128i *mul = (__m128i*)multipliers;
|
||||
__m128i *out = (__m128i*)output;
|
||||
__m128i *tbl = (__m128i*)&( fftTable[ input[0] << 3 ] );
|
||||
|
||||
F[ 0] = _mm_mullo_epi32( mul[ 0], tbl[0] );
|
||||
F[ 1] = _mm_mullo_epi32( mul[ 1], tbl[1] );
|
||||
tbl = (__m128i*)&( fftTable[ input[1] << 3 ] );
|
||||
F[ 2] = _mm_mullo_epi32( mul[ 2], tbl[0] );
|
||||
F[ 3] = _mm_mullo_epi32( mul[ 3], tbl[1] );
|
||||
tbl = (__m128i*)&( fftTable[ input[2] << 3 ] );
|
||||
F[ 4] = _mm_mullo_epi32( mul[ 4], tbl[0] );
|
||||
F[ 5] = _mm_mullo_epi32( mul[ 5], tbl[1] );
|
||||
tbl = (__m128i*)&( fftTable[ input[3] << 3 ] );
|
||||
F[ 6] = _mm_mullo_epi32( mul[ 6], tbl[0] );
|
||||
F[ 7] = _mm_mullo_epi32( mul[ 7], tbl[1] );
|
||||
tbl = (__m128i*)&( fftTable[ input[4] << 3 ] );
|
||||
F[ 8] = _mm_mullo_epi32( mul[ 8], tbl[0] );
|
||||
F[ 9] = _mm_mullo_epi32( mul[ 9], tbl[1] );
|
||||
tbl = (__m128i*)&( fftTable[ input[5] << 3 ] );
|
||||
F[10] = _mm_mullo_epi32( mul[10], tbl[0] );
|
||||
F[11] = _mm_mullo_epi32( mul[11], tbl[1] );
|
||||
tbl = (__m128i*)&( fftTable[ input[6] << 3 ] );
|
||||
F[12] = _mm_mullo_epi32( mul[12], tbl[0] );
|
||||
F[13] = _mm_mullo_epi32( mul[13], tbl[1] );
|
||||
tbl = (__m128i*)&( fftTable[ input[7] << 3 ] );
|
||||
F[14] = _mm_mullo_epi32( mul[14], tbl[0] );
|
||||
F[15] = _mm_mullo_epi32( mul[15], tbl[1] );
|
||||
|
||||
#define ADD_SUB( a, b ) \
|
||||
{ \
|
||||
__m128i tmp = b; \
|
||||
b = _mm_sub_epi32( a, b ); \
|
||||
a = _mm_add_epi32( a, tmp ); \
|
||||
}
|
||||
|
||||
ADD_SUB( F[ 0], F[ 2] );
|
||||
ADD_SUB( F[ 1], F[ 3] );
|
||||
ADD_SUB( F[ 4], F[ 6] );
|
||||
ADD_SUB( F[ 5], F[ 7] );
|
||||
ADD_SUB( F[ 8], F[10] );
|
||||
ADD_SUB( F[ 9], F[11] );
|
||||
ADD_SUB( F[12], F[14] );
|
||||
ADD_SUB( F[13], F[15] );
|
||||
|
||||
F[ 6] = _mm_slli_epi32( F[ 6], 4 );
|
||||
F[ 7] = _mm_slli_epi32( F[ 7], 4 );
|
||||
F[14] = _mm_slli_epi32( F[14], 4 );
|
||||
F[15] = _mm_slli_epi32( F[15], 4 );
|
||||
|
||||
ADD_SUB( F[ 0], F[ 4] );
|
||||
ADD_SUB( F[ 1], F[ 5] );
|
||||
ADD_SUB( F[ 2], F[ 6] );
|
||||
ADD_SUB( F[ 3], F[ 7] );
|
||||
ADD_SUB( F[ 8], F[12] );
|
||||
ADD_SUB( F[ 9], F[13] );
|
||||
ADD_SUB( F[10], F[14] );
|
||||
ADD_SUB( F[11], F[15] );
|
||||
|
||||
F[10] = _mm_slli_epi32( F[10], 2 );
|
||||
F[11] = _mm_slli_epi32( F[11], 2 );
|
||||
F[12] = _mm_slli_epi32( F[12], 4 );
|
||||
F[13] = _mm_slli_epi32( F[13], 4 );
|
||||
F[14] = _mm_slli_epi32( F[14], 6 );
|
||||
F[15] = _mm_slli_epi32( F[15], 6 );
|
||||
|
||||
ADD_SUB( F[ 0], F[ 8] );
|
||||
ADD_SUB( F[ 1], F[ 9] );
|
||||
ADD_SUB( F[ 2], F[10] );
|
||||
ADD_SUB( F[ 3], F[11] );
|
||||
ADD_SUB( F[ 4], F[12] );
|
||||
ADD_SUB( F[ 5], F[13] );
|
||||
ADD_SUB( F[ 6], F[14] );
|
||||
ADD_SUB( F[ 7], F[15] );
|
||||
|
||||
#undef ADD_SUB
|
||||
|
||||
#define Q_REDUCE( a ) \
|
||||
_mm_sub_epi32( _mm_and_si128( a, \
|
||||
m128_const1_32( 0x000000ff ) ), _mm_srai_epi32( a, 8 ) )
|
||||
|
||||
out[ 0] = Q_REDUCE( F[ 0] );
|
||||
out[ 1] = Q_REDUCE( F[ 1] );
|
||||
out[ 2] = Q_REDUCE( F[ 2] );
|
||||
out[ 3] = Q_REDUCE( F[ 3] );
|
||||
out[ 4] = Q_REDUCE( F[ 4] );
|
||||
out[ 5] = Q_REDUCE( F[ 5] );
|
||||
out[ 6] = Q_REDUCE( F[ 6] );
|
||||
out[ 7] = Q_REDUCE( F[ 7] );
|
||||
out[ 8] = Q_REDUCE( F[ 8] );
|
||||
out[ 9] = Q_REDUCE( F[ 9] );
|
||||
out[10] = Q_REDUCE( F[10] );
|
||||
out[11] = Q_REDUCE( F[11] );
|
||||
out[12] = Q_REDUCE( F[12] );
|
||||
out[13] = Q_REDUCE( F[13] );
|
||||
out[14] = Q_REDUCE( F[14] );
|
||||
out[15] = Q_REDUCE( F[15] );
|
||||
|
||||
#undef Q_REDUCE
|
||||
|
||||
#else // < SSE4.1
|
||||
|
||||
swift_int16_t *mult = multipliers;
|
||||
|
||||
// First loop unrolling:
|
||||
register swift_int16_t *table = &(fftTable[input[0] << 3]);
|
||||
|
||||
/*
|
||||
swift_int32_t F[64];
|
||||
@@ -666,11 +861,8 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
|
||||
F50, F51, F52, F53, F54, F55, F56, F57, F58, F59,
|
||||
F60, F61, F62, F63;
|
||||
|
||||
// First loop unrolling:
|
||||
register swift_int16_t *table = &(fftTable[input[0] << 3]);
|
||||
|
||||
F0 = mult[0] * table[0];
|
||||
F8 = mult[1] * table[1];
|
||||
F0 = mult[0] * table[0];
|
||||
F8 = mult[1] * table[1];
|
||||
F16 = mult[2] * table[2];
|
||||
F24 = mult[3] * table[3];
|
||||
F32 = mult[4] * table[4];
|
||||
@@ -678,90 +870,93 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
|
||||
F48 = mult[6] * table[6];
|
||||
F56 = mult[7] * table[7];
|
||||
|
||||
mult += 8;
|
||||
table = &(fftTable[input[1] << 3]);
|
||||
|
||||
F1 = mult[0] * table[0];
|
||||
F9 = mult[1] * table[1];
|
||||
F17 = mult[2] * table[2];
|
||||
F25 = mult[3] * table[3];
|
||||
F33 = mult[4] * table[4];
|
||||
F41 = mult[5] * table[5];
|
||||
F49 = mult[6] * table[6];
|
||||
F57 = mult[7] * table[7];
|
||||
F1 = mult[ 8] * table[0];
|
||||
F9 = mult[ 9] * table[1];
|
||||
F17 = mult[10] * table[2];
|
||||
F25 = mult[11] * table[3];
|
||||
F33 = mult[12] * table[4];
|
||||
F41 = mult[13] * table[5];
|
||||
F49 = mult[14] * table[6];
|
||||
F57 = mult[15] * table[7];
|
||||
|
||||
mult += 8;
|
||||
table = &(fftTable[input[2] << 3]);
|
||||
|
||||
F2 = mult[0] * table[0];
|
||||
F10 = mult[1] * table[1];
|
||||
F18 = mult[2] * table[2];
|
||||
F26 = mult[3] * table[3];
|
||||
F34 = mult[4] * table[4];
|
||||
F42 = mult[5] * table[5];
|
||||
F50 = mult[6] * table[6];
|
||||
F58 = mult[7] * table[7];
|
||||
F2 = mult[16] * table[0];
|
||||
F10 = mult[17] * table[1];
|
||||
F18 = mult[18] * table[2];
|
||||
F26 = mult[19] * table[3];
|
||||
F34 = mult[20] * table[4];
|
||||
F42 = mult[21] * table[5];
|
||||
F50 = mult[22] * table[6];
|
||||
F58 = mult[23] * table[7];
|
||||
|
||||
mult += 8;
|
||||
table = &(fftTable[input[3] << 3]);
|
||||
|
||||
F3 = mult[0] * table[0];
|
||||
F11 = mult[1] * table[1];
|
||||
F19 = mult[2] * table[2];
|
||||
F27 = mult[3] * table[3];
|
||||
F35 = mult[4] * table[4];
|
||||
F43 = mult[5] * table[5];
|
||||
F51 = mult[6] * table[6];
|
||||
F59 = mult[7] * table[7];
|
||||
F3 = mult[24] * table[0];
|
||||
F11 = mult[25] * table[1];
|
||||
F19 = mult[26] * table[2];
|
||||
F27 = mult[27] * table[3];
|
||||
F35 = mult[28] * table[4];
|
||||
F43 = mult[29] * table[5];
|
||||
F51 = mult[30] * table[6];
|
||||
F59 = mult[31] * table[7];
|
||||
|
||||
mult += 8;
|
||||
table = &(fftTable[input[4] << 3]);
|
||||
|
||||
F4 = mult[0] * table[0];
|
||||
F12 = mult[1] * table[1];
|
||||
F20 = mult[2] * table[2];
|
||||
F28 = mult[3] * table[3];
|
||||
F36 = mult[4] * table[4];
|
||||
F44 = mult[5] * table[5];
|
||||
F52 = mult[6] * table[6];
|
||||
F60 = mult[7] * table[7];
|
||||
F4 = mult[32] * table[0];
|
||||
F12 = mult[33] * table[1];
|
||||
F20 = mult[34] * table[2];
|
||||
F28 = mult[35] * table[3];
|
||||
F36 = mult[36] * table[4];
|
||||
F44 = mult[37] * table[5];
|
||||
F52 = mult[38] * table[6];
|
||||
F60 = mult[39] * table[7];
|
||||
|
||||
mult += 8;
|
||||
table = &(fftTable[input[5] << 3]);
|
||||
|
||||
F5 = mult[0] * table[0];
|
||||
F13 = mult[1] * table[1];
|
||||
F21 = mult[2] * table[2];
|
||||
F29 = mult[3] * table[3];
|
||||
F37 = mult[4] * table[4];
|
||||
F45 = mult[5] * table[5];
|
||||
F53 = mult[6] * table[6];
|
||||
F61 = mult[7] * table[7];
|
||||
F5 = mult[40] * table[0];
|
||||
F13 = mult[41] * table[1];
|
||||
F21 = mult[42] * table[2];
|
||||
F29 = mult[43] * table[3];
|
||||
F37 = mult[44] * table[4];
|
||||
F45 = mult[45] * table[5];
|
||||
F53 = mult[46] * table[6];
|
||||
F61 = mult[47] * table[7];
|
||||
|
||||
mult += 8;
|
||||
table = &(fftTable[input[6] << 3]);
|
||||
|
||||
F6 = mult[0] * table[0];
|
||||
F14 = mult[1] * table[1];
|
||||
F22 = mult[2] * table[2];
|
||||
F30 = mult[3] * table[3];
|
||||
F38 = mult[4] * table[4];
|
||||
F46 = mult[5] * table[5];
|
||||
F54 = mult[6] * table[6];
|
||||
F62 = mult[7] * table[7];
|
||||
F6 = mult[48] * table[0];
|
||||
F14 = mult[49] * table[1];
|
||||
F22 = mult[50] * table[2];
|
||||
F30 = mult[51] * table[3];
|
||||
F38 = mult[52] * table[4];
|
||||
F46 = mult[53] * table[5];
|
||||
F54 = mult[54] * table[6];
|
||||
F62 = mult[55] * table[7];
|
||||
|
||||
mult += 8;
|
||||
table = &(fftTable[input[7] << 3]);
|
||||
|
||||
F7 = mult[0] * table[0];
|
||||
F15 = mult[1] * table[1];
|
||||
F23 = mult[2] * table[2];
|
||||
F31 = mult[3] * table[3];
|
||||
F39 = mult[4] * table[4];
|
||||
F47 = mult[5] * table[5];
|
||||
F55 = mult[6] * table[6];
|
||||
F63 = mult[7] * table[7];
|
||||
F7 = mult[56] * table[0];
|
||||
F15 = mult[57] * table[1];
|
||||
F23 = mult[58] * table[2];
|
||||
F31 = mult[59] * table[3];
|
||||
F39 = mult[60] * table[4];
|
||||
F47 = mult[61] * table[5];
|
||||
F55 = mult[62] * table[6];
|
||||
F63 = mult[63] * table[7];
|
||||
|
||||
#define ADD_SUB( a, b ) \
|
||||
{ \
|
||||
int temp = b; \
|
||||
b = a - b; \
|
||||
a = a + temp; \
|
||||
}
|
||||
|
||||
#define Q_REDUCE( a ) \
|
||||
( ( (a) & 0xff ) - ( (a) >> 8 ) )
|
||||
|
||||
/*
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
@@ -800,7 +995,6 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
// Second loop unrolling:
|
||||
// Iteration 0:
|
||||
ADD_SUB(F0, F1);
|
||||
@@ -1057,6 +1251,11 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
|
||||
output[47] = Q_REDUCE(F61);
|
||||
output[55] = Q_REDUCE(F62);
|
||||
output[63] = Q_REDUCE(F63);
|
||||
|
||||
#undef ADD_SUB
|
||||
#undef Q_REDUCE
|
||||
|
||||
#endif // AVX2 elif SSE4.1 else
|
||||
}
|
||||
|
||||
// Calculates the FFT part of SWIFFT.
|
||||
@@ -1086,24 +1285,66 @@ void SWIFFTFFT(const unsigned char *input, int m, swift_int32_t *output)
|
||||
// - m: the input size divided by 64.
|
||||
// - output: will store the result.
|
||||
// - a: the coefficients in the sum. Of size 64 * m.
|
||||
void SWIFFTSum(const swift_int32_t *input, int m, unsigned char *output, const swift_int16_t *a)
|
||||
void SWIFFTSum( const swift_int32_t *input, int m, unsigned char *output,
|
||||
const swift_int16_t *a )
|
||||
{
|
||||
int i, j;
|
||||
swift_int32_t result[N];
|
||||
swift_int32_t result[N] __attribute__ ((aligned (64)));
|
||||
register swift_int16_t carry = 0;
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
__m512i *res = (__m512i*)result;
|
||||
for ( j = 0; j < N/16; ++j )
|
||||
{
|
||||
__m512i sum = _mm512_setzero_si512();
|
||||
const __m512i *f = (__m512i*)input + j;
|
||||
const __m512i *k = (__m512i*)a + j;
|
||||
for ( i = 0; i < m; i++, f += N/16, k += N/16 )
|
||||
sum = _mm512_add_epi32( sum, _mm512_mullo_epi32( *f, *k ) );
|
||||
res[j] = sum;
|
||||
}
|
||||
|
||||
#elif defined(__AVX2__)
|
||||
|
||||
__m256i *res = (__m256i*)result;
|
||||
for ( j = 0; j < N/8; ++j )
|
||||
{
|
||||
__m256i sum = _mm256_setzero_si256();
|
||||
const __m256i *f = (__m256i*)input + j;
|
||||
const __m256i *k = (__m256i*)a + j;
|
||||
for ( i = 0; i < m; i++, f += N/8, k += N/8 )
|
||||
sum = _mm256_add_epi32( sum, _mm256_mullo_epi32( *f, *k ) );
|
||||
res[j] = sum;
|
||||
}
|
||||
|
||||
#elif defined(__SSE4_1__)
|
||||
|
||||
__m128i *res = (__m128i*)result;
|
||||
for ( j = 0; j < N/4; ++j )
|
||||
{
|
||||
__m128i sum = _mm_setzero_si128();
|
||||
const __m128i *f = (__m128i*)input + j;
|
||||
const __m128i *k = (__m128i*)a + j;
|
||||
for ( i = 0; i < m; i++, f += N/4, k += N/4 )
|
||||
sum = _mm_add_epi32( sum, _mm_mullo_epi32( *f, *k ) );
|
||||
res[j] = sum;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
for (j = 0; j < N; ++j)
|
||||
{
|
||||
register swift_int32_t sum = 0;
|
||||
const register swift_int32_t *f = input + j;
|
||||
const register swift_int16_t *k = a + j;
|
||||
|
||||
for (i = 0; i < m; i++, f += N,k += N)
|
||||
sum += (*f) * (*k);
|
||||
|
||||
result[j] = sum;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
for (j = 0; j < N; ++j)
|
||||
result[j] = ((FIELD_SIZE << 22) + result[j]) % FIELD_SIZE;
|
||||
|
||||
@@ -1122,8 +1363,8 @@ void ComputeSingleSWIFFTX_smooth(unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
|
||||
{
|
||||
int i;
|
||||
// Will store the result of the FFT parts:
|
||||
swift_int32_t fftOut[N * M];
|
||||
unsigned char intermediate[N * 3 + 8];
|
||||
swift_int32_t fftOut[N * M] __attribute__ ((aligned (64)));
|
||||
unsigned char intermediate[N * 3 + 8] __attribute__ ((aligned (64)));
|
||||
unsigned char carry0,carry1,carry2;
|
||||
|
||||
// Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets
|
||||
@@ -1199,8 +1440,8 @@ void ComputeSingleSWIFFTX( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
|
||||
{
|
||||
int i;
|
||||
// Will store the result of the FFT parts:
|
||||
swift_int32_t fftOut[N * M];
|
||||
unsigned char intermediate[N * 3 + 8];
|
||||
swift_int32_t fftOut[N * M] __attribute__ ((aligned (64)));
|
||||
unsigned char intermediate[N * 3 + 8] __attribute__ ((aligned (64)));
|
||||
unsigned char carry0,carry1,carry2;
|
||||
|
||||
// Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.0.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.2.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.18.0'
|
||||
PACKAGE_STRING='cpuminer-opt 3.18.0'
|
||||
PACKAGE_VERSION='3.18.2'
|
||||
PACKAGE_STRING='cpuminer-opt 3.18.2'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.18.0 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.18.2 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1404,7 +1404,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.18.0:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.18.2:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1509,7 +1509,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.18.0
|
||||
cpuminer-opt configure 3.18.2
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.18.0, which was
|
||||
It was created by cpuminer-opt $as_me 3.18.2, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2993,7 +2993,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.18.0'
|
||||
VERSION='3.18.2'
|
||||
|
||||
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.18.0, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.18.2, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -6756,7 +6756,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.18.0
|
||||
cpuminer-opt config.status 3.18.2
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.18.0])
|
||||
AC_INIT([cpuminer-opt], [3.18.2])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
89
cpu-miner.c
89
cpu-miner.c
@@ -1112,19 +1112,17 @@ void report_summary_log( bool force )
|
||||
applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], short_url );
|
||||
applog2( LOG_NOTICE, "Periodic Report %s %s", et_str, upt_str );
|
||||
applog2( LOG_INFO, "Share rate %.2f/min %.2f/min",
|
||||
submit_rate, (double)submitted_share_count*60. /
|
||||
( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ) );
|
||||
submit_rate, safe_div( (double)submitted_share_count*60.,
|
||||
( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ), 0. ) );
|
||||
applog2( LOG_INFO, "Hash rate %7.2f%sh/s %7.2f%sh/s (%.2f%sh/s)",
|
||||
shrate, shr_units, sess_hrate, sess_hr_units, ghrate, ghr_units );
|
||||
|
||||
if ( accepted_share_count < submitted_share_count )
|
||||
{
|
||||
double lost_ghrate = uptime.tv_sec == 0 ? 0.
|
||||
: target_diff
|
||||
* (double)(submitted_share_count - accepted_share_count )
|
||||
/ (double)uptime.tv_sec;
|
||||
double lost_shrate = share_time == 0. ? 0.
|
||||
: target_diff * (double)(submits - accepts ) / share_time;
|
||||
double lost_ghrate = safe_div( target_diff
|
||||
* (double)(submitted_share_count - accepted_share_count ),
|
||||
(double)uptime.tv_sec, 0. );
|
||||
double lost_shrate = safe_div( target_diff * (double)(submits - accepts ), share_time, 0. );
|
||||
char lshr_units[4] = {0};
|
||||
char lghr_units[4] = {0};
|
||||
scale_hash_for_display( &lost_shrate, lshr_units );
|
||||
@@ -2083,7 +2081,8 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
|
||||
/ ( opt_target_factor * opt_diff_factor );
|
||||
diff_to_hash( g_work->target, g_work->targetdiff );
|
||||
|
||||
// Increment extranonce2
|
||||
// Pre increment extranonce2 in case of being called again before receiving
|
||||
// a new job
|
||||
for ( int t = 0;
|
||||
t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] );
|
||||
t++ );
|
||||
@@ -2103,20 +2102,12 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
|
||||
|
||||
pthread_mutex_unlock( &stats_lock );
|
||||
|
||||
if ( !opt_quiet )
|
||||
{
|
||||
int mismatch = submitted_share_count
|
||||
- ( accepted_share_count + stale_share_count + rejected_share_count );
|
||||
if ( mismatch )
|
||||
applog(LOG_INFO, CL_LBL "%d Submitted share pending, maybe stale" CL_N, submitted_share_count );
|
||||
}
|
||||
|
||||
if ( stratum_diff != sctx->job.diff )
|
||||
applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Job %s",
|
||||
sctx->job.diff, sctx->block_height, g_work->job_id );
|
||||
else if ( last_block_height != sctx->block_height )
|
||||
applog( LOG_BLUE, "New Block %d, Job %s",
|
||||
sctx->block_height, g_work->job_id );
|
||||
applog( LOG_BLUE, "New Block %d, Net diff %.5g, Job %s",
|
||||
sctx->block_height, net_diff, g_work->job_id );
|
||||
else if ( g_work->job_id && new_job )
|
||||
applog( LOG_BLUE, "New Work: Block %d, Net diff %.5g, Job %s",
|
||||
sctx->block_height, net_diff, g_work->job_id );
|
||||
@@ -2173,7 +2164,6 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
|
||||
{
|
||||
double net_hr = nd / net_ttf;
|
||||
char net_hr_units[4] = {0};
|
||||
|
||||
scale_hash_for_display ( &net_hr, net_hr_units );
|
||||
applog2( LOG_INFO, "Net hash rate (est) %.2f %sh/s",
|
||||
net_hr, net_hr_units );
|
||||
@@ -2182,6 +2172,17 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
|
||||
} // hr > 0
|
||||
} // !quiet
|
||||
} // new diff/block
|
||||
|
||||
if ( new_job && !opt_quiet )
|
||||
{
|
||||
int mismatch = submitted_share_count - ( accepted_share_count
|
||||
+ stale_share_count
|
||||
+ rejected_share_count );
|
||||
if ( mismatch )
|
||||
applog( LOG_INFO,
|
||||
CL_LBL "%d Submitted share pending, maybe stale" CL_N,
|
||||
submitted_share_count );
|
||||
}
|
||||
}
|
||||
|
||||
static void *miner_thread( void *userdata )
|
||||
@@ -2492,18 +2493,21 @@ static void *miner_thread( void *userdata )
|
||||
timeval_subtract( &uptime, &total_hashes_time, &session_start );
|
||||
double hashrate = safe_div( total_hashes, uptime.tv_sec, 0. );
|
||||
|
||||
scale_hash_for_display( &hashrate, hr_units );
|
||||
sprintf( hr, "%.2f", hashrate );
|
||||
if ( hashrate > 0. )
|
||||
{
|
||||
scale_hash_for_display( &hashrate, hr_units );
|
||||
sprintf( hr, "%.2f", hashrate );
|
||||
#if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32))
|
||||
applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units );
|
||||
applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units );
|
||||
#else
|
||||
float lo_freq = 0., hi_freq = 0.;
|
||||
linux_cpu_hilo_freq( &lo_freq, &hi_freq );
|
||||
applog( LOG_NOTICE,
|
||||
float lo_freq = 0., hi_freq = 0.;
|
||||
linux_cpu_hilo_freq( &lo_freq, &hi_freq );
|
||||
applog( LOG_NOTICE,
|
||||
"Total: %s %sH/s, Temp: %dC, Freq: %.3f/%.3f GHz",
|
||||
hr, hr_units, (uint32_t)cpu_temp(0), lo_freq / 1e6,
|
||||
hi_freq / 1e6 );
|
||||
#endif
|
||||
}
|
||||
}
|
||||
} // benchmark
|
||||
|
||||
@@ -2897,6 +2901,7 @@ static bool cpu_capability( bool display_only )
|
||||
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
|
||||
bool algo_has_aes = set_incl( AES_OPT, algo_features );
|
||||
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
|
||||
bool algo_has_avx = set_incl( AVX_OPT, algo_features );
|
||||
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
|
||||
bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features );
|
||||
bool algo_has_sha = set_incl( SHA_OPT, algo_features );
|
||||
@@ -2904,6 +2909,8 @@ static bool cpu_capability( bool display_only )
|
||||
bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features );
|
||||
bool use_aes;
|
||||
bool use_sse2;
|
||||
bool use_sse42;
|
||||
bool use_avx;
|
||||
bool use_avx2;
|
||||
bool use_avx512;
|
||||
bool use_sha;
|
||||
@@ -2973,18 +2980,21 @@ static bool cpu_capability( bool display_only )
|
||||
else if ( sw_has_aes ) printf( " AES" );
|
||||
if ( sw_has_sha ) printf( " SHA" );
|
||||
|
||||
printf("\nAlgo features:");
|
||||
if ( algo_features == EMPTY_SET ) printf( " None" );
|
||||
else
|
||||
if ( !display_only )
|
||||
{
|
||||
if ( algo_has_avx512 ) printf( " AVX512" );
|
||||
else if ( algo_has_avx2 ) printf( " AVX2 " );
|
||||
else if ( algo_has_sse42 ) printf( " SSE4.2" );
|
||||
else if ( algo_has_sse2 ) printf( " SSE2 " );
|
||||
if ( algo_has_vaes ||
|
||||
algo_has_vaes256 ) printf( " VAES" );
|
||||
else if ( algo_has_aes ) printf( " AES" );
|
||||
if ( algo_has_sha ) printf( " SHA" );
|
||||
printf("\nAlgo features:");
|
||||
if ( algo_features == EMPTY_SET ) printf( " None" );
|
||||
else
|
||||
{
|
||||
if ( algo_has_avx512 ) printf( " AVX512" );
|
||||
else if ( algo_has_avx2 ) printf( " AVX2 " );
|
||||
else if ( algo_has_sse42 ) printf( " SSE4.2" );
|
||||
else if ( algo_has_sse2 ) printf( " SSE2 " );
|
||||
if ( algo_has_vaes ||
|
||||
algo_has_vaes256 ) printf( " VAES" );
|
||||
else if ( algo_has_aes ) printf( " AES" );
|
||||
if ( algo_has_sha ) printf( " SHA" );
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
@@ -3019,6 +3029,8 @@ static bool cpu_capability( bool display_only )
|
||||
|
||||
// Determine mining options
|
||||
use_sse2 = cpu_has_sse2 && algo_has_sse2;
|
||||
use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42;
|
||||
use_avx = cpu_has_avx && sw_has_avx && algo_has_avx;
|
||||
use_aes = cpu_has_aes && sw_has_aes && algo_has_aes;
|
||||
use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2;
|
||||
use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
|
||||
@@ -3035,6 +3047,8 @@ static bool cpu_capability( bool display_only )
|
||||
{
|
||||
if ( use_avx512 ) printf( " AVX512" );
|
||||
else if ( use_avx2 ) printf( " AVX2" );
|
||||
else if ( use_avx ) printf( " AVX" );
|
||||
else if ( use_sse42 ) printf( " SSE42" );
|
||||
else if ( use_sse2 ) printf( " SSE2" );
|
||||
if ( use_vaes ) printf( " VAES" );
|
||||
else if ( use_aes ) printf( " AES" );
|
||||
@@ -3970,6 +3984,7 @@ int main(int argc, char *argv[])
|
||||
gettimeofday( &last_submit_time, NULL );
|
||||
memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
|
||||
memcpy( &session_start, &last_submit_time, sizeof (struct timeval) );
|
||||
memcpy( &total_hashes_time, &last_submit_time, sizeof (struct timeval) );
|
||||
pthread_mutex_unlock( &stats_lock );
|
||||
|
||||
applog( LOG_INFO, "%d of %d miner threads started using '%s' algorithm",
|
||||
|
18
miner.h
18
miner.h
@@ -868,9 +868,9 @@ Options:\n\
|
||||
yespowerr16 Yenten (YTN)\n\
|
||||
yespower-b2b generic yespower + blake2b\n\
|
||||
zr5 Ziftr\n\
|
||||
-N, --param-n N parameter for scrypt based algos\n\
|
||||
-R, --param-r R parameter for scrypt based algos\n\
|
||||
-K, --param-key Key (pers) parameter for algos that use it\n\
|
||||
-N, --param-n=N N parameter for scrypt based algos\n\
|
||||
-R, --param-r=N R parameter for scrypt based algos\n\
|
||||
-K, --param-key=STRING Key (pers) parameter for algos that use it\n\
|
||||
-o, --url=URL URL of mining server\n\
|
||||
-O, --userpass=U:P username:password pair for mining server\n\
|
||||
-u, --user=USERNAME username for mining server\n\
|
||||
@@ -886,8 +886,8 @@ Options:\n\
|
||||
-s, --scantime=N upper bound on time spent scanning current work when\n\
|
||||
long polling is unavailable, in seconds (default: 5)\n\
|
||||
--randomize Randomize scan range start to reduce duplicates\n\
|
||||
-f, --diff-factor Divide req. difficulty by this factor (std is 1.0)\n\
|
||||
-m, --diff-multiplier Multiply difficulty by this factor (std is 1.0)\n\
|
||||
-f, --diff-factor=N Divide req. difficulty by this factor (std is 1.0)\n\
|
||||
-m, --diff-multiplier=N Multiply difficulty by this factor (std is 1.0)\n\
|
||||
--hash-meter Display thread hash rates\n\
|
||||
--coinbase-addr=ADDR payout address for solo mining\n\
|
||||
--coinbase-sig=TEXT data to insert in the coinbase when possible\n\
|
||||
@@ -895,9 +895,9 @@ Options:\n\
|
||||
--no-getwork disable getwork support\n\
|
||||
--no-gbt disable getblocktemplate support\n\
|
||||
--no-stratum disable X-Stratum support\n\
|
||||
--no-extranonce disable Stratum extranonce support\n\
|
||||
--no-extranonce disable Stratum extranonce subscribe\n\
|
||||
--no-redirect ignore requests to change the URL of the mining server\n\
|
||||
-q, --quiet disable per-thread hashmeter output\n\
|
||||
-q, --quiet reduce log verbosity\n\
|
||||
--no-color disable colored output\n\
|
||||
-D, --debug enable debug output\n\
|
||||
-P, --protocol-dump verbose dump of protocol-level activities\n"
|
||||
@@ -916,9 +916,9 @@ Options:\n\
|
||||
--max-rate=N[KMG] Only mine if net hashrate is less than specified value\n\
|
||||
--max-diff=N Only mine if net difficulty is less than specified value\n\
|
||||
-c, --config=FILE load a JSON-format configuration file\n\
|
||||
--data-file path and name of data file\n\
|
||||
--data-file=FILE path and name of data file\n\
|
||||
--verify enable additional time consuming start up tests\n\
|
||||
-V, --version display version information and exit\n\
|
||||
-V, --version display version and CPU information and exit\n\
|
||||
-h, --help display this help text and exit\n\
|
||||
";
|
||||
|
||||
|
@@ -1,7 +1,7 @@
|
||||
#if !defined(SIMD_256_H__)
|
||||
#define SIMD_256_H__ 1
|
||||
|
||||
#if defined(__AVX2__)
|
||||
//#if defined(__AVX2__)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
@@ -14,7 +14,9 @@
|
||||
// is limited because 256 bit vectors are less likely to be used when 512
|
||||
// is available.
|
||||
|
||||
// Used instead if casting.
|
||||
#if defined(__AVX__)
|
||||
|
||||
// Used instead of casting.
|
||||
typedef union
|
||||
{
|
||||
__m256i m256;
|
||||
@@ -23,6 +25,28 @@ typedef union
|
||||
uint32_t u32[8];
|
||||
} __attribute__ ((aligned (32))) m256_ovly;
|
||||
|
||||
//
|
||||
// Pointer casting
|
||||
|
||||
// p = any aligned pointer
|
||||
// returns p as pointer to vector type, not very useful
|
||||
#define castp_m256i(p) ((__m256i*)(p))
|
||||
|
||||
// p = any aligned pointer
|
||||
// returns *p, watch your pointer arithmetic
|
||||
#define cast_m256i(p) (*((__m256i*)(p)))
|
||||
|
||||
// p = any aligned pointer, i = scaled array index
|
||||
// returns value p[i]
|
||||
#define casti_m256i(p,i) (((__m256i*)(p))[(i)])
|
||||
|
||||
// p = any aligned pointer, o = scaled offset
|
||||
// returns pointer p+o
|
||||
#define casto_m256i(p,o) (((__m256i*)(p))+(o))
|
||||
|
||||
#endif
|
||||
#if defined(__AVX2__)
|
||||
|
||||
|
||||
// Move integer to low element of vector, other elements are set to zero.
|
||||
#define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
|
||||
@@ -91,26 +115,6 @@ static inline __m256i mm256_neg1_fn()
|
||||
#define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v )
|
||||
#define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 )
|
||||
|
||||
//
|
||||
// Pointer casting
|
||||
|
||||
// p = any aligned pointer
|
||||
// returns p as pointer to vector type, not very useful
|
||||
#define castp_m256i(p) ((__m256i*)(p))
|
||||
|
||||
// p = any aligned pointer
|
||||
// returns *p, watch your pointer arithmetic
|
||||
#define cast_m256i(p) (*((__m256i*)(p)))
|
||||
|
||||
// p = any aligned pointer, i = scaled array index
|
||||
// returns value p[i]
|
||||
#define casti_m256i(p,i) (((__m256i*)(p))[(i)])
|
||||
|
||||
// p = any aligned pointer, o = scaled offset
|
||||
// returns pointer p+o
|
||||
#define casto_m256i(p,o) (((__m256i*)(p))+(o))
|
||||
|
||||
|
||||
//
|
||||
// Memory functions
|
||||
// n = number of 256 bit (32 byte) vectors
|
||||
|
@@ -535,7 +535,6 @@ static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
|
||||
|
||||
// Rotate 256 bit lanes by one 64 bit element
|
||||
#define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 )
|
||||
|
||||
#define mm512_shufll256_64( v ) _mm512_permutex_epi64( v, 0x93 )
|
||||
|
||||
// Rotate 256 bit lanes by one 32 bit element
|
||||
@@ -611,9 +610,6 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
|
||||
// shufl2r is 2 input ...
|
||||
// Drop macros? They can easilly be rebuilt using shufl2 functions
|
||||
|
||||
// add shuflr shufll functions performing rotate, returning first arg
|
||||
// They're faster than doing both, when both not needed.
|
||||
|
||||
// Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
|
||||
// rotated v1
|
||||
// visually confusing for shif2r because of arg order. First arg is always
|
||||
|
@@ -2,22 +2,21 @@
|
||||
#define SIMD_INT_H__ 1
|
||||
|
||||
// Endian byte swap
|
||||
#define bswap_64( a ) __builtin_bswap64( a )
|
||||
#define bswap_32( a ) __builtin_bswap32( a )
|
||||
#define bswap_64 __builtin_bswap64
|
||||
#define bswap_32 __builtin_bswap32
|
||||
|
||||
// Bit rotation
|
||||
#define rol64 __rolq
|
||||
#define ror64 __rorq
|
||||
#define rol32 __rold
|
||||
#define ror32 __rord
|
||||
|
||||
// Safe division, integer or floating point. For floating point it's as
|
||||
// safe as 0. is precisely zero.
|
||||
// Returns safe_result if division by zero.
|
||||
// safe as 0 is precisely zero.
|
||||
// Returns safe_result if division by zero, typically zero.
|
||||
#define safe_div( dividend, divisor, safe_result ) \
|
||||
( (divisor) == 0 ? safe_result : ( (dividend) / (divisor) ) )
|
||||
|
||||
// Aliases with familiar names for built in bit rotate instructions
|
||||
#define rol64( a, n ) _lrotl( a, n )
|
||||
#define ror64( a, n ) _lrotr( a, n )
|
||||
#define rol32( a, n ) _rotl( a, n )
|
||||
#define ror32( a, n ) _rotr( a, n )
|
||||
#define rol16( a, n ) _rotwl( a, n )
|
||||
#define ror16( a, n ) _rotwr( a, n )
|
||||
|
||||
///////////////////////////////////////
|
||||
//
|
||||
|
Reference in New Issue
Block a user