This commit is contained in:
Jay D Dee
2023-03-11 14:54:49 -05:00
parent fb93160641
commit b339450898
49 changed files with 1120 additions and 1119 deletions

View File

@@ -264,8 +264,6 @@ cpuminer_SOURCES = \
algo/x16/x16r-4way.c \
algo/x16/x16rv2.c \
algo/x16/x16rv2-4way.c \
algo/x16/x16rt.c \
algo/x16/x16rt-4way.c \
algo/x16/hex.c \
algo/x16/x21s-4way.c \
algo/x16/x21s.c \

View File

@@ -65,6 +65,18 @@ If not what makes it happen or not happen?
Change Log
----------
v3.22.3
#392 #379 #389 Fixed misaligned address segfault solo mining.
#392 Fixed stats for myr-gr algo, and a few others, for CPUs without AVX2.
#392 Fixed conditional mining.
#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries,
Windows binaries no longer support CPU groups,
Windows binaries support CPUs with up to 64 threads.
Midstate prehash is now centralized, done only once instead of by every thread
for selected algos.
Small optimizations to serialized vectoring.
v3.22.2
Faster SALSA SIMD shuffle for yespower, yescrypt & scryptn2.

View File

@@ -253,6 +253,7 @@ void init_algo_gate( algo_gate_t* gate )
gate->miner_thread_init = (void*)&return_true;
gate->scanhash = (void*)&scanhash_generic;
gate->hash = (void*)&null_hash;
gate->prehash = (void*)&return_true;
gate->get_new_work = (void*)&std_get_new_work;
gate->work_decode = (void*)&std_le_work_decode;
gate->decode_extra_data = (void*)&do_nothing;

View File

@@ -119,7 +119,7 @@ typedef struct
// to be registered with the gate.
int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* );
int ( *hash ) ( void*, const void*, int );
int ( *hash ) ( void*, const void*, const int );
//optional, safe to use default in most cases
@@ -127,6 +127,9 @@ int ( *hash ) ( void*, const void*, int );
// other initialization specific to miner threads.
bool ( *miner_thread_init ) ( int );
// Perform prehash after receiving new work
int ( *prehash ) ( struct work* );
// Get thread local copy of blockheader with unique nonce.
void ( *get_new_work ) ( struct work*, struct work*, int, uint32_t* );

View File

@@ -1,5 +1,6 @@
#include "blake2s-gate.h"
#include "blake2s-hash-4way.h"
//#include "sph-blake2s.h"
#include <string.h>
#include <stdint.h>
@@ -7,6 +8,43 @@
static __thread blake2s_16way_state blake2s_16w_ctx;
/*
static blake2s_16way_state blake2s_16w_ctx;
static uint32_t blake2s_16way_vdata[20*16] __attribute__ ((aligned (64)));
*/
/*
int blake2s_16way_prehash( struct work *work )
{
uint32_t edata[20] __attribute__ ((aligned (64)));
blake2s_state ctx;
mm128_bswap32_80( edata, work->data );
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
ctx.buflen = ctx.t[0] = 64;
blake2s_compress( &ctx, (const uint8_t*)edata );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
intrlv_16x32( blake2s_16w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h, 256 );
intrlv_16x32( blake2s_16way_vdata, edata, edata, edata, edata,
edata, edata, edata, edata,
edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
blake2s_16w_ctx.t[0] = 64;
return 1;
}
*/
/*
int blake2s_16way_prehash( struct work *work )
{
mm512_bswap32_intrlv80_16x32( blake2s_16way_vdata, work->data );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
blake2s_16way_update( &blake2s_16w_ctx, blake2s_16way_vdata, 64 );
return 1;
}
*/
void blake2s_16way_hash( void *output, const void *input )
{
blake2s_16way_state ctx;
@@ -30,10 +68,40 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
int thr_id = mythr->id;
/*
// pthread_rwlock_rdlock( &g_work_lock );
memcpy( (__m512i*)vdata +16, (__m512i*)blake2s_16way_vdata +16, 3*4*16 );
// casti_m512i( vdata, 16 ) = casti_m512i( blake2s_16way_vdata, 16 );
// casti_m512i( vdata, 17 ) = casti_m512i( blake2s_16way_vdata, 17 );
// casti_m512i( vdata, 18 ) = casti_m512i( blake2s_16way_vdata, 18 );
// pthread_rwlock_unlock( &g_work_lock );
*/
/*
uint32_t edata[20] __attribute__ ((aligned (64)));
blake2s_state ctx;
mm128_bswap32_80( edata, pdata );
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
ctx.buflen = ctx.t[0] = 64;
blake2s_compress( &ctx, (const uint8_t*)edata );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
intrlv_16x32( blake2s_16w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h, 256 );
intrlv_16x32( blake2s_16way_blake2s_16way_vdata, edata, edata, edata, edata,
edata, edata, edata, edata,
edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
blake2s_16w_ctx.t[0] = 64;
*/
mm512_bswap32_intrlv80_16x32( vdata, pdata );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
do {
*noncev = mm512_bswap_32( _mm512_set_epi32(
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
@@ -63,6 +131,36 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
static __thread blake2s_8way_state blake2s_8w_ctx;
/*
static blake2s_8way_state blake2s_8w_ctx;
static uint32_t blake2s_8way_vdata[20*8] __attribute__ ((aligned (32)));
int blake2s_8way_prehash( struct work *work )
{
uint32_t edata[20] __attribute__ ((aligned (64)));
blake2s_state ctx;
mm128_bswap32_80( edata, work->data );
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
ctx.buflen = ctx.t[0] = 64;
blake2s_compress( &ctx, (const uint8_t*)edata );
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
for ( int i = 0; i < 8; i++ )
casti_m256i( blake2s_8w_ctx.h, i ) = _mm256_set1_epi32( ctx.h[i] );
casti_m256i( blake2s_8way_vdata, 16 ) = _mm256_set1_epi32( edata[16] );
casti_m256i( blake2s_8way_vdata, 17 ) = _mm256_set1_epi32( edata[17] );
casti_m256i( blake2s_8way_vdata, 18 ) = _mm256_set1_epi32( edata[18] );
// intrlv_8x32( blake2s_8w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
// ctx.h, ctx.h, ctx.h, ctx.h, 256 );
// intrlv_8x32( blake2s_8way_vdata, edata, edata, edata, edata,
// edata, edata, edata, edata, 640 );
blake2s_8w_ctx.t[0] = 64;
}
*/
void blake2s_8way_hash( void *output, const void *input )
{
blake2s_8way_state ctx;
@@ -86,10 +184,41 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
int thr_id = mythr->id;
/*
// pthread_rwlock_rdlock( &g_work_lock );
memcpy( &vdata[16*8], &blake2s_8way_vdata[16*8], 3*4*8 );
// pthread_rwlock_unlock( &g_work_lock );
*/
/*
uint32_t edata[20] __attribute__ ((aligned (64)));
blake2s_state ctx;
mm128_bswap32_80( edata, pdata );
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
ctx.buflen = ctx.t[0] = 64;
blake2s_compress( &ctx, (const uint8_t*)edata );
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
for ( int i = 0; i < 8; i++ )
casti_m256i( blake2s_8w_ctx.h, i ) = _mm256_set1_epi32( ctx.h[i] );
casti_m256i( vdata, 16 ) = _mm256_set1_epi32( edata[16] );
casti_m256i( vdata, 17 ) = _mm256_set1_epi32( edata[17] );
casti_m256i( vdata, 18 ) = _mm256_set1_epi32( edata[18] );
// intrlv_8x32( blake2s_8w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
// ctx.h, ctx.h, ctx.h, ctx.h, 256 );
// intrlv_8x32( vdata, edata, edata, edata, edata,
// edata, edata, edata, edata, 640 );
blake2s_8w_ctx.t[0] = 64;
*/
mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
do {
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
n+3, n+2, n+1, n ) );
@@ -117,7 +246,25 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
#elif defined(BLAKE2S_4WAY)
static __thread blake2s_4way_state blake2s_4w_ctx;
/*
static blake2s_4way_state blake2s_4w_ctx;
static uint32_t blake2s_4way_vdata[20*4] __attribute__ ((aligned (32)));
int blake2s_4way_prehash( struct work *work )
{
uint32_t edata[20] __attribute__ ((aligned (64)));
blake2s_state ctx;
mm128_bswap32_80( edata, work->data );
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
ctx.buflen = ctx.t[0] = 64;
blake2s_compress( &ctx, (const uint8_t*)edata );
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
intrlv_4x32( blake2s_4w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h, 256 );
intrlv_4x32( blake2s_4way_vdata, edata, edata, edata, edata, 640 );
blake2s_4w_ctx.t[0] = 64;
}
*/
void blake2s_4way_hash( void *output, const void *input )
{
blake2s_4way_state ctx;
@@ -140,7 +287,11 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
__m128i *noncev = (__m128i*)vdata + 19; // aligned
uint32_t n = first_nonce;
int thr_id = mythr->id;
/*
pthread_rwlock_rdlock( &g_work_lock );
memcpy( vdata, blake2s_4way_vdata, sizeof vdata );
pthread_rwlock_unlock( &g_work_lock );
*/
mm128_bswap32_intrlv80_4x32( vdata, pdata );
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );

View File

@@ -5,13 +5,15 @@ bool register_blake2s_algo( algo_gate_t* gate )
#if defined(BLAKE2S_16WAY)
gate->scanhash = (void*)&scanhash_blake2s_16way;
gate->hash = (void*)&blake2s_16way_hash;
// gate->prehash = (void*)&blake2s_16way_prehash;
#elif defined(BLAKE2S_8WAY)
//#if defined(BLAKE2S_8WAY)
gate->scanhash = (void*)&scanhash_blake2s_8way;
gate->hash = (void*)&blake2s_8way_hash;
// gate->prehash = (void*)&blake2s_8way_prehash;
#elif defined(BLAKE2S_4WAY)
gate->scanhash = (void*)&scanhash_blake2s_4way;
gate->hash = (void*)&blake2s_4way_hash;
// gate->prehash = (void*)&blake2s_4way_prehash;
#else
gate->scanhash = (void*)&scanhash_blake2s;
gate->hash = (void*)&blake2s_hash;

View File

@@ -23,18 +23,22 @@ bool register_blake2s_algo( algo_gate_t* gate );
void blake2s_16way_hash( void *state, const void *input );
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int blake2s_16way_prehash( struct work * );
#elif defined (BLAKE2S_8WAY)
void blake2s_8way_hash( void *state, const void *input );
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int blake2s_8way_prehash( struct work * );
#elif defined (BLAKE2S_4WAY)
void blake2s_4way_hash( void *state, const void *input );
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int blake2s_4way_prehash( struct work * );
#else
void blake2s_hash( void *state, const void *input );

View File

@@ -105,8 +105,8 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
#define G4W( sigma0, sigma1, a, b, c, d ) \
do { \
uint8_t s0 = sigma0; \
uint8_t s1 = sigma1; \
const uint8_t s0 = sigma0; \
const uint8_t s1 = sigma1; \
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
c = _mm_add_epi32( c, d ); \
@@ -120,7 +120,7 @@ do { \
#define ROUND4W(r) \
do { \
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
G4W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
G4W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
G4W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
@@ -317,8 +317,8 @@ do { \
#define G8W( sigma0, sigma1, a, b, c, d) \
do { \
uint8_t s0 = sigma0; \
uint8_t s1 = sigma1; \
const uint8_t s0 = sigma0; \
const uint8_t s1 = sigma1; \
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi32( c, d ); \
@@ -331,7 +331,7 @@ do { \
#define ROUND8W(r) \
do { \
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
G8W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
G8W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
G8W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
@@ -529,8 +529,8 @@ int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
#define G16W( sigma0, sigma1, a, b, c, d) \
do { \
uint8_t s0 = sigma0; \
uint8_t s1 = sigma1; \
const uint8_t s0 = sigma0; \
const uint8_t s1 = sigma1; \
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s0 ] ); \
d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
c = _mm512_add_epi32( c, d ); \
@@ -543,7 +543,7 @@ do { \
#define ROUND16W(r) \
do { \
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
G16W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
G16W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
G16W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \

View File

@@ -20,6 +20,7 @@
#include <stddef.h>
#include <stdint.h>
//#include "sph-blake2s.h"
#if defined(_MSC_VER)
#include <inttypes.h>
@@ -33,7 +34,7 @@
#if defined(__cplusplus)
extern "C" {
#endif
/*
enum blake2s_constant
{
BLAKE2S_BLOCKBYTES = 64,
@@ -42,6 +43,13 @@ enum blake2s_constant
BLAKE2S_SALTBYTES = 8,
BLAKE2S_PERSONALBYTES = 8
};
*/
#define BLAKE2S_BLOCKBYTES 64
#define BLAKE2S_OUTBYTES 32
#define BLAKE2S_KEYBYTES 32
#define BLAKE2S_SALTBYTES 8
#define BLAKE2S_PERSONALBYTES 8
#pragma pack(push, 1)
typedef struct __blake2s_nway_param

View File

@@ -8,8 +8,6 @@
#include "sph-blake2s.h"
static __thread blake2s_state blake2s_ctx;
//static __thread blake2s_state s_ctx;
#define MIDLEN 76
void blake2s_hash( void *output, const void *input )
{
@@ -19,37 +17,27 @@ void blake2s_hash( void *output, const void *input )
memcpy( &ctx, &blake2s_ctx, sizeof ctx );
blake2s_update( &ctx, input+64, 16 );
// blake2s_init(&ctx, BLAKE2S_OUTBYTES);
// blake2s_update(&ctx, input, 80);
blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
memcpy(output, hash, 32);
}
/*
static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
{
s_ctx.buflen = MIDLEN;
memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
}
*/
int scanhash_blake2s( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(64) hash64[8];
uint32_t _ALIGN(64) endiandata[20];
int thr_id = mythr->id; // thr_id arg is deprecated
int thr_id = mythr->id;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
swab32_array( endiandata, pdata, 20 );
swab32_array( endiandata, pdata, 20 );
// midstate
blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
@@ -58,11 +46,12 @@ int scanhash_blake2s( struct work *work,
do {
be32enc(&endiandata[19], n);
blake2s_hash( hash64, endiandata );
if (hash64[7] <= Htarg && fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return true;
}
if (hash64[7] <= Htarg )
if ( fulltest(hash64, ptarget) && !opt_benchmark )
{
pdata[19] = n;
submit_solution( work, hash64, mythr );
}
n++;
} while (n < max_nonce && !work_restart[thr_id].restart);

View File

@@ -17,6 +17,7 @@
#include "algo/sha/sph_types.h"
#include "sph-blake2s.h"
#include "simd-utils.h"
static const uint32_t blake2s_IV[8] =
{
@@ -225,6 +226,71 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
v[13] = S->t[1] ^ blake2s_IV[5];
v[14] = S->f[0] ^ blake2s_IV[6];
v[15] = S->f[1] ^ blake2s_IV[7];
#if 0
//#if defined(__SSE2__) // always true
The only application for this is to do a prehash for the blake2s algorithm.
SSE2 also supports 4 way parallel hashing so that is preferred in most cases.
Testing has found that using this serial SIMD code for prehash is slower than
doing a parallel hash. A parallel hash has more instructions and uses more
data. The serial hash uses fewer instructions and data and only needs to
interleave the final hash into parallel streams. This has shown negligible
improvement on other algos, notably blake256 which is almost identical.
Considering the low frequency of prehash no statistically valid change
was expected. It was simply better on paper.
Furthermore, simply defining this macro has an additional negative effect on
blake2s as a whole. There are no references to this macro, blake2s-4way does
not include it in any header files, it's just another unused macro which should
have no effect beyond the preprocessor. But just being visible to the compiler
changes things in a dramatic way.
These 2 things combined reduced the hash rate for blake2s by more than 5% when
using serial SIMD for the blake2s prehash over 16way parallel prehash.
16way parallel hashing was used in the high frequency nonce loop in both cases.
Comsidering the prehash represents 50% of the algorithm and is done once vs
the high frequency second half that is done mega, maybe giga, times more it's
hard to imagine that big of an effect in either direction.
#define ROUND( r ) \
{ \
__m128i *V = (__m128i*)v; \
const uint8_t *sigma = blake2s_sigma[r]; \
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
_mm_set_epi32( m[ sigma[ 6 ] ], m[ sigma[ 4 ] ], \
m[ sigma[ 2 ] ], m[ sigma[ 0 ] ] ) ) ); \
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
V[2] = _mm_add_epi32( V[2], V[3] ); \
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
_mm_set_epi32( m[ sigma[ 7 ] ], m[ sigma[ 5 ] ], \
m[ sigma[ 3 ] ], m[ sigma[ 1 ] ] ) ) ); \
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
V[2] = _mm_add_epi32( V[2], V[3] ); \
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
V[3] = mm128_shufll_32( V[3] ); \
V[2] = mm128_swap_64( V[2] ); \
V[1] = mm128_shuflr_32( V[1] ); \
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
_mm_set_epi32( m[ sigma[14] ], m[ sigma[12] ], \
m[ sigma[10] ], m[ sigma[ 8] ] ) ) ); \
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
V[2] = _mm_add_epi32( V[2], V[3] ); \
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
_mm_set_epi32( m[ sigma[15] ], m[ sigma[13] ], \
m[ sigma[11] ], m[ sigma[ 9] ] ) ) ); \
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
V[2] = _mm_add_epi32( V[2], V[3] ); \
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
V[3] = mm128_shuflr_32( V[3] ); \
V[2] = mm128_swap_64( V[2] ); \
V[1] = mm128_shufll_32( V[1] ); \
}
#else
#define G(r,i,a,b,c,d) \
do { \
a = a + b + m[blake2s_sigma[r][2*i+0]]; \
@@ -247,7 +313,10 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
} while(0)
ROUND( 0 );
#endif
ROUND( 0 );
ROUND( 1 );
ROUND( 2 );
ROUND( 3 );

View File

@@ -91,6 +91,7 @@ static inline void secure_zero_memory(void *v, size_t n)
extern "C" {
#endif
/*
enum blake2s_constant
{
BLAKE2S_BLOCKBYTES = 64,
@@ -99,6 +100,13 @@ extern "C" {
BLAKE2S_SALTBYTES = 8,
BLAKE2S_PERSONALBYTES = 8
};
*/
#define BLAKE2S_BLOCKBYTES 64
#define BLAKE2S_OUTBYTES 32
#define BLAKE2S_KEYBYTES 32
#define BLAKE2S_SALTBYTES 8
#define BLAKE2S_PERSONALBYTES 8
#pragma pack(push, 1)
typedef struct __blake2s_param

View File

@@ -103,16 +103,16 @@
const uint8_t *sigmaR = sigma[R]; \
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
V2 = mm128_alignr_64( V[3], V[2] ); \
V3 = mm128_alignr_64( V[2], V[3] ); \
V6 = mm128_alignr_64( V[6], V[7] ); \
V7 = mm128_alignr_64( V[7], V[6] ); \
V2 = mm128_alignr_64( V[3], V[2], 1 ); \
V3 = mm128_alignr_64( V[2], V[3], 1 ); \
V6 = mm128_alignr_64( V[6], V[7], 1 ); \
V7 = mm128_alignr_64( V[7], V[6], 1 ); \
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
V[2] = mm128_alignr_64( V2, V3 ); \
V[3] = mm128_alignr_64( V3, V2 ); \
V[6] = mm128_alignr_64( V7, V6 ); \
V[7] = mm128_alignr_64( V6, V7 ); \
V[2] = mm128_alignr_64( V2, V3, 1 ); \
V[3] = mm128_alignr_64( V3, V2, 1 ); \
V[6] = mm128_alignr_64( V7, V6, 1 ); \
V[7] = mm128_alignr_64( V6, V7, 1 ); \
}
#else

View File

@@ -73,11 +73,11 @@ int scanhash_myriad( struct work *work, uint32_t max_nonce,
be32enc(&endiandata[19], nonce);
myriad_hash(hash, endiandata);
if (hash[7] <= Htarg && fulltest(hash, ptarget))
if (hash[7] <= Htarg )
if ( fulltest(hash, ptarget) && !opt_benchmark )
{
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 1;
submit_solution( work, hash, mythr );
}
nonce++;

View File

@@ -19,26 +19,34 @@
*/
#include <string.h>
#include <emmintrin.h>
#include "simd-utils.h"
#include "luffa_for_sse2.h"
#if defined(__SSE4_1__)
#if defined(__AVX512VL__)
#define MULT2( a0, a1 ) \
{ \
__m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
a0 = _mm_alignr_epi32( a1, b, 1 ); \
a1 = _mm_alignr_epi32( b, a1, 1 ); \
}
#elif defined(__SSE4_1__)
#define MULT2( a0, a1 ) do \
{ \
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
a0 = _mm_alignr_epi8( a1, b, 4 ); \
a1 = _mm_alignr_epi8( b, a1, 4 ); \
} while(0)
#else
#define MULT2( a0, a1 ) do \
{ \
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 16 ) ); \
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
} while(0)
#endif

View File

@@ -24,6 +24,45 @@ typedef union {
#endif
} allium_16way_ctx_holder;
static uint32_t allium_16way_midstate_vars[16*16] __attribute__ ((aligned (64)));
static __m512i allium_16way_block0_hash[8] __attribute__ ((aligned (64)));
static __m512i allium_16way_block_buf[16] __attribute__ ((aligned (64)));
int allium_16way_prehash( struct work *work )
{
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
// Prehash first block.
blake256_transform_le( phash, pdata, 512, 0 );
// Interleave hash for second block prehash.
allium_16way_block0_hash[0] = _mm512_set1_epi32( phash[0] );
allium_16way_block0_hash[1] = _mm512_set1_epi32( phash[1] );
allium_16way_block0_hash[2] = _mm512_set1_epi32( phash[2] );
allium_16way_block0_hash[3] = _mm512_set1_epi32( phash[3] );
allium_16way_block0_hash[4] = _mm512_set1_epi32( phash[4] );
allium_16way_block0_hash[5] = _mm512_set1_epi32( phash[5] );
allium_16way_block0_hash[6] = _mm512_set1_epi32( phash[6] );
allium_16way_block0_hash[7] = _mm512_set1_epi32( phash[7] );
// Build vectored second block, interleave 12 of last 16 bytes of data,
// excluding the nonce.
allium_16way_block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
allium_16way_block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
allium_16way_block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( allium_16way_midstate_vars,
allium_16way_block0_hash, allium_16way_block_buf );
return 1;
}
static void allium_16way_hash( void *state, const void *midstate_vars,
const void *midhash, const void *block )
{
@@ -200,11 +239,6 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
__m512i block0_hash[8] __attribute__ ((aligned (64)));
__m512i block_buf[16] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -216,31 +250,19 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
// Prehash first block.
blake256_transform_le( phash, pdata, 512, 0 );
pthread_rwlock_rdlock( &g_work_lock );
// Interleave hash for second block prehash.
block0_hash[0] = _mm512_set1_epi32( phash[0] );
block0_hash[1] = _mm512_set1_epi32( phash[1] );
block0_hash[2] = _mm512_set1_epi32( phash[2] );
block0_hash[3] = _mm512_set1_epi32( phash[3] );
block0_hash[4] = _mm512_set1_epi32( phash[4] );
block0_hash[5] = _mm512_set1_epi32( phash[5] );
block0_hash[6] = _mm512_set1_epi32( phash[6] );
block0_hash[7] = _mm512_set1_epi32( phash[7] );
memcpy( midstate_vars, allium_16way_midstate_vars, sizeof midstate_vars );
memcpy( block0_hash, allium_16way_block0_hash, sizeof block0_hash );
memcpy( block_buf, allium_16way_block_buf, sizeof block_buf );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
block_buf[ 3] =
pthread_rwlock_unlock( &g_work_lock );
// fill in the nonces
block_buf[3] =
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -271,6 +293,44 @@ typedef union {
#endif
} allium_8way_ctx_holder;
static uint32_t allium_8way_midstate_vars[16*8] __attribute__ ((aligned (64)));
static __m256i allium_8way_block0_hash[8] __attribute__ ((aligned (64)));
static __m256i allium_8way_block_buf[16] __attribute__ ((aligned (64)));
int allium_8way_prehash ( struct work *work )
{
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
allium_8way_block0_hash[0] = _mm256_set1_epi32( phash[0] );
allium_8way_block0_hash[1] = _mm256_set1_epi32( phash[1] );
allium_8way_block0_hash[2] = _mm256_set1_epi32( phash[2] );
allium_8way_block0_hash[3] = _mm256_set1_epi32( phash[3] );
allium_8way_block0_hash[4] = _mm256_set1_epi32( phash[4] );
allium_8way_block0_hash[5] = _mm256_set1_epi32( phash[5] );
allium_8way_block0_hash[6] = _mm256_set1_epi32( phash[6] );
allium_8way_block0_hash[7] = _mm256_set1_epi32( phash[7] );
// Build vectored second block, interleave 12 of the last 16 bytes,
// excepting the nonces.
allium_8way_block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
allium_8way_block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
allium_8way_block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( allium_8way_midstate_vars,
allium_8way_block0_hash, allium_8way_block_buf );
return 1;
}
static void allium_8way_hash( void *hash, const void *midstate_vars,
const void *midhash, const void *block )
{
@@ -386,11 +446,6 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
__m256i block0_hash[8] __attribute__ ((aligned (64)));
__m256i block_buf[16] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint64_t *ptarget = (uint64_t*)work->target;
const uint32_t first_nonce = pdata[19];
@@ -400,29 +455,17 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
const bool bench = opt_benchmark;
const __m256i eight = m256_const1_32( 8 );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
pthread_rwlock_rdlock( &g_work_lock );
block0_hash[0] = _mm256_set1_epi32( phash[0] );
block0_hash[1] = _mm256_set1_epi32( phash[1] );
block0_hash[2] = _mm256_set1_epi32( phash[2] );
block0_hash[3] = _mm256_set1_epi32( phash[3] );
block0_hash[4] = _mm256_set1_epi32( phash[4] );
block0_hash[5] = _mm256_set1_epi32( phash[5] );
block0_hash[6] = _mm256_set1_epi32( phash[6] );
block0_hash[7] = _mm256_set1_epi32( phash[7] );
memcpy( midstate_vars, allium_8way_midstate_vars, sizeof midstate_vars );
memcpy( block0_hash, allium_8way_block0_hash, sizeof block0_hash );
memcpy( block_buf, allium_8way_block_buf, sizeof block_buf );
pthread_rwlock_unlock( &g_work_lock );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
block_buf[ 3] = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4,
n+ 3, n+ 2, n+ 1, n );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -438,6 +481,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
n += 8;
block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
} while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;

View File

@@ -131,10 +131,12 @@ bool register_lyra2z_algo( algo_gate_t* gate )
{
#if defined(LYRA2Z_16WAY)
gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
gate->prehash = (void*)&lyra2z_16way_prehash;
gate->scanhash = (void*)&scanhash_lyra2z_16way;
// gate->hash = (void*)&lyra2z_16way_hash;
#elif defined(LYRA2Z_8WAY)
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
gate->prehash = (void*)&lyra2z_8way_prehash;
gate->scanhash = (void*)&scanhash_lyra2z_8way;
// gate->hash = (void*)&lyra2z_8way_hash;
#elif defined(LYRA2Z_4WAY)
@@ -175,8 +177,10 @@ bool register_lyra2h_algo( algo_gate_t* gate )
bool register_allium_algo( algo_gate_t* gate )
{
#if defined (ALLIUM_16WAY)
gate->prehash = (void*)&allium_16way_prehash;
gate->scanhash = (void*)&scanhash_allium_16way;
#elif defined (ALLIUM_8WAY)
gate->prehash = (void*)&allium_8way_prehash;
gate->scanhash = (void*)&scanhash_allium_8way;
#else
gate->miner_thread_init = (void*)&init_allium_ctx;

View File

@@ -5,7 +5,6 @@
#include <stdint.h>
#include "lyra2.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define LYRA2REV3_16WAY 1
#elif defined(__AVX2__)
@@ -102,6 +101,7 @@ bool init_lyra2rev2_ctx();
//void lyra2z_16way_hash( void *state, const void *input );
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int lyra2z_16way_prehash ( struct work *work );
bool lyra2z_16way_thread_init();
#elif defined(LYRA2Z_8WAY)
@@ -110,6 +110,7 @@ bool lyra2z_16way_thread_init();
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2z_8way_thread_init();
int lyra2z_8way_prehash ( struct work *work );
#elif defined(LYRA2Z_4WAY)
@@ -165,11 +166,13 @@ bool register_allium_algo( algo_gate_t* gate );
int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int allium_16way_prehash ( struct work *work );
#elif defined(ALLIUM_8WAY)
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int allium_8way_prehash ( struct work *work );
#else

View File

@@ -14,6 +14,44 @@ bool lyra2z_16way_thread_init()
return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
}
static uint32_t lyra2z_16way_midstate_vars[16*16] __attribute__ ((aligned (64)));
static __m512i lyra2z_16way_block0_hash[8] __attribute__ ((aligned (64)));
static __m512i lyra2z_16way_block_buf[16] __attribute__ ((aligned (64)));
int lyra2z_16way_prehash ( struct work *work )
{
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
lyra2z_16way_block0_hash[0] = _mm512_set1_epi32( phash[0] );
lyra2z_16way_block0_hash[1] = _mm512_set1_epi32( phash[1] );
lyra2z_16way_block0_hash[2] = _mm512_set1_epi32( phash[2] );
lyra2z_16way_block0_hash[3] = _mm512_set1_epi32( phash[3] );
lyra2z_16way_block0_hash[4] = _mm512_set1_epi32( phash[4] );
lyra2z_16way_block0_hash[5] = _mm512_set1_epi32( phash[5] );
lyra2z_16way_block0_hash[6] = _mm512_set1_epi32( phash[6] );
lyra2z_16way_block0_hash[7] = _mm512_set1_epi32( phash[7] );
// Build vectored second block, interleave 12 of last 16 bytes of data
// excepting the nonce.
lyra2z_16way_block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
lyra2z_16way_block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
lyra2z_16way_block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( lyra2z_16way_midstate_vars,
lyra2z_16way_block0_hash, lyra2z_16way_block_buf );
return 1;
}
static void lyra2z_16way_hash( void *state, const void *midstate_vars,
const void *midhash, const void *block )
{
@@ -91,11 +129,6 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
__m512i block0_hash[8] __attribute__ ((aligned (64)));
__m512i block_buf[16] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (64))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -107,30 +140,18 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
pthread_rwlock_rdlock( &g_work_lock );
block0_hash[0] = _mm512_set1_epi32( phash[0] );
block0_hash[1] = _mm512_set1_epi32( phash[1] );
block0_hash[2] = _mm512_set1_epi32( phash[2] );
block0_hash[3] = _mm512_set1_epi32( phash[3] );
block0_hash[4] = _mm512_set1_epi32( phash[4] );
block0_hash[5] = _mm512_set1_epi32( phash[5] );
block0_hash[6] = _mm512_set1_epi32( phash[6] );
block0_hash[7] = _mm512_set1_epi32( phash[7] );
memcpy( midstate_vars, lyra2z_16way_midstate_vars, sizeof midstate_vars );
memcpy( block0_hash, lyra2z_16way_block0_hash, sizeof block0_hash );
memcpy( block_buf, lyra2z_16way_block_buf, sizeof block_buf );
pthread_rwlock_unlock( &g_work_lock );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
block_buf[ 3] =
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -157,6 +178,44 @@ bool lyra2z_8way_thread_init()
return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
}
static uint32_t lyra2z_8way_midstate_vars[16*8] __attribute__ ((aligned (64)));
static __m256i lyra2z_8way_block0_hash[8] __attribute__ ((aligned (64)));
static __m256i lyra2z_8way_block_buf[16] __attribute__ ((aligned (64)));
int lyra2z_8way_prehash ( struct work *work )
{
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
lyra2z_8way_block0_hash[0] = _mm256_set1_epi32( phash[0] );
lyra2z_8way_block0_hash[1] = _mm256_set1_epi32( phash[1] );
lyra2z_8way_block0_hash[2] = _mm256_set1_epi32( phash[2] );
lyra2z_8way_block0_hash[3] = _mm256_set1_epi32( phash[3] );
lyra2z_8way_block0_hash[4] = _mm256_set1_epi32( phash[4] );
lyra2z_8way_block0_hash[5] = _mm256_set1_epi32( phash[5] );
lyra2z_8way_block0_hash[6] = _mm256_set1_epi32( phash[6] );
lyra2z_8way_block0_hash[7] = _mm256_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
lyra2z_8way_block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
lyra2z_8way_block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
lyra2z_8way_block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( lyra2z_8way_midstate_vars,
lyra2z_8way_block0_hash, lyra2z_8way_block_buf );
return 1;
}
static void lyra2z_8way_hash( void *state, const void *midstate_vars,
const void *midhash, const void *block )
{
@@ -201,11 +260,6 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
__m256i block0_hash[8] __attribute__ ((aligned (64)));
__m256i block_buf[16] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint64_t *ptarget = (uint64_t*)work->target;
const uint32_t first_nonce = pdata[19];
@@ -215,23 +269,14 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
const bool bench = opt_benchmark;
const __m256i eight = m256_const1_32( 8 );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
pthread_rwlock_rdlock( &g_work_lock );
block0_hash[0] = _mm256_set1_epi32( phash[0] );
block0_hash[1] = _mm256_set1_epi32( phash[1] );
block0_hash[2] = _mm256_set1_epi32( phash[2] );
block0_hash[3] = _mm256_set1_epi32( phash[3] );
block0_hash[4] = _mm256_set1_epi32( phash[4] );
block0_hash[5] = _mm256_set1_epi32( phash[5] );
block0_hash[6] = _mm256_set1_epi32( phash[6] );
block0_hash[7] = _mm256_set1_epi32( phash[7] );
memcpy( midstate_vars, lyra2z_8way_midstate_vars, sizeof midstate_vars );
memcpy( block0_hash, lyra2z_8way_block0_hash, sizeof block0_hash );
memcpy( block_buf, lyra2z_8way_block_buf, sizeof block_buf );
pthread_rwlock_unlock( &g_work_lock );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
block_buf[ 3] =
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );

View File

@@ -146,14 +146,25 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
{ \
__m128i t; \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm128_vrol256_64( s6, s7 ); \
mm128_vror256_64( s2, s3 ); \
t = mm128_alignr_64( s7, s6, 1 ); \
s6 = mm128_alignr_64( s6, s7, 1 ); \
s7 = t; \
t = mm128_alignr_64( s2, s3, 1 ); \
s2 = mm128_alignr_64( s3, s2, 1 ); \
s3 = t; \
G_2X64( s0, s2, s5, s6 ); \
G_2X64( s1, s3, s4, s7 ); \
mm128_vror256_64( s6, s7 ); \
mm128_vrol256_64( s2, s3 );
t = mm128_alignr_64( s6, s7, 1 ); \
s6 = mm128_alignr_64( s7, s6, 1 ); \
s7 = t; \
t = mm128_alignr_64( s3, s2, 1 ); \
s2 = mm128_alignr_64( s2, s3, 1 ); \
s3 = t; \
}
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \

View File

@@ -7,8 +7,16 @@
#if defined (SKEIN_8WAY)
static __thread skein512_8way_context skein512_8way_ctx
static skein512_8way_context skein512_8way_ctx
__attribute__ ((aligned (64)));
static uint32_t skein_8way_vdata[20*8] __attribute__ ((aligned (64)));
int skein_8way_prehash( struct work *work )
{
mm512_bswap32_intrlv80_8x64( skein_8way_vdata, work->data );
skein512_8way_prehash64( &skein512_8way_ctx, skein_8way_vdata );
return 1;
}
void skeinhash_8way( void *state, const void *input )
{
@@ -29,25 +37,27 @@ void skeinhash_8way( void *state, const void *input )
int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*8] __attribute__ ((aligned (128)));
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash_d7 = &(hash[7*8]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t targ_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m512i *noncev = (__m512i*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
uint32_t vdata[20*8] __attribute__ ((aligned (128)));
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash_d7 = &(hash[7*8]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t targ_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m512i *noncev = (__m512i*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
pthread_rwlock_rdlock( &g_work_lock );
memcpy( vdata, skein_8way_vdata, sizeof vdata );
pthread_rwlock_unlock( &g_work_lock );
mm512_bswap32_intrlv80_8x64( vdata, pdata );
*noncev = mm512_intrlv_blend_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev );
skein512_8way_prehash64( &skein512_8way_ctx, vdata );
do
{
skeinhash_8way( hash, vdata );
@@ -74,8 +84,16 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
#elif defined (SKEIN_4WAY)
static __thread skein512_4way_context skein512_4way_ctx
static skein512_4way_context skein512_4way_ctx
__attribute__ ((aligned (64)));
static uint32_t skein_4way_vdata[20*4] __attribute__ ((aligned (64)));
int skein_4way_prehash( struct work *work )
{
mm256_bswap32_intrlv80_4x64( skein_4way_vdata, work->data );
skein512_4way_prehash64( &skein512_4way_ctx, skein_4way_vdata );
return 1;
}
void skeinhash_4way( void *state, const void *input )
{
@@ -118,22 +136,23 @@ void skeinhash_4way( void *state, const void *input )
int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash_d7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t targ_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash_d7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t targ_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
mm256_bswap32_intrlv80_4x64( vdata, pdata );
skein512_4way_prehash64( &skein512_4way_ctx, vdata );
pthread_rwlock_rdlock( &g_work_lock );
memcpy( vdata, skein_4way_vdata, sizeof vdata );
pthread_rwlock_unlock( &g_work_lock );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );

View File

@@ -7,10 +7,12 @@ bool register_skein_algo( algo_gate_t* gate )
#if defined (SKEIN_8WAY)
gate->optimizations = AVX2_OPT | AVX512_OPT;
gate->scanhash = (void*)&scanhash_skein_8way;
gate->prehash = (void*)&skein_8way_prehash;
gate->hash = (void*)&skeinhash_8way;
#elif defined (SKEIN_4WAY)
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_skein_4way;
gate->prehash = (void*)&skein_4way_prehash;
gate->hash = (void*)&skeinhash_4way;
#else
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
@@ -25,10 +27,12 @@ bool register_skein2_algo( algo_gate_t* gate )
gate->optimizations = AVX2_OPT | AVX512_OPT;
#if defined (SKEIN_8WAY)
gate->scanhash = (void*)&scanhash_skein2_8way;
gate->hash = (void*)&skein2hash_8way;
// gate->hash = (void*)&skein2hash_8way;
gate->prehash = (void*)&skein2_8way_prehash;
#elif defined (SKEIN_4WAY)
gate->scanhash = (void*)&scanhash_skein2_4way;
gate->hash = (void*)&skein2hash_4way;
// gate->hash = (void*)&skein2hash_4way;
gate->prehash = (void*)&skein2_4way_prehash;
#else
gate->scanhash = (void*)&scanhash_skein2;
gate->hash = (void*)&skein2hash;

View File

@@ -14,20 +14,24 @@
void skeinhash_8way( void *output, const void *input );
int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int skein_8way_prehash( struct work * );
void skein2hash_8way( void *output, const void *input );
int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
uint64_t* hashes_done, struct thr_info *mythr );
int skein2_8way_prehash( struct work * );
#elif defined(SKEIN_4WAY)
void skeinhash_4way( void *output, const void *input );
int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int skein_4way_prehash( struct work * );
void skein2hash_4way( void *output, const void *input );
int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
uint64_t* hashes_done, struct thr_info *mythr );
int skein2_4way_prehash( struct work * );
#else

View File

@@ -31,18 +31,19 @@ int scanhash_skein( struct work *work, uint32_t max_nonce,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
int thr_id = mythr->id; // thr_id arg is deprecated
int thr_id = mythr->id;
swab32_array( endiandata, pdata, 20 );
do {
be32enc(&endiandata[19], n);
skeinhash(hash64, endiandata);
if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return true;
}
if (hash64[7] <= Htarg )
if ( fulltest(hash64, ptarget) && !opt_benchmark )
{
pdata[19] = n;
submit_solution( work, hash64, mythr );
}
n++;
} while (n < max_nonce && !work_restart[thr_id].restart);

View File

@@ -5,9 +5,17 @@
#if defined(SKEIN_8WAY)
static __thread skein512_8way_context skein512_8way_ctx
__attribute__ ((aligned (64)));
static skein512_8way_context skein512_8way_ctx __attribute__ ((aligned (64)));
static uint32_t skein2_8way_vdata[20*8] __attribute__ ((aligned (64)));
int skein2_8way_prehash( struct work *work )
{
mm512_bswap32_intrlv80_8x64( skein2_8way_vdata, work->data );
skein512_8way_prehash64( &skein512_8way_ctx, skein2_8way_vdata );
return 1;
}
/* not used
void skein2hash_8way( void *output, const void *input )
{
uint64_t hash[16*8] __attribute__ ((aligned (128)));
@@ -17,6 +25,7 @@ void skein2hash_8way( void *output, const void *input )
skein512_8way_final16( &ctx, hash, input + (64*8) );
skein512_8way_full( &ctx, output, hash, 64 );
}
*/
int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
@@ -36,11 +45,14 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
const bool bench = opt_benchmark;
skein512_8way_context ctx;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
pthread_rwlock_rdlock( &g_work_lock );
memcpy( vdata, skein2_8way_vdata, sizeof vdata );
memcpy( &ctx, &skein512_8way_ctx, sizeof ctx );
pthread_rwlock_unlock( &g_work_lock );
*noncev = mm512_intrlv_blend_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev );
skein512_8way_prehash64( &ctx, vdata );
do
{
skein512_8way_final16( &ctx, hash, vdata + (16*8) );
@@ -68,9 +80,17 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
#elif defined(SKEIN_4WAY)
static __thread skein512_4way_context skein512_4way_ctx
__attribute__ ((aligned (64)));
static skein512_4way_context skein512_4way_ctx __attribute__ ((aligned (64)));
static uint32_t skein2_4way_vdata[20*4] __attribute__ ((aligned (64)));
int skein2_4way_prehash( struct work *work )
{
mm256_bswap32_intrlv80_4x64( skein2_4way_vdata, work->data );
skein512_4way_prehash64( &skein512_4way_ctx, skein2_4way_vdata );
return 1;
}
/* not used
void skein2hash_4way( void *output, const void *input )
{
skein512_4way_context ctx;
@@ -80,6 +100,7 @@ void skein2hash_4way( void *output, const void *input )
skein512_4way_final16( &ctx, hash, input + (64*4) );
skein512_4way_full( &ctx, output, hash, 64 );
}
*/
int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
@@ -99,8 +120,11 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
const bool bench = opt_benchmark;
skein512_4way_context ctx;
mm256_bswap32_intrlv80_4x64( vdata, pdata );
skein512_4way_prehash64( &ctx, vdata );
pthread_rwlock_rdlock( &g_work_lock );
memcpy( vdata, skein2_4way_vdata, sizeof vdata );
memcpy( &ctx, &skein512_4way_ctx, sizeof ctx );
pthread_rwlock_unlock( &g_work_lock );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do

View File

@@ -34,31 +34,31 @@ void skein2hash(void *output, const void *input)
sph_skein512_close(&ctx_skein, hash);
memcpy(output, hash, 32);
}
int scanhash_skein2( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t hash64[8] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__ ((aligned (64)));
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
int thr_id = mythr->id; // thr_id arg is deprecated
int thr_id = mythr->id;
swab32_array( endiandata, pdata, 20 );
swab32_array( endiandata, pdata, 20 );
do {
be32enc(&endiandata[19], n);
skein2hash(hash64, endiandata);
if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return true;
}
if (hash64[7] <= Htarg )
if ( fulltest(hash64, ptarget) && !opt_benchmark )
{
pdata[19] = n;
submit_solution( work, hash64, mythr );
}
n++;
} while (n < max_nonce && !work_restart[thr_id].restart);

View File

@@ -25,7 +25,7 @@ static void hex_getAlgoString(const uint32_t* prevblock, char *output)
static __thread x16r_context_overlay hex_ctx;
int hex_hash( void* output, const void* input, int thrid )
int hex_hash( void* output, const void* input, const int thrid )
{
uint32_t _ALIGN(128) hash[16];
x16r_context_overlay ctx;

View File

@@ -72,7 +72,7 @@ struct TortureGarden
// Get a 64-byte hash for given 64-byte input, using given TortureGarden contexts and given algo index
static int get_hash( void *output, const void *input, TortureGarden *garden,
unsigned int algo, int thr_id )
unsigned int algo, const int thr_id )
{
unsigned char hash[64] __attribute__ ((aligned (64)));
int rc = 1;
@@ -233,7 +233,7 @@ bool initialize_torture_garden()
}
// Produce a 32-byte hash from 80-byte input data
int minotaur_hash( void *output, const void *input, int thr_id )
int minotaur_hash( void *output, const void *input, const int thr_id )
{
unsigned char hash[64] __attribute__ ((aligned (64)));
int rc = 1;

View File

@@ -19,7 +19,7 @@
// Perform midstate prehash of hash functions with block size <= 72 bytes,
// 76 bytes for hash functions that operate on 32 bit data.
void x16r_8way_prehash( void *vdata, void *pdata )
void x16r_8way_do_prehash( void *vdata, const void *pdata )
{
uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
@@ -106,11 +106,18 @@ void x16r_8way_prehash( void *vdata, void *pdata )
}
}
int x16r_8way_prehash( struct work *work )
{
x16r_gate_get_hash_order( work, x16r_hash_order );
x16r_8way_do_prehash( x16r_8way_vdata, work->data );
return 1;
}
// Perform the full x16r hash and returns 512 bit intermediate hash.
// Called by wrapper hash function to optionally continue hashing and
// convert to final hash.
int x16r_8way_hash_generic( void* output, const void* input, int thrid )
int x16r_8way_hash_generic( void* output, const void* input, const int thrid )
{
uint32_t vhash[20*8] __attribute__ ((aligned (128)));
uint32_t hash0[20] __attribute__ ((aligned (16)));
@@ -471,7 +478,7 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
// x16-r,-s,-rt wrapper called directly by scanhash to repackage 512 bit
// hash to 256 bit final hash.
int x16r_8way_hash( void* output, const void* input, int thrid )
int x16r_8way_hash( void* output, const void* input, const int thrid )
{
uint8_t hash[64*8] __attribute__ ((aligned (128)));
if ( !x16r_8way_hash_generic( hash, input, thrid ) )
@@ -495,7 +502,6 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[16*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t bedata1[2];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -508,27 +514,16 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0cff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
pthread_rwlock_rdlock( &g_work_lock );
memcpy( vdata, x16r_8way_vdata, sizeof vdata );
pthread_rwlock_unlock( &g_work_lock );
static __thread uint32_t s_ntime = UINT32_MAX;
const uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
}
x16r_8way_prehash( vdata, pdata );
*noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if( x16r_8way_hash( hash, vdata, thr_id ) );
if( algo_gate.hash( hash, vdata, thr_id ) );
for ( int i = 0; i < 8; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
@@ -546,7 +541,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
#elif defined (X16R_4WAY)
void x16r_4way_prehash( void *vdata, void *pdata )
void x16r_4way_do_prehash( void *vdata, const void *pdata )
{
uint32_t vdata2[20*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
@@ -627,7 +622,14 @@ void x16r_4way_prehash( void *vdata, void *pdata )
}
}
int x16r_4way_hash_generic( void* output, const void* input, int thrid )
int x16r_4way_prehash( struct work *work )
{
x16r_gate_get_hash_order( work, x16r_hash_order );
x16r_4way_do_prehash( x16r_4way_vdata, work->data );
return 1;
}
int x16r_4way_hash_generic( void* output, const void* input, const int thrid )
{
uint32_t vhash[20*4] __attribute__ ((aligned (128)));
uint32_t hash0[20] __attribute__ ((aligned (32)));
@@ -635,13 +637,14 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
uint32_t hash2[20] __attribute__ ((aligned (32)));
uint32_t hash3[20] __attribute__ ((aligned (32)));
x16r_4way_context_overlay ctx;
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
void *in0 = (void*) hash0;
void *in1 = (void*) hash1;
void *in2 = (void*) hash2;
void *in3 = (void*) hash3;
int size = 80;
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
for ( int i = 0; i < 16; i++ )
@@ -905,7 +908,7 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
return 1;
}
int x16r_4way_hash( void* output, const void* input, int thrid )
int x16r_4way_hash( void* output, const void* input, const int thrid )
{
uint8_t hash[64*4] __attribute__ ((aligned (64)));
if ( !x16r_4way_hash_generic( hash, input, thrid ) )
@@ -924,7 +927,6 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[16*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t bedata1[2];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -937,25 +939,15 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0cff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
pthread_rwlock_rdlock( &g_work_lock );
memcpy( vdata, x16r_4way_vdata, sizeof vdata );
pthread_rwlock_unlock( &g_work_lock );
static __thread uint32_t s_ntime = UINT32_MAX;
const uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
}
x16r_4way_prehash( vdata, pdata );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x16r_4way_hash( hash, vdata, thr_id ) );
if ( algo_gate.hash( hash, vdata, thr_id ) );
for ( int i = 0; i < 4; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{

View File

@@ -1,26 +1,44 @@
#include "x16r-gate.h"
#include "algo/sha/sha256d.h"
__thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = { 0 };
char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = {0};
void (*x16_r_s_getAlgoString) ( const uint8_t*, char* ) = NULL;
void (*x16r_gate_get_hash_order) ( const struct work *, char * ) = NULL;
#if defined (X16R_8WAY)
__thread x16r_8way_context_overlay x16r_ctx;
x16r_8way_context_overlay x16r_ctx;
uint32_t x16r_8way_vdata[24*8] __attribute__ ((aligned (64)));
#elif defined (X16R_4WAY)
__thread x16r_4way_context_overlay x16r_ctx;
x16r_4way_context_overlay x16r_ctx;
uint32_t x16r_4way_vdata[24*4] __attribute__ ((aligned (64)));
#endif
__thread x16r_context_overlay x16_ctx;
#if defined (X16RV2_8WAY)
x16rv2_8way_context_overlay x16rv2_ctx;
void x16r_getAlgoString( const uint8_t* prevblock, char *output )
#elif defined (X16RV2_4WAY)
x16rv2_4way_context_overlay x16rv2_ctx;
#endif
x16r_context_overlay x16_ctx;
uint32_t x16r_edata[24] __attribute__ ((aligned (32)));
void x16r_get_hash_order( const struct work *work, char *hash_order )
{
char *sptr = output;
char *sptr = hash_order;
const uint32_t *pdata = work->data;
uint8_t prevblock[16];
((uint32_t*)prevblock)[0] = bswap_32( pdata[1] );
((uint32_t*)prevblock)[1] = bswap_32( pdata[2] );
for ( int j = 0; j < X16R_HASH_FUNC_COUNT; j++ )
{
uint8_t b = (15 - j) >> 1; // 16 first ascii hex chars (lsb in uint256)
@@ -32,38 +50,51 @@ void x16r_getAlgoString( const uint8_t* prevblock, char *output )
sptr++;
}
*sptr = '\0';
if ( !opt_quiet )
applog( LOG_INFO, "Hash order %s", x16r_hash_order );
}
void x16s_getAlgoString( const uint8_t* prevblock, char *output )
void x16s_get_hash_order( const struct work *work, char *hash_order )
{
strcpy( output, "0123456789ABCDEF" );
const uint32_t *pdata = work->data;
uint8_t prevblock[16];
((uint32_t*)prevblock)[0] = bswap_32( pdata[1] );
((uint32_t*)prevblock)[1] = bswap_32( pdata[2] );
strcpy( hash_order, "0123456789ABCDEF" );
for ( int i = 0; i < 16; i++ )
{
uint8_t b = (15 - i) >> 1; // 16 ascii hex chars, reversed
uint8_t algoDigit = (i & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4;
int offset = algoDigit;
// insert the nth character at the front
char oldVal = output[offset];
char oldVal = hash_order[ offset ];
for( int j = offset; j-- > 0; )
output[j+1] = output[j];
output[0] = oldVal;
hash_order[ j+1 ] = hash_order[ j ];
hash_order[ 0 ] = oldVal;
}
if ( !opt_quiet )
applog( LOG_INFO, "Hash order %s", x16r_hash_order );
}
bool register_x16r_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
gate->scanhash = (void*)&scanhash_x16r_8way;
gate->prehash = (void*)&x16r_8way_prehash;
gate->hash = (void*)&x16r_8way_hash;
#elif defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->prehash = (void*)&x16r_4way_prehash;
gate->hash = (void*)&x16r_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x16r;
gate->prehash = (void*)&x16r_prehash;
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
x16r_gate_get_hash_order = (void*)&x16r_get_hash_order;
opt_target_factor = 256.0;
return true;
};
@@ -71,17 +102,20 @@ bool register_x16r_algo( algo_gate_t* gate )
bool register_x16rv2_algo( algo_gate_t* gate )
{
#if defined (X16RV2_8WAY)
gate->scanhash = (void*)&scanhash_x16rv2_8way;
gate->scanhash = (void*)&scanhash_x16r_8way;
gate->prehash = (void*)&x16rv2_8way_prehash;
gate->hash = (void*)&x16rv2_8way_hash;
#elif defined (X16RV2_4WAY)
gate->scanhash = (void*)&scanhash_x16rv2_4way;
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->prehash = (void*)&x16rv2_4way_prehash;
gate->hash = (void*)&x16rv2_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x16rv2;
gate->scanhash = (void*)&scanhash_x16r;
gate->prehash = (void*)&x16rv2_prehash;
gate->hash = (void*)&x16rv2_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
x16r_gate_get_hash_order = (void*)&x16r_get_hash_order;
opt_target_factor = 256.0;
return true;
};
@@ -90,16 +124,19 @@ bool register_x16s_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
gate->scanhash = (void*)&scanhash_x16r_8way;
gate->prehash = (void*)&x16r_8way_prehash;
gate->hash = (void*)&x16r_8way_hash;
#elif defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->prehash = (void*)&x16r_4way_prehash;
gate->hash = (void*)&x16r_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x16r;
gate->prehash = (void*)&x16r_prehash;
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
x16r_gate_get_hash_order = (void*)&x16s_get_hash_order;
opt_target_factor = 256.0;
return true;
};
@@ -108,30 +145,33 @@ bool register_x16s_algo( algo_gate_t* gate )
//
// X16RT
void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash )
void x16rt_get_hash_order( const struct work * work, char * hash_order )
{
int32_t maskedTime = timeStamp & 0xffffff80;
sha256d( (unsigned char*)timeHash, (const unsigned char*)( &maskedTime ),
sizeof( maskedTime ) );
}
uint32_t _ALIGN(64) timehash[8*8];
const uint32_t ntime = bswap_32( work->data[17] );
const int32_t masked_ntime = ntime & 0xffffff80;
uint8_t* data = (uint8_t*)timehash;
char *sptr = hash_order;
void x16rt_getAlgoString( const uint32_t *timeHash, char *output)
{
char *sptr = output;
uint8_t* data = (uint8_t*)timeHash;
sha256d( (unsigned char*)timehash, (const unsigned char*)( &masked_ntime ),
sizeof( masked_ntime ) );
for (uint8_t j = 0; j < X16R_HASH_FUNC_COUNT; j++) {
for ( uint8_t j = 0; j < X16R_HASH_FUNC_COUNT; j++ )
{
uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed
uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4;
if (algoDigit >= 10)
sprintf(sptr, "%c", 'A' + (algoDigit - 10));
if ( algoDigit >= 10 )
sprintf( sptr, "%c", 'A' + (algoDigit - 10) );
else
sprintf(sptr, "%u", (uint32_t) algoDigit);
sprintf( sptr, "%u", (uint32_t) algoDigit );
sptr++;
}
*sptr = '\0';
if ( !opt_quiet )
applog( LOG_INFO, "Hash order %s, ntime %08x, time hash %08x",
hash_order, ntime, timehash );
}
void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
@@ -222,15 +262,19 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
bool register_x16rt_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
gate->scanhash = (void*)&scanhash_x16rt_8way;
gate->scanhash = (void*)&scanhash_x16r_8way;
gate->prehash = (void*)&x16r_8way_prehash;
gate->hash = (void*)&x16r_8way_hash;
#elif defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16rt_4way;
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->prehash = (void*)&x16r_4way_prehash;
gate->hash = (void*)&x16r_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x16rt;
gate->scanhash = (void*)&scanhash_x16r;
gate->prehash = (void*)&x16r_prehash;
gate->hash = (void*)&x16r_hash;
#endif
x16r_gate_get_hash_order = (void*)&x16rt_get_hash_order;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
opt_target_factor = 256.0;
return true;
@@ -239,16 +283,20 @@ bool register_x16rt_algo( algo_gate_t* gate )
bool register_x16rt_veil_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
gate->scanhash = (void*)&scanhash_x16rt_8way;
gate->scanhash = (void*)&scanhash_x16r_8way;
gate->prehash = (void*)&x16r_8way_prehash;
gate->hash = (void*)&x16r_8way_hash;
#elif defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16rt_4way;
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->prehash = (void*)&x16r_4way_prehash;
gate->hash = (void*)&x16r_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x16rt;
gate->scanhash = (void*)&scanhash_x16r;
gate->prehash = (void*)&x16r_prehash;
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
x16r_gate_get_hash_order = (void*)&x16rt_get_hash_order;
gate->build_extraheader = (void*)&veil_build_extraheader;
opt_target_factor = 256.0;
return true;
@@ -275,20 +323,23 @@ bool register_hex_algo( algo_gate_t* gate )
bool register_x21s_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
gate->scanhash = (void*)&scanhash_x21s_8way;
gate->scanhash = (void*)&scanhash_x16r_8way;
gate->prehash = (void*)&x16r_8way_prehash;
gate->hash = (void*)&x21s_8way_hash;
gate->miner_thread_init = (void*)&x21s_8way_thread_init;
#elif defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x21s_4way;
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->prehash = (void*)&x16r_4way_prehash;
gate->hash = (void*)&x21s_4way_hash;
gate->miner_thread_init = (void*)&x21s_4way_thread_init;
#else
gate->scanhash = (void*)&scanhash_x21s;
gate->scanhash = (void*)&scanhash_x16r;
gate->prehash = (void*)&x16r_prehash;
gate->hash = (void*)&x21s_hash;
gate->miner_thread_init = (void*)&x21s_thread_init;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
x16r_gate_get_hash_order = (void*)&x16s_get_hash_order;
opt_target_factor = 256.0;
return true;
};

View File

@@ -21,6 +21,7 @@
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sph_sha2.h"
#include "algo/tiger/sph_tiger.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
@@ -57,13 +58,11 @@
#define X16R_8WAY 1
#define X16RV2_8WAY 1
#define X16RT_8WAY 1
#define X21S_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X16RV2_4WAY 1
#define X16RT_4WAY 1
#define X21S_4WAY 1
#define X16R_4WAY 1
@@ -89,23 +88,29 @@ enum x16r_Algo {
X16R_HASH_FUNC_COUNT
};
extern __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ];
extern void (*x16_r_s_getAlgoString) ( const uint8_t*, char* );
void x16r_getAlgoString( const uint8_t *prevblock, char *output );
void x16s_getAlgoString( const uint8_t *prevblock, char *output );
void x16rt_getAlgoString( const uint32_t *timeHash, char *output );
//extern __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ];
extern char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ];
extern void (*x16r_gate_get_hash_order) ( const struct work *, char * );
// x16r, x16rv2
void x16r_get_hash_order( const struct work *, char * );
// x16s, x21s
void x16s_get_hash_order( const struct work *, char * );
// x16rt
void x16rt_get_hash_order( const struct work *, char * );
void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash );
bool register_x16r_algo( algo_gate_t* gate );
bool register_x16rv2_algo( algo_gate_t* gate );
bool register_x16s_algo( algo_gate_t* gate );
bool register_x16rt_algo( algo_gate_t* gate );
bool register_hex__algo( algo_gate_t* gate );
bool register_x21s__algo( algo_gate_t* gate );
bool register_hex_algo( algo_gate_t* gate );
bool register_x21s_algo( algo_gate_t* gate );
// x16r, x16s
// x16r, x16s, x16rt
#if defined(X16R_8WAY)
union _x16r_8way_context_overlay
@@ -136,15 +141,15 @@ union _x16r_8way_context_overlay
typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;
extern __thread x16r_8way_context_overlay x16r_ctx;
extern x16r_8way_context_overlay x16r_ctx;
extern uint32_t x16r_8way_vdata[24*8] __attribute__ ((aligned (64)));
void x16r_8way_prehash( void *, void * );
int x16r_8way_hash_generic( void *, const void *, int );
int x16r_8way_hash( void *, const void *, int );
void x16r_8way_do_prehash( void *, const void * );
int x16r_8way_prehash( struct work * );
int x16r_8way_hash_generic( void *, const void *, const int );
int x16r_8way_hash( void *, const void *, const int );
int scanhash_x16r_8way( struct work *, uint32_t ,
uint64_t *, struct thr_info * );
extern __thread x16r_8way_context_overlay x16r_ctx;
#elif defined(X16R_4WAY)
@@ -177,14 +182,15 @@ union _x16r_4way_context_overlay
typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;
extern __thread x16r_4way_context_overlay x16r_ctx;
extern x16r_4way_context_overlay x16r_ctx;
extern uint32_t x16r_4way_vdata[24*4] __attribute__ ((aligned (64)));
void x16r_4way_prehash( void *, void * );
int x16r_4way_hash_generic( void *, const void *, int );
int x16r_4way_hash( void *, const void *, int );
void x16r_4way_do_prehash( void *, const void * );
int x16r_4way_prehash( struct work * );
int x16r_4way_hash_generic( void *, const void *, const int );
int x16r_4way_hash( void *, const void *, const int );
int scanhash_x16r_4way( struct work *, uint32_t,
uint64_t *, struct thr_info * );
extern __thread x16r_4way_context_overlay x16r_ctx;
#endif
@@ -217,80 +223,113 @@ union _x16r_context_overlay
typedef union _x16r_context_overlay x16r_context_overlay;
extern __thread x16r_context_overlay x16_ctx;
extern x16r_context_overlay x16_ctx;
extern uint32_t x16r_edata[24] __attribute__ ((aligned (32)));
void x16r_prehash( void *, void * );
int x16r_hash_generic( void *, const void *, int );
int x16r_hash( void *, const void *, int );
void x16r_do_prehash( const void * );
int x16r_prehash( const struct work * );
int x16r_hash_generic( void *, const void *, const int );
int x16r_hash( void *, const void *, const int );
int scanhash_x16r( struct work *, uint32_t, uint64_t *, struct thr_info * );
// x16Rv2
#if defined(X16RV2_8WAY)
int x16rv2_8way_hash( void *state, const void *input, int thrid );
int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
union _x16rv2_8way_context_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cubehashParam cube;
simd_4way_context simd;
hamsi512_8way_context hamsi;
hashState_fugue fugue;
shabal512_8way_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
sph_tiger_context tiger;
#if defined(__VAES__)
groestl512_4way_context groestl;
shavite512_4way_context shavite;
echo_4way_context echo;
#else
hashState_groestl groestl;
shavite512_context shavite;
hashState_echo echo;
#endif
} __attribute__ ((aligned (64)));
typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay;
extern x16rv2_8way_context_overlay x16rv2_ctx;
int x16rv2_8way_prehash( struct work * );
int x16rv2_8way_hash( void *state, const void *input, const int thrid );
//int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X16RV2_4WAY)
int x16rv2_4way_hash( void *state, const void *input, int thrid );
int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
union _x16rv2_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
#if defined(__VAES__)
groestl512_2way_context groestl;
shavite512_2way_context shavite;
echo_2way_context echo;
#else
int x16rv2_hash( void *state, const void *input, int thr_id );
int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
hashState_groestl groestl;
shavite512_context shavite;
hashState_echo echo;
#endif
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
luffa_2way_context luffa;
cubehashParam cube;
simd_2way_context simd;
hamsi512_4way_context hamsi;
hashState_fugue fugue;
shabal512_4way_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
sph_tiger_context tiger;
};
// x16rt, veil
#if defined(X16R_8WAY)
typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay;
extern x16rv2_4way_context_overlay x16rv2_ctx;
//void x16rt_8way_hash( void *state, const void *input );
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X16R_4WAY)
//void x16rt_4way_hash( void *state, const void *input );
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int x16rv2_4way_hash( void *state, const void *input, const int thrid );
int x16rv2_4way_prehash( struct work * );
#else
//void x16rt_hash( void *state, const void *input );
int scanhash_x16rt( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int x16rv2_hash( void *state, const void *input, const int thr_id );
int x16rv2_prehash( const struct work * );
#endif
// x21s
#if defined(X16R_8WAY)
int x21s_8way_hash( void *state, const void *input, int thrid );
int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int x21s_8way_hash( void *state, const void *input, const int thrid );
bool x21s_8way_thread_init();
#elif defined(X16R_4WAY)
int x21s_4way_hash( void *state, const void *input, int thrid );
int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int x21s_4way_hash( void *state, const void *input, const int thrid );
bool x21s_4way_thread_init();
#else
int x21s_hash( void *state, const void *input, int thr_id );
int scanhash_x21s( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int x21s_hash( void *state, const void *input, const int thr_id );
bool x21s_thread_init();
#endif
//void hex_hash( void *state, const void *input );
int scanhash_hex( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );

View File

@@ -10,7 +10,7 @@
#include <stdlib.h>
#include <string.h>
void x16r_prehash( void *edata, void *pdata )
void x16r_do_prehash( const void *edata )
{
const char elem = x16r_hash_order[0];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
@@ -48,7 +48,7 @@ void x16r_prehash( void *edata, void *pdata )
}
}
int x16r_hash_generic( void* output, const void* input, int thrid )
int x16r_hash_generic( void* output, const void* input, const int thrid )
{
uint32_t _ALIGN(128) hash[16];
x16r_context_overlay ctx;
@@ -192,7 +192,15 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
return true;
}
int x16r_hash( void* output, const void* input, int thrid )
int x16r_prehash( const struct work *work )
{
mm128_bswap32_80( x16r_edata, work->data );
x16r_gate_get_hash_order( work, x16r_hash_order );
x16r_do_prehash( x16r_edata );
return 1;
}
int x16r_hash( void* output, const void* input, const int thrid )
{
uint8_t hash[64] __attribute__ ((aligned (64)));
if ( !x16r_hash_generic( hash, input, thrid ) )
@@ -205,8 +213,8 @@ int x16r_hash( void* output, const void* input, int thrid )
int scanhash_x16r( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash32[8];
uint32_t _ALIGN(128) edata[20];
uint32_t _ALIGN(32) hash32[8];
uint32_t _ALIGN(32) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -216,24 +224,14 @@ int scanhash_x16r( struct work *work, uint32_t max_nonce,
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
mm128_bswap32_80( edata, pdata );
static __thread uint32_t s_ntime = UINT32_MAX;
if ( s_ntime != pdata[17] )
{
uint32_t ntime = swab32(pdata[17]);
x16_r_s_getAlgoString( (const uint8_t*)(&edata[1]), x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
}
x16r_prehash( edata, pdata );
pthread_rwlock_rdlock( &g_work_lock );
memcpy( edata, x16r_edata, sizeof edata );
pthread_rwlock_unlock( &g_work_lock );
do
{
edata[19] = nonce;
if ( x16r_hash( hash32, edata, thr_id ) )
if ( algo_gate.hash( hash32, edata, thr_id ) )
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( nonce );

View File

@@ -1,113 +0,0 @@
#include "x16r-gate.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#if defined (X16R_8WAY)
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[16*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) timeHash[8*8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
const int thr_id = mythr->id;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
if ( s_ntime != masked_ntime )
{
x16rt_getTimeHash( masked_ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
s_ntime = masked_ntime;
if ( !thr_id )
applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
}
x16r_8way_prehash( vdata, pdata );
*noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x16r_8way_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 8; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#elif defined (X16R_4WAY)
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) timeHash[4*8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
if ( s_ntime != masked_ntime )
{
x16rt_getTimeHash( masked_ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
s_ntime = masked_ntime;
if ( !thr_id )
applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
}
x16r_4way_prehash( vdata, pdata );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x16r_4way_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 4; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
n += 4;
} while ( ( n < last_nonce ) && !(*restart) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -1,53 +0,0 @@
#include "x16r-gate.h"
#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
int scanhash_x16rt( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash32[8];
uint32_t _ALIGN(128) edata[20];
uint32_t _ALIGN(64) timeHash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const int thr_id = mythr->id;
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
mm128_bswap32_80( edata, pdata );
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t masked_ntime = swab32( pdata[17] ) & 0xffffff80;
if ( s_ntime != masked_ntime )
{
x16rt_getTimeHash( masked_ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
s_ntime = masked_ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
x16r_hash_order, swab32( pdata[17] ), timeHash );
}
x16r_prehash( edata, pdata );
do
{
edata[19] = nonce;
if ( x16r_hash( hash32, edata, thr_id ) )
if ( valid_hash( hash32, ptarget ) && !bench )
{
pdata[19] = bswap_32( nonce );
submit_solution( work, hash32, mythr );
}
nonce++;
} while ( nonce < max_nonce && !(*restart) );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 0;
}
#endif // !defined(X16R_8WAY) && !defined(X16R_4WAY)

View File

@@ -12,37 +12,73 @@
#if defined (X16RV2_8WAY)
union _x16rv2_8way_context_overlay
void x16rv2_8way_do_prehash( void *vdata, void *pdata )
{
blake512_8way_context blake;
bmw512_8way_context bmw;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cubehashParam cube;
simd_4way_context simd;
hamsi512_8way_context hamsi;
hashState_fugue fugue;
shabal512_8way_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
sph_tiger_context tiger;
#if defined(__VAES__)
groestl512_4way_context groestl;
shavite512_4way_context shavite;
echo_4way_context echo;
#else
hashState_groestl groestl;
shavite512_context shavite;
hashState_echo echo;
#endif
} __attribute__ ((aligned (64)));
uint32_t vdata32[20*8] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay;
static __thread x16rv2_8way_context_overlay x16rv2_ctx;
const char elem = x16r_hash_order[0];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
int x16rv2_8way_hash( void* output, const void* input, int thrid )
switch ( algo )
{
case JH:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
jh512_8way_init( &x16rv2_ctx.jh );
jh512_8way_update( &x16rv2_ctx.jh, vdata, 64 );
break;
case KECCAK:
case LUFFA:
case SHA_512:
mm128_bswap32_80( edata, pdata );
sph_tiger_init( &x16rv2_ctx.tiger );
sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
break;
case SKEIN:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
skein512_8way_init( &x16rv2_ctx.skein );
skein512_8way_update( &x16rv2_ctx.skein, vdata, 64 );
break;
case CUBEHASH:
mm128_bswap32_80( edata, pdata );
cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
break;
case HAMSI:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
hamsi512_8way_init( &x16rv2_ctx.hamsi );
hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 );
break;
case SHABAL:
mm256_bswap32_intrlv80_8x32( vdata32, pdata );
shabal512_8way_init( &x16rv2_ctx.shabal );
shabal512_8way_update( &x16rv2_ctx.shabal, vdata32, 64 );
rintrlv_8x32_8x64( vdata, vdata32, 640 );
break;
case WHIRLPOOL:
mm128_bswap32_80( edata, pdata );
sph_whirlpool_init( &x16rv2_ctx.whirlpool );
sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
break;
default:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
}
}
int x16rv2_8way_prehash( struct work *work )
{
x16r_gate_get_hash_order( work, x16r_hash_order );
x16rv2_8way_do_prehash( x16r_8way_vdata, work->data );
return 1;
}
int x16rv2_8way_hash( void* output, const void* input, const int thrid )
{
uint32_t vhash[24*8] __attribute__ ((aligned (128)));
uint32_t hash0[24] __attribute__ ((aligned (32)));
@@ -557,50 +593,28 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
return 1;
}
int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
#elif defined (X16RV2_4WAY)
// Pad the 24 bytes tiger hash to 64 bytes
inline void padtiger512( uint32_t* hash )
{
uint32_t hash[16*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
for ( int i = 6; i < 16; i++ ) hash[i] = 0;
}
void x16rv2_4way_do_prehash( void *vdata, void *pdata )
{
uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
const int thr_id = mythr->id;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
const uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
}
// Do midstate prehash on hash functions with block size <= 64 bytes.
const char elem = x16r_hash_order[0];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case JH:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
jh512_8way_init( &x16rv2_ctx.jh );
jh512_8way_update( &x16rv2_ctx.jh, vdata, 64 );
mm256_bswap32_intrlv80_4x64( vdata, pdata );
jh512_4way_init( &x16rv2_ctx.jh );
jh512_4way_update( &x16rv2_ctx.jh, vdata, 64 );
break;
case KECCAK:
case LUFFA:
@@ -608,100 +622,45 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
mm128_bswap32_80( edata, pdata );
sph_tiger_init( &x16rv2_ctx.tiger );
sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
break;
case SKEIN:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
skein512_8way_init( &x16rv2_ctx.skein );
skein512_8way_update( &x16rv2_ctx.skein, vdata, 64 );
mm256_bswap32_intrlv80_4x64( vdata, pdata );
skein512_4way_prehash64( &x16r_ctx.skein, vdata );
break;
case CUBEHASH:
mm128_bswap32_80( edata, pdata );
cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
break;
case HAMSI:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
hamsi512_8way_init( &x16rv2_ctx.hamsi );
hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 );
mm256_bswap32_intrlv80_4x64( vdata, pdata );
hamsi512_4way_init( &x16rv2_ctx.hamsi );
hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 );
break;
case SHABAL:
mm256_bswap32_intrlv80_8x32( vdata2, pdata );
shabal512_8way_init( &x16rv2_ctx.shabal );
shabal512_8way_update( &x16rv2_ctx.shabal, vdata2, 64 );
rintrlv_8x32_8x64( vdata, vdata2, 640 );
mm128_bswap32_intrlv80_4x32( vdata32, pdata );
shabal512_4way_init( &x16rv2_ctx.shabal );
shabal512_4way_update( &x16rv2_ctx.shabal, vdata32, 64 );
rintrlv_4x32_4x64( vdata, vdata32, 640 );
break;
case WHIRLPOOL:
mm128_bswap32_80( edata, pdata );
sph_whirlpool_init( &x16rv2_ctx.whirlpool );
sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
break;
default:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
mm256_bswap32_intrlv80_4x64( vdata, pdata );
}
*noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x16rv2_8way_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 8; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#elif defined (X16RV2_4WAY)
union _x16rv2_4way_context_overlay
int x16rv2_4way_prehash( struct work *work )
{
blake512_4way_context blake;
bmw512_4way_context bmw;
#if defined(__VAES__)
groestl512_2way_context groestl;
shavite512_2way_context shavite;
echo_2way_context echo;
#else
hashState_groestl groestl;
shavite512_context shavite;
hashState_echo echo;
#endif
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
luffa_2way_context luffa;
cubehashParam cube;
simd_2way_context simd;
hamsi512_4way_context hamsi;
hashState_fugue fugue;
shabal512_4way_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
sph_tiger_context tiger;
};
typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay;
static __thread x16rv2_4way_context_overlay x16rv2_ctx;
// Pad the 24 bytes tiger hash to 64 bytes
inline void padtiger512( uint32_t* hash )
{
for ( int i = 6; i < 16; i++ ) hash[i] = 0;
x16r_gate_get_hash_order( work, x16r_hash_order );
x16rv2_4way_do_prehash( x16r_4way_vdata, work->data );
return 1;
}
int x16rv2_4way_hash( void* output, const void* input, int thrid )
@@ -1048,107 +1007,4 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
return 1;
}
int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
uint32_t edata[20];
uint32_t bedata1[2];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
__m256i *noncev = (__m256i*)vdata + 9;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0fff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
const uint32_t ntime = bswap_32(pdata[17]);
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
}
// Do midstate prehash on hash functions with block size <= 64 bytes.
const char elem = x16r_hash_order[0];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case JH:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
jh512_4way_init( &x16rv2_ctx.jh );
jh512_4way_update( &x16rv2_ctx.jh, vdata, 64 );
break;
case KECCAK:
case LUFFA:
case SHA_512:
mm128_bswap32_80( edata, pdata );
sph_tiger_init( &x16rv2_ctx.tiger );
sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
break;
case SKEIN:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
skein512_4way_prehash64( &x16r_ctx.skein, vdata );
break;
case CUBEHASH:
mm128_bswap32_80( edata, pdata );
cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 );
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
break;
case HAMSI:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
hamsi512_4way_init( &x16rv2_ctx.hamsi );
hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 );
break;
case SHABAL:
mm128_bswap32_intrlv80_4x32( vdata32, pdata );
shabal512_4way_init( &x16rv2_ctx.shabal );
shabal512_4way_update( &x16rv2_ctx.shabal, vdata32, 64 );
rintrlv_4x32_4x64( vdata, vdata32, 640 );
break;
case WHIRLPOOL:
mm128_bswap32_80( edata, pdata );
sph_whirlpool_init( &x16rv2_ctx.whirlpool );
sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 );
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
break;
default:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
}
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x16rv2_4way_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 4; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -43,9 +43,16 @@ inline void padtiger512(uint32_t* hash) {
for (int i = (24/4); i < (64/4); i++) hash[i] = 0;
}
int x16rv2_hash( void* output, const void* input, int thrid )
// no prehash
int x16rv2_prehash( const struct work *work )
{
uint32_t _ALIGN(128) hash[16];
x16r_gate_get_hash_order( work, x16r_hash_order );
return 1;
}
int x16rv2_hash( void* output, const void* input, const int thrid )
{
uint32_t _ALIGN(32) hash[16];
x16rv2_context_overlay ctx;
void *in = (void*) input;
int size = 80;
@@ -170,52 +177,4 @@ int x16rv2_hash( void* output, const void* input, int thrid )
return 1;
}
int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash32[8];
uint32_t _ALIGN(128) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const int thr_id = mythr->id;
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
static __thread uint32_t s_ntime = UINT32_MAX;
if ( s_ntime != pdata[17] )
{
uint32_t ntime = swab32(pdata[17]);
x16_r_s_getAlgoString( (const uint8_t*) (&edata[1]), x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)",
x16r_hash_order, ntime );
}
if ( bench ) ptarget[7] = 0x0cff;
do
{
edata[19] = nonce;
if ( x16rv2_hash( hash32, edata, thr_id ) )
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( nonce );
submit_solution( work, hash32, mythr );
}
nonce++;
} while ( nonce < max_nonce && !(*restart) );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 0;
}
#endif

View File

@@ -30,7 +30,7 @@ union _x21s_8way_context_overlay
typedef union _x21s_8way_context_overlay x21s_8way_context_overlay;
int x21s_8way_hash( void* output, const void* input, int thrid )
int x21s_8way_hash( void* output, const void* input, const int thrid )
{
uint32_t vhash[16*8] __attribute__ ((aligned (128)));
uint8_t shash[64*8] __attribute__ ((aligned (64)));
@@ -129,66 +129,6 @@ int x21s_8way_hash( void* output, const void* input, int thrid )
return 1;
}
int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[16*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &hash[7<<3];
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 16;
const int thr_id = mythr->id;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
}
x16r_8way_prehash( vdata, pdata );
*noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x21s_8way_hash( hash, vdata, thr_id ) )
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash7[lane] <= Htarg ) )
{
extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
bool x21s_8way_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
@@ -215,7 +155,7 @@ union _x21s_4way_context_overlay
typedef union _x21s_4way_context_overlay x21s_4way_context_overlay;
int x21s_4way_hash( void* output, const void* input, int thrid )
int x21s_4way_hash( void* output, const void* input, const int thrid )
{
uint32_t vhash[16*4] __attribute__ ((aligned (64)));
uint8_t shash[64*4] __attribute__ ((aligned (64)));
@@ -291,58 +231,6 @@ int x21s_4way_hash( void* output, const void* input, int thrid )
return 1;
}
int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[16*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( bench ) ptarget[7] = 0x0cff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
}
x16r_4way_prehash( vdata, pdata );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x21s_4way_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 4; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
bool x21s_4way_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols

View File

@@ -27,7 +27,7 @@ union _x21s_context_overlay
};
typedef union _x21s_context_overlay x21s_context_overlay;
int x21s_hash( void* output, const void* input, int thrid )
int x21s_hash( void* output, const void* input, const int thrid )
{
uint32_t _ALIGN(128) hash[16];
x21s_context_overlay ctx;
@@ -57,50 +57,6 @@ int x21s_hash( void* output, const void* input, int thrid )
return 1;
}
int scanhash_x21s( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash32[8];
uint32_t _ALIGN(128) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const int thr_id = mythr->id;
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
mm128_bswap32_80( edata, pdata );
static __thread uint32_t s_ntime = UINT32_MAX;
if ( s_ntime != pdata[17] )
{
uint32_t ntime = swab32(pdata[17]);
x16_r_s_getAlgoString( (const uint8_t*)(&edata[1]), x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
}
x16r_prehash( edata, pdata );
do
{
edata[19] = nonce;
if ( x21s_hash( hash32, edata, thr_id ) )
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( nonce );
submit_solution( work, hash32, mythr );
}
nonce++;
} while ( nonce < max_nonce && !(*restart) );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 0;
}
bool x21s_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols

View File

@@ -31,8 +31,26 @@
yespower_params_t yespower_params;
// master g_work
sha256_context yespower_sha256_prehash_ctx;
uint32_t _ALIGN(64) yespower_endiandata[20];
// local work
__thread sha256_context sha256_prehash_ctx;
int yespower_sha256_prehash( struct work *work )
{
uint32_t *pdata = work->data;
for ( int k = 0; k < 19; k++ )
be32enc( &yespower_endiandata[k], pdata[k] );
sha256_ctx_init( &yespower_sha256_prehash_ctx );
sha256_update( &yespower_sha256_prehash_ctx, yespower_endiandata, 64 );
return 1;
}
// YESPOWER
int yespower_hash( const char *input, char *output, uint32_t len, int thrid )
@@ -53,13 +71,14 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
for ( int k = 0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
endiandata[19] = n;
// pthread_rwlock_rdlock( &g_work_lock );
// do sha256 prehash
sha256_ctx_init( &sha256_prehash_ctx );
sha256_update( &sha256_prehash_ctx, endiandata, 64 );
memcpy( endiandata, yespower_endiandata, sizeof endiandata );
memcpy( &sha256_prehash_ctx, &yespower_sha256_prehash_ctx, sizeof sha256_prehash_ctx );
// pthread_rwlock_unlock( &g_work_lock );
endiandata[19] = n;
do {
if ( yespower_hash( (char*)endiandata, (char*)vhash, 80, thr_id ) )
@@ -140,6 +159,7 @@ bool register_yespower_algo( algo_gate_t* gate )
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yespower;
gate->prehash = (void*)&yespower_sha256_prehash;
gate->hash = (void*)&yespower_hash;
opt_target_factor = 65536.0;
return true;
@@ -154,6 +174,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate )
yespower_params.perslen = 0;
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yespower;
gate->prehash = (void*)&yespower_sha256_prehash;
gate->hash = (void*)&yespower_hash;
opt_target_factor = 65536.0;
return true;
@@ -165,6 +186,7 @@ bool register_yescrypt_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yespower;
gate->prehash = (void*)&yespower_sha256_prehash;
yespower_params.version = YESPOWER_0_5;
opt_target_factor = 65536.0;
@@ -198,6 +220,7 @@ bool register_yescryptr8_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yespower;
gate->prehash = (void*)&yespower_sha256_prehash;
yespower_params.version = YESPOWER_0_5;
yespower_params.N = 2048;
yespower_params.r = 8;
@@ -211,6 +234,7 @@ bool register_yescryptr16_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yespower;
gate->prehash = (void*)&yespower_sha256_prehash;
yespower_params.version = YESPOWER_0_5;
yespower_params.N = 4096;
yespower_params.r = 16;
@@ -224,6 +248,7 @@ bool register_yescryptr32_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yespower;
gate->prehash = (void*)&yespower_sha256_prehash;
yespower_params.version = YESPOWER_0_5;
yespower_params.N = 4096;
yespower_params.r = 32;

View File

@@ -80,6 +80,8 @@ extern yespower_params_t yespower_params;
extern __thread sha256_context sha256_prehash_ctx;
int yespower_sha256_prehash( struct work *work );
/**
* yespower_init_local(local):
* Initialize the thread-local (RAM) data structure. Actual memory allocation

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.21.2.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.21.3.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.21.2'
PACKAGE_STRING='cpuminer-opt 3.21.2'
PACKAGE_VERSION='3.21.3'
PACKAGE_STRING='cpuminer-opt 3.21.3'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.21.2 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.21.3 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1432,7 +1432,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.21.2:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.21.3:";;
esac
cat <<\_ACEOF
@@ -1538,7 +1538,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.21.2
cpuminer-opt configure 3.21.3
generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.21.2, which was
It was created by cpuminer-opt $as_me 3.21.3, which was
generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.21.2'
VERSION='3.21.3'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.21.2, which was
This file was extended by cpuminer-opt $as_me 3.21.3, which was
generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
cpuminer-opt config.status 3.21.2
cpuminer-opt config.status 3.21.3
configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.21.2])
AC_INIT([cpuminer-opt], [3.21.3])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -37,6 +37,7 @@
#include <curl/curl.h>
#include <jansson.h>
#include <openssl/sha.h>
#include <mm_malloc.h>
#include "sysinfos.c"
#include "algo/sha/sha256d.h"
@@ -317,8 +318,9 @@ static void affine_to_cpu( struct thr_info *thr )
if ( !ok )
{
last_error = GetLastError();
applog( LOG_WARNING, "affine_to_cpu_mask for %u returned 0x%x",
thread, last_error );
if ( !thread )
applog( LOG_WARNING, "Set affinity returned error 0x%x for thread %d",
last_error, thread );
}
}
@@ -1725,9 +1727,9 @@ static bool workio_get_work( struct workio_cmd *wc, CURL *curl )
struct work *ret_work;
int failures = 0;
ret_work = (struct work*) calloc( 1, sizeof(*ret_work) );
if ( !ret_work )
return false;
ret_work = (struct work*) _mm_malloc( sizeof(*ret_work), 32 );
if ( !ret_work ) return false;
memset( ret_work, 0, sizeof(*ret_work) );
/* obtain new work from bitcoin via JSON-RPC */
while ( !get_upstream_work( curl, ret_work ) )
@@ -1736,22 +1738,23 @@ static bool workio_get_work( struct workio_cmd *wc, CURL *curl )
{
applog( LOG_ERR, "json_rpc_call failed, terminating workio thread" );
free( ret_work );
return false;
return false;
}
/* pause, then restart work-request loop */
applog( LOG_ERR, "json_rpc_call failed, retry after %d seconds",
opt_fail_pause );
applog( LOG_ERR, "json_rpc_call failed, retry after %d seconds",
opt_fail_pause );
sleep( opt_fail_pause );
}
/* send work to requesting thread */
if ( !tq_push(wc->thr->q, ret_work ) )
free( ret_work );
free( ret_work );
return true;
}
static bool workio_submit_work(struct workio_cmd *wc, CURL *curl)
{
int failures = 0;
@@ -1970,15 +1973,15 @@ static bool wanna_mine(int thr_id)
float temp = cpu_temp(0);
if (temp > opt_max_temp)
{
if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
applog(LOG_INFO, "temperature too high (%.0fC), waiting...", temp);
state = false;
if ( !thr_id && !conditional_state[thr_id] && !opt_quiet )
applog(LOG_NOTICE, "CPU temp too high: %.0fC max %.0f, waiting...", temp, opt_max_temp );
state = false;
}
}
if (opt_max_diff > 0.0 && net_diff > opt_max_diff)
{
if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
applog(LOG_INFO, "network diff too high, waiting...");
applog(LOG_NOTICE, "network diff too high, waiting...");
state = false;
}
if (opt_max_rate > 0.0 && net_hashrate > opt_max_rate)
@@ -1987,12 +1990,14 @@ static bool wanna_mine(int thr_id)
{
char rate[32];
format_hashrate(opt_max_rate, rate);
applog(LOG_INFO, "network hashrate too high, waiting %s...", rate);
applog(LOG_NOTICE, "network hashrate too high (%s), waiting...", rate);
}
state = false;
}
if (thr_id < MAX_CPUS)
conditional_state[thr_id] = (uint8_t) !state;
if ( conditional_state[thr_id] && state && !thr_id && !opt_quiet )
applog(LOG_NOTICE, "...resuming" );
conditional_state[thr_id] = (uint8_t) !state;
return state;
}
@@ -2117,6 +2122,10 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
t++ );
g_work_time = time(NULL);
// Do midstate prehash
algo_gate.prehash( g_work );
restart_threads();
pthread_mutex_unlock( &sctx->work_lock );
@@ -2140,7 +2149,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
else if ( g_work->job_id && new_job )
applog( LOG_BLUE, "New Work: Block %d, Net diff %.5g, Job %s",
sctx->block_height, net_diff, g_work->job_id );
else if ( !opt_quiet )
else if ( opt_debug )
{
unsigned char *xnonce2str = bebin2hex( g_work->xnonce2,
g_work->xnonce2_len );
@@ -2335,6 +2344,9 @@ static void *miner_thread( void *userdata )
goto out;
}
g_work_time = time(NULL);
// do midstate prehash
algo_gate.prehash( &g_work );
restart_threads();
}
@@ -2354,6 +2366,14 @@ static void *miner_thread( void *userdata )
if ( unlikely( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) ) )
continue;
// conditional mining
if ( unlikely( !wanna_mine( thr_id ) ) )
{
restart_threads();
sleep(5);
continue;
}
// opt_scantime expressed in hashes
max64 = opt_scantime * thr_hashrates[thr_id];
@@ -2500,14 +2520,6 @@ static void *miner_thread( void *userdata )
}
}
} // benchmark
// conditional mining
if ( unlikely( !wanna_mine( thr_id ) ) )
{
sleep(5);
continue;
}
} // miner_thread loop
out:
@@ -3682,7 +3694,7 @@ int main(int argc, char *argv[])
#if defined(WIN32)
// Are Windows CPU Groups supported?
// Get the number of cpus, display after parsing command line
#if defined(WINDOWS_CPU_GROUPS_ENABLED)
num_cpus = 0;
num_cpugroups = GetActiveProcessorGroupCount();
@@ -3691,8 +3703,8 @@ int main(int argc, char *argv[])
int cpus = GetActiveProcessorCount( i );
num_cpus += cpus;
if (opt_debug)
applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
// if (opt_debug)
// applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
}
#else
@@ -3861,6 +3873,11 @@ int main(int argc, char *argv[])
}
#endif
#if defined(WIN32) && defined(WINDOWS_CPU_GROUPS_ENABLED)
if ( !opt_quiet )
applog( LOG_INFO, "Found %d CPUs in %d groups", num_cpus, num_cpugroups );
#endif
if ( opt_affinity && num_cpus > max_cpus )
{
applog( LOG_WARNING, "More than %d CPUs, CPU affinity is disabled",
@@ -3872,7 +3889,7 @@ int main(int argc, char *argv[])
{
for ( int thr = 0, cpu = 0; thr < opt_n_threads; thr++, cpu++ )
{
while ( !( ( opt_affinity >> ( cpu&63 ) ) & 1ULL ) ) cpu++;
while ( !( ( opt_affinity >> ( cpu & 63 ) ) & 1ULL ) ) cpu++;
thread_affinity_map[ thr ] = cpu % num_cpus;
}
if ( !opt_quiet )

View File

@@ -392,8 +392,8 @@ float cpu_temp( int core );
struct work
{
uint32_t target[8] __attribute__ ((aligned (64)));
uint32_t data[48] __attribute__ ((aligned (64)));
uint32_t data[48] __attribute__ ((aligned (64)));
uint32_t target[8] __attribute__ ((aligned (32)));
double targetdiff;
double sharediff;
double stratum_diff;

View File

@@ -470,7 +470,7 @@ static inline void mm128_intrlv_4x32x( void *dst, void *src0, void *src1,
#if defined(__SSSE3__)
static inline void mm128_bswap32_80( void *d, void *s )
static inline void mm128_bswap32_80( void *d, const void *s )
{
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), bswap_shuf );
@@ -482,7 +482,7 @@ static inline void mm128_bswap32_80( void *d, void *s )
#else
static inline void mm128_bswap32_80( void *d, void *s )
static inline void mm128_bswap32_80( void *d, const void *s )
{
( (uint32_t*)d )[ 0] = bswap_32( ( (uint32_t*)s )[ 0] );
( (uint32_t*)d )[ 1] = bswap_32( ( (uint32_t*)s )[ 1] );

View File

@@ -385,7 +385,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_rol_var_32( v, c ) \
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
//
/* Not used
// Limited 2 input shuffle, combines shuffle with blend. The destination low
// half is always taken from v1, and the high half from v2.
#define mm128_shuffle2_64( v1, v2, c ) \
@@ -395,6 +395,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_shuffle2_32( v1, v2, c ) \
_mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
_mm_castsi128_ps( v2 ), c ) );
*/
//
// Rotate vector elements accross all lanes
@@ -406,6 +407,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 )
#define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 )
/* Not used
#if defined(__SSSE3__)
// Rotate right by c bytes, no SSE2 equivalent.
@@ -413,6 +415,7 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
{ return _mm_alignr_epi8( v, v, c ); }
#endif
*/
// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit rotations
// for multiples of 8 bits. Uses ror/rol macros when AVX512 is available
@@ -555,68 +558,25 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
v2 = _mm_xor_si128( v1, v2 ); \
v1 = _mm_xor_si128( v1, v2 );
// alignr for 32 & 64 bit elements is only available with AVX512 but
// emulated here. Shift argument is not needed, it's always 1.
// Behaviour is otherwise consistent with Intel alignr intrinsics.
// Concatenate { hi, lo }, rotate right by c elements and return low 128 bits.
#if defined(__SSSE3__)
#define mm128_alignr_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
#define mm128_alignr_32( v1, v2 ) _mm_alignr_epi8( v1, v2, 4 )
// _mm_alignr_epi32 & _mm_alignr_epi64 are only available with AVX512VL but
// are emulated here using _mm_alignr_epi8. There are no fast equivalents for
// 256 bit vectors, though there is no for this functionality.
#define mm128_alignr_64( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*8 )
#define mm128_alignr_32( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*4 )
#else
#define mm128_alignr_64( v1, v2 ) _mm_or_si128( _mm_slli_si128( v1, 8 ), \
_mm_srli_si128( v2, 8 ) )
#define mm128_alignr_64( hi, lo, c ) \
_mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )
#define mm128_alignr_32( v1, v2 ) _mm_or_si128( _mm_slli_si128( v1, 4 ), \
_mm_srli_si128( v2, 4 ) )
#define mm128_alignr_32( hi, lo, c ) \
_mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )
#endif
// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
// vrol & vror are deprecated and do not exist for larger vectors.
// Their only use is by lyra2 blake2b when AVX2 is not available and is
// grandfathered.
#if defined(__SSSE3__)
#define mm128_vror256_64( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
v1 = _mm_alignr_epi8( v2, v1, 8 ); \
v2 = t; \
} while(0)
#define mm128_vrol256_64( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
v2 = _mm_alignr_epi8( v2, v1, 8 ); \
v1 = t; \
} while(0)
#else // SSE2
#define mm128_vror256_64( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
_mm_slli_si128( v2, 8 ) ); \
v2 = _mm_or_si128( _mm_srli_si128( v2, 8 ), \
_mm_slli_si128( v1, 8 ) ); \
v1 = t; \
} while(0)
#define mm128_vrol256_64( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
_mm_srli_si128( v2, 8 ) ); \
v2 = _mm_or_si128( _mm_slli_si128( v2, 8 ), \
_mm_srli_si128( v1, 8 ) ); \
v1 = t; \
} while(0)
#endif // SSE4.1 else SSE2
#endif // __SSE2__
#endif // SIMD_128_H__

View File

@@ -239,8 +239,8 @@ static inline __m256i mm256_not( const __m256i v )
// Mask making
// Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
// Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
// Effectively a sign test.
// Create a 64 or 32 bit integer mask from MSB of 64 or 32 bit elements.
// Effectively a sign test: if (mask[n]) then -1 else 0.
#define mm256_movmask_64( v ) \
_mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) )
@@ -348,7 +348,7 @@ static inline __m256i mm256_not( const __m256i v )
_mm256_or_si256( _mm256_slli_epi16( v, c ), \
_mm256_srli_epi16( v, 16-(c) ) )
// Deprecated.
// Deprecated. Obsolete sm3, the only user, is grandfathered.
#define mm256_rol_var_32( v, c ) \
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
_mm256_srli_epi32( v, 32-(c) ) )
@@ -391,6 +391,7 @@ static inline __m256i mm256_shufll_32( const __m256i v )
//
// Rotate elements within each 128 bit lane of 256 bit vector.
/* Not used
// Limited 2 input shuffle
#define mm256_shuffle2_64( v1, v2, c ) \
_mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( v1 ), \
@@ -399,6 +400,7 @@ static inline __m256i mm256_shufll_32( const __m256i v )
#define mm256_shuffle2_32( v1, v2, c ) \
_mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( v1 ), \
_mm256_castsi256_ps( v2 ), c ) );
*/
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
#define mm256_shuflr128_64 mm256_swap128_64
@@ -511,7 +513,8 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
} while(0)
// swap 256 bit vectors in place.
// This should be avoided, it's more efficient to switch references.
// Deprecated, Shabal is the only user and it should be modified to reorder
// instructions.
#define mm256_swap512_256( v1, v2 ) \
v1 = _mm256_xor_si256( v1, v2 ); \
v2 = _mm256_xor_si256( v1, v2 ); \

View File

@@ -409,19 +409,20 @@ static inline __m512i mm512_shuflr_x64( const __m512i v, const int n )
static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
{ return _mm512_alignr_epi32( v, v, n ); }
/* Not used
#define mm512_shuflr_16( v ) \
_mm512_permutexvar_epi16( m512_const_64( \
0x0000001F001E001D, 0x001C001B001A0019, \
0X0018001700160015, 0X0014001300120011, \
0X0010000F000E000D, 0X000C000B000A0009, \
0X0008000700060005, 0X0004000300020001 ), v )
0x0018001700160015, 0x0014001300120011, \
0x0010000F000E000D, 0x000C000B000A0009, \
0x0008000700060005, 0x0004000300020001 ), v )
#define mm512_shufll_16( v ) \
_mm512_permutexvar_epi16( m512_const_64( \
0x001E001D001C001B, 0x001A001900180017, \
0X0016001500140013, 0X001200110010000F, \
0X000E000D000C000B, 0X000A000900080007, \
0X0006000500040003, 0X000200010000001F ), v )
0x0016001500140013, 0x001200110010000F, \
0x000E000D000C000B, 0x000A000900080007, \
0x0006000500040003, 0x000200010000001F ), v )
#define mm512_shuflr_8( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \
@@ -436,6 +437,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
0x2E2D2C2B2A292827, 0x262524232221201F, \
0x1E1D1C1B1A191817, 0x161514131211100F, \
0x0E0D0C0B0A090807, 0x060504030201003F ) )
*/
// 256 bit lanes used only by lyra2, move these there
// Rotate elements within 256 bit lanes of 512 bit vector.
@@ -449,7 +451,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
#define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 )
#define mm512_shufll256_64( v ) _mm512_permutex_epi64( v, 0x93 )
/*
/* Not used
// Rotate 256 bit lanes by one 32 bit element
#define mm512_shuflr256_32( v ) \
_mm512_permutexvar_epi32( m512_const_64( \
@@ -496,6 +498,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
//
// Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
/* Not used
// Limited 2 input, 1 output shuffle, combines shuffle with blend.
// Like most shuffles it's limited to 128 bit lanes and like some shuffles
// destination elements must come from a specific source arg.
@@ -506,7 +509,10 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
#define mm512_shuffle2_32( v1, v2, c ) \
_mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( v1 ), \
_mm512_castsi512_ps( v2 ), c ) );
*/
// These hard coded shuffles exist for consistency with AVX2 & SSE2 where
// efficient generic versions don't exist.
// Swap 64 bits in each 128 bit lane
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
#define mm512_shuflr128_64 mm512_swap128_64
@@ -516,9 +522,11 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
#define mm512_shuflr128_32( v ) _mm512_shuffle_epi32( v, 0x39 )
#define mm512_shufll128_32( v ) _mm512_shuffle_epi32( v, 0x93 )
// Rotate right 128 bit lanes by c bytes, versatile and just as fast
/* Not used
// Rotate right 128 bit lanes by c bytes, efficient generic version of above.
static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
{ return _mm512_alignr_epi8( v, v, c ); }
*/
// Rotate byte elements in each 64 or 32 bit lane. Redundant for AVX512, all
// can be done with ror & rol. Defined only for convenience and consistency

View File

@@ -17,6 +17,7 @@ export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
# used by GCC
export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
# Support for Windows 7 CPU groups, AES sometimes not included in -march
# Disabled due to CPU group incompatibilities between Intel and AMD CPU.
export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
export DEFAULT_CFLAGS_OLD="-O3 -Wall"
@@ -45,7 +46,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
./clean-all.sh || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-march=icelake-client $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
CFLAGS="-march=icelake-client $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
@@ -53,7 +54,7 @@ mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
# AVX512 AES: Intel Core HEDT Slylake-X, Cascadelake
make clean || echo clean
rm -f config.status
CFLAGS="-march=skylake-avx512 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
CFLAGS="-march=skylake-avx512 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx512.exe
@@ -61,7 +62,7 @@ mv cpuminer.exe release/cpuminer-avx512.exe
# AVX2 SHA VAES: Intel Alderlake, AMD Zen3
make clean || echo done
rm -f config.status
CFLAGS="-mavx2 -msha -mvaes $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
CFLAGS="-mavx2 -msha -mvaes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
@@ -69,7 +70,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
# AVX2 AES SHA: AMD Zen1
make clean || echo clean
rm -f config.status
CFLAGS="-march=znver1 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
CFLAGS="-march=znver1 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx2-sha.exe
@@ -77,7 +78,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha.exe
# AVX2 AES: Intel Core Haswell, Skylake, Kabylake, Coffeelake, Cometlake
make clean || echo clean
rm -f config.status
CFLAGS="-march=core-avx2 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
CFLAGS="-march=core-avx2 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx2.exe