mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.21.3 Unreleased
This commit is contained in:
@@ -264,6 +264,8 @@ cpuminer_SOURCES = \
|
||||
algo/x16/x16r-4way.c \
|
||||
algo/x16/x16rv2.c \
|
||||
algo/x16/x16rv2-4way.c \
|
||||
algo/x16/x16rt.c \
|
||||
algo/x16/x16rt-4way.c \
|
||||
algo/x16/hex.c \
|
||||
algo/x16/x21s-4way.c \
|
||||
algo/x16/x21s.c \
|
||||
|
@@ -65,7 +65,11 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.22.3
|
||||
v3.21.3.1 UNRELEASED
|
||||
|
||||
Revert to 3.21.2
|
||||
|
||||
v3.21.3 CANCELLED
|
||||
|
||||
#392 #379 #389 Fixed misaligned address segfault solo mining.
|
||||
#392 Fixed stats for myr-gr algo, and a few others, for CPUs without AVX2.
|
||||
@@ -74,10 +78,10 @@ v3.22.3
|
||||
Windows binaries no longer support CPU groups,
|
||||
Windows binaries support CPUs with up to 64 threads.
|
||||
Midstate prehash is now centralized, done only once instead of by every thread
|
||||
for selected algos.
|
||||
for selected algos.
|
||||
Small optimizations to serialized vectoring.
|
||||
|
||||
v3.22.2
|
||||
v3.21.2
|
||||
|
||||
Faster SALSA SIMD shuffle for yespower, yescrypt & scryptn2.
|
||||
Fixed a couple of compiler warnings with gcc-12.
|
||||
|
@@ -253,7 +253,6 @@ void init_algo_gate( algo_gate_t* gate )
|
||||
gate->miner_thread_init = (void*)&return_true;
|
||||
gate->scanhash = (void*)&scanhash_generic;
|
||||
gate->hash = (void*)&null_hash;
|
||||
gate->prehash = (void*)&return_true;
|
||||
gate->get_new_work = (void*)&std_get_new_work;
|
||||
gate->work_decode = (void*)&std_le_work_decode;
|
||||
gate->decode_extra_data = (void*)&do_nothing;
|
||||
|
@@ -119,7 +119,7 @@ typedef struct
|
||||
// to be registered with the gate.
|
||||
int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* );
|
||||
|
||||
int ( *hash ) ( void*, const void*, const int );
|
||||
int ( *hash ) ( void*, const void*, int );
|
||||
|
||||
//optional, safe to use default in most cases
|
||||
|
||||
@@ -127,9 +127,6 @@ int ( *hash ) ( void*, const void*, const int );
|
||||
// other initialization specific to miner threads.
|
||||
bool ( *miner_thread_init ) ( int );
|
||||
|
||||
// Perform prehash after receiving new work
|
||||
int ( *prehash ) ( struct work* );
|
||||
|
||||
// Get thread local copy of blockheader with unique nonce.
|
||||
void ( *get_new_work ) ( struct work*, struct work*, int, uint32_t* );
|
||||
|
||||
|
@@ -1,6 +1,5 @@
|
||||
#include "blake2s-gate.h"
|
||||
#include "blake2s-hash-4way.h"
|
||||
//#include "sph-blake2s.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
@@ -8,43 +7,6 @@
|
||||
|
||||
static __thread blake2s_16way_state blake2s_16w_ctx;
|
||||
|
||||
/*
|
||||
static blake2s_16way_state blake2s_16w_ctx;
|
||||
static uint32_t blake2s_16way_vdata[20*16] __attribute__ ((aligned (64)));
|
||||
*/
|
||||
/*
|
||||
int blake2s_16way_prehash( struct work *work )
|
||||
{
|
||||
uint32_t edata[20] __attribute__ ((aligned (64)));
|
||||
blake2s_state ctx;
|
||||
mm128_bswap32_80( edata, work->data );
|
||||
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
|
||||
ctx.buflen = ctx.t[0] = 64;
|
||||
blake2s_compress( &ctx, (const uint8_t*)edata );
|
||||
|
||||
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
|
||||
intrlv_16x32( blake2s_16w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
|
||||
ctx.h, ctx.h, ctx.h, ctx.h,
|
||||
ctx.h, ctx.h, ctx.h, ctx.h,
|
||||
ctx.h, ctx.h, ctx.h, ctx.h, 256 );
|
||||
intrlv_16x32( blake2s_16way_vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata,
|
||||
edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 640 );
|
||||
blake2s_16w_ctx.t[0] = 64;
|
||||
return 1;
|
||||
}
|
||||
*/
|
||||
/*
|
||||
int blake2s_16way_prehash( struct work *work )
|
||||
{
|
||||
mm512_bswap32_intrlv80_16x32( blake2s_16way_vdata, work->data );
|
||||
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_16way_update( &blake2s_16w_ctx, blake2s_16way_vdata, 64 );
|
||||
return 1;
|
||||
}
|
||||
*/
|
||||
|
||||
void blake2s_16way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_16way_state ctx;
|
||||
@@ -68,40 +30,10 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
/*
|
||||
// pthread_rwlock_rdlock( &g_work_lock );
|
||||
memcpy( (__m512i*)vdata +16, (__m512i*)blake2s_16way_vdata +16, 3*4*16 );
|
||||
// casti_m512i( vdata, 16 ) = casti_m512i( blake2s_16way_vdata, 16 );
|
||||
// casti_m512i( vdata, 17 ) = casti_m512i( blake2s_16way_vdata, 17 );
|
||||
// casti_m512i( vdata, 18 ) = casti_m512i( blake2s_16way_vdata, 18 );
|
||||
|
||||
// pthread_rwlock_unlock( &g_work_lock );
|
||||
*/
|
||||
/*
|
||||
uint32_t edata[20] __attribute__ ((aligned (64)));
|
||||
blake2s_state ctx;
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
|
||||
ctx.buflen = ctx.t[0] = 64;
|
||||
blake2s_compress( &ctx, (const uint8_t*)edata );
|
||||
|
||||
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
|
||||
intrlv_16x32( blake2s_16w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
|
||||
ctx.h, ctx.h, ctx.h, ctx.h,
|
||||
ctx.h, ctx.h, ctx.h, ctx.h,
|
||||
ctx.h, ctx.h, ctx.h, ctx.h, 256 );
|
||||
intrlv_16x32( blake2s_16way_blake2s_16way_vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata,
|
||||
edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 640 );
|
||||
blake2s_16w_ctx.t[0] = 64;
|
||||
*/
|
||||
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
|
||||
|
||||
|
||||
do {
|
||||
*noncev = mm512_bswap_32( _mm512_set_epi32(
|
||||
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
@@ -131,36 +63,6 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
static __thread blake2s_8way_state blake2s_8w_ctx;
|
||||
|
||||
/*
|
||||
static blake2s_8way_state blake2s_8w_ctx;
|
||||
static uint32_t blake2s_8way_vdata[20*8] __attribute__ ((aligned (32)));
|
||||
|
||||
int blake2s_8way_prehash( struct work *work )
|
||||
{
|
||||
uint32_t edata[20] __attribute__ ((aligned (64)));
|
||||
blake2s_state ctx;
|
||||
mm128_bswap32_80( edata, work->data );
|
||||
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
|
||||
ctx.buflen = ctx.t[0] = 64;
|
||||
blake2s_compress( &ctx, (const uint8_t*)edata );
|
||||
|
||||
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
casti_m256i( blake2s_8w_ctx.h, i ) = _mm256_set1_epi32( ctx.h[i] );
|
||||
|
||||
casti_m256i( blake2s_8way_vdata, 16 ) = _mm256_set1_epi32( edata[16] );
|
||||
casti_m256i( blake2s_8way_vdata, 17 ) = _mm256_set1_epi32( edata[17] );
|
||||
casti_m256i( blake2s_8way_vdata, 18 ) = _mm256_set1_epi32( edata[18] );
|
||||
|
||||
// intrlv_8x32( blake2s_8w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
|
||||
// ctx.h, ctx.h, ctx.h, ctx.h, 256 );
|
||||
// intrlv_8x32( blake2s_8way_vdata, edata, edata, edata, edata,
|
||||
// edata, edata, edata, edata, 640 );
|
||||
blake2s_8w_ctx.t[0] = 64;
|
||||
}
|
||||
*/
|
||||
|
||||
void blake2s_8way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_8way_state ctx;
|
||||
@@ -184,41 +86,10 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
/*
|
||||
// pthread_rwlock_rdlock( &g_work_lock );
|
||||
memcpy( &vdata[16*8], &blake2s_8way_vdata[16*8], 3*4*8 );
|
||||
// pthread_rwlock_unlock( &g_work_lock );
|
||||
*/
|
||||
/*
|
||||
uint32_t edata[20] __attribute__ ((aligned (64)));
|
||||
blake2s_state ctx;
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
|
||||
ctx.buflen = ctx.t[0] = 64;
|
||||
blake2s_compress( &ctx, (const uint8_t*)edata );
|
||||
|
||||
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
casti_m256i( blake2s_8w_ctx.h, i ) = _mm256_set1_epi32( ctx.h[i] );
|
||||
|
||||
casti_m256i( vdata, 16 ) = _mm256_set1_epi32( edata[16] );
|
||||
casti_m256i( vdata, 17 ) = _mm256_set1_epi32( edata[17] );
|
||||
casti_m256i( vdata, 18 ) = _mm256_set1_epi32( edata[18] );
|
||||
|
||||
|
||||
// intrlv_8x32( blake2s_8w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
|
||||
// ctx.h, ctx.h, ctx.h, ctx.h, 256 );
|
||||
// intrlv_8x32( vdata, edata, edata, edata, edata,
|
||||
// edata, edata, edata, edata, 640 );
|
||||
|
||||
blake2s_8w_ctx.t[0] = 64;
|
||||
*/
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
|
||||
|
||||
|
||||
do {
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
n+3, n+2, n+1, n ) );
|
||||
@@ -246,25 +117,7 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
|
||||
static __thread blake2s_4way_state blake2s_4w_ctx;
|
||||
/*
|
||||
static blake2s_4way_state blake2s_4w_ctx;
|
||||
static uint32_t blake2s_4way_vdata[20*4] __attribute__ ((aligned (32)));
|
||||
|
||||
int blake2s_4way_prehash( struct work *work )
|
||||
{
|
||||
uint32_t edata[20] __attribute__ ((aligned (64)));
|
||||
blake2s_state ctx;
|
||||
mm128_bswap32_80( edata, work->data );
|
||||
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
|
||||
ctx.buflen = ctx.t[0] = 64;
|
||||
blake2s_compress( &ctx, (const uint8_t*)edata );
|
||||
|
||||
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
|
||||
intrlv_4x32( blake2s_4w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h, 256 );
|
||||
intrlv_4x32( blake2s_4way_vdata, edata, edata, edata, edata, 640 );
|
||||
blake2s_4w_ctx.t[0] = 64;
|
||||
}
|
||||
*/
|
||||
void blake2s_4way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_4way_state ctx;
|
||||
@@ -287,15 +140,11 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
/*
|
||||
pthread_rwlock_rdlock( &g_work_lock );
|
||||
memcpy( vdata, blake2s_4way_vdata, sizeof vdata );
|
||||
pthread_rwlock_unlock( &g_work_lock );
|
||||
*/
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
|
||||
|
||||
|
||||
do {
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
|
@@ -5,15 +5,13 @@ bool register_blake2s_algo( algo_gate_t* gate )
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_16way;
|
||||
gate->hash = (void*)&blake2s_16way_hash;
|
||||
// gate->prehash = (void*)&blake2s_16way_prehash;
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
//#if defined(BLAKE2S_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_8way;
|
||||
gate->hash = (void*)&blake2s_8way_hash;
|
||||
// gate->prehash = (void*)&blake2s_8way_prehash;
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_4way;
|
||||
gate->hash = (void*)&blake2s_4way_hash;
|
||||
// gate->prehash = (void*)&blake2s_4way_prehash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake2s;
|
||||
gate->hash = (void*)&blake2s_hash;
|
||||
|
@@ -23,22 +23,18 @@ bool register_blake2s_algo( algo_gate_t* gate );
|
||||
void blake2s_16way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
int blake2s_16way_prehash( struct work * );
|
||||
|
||||
#elif defined (BLAKE2S_8WAY)
|
||||
|
||||
void blake2s_8way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
int blake2s_8way_prehash( struct work * );
|
||||
|
||||
#elif defined (BLAKE2S_4WAY)
|
||||
|
||||
void blake2s_4way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
int blake2s_4way_prehash( struct work * );
|
||||
|
||||
#else
|
||||
|
||||
void blake2s_hash( void *state, const void *input );
|
||||
|
@@ -105,8 +105,8 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
|
||||
|
||||
#define G4W( sigma0, sigma1, a, b, c, d ) \
|
||||
do { \
|
||||
const uint8_t s0 = sigma0; \
|
||||
const uint8_t s1 = sigma1; \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
@@ -120,7 +120,7 @@ do { \
|
||||
|
||||
#define ROUND4W(r) \
|
||||
do { \
|
||||
const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
|
||||
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
|
||||
G4W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
G4W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
|
||||
G4W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
|
||||
@@ -317,8 +317,8 @@ do { \
|
||||
|
||||
#define G8W( sigma0, sigma1, a, b, c, d) \
|
||||
do { \
|
||||
const uint8_t s0 = sigma0; \
|
||||
const uint8_t s1 = sigma1; \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
@@ -331,7 +331,7 @@ do { \
|
||||
|
||||
#define ROUND8W(r) \
|
||||
do { \
|
||||
const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
|
||||
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
|
||||
G8W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
G8W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
|
||||
G8W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
|
||||
@@ -529,8 +529,8 @@ int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
|
||||
|
||||
#define G16W( sigma0, sigma1, a, b, c, d) \
|
||||
do { \
|
||||
const uint8_t s0 = sigma0; \
|
||||
const uint8_t s1 = sigma1; \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
|
||||
c = _mm512_add_epi32( c, d ); \
|
||||
@@ -543,7 +543,7 @@ do { \
|
||||
|
||||
#define ROUND16W(r) \
|
||||
do { \
|
||||
const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
|
||||
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
|
||||
G16W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
G16W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
|
||||
G16W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
|
||||
|
@@ -20,7 +20,6 @@
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
//#include "sph-blake2s.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#include <inttypes.h>
|
||||
@@ -34,7 +33,7 @@
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
/*
|
||||
|
||||
enum blake2s_constant
|
||||
{
|
||||
BLAKE2S_BLOCKBYTES = 64,
|
||||
@@ -43,13 +42,6 @@ enum blake2s_constant
|
||||
BLAKE2S_SALTBYTES = 8,
|
||||
BLAKE2S_PERSONALBYTES = 8
|
||||
};
|
||||
*/
|
||||
|
||||
#define BLAKE2S_BLOCKBYTES 64
|
||||
#define BLAKE2S_OUTBYTES 32
|
||||
#define BLAKE2S_KEYBYTES 32
|
||||
#define BLAKE2S_SALTBYTES 8
|
||||
#define BLAKE2S_PERSONALBYTES 8
|
||||
|
||||
#pragma pack(push, 1)
|
||||
typedef struct __blake2s_nway_param
|
||||
|
@@ -8,6 +8,8 @@
|
||||
#include "sph-blake2s.h"
|
||||
|
||||
static __thread blake2s_state blake2s_ctx;
|
||||
//static __thread blake2s_state s_ctx;
|
||||
#define MIDLEN 76
|
||||
|
||||
void blake2s_hash( void *output, const void *input )
|
||||
{
|
||||
@@ -17,27 +19,37 @@ void blake2s_hash( void *output, const void *input )
|
||||
memcpy( &ctx, &blake2s_ctx, sizeof ctx );
|
||||
blake2s_update( &ctx, input+64, 16 );
|
||||
|
||||
// blake2s_init(&ctx, BLAKE2S_OUTBYTES);
|
||||
// blake2s_update(&ctx, input, 80);
|
||||
blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
|
||||
/*
|
||||
static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
|
||||
{
|
||||
s_ctx.buflen = MIDLEN;
|
||||
memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
|
||||
blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
|
||||
blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
|
||||
}
|
||||
*/
|
||||
int scanhash_blake2s( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
|
||||
uint32_t _ALIGN(64) hash64[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
int thr_id = mythr->id;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
// midstate
|
||||
blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
|
||||
@@ -46,12 +58,11 @@ int scanhash_blake2s( struct work *work,
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
blake2s_hash( hash64, endiandata );
|
||||
if (hash64[7] <= Htarg )
|
||||
if ( fulltest(hash64, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hash64, mythr );
|
||||
}
|
||||
if (hash64[7] <= Htarg && fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return true;
|
||||
}
|
||||
n++;
|
||||
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
@@ -17,7 +17,6 @@
|
||||
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "sph-blake2s.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
static const uint32_t blake2s_IV[8] =
|
||||
{
|
||||
@@ -226,71 +225,6 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
|
||||
v[13] = S->t[1] ^ blake2s_IV[5];
|
||||
v[14] = S->f[0] ^ blake2s_IV[6];
|
||||
v[15] = S->f[1] ^ blake2s_IV[7];
|
||||
|
||||
#if 0
|
||||
//#if defined(__SSE2__) // always true
|
||||
|
||||
The only application for this is to do a prehash for the blake2s algorithm.
|
||||
SSE2 also supports 4 way parallel hashing so that is preferred in most cases.
|
||||
Testing has found that using this serial SIMD code for prehash is slower than
|
||||
doing a parallel hash. A parallel hash has more instructions and uses more
|
||||
data. The serial hash uses fewer instructions and data and only needs to
|
||||
interleave the final hash into parallel streams. This has shown negligible
|
||||
improvement on other algos, notably blake256 which is almost identical.
|
||||
Considering the low frequency of prehash no statistically valid change
|
||||
was expected. It was simply better on paper.
|
||||
|
||||
Furthermore, simply defining this macro has an additional negative effect on
|
||||
blake2s as a whole. There are no references to this macro, blake2s-4way does
|
||||
not include it in any header files, it's just another unused macro which should
|
||||
have no effect beyond the preprocessor. But just being visible to the compiler
|
||||
changes things in a dramatic way.
|
||||
|
||||
These 2 things combined reduced the hash rate for blake2s by more than 5% when
|
||||
using serial SIMD for the blake2s prehash over 16way parallel prehash.
|
||||
16way parallel hashing was used in the high frequency nonce loop in both cases.
|
||||
Comsidering the prehash represents 50% of the algorithm and is done once vs
|
||||
the high frequency second half that is done mega, maybe giga, times more it's
|
||||
hard to imagine that big of an effect in either direction.
|
||||
|
||||
#define ROUND( r ) \
|
||||
{ \
|
||||
__m128i *V = (__m128i*)v; \
|
||||
const uint8_t *sigma = blake2s_sigma[r]; \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
|
||||
_mm_set_epi32( m[ sigma[ 6 ] ], m[ sigma[ 4 ] ], \
|
||||
m[ sigma[ 2 ] ], m[ sigma[ 0 ] ] ) ) ); \
|
||||
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
|
||||
_mm_set_epi32( m[ sigma[ 7 ] ], m[ sigma[ 5 ] ], \
|
||||
m[ sigma[ 3 ] ], m[ sigma[ 1 ] ] ) ) ); \
|
||||
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
|
||||
V[3] = mm128_shufll_32( V[3] ); \
|
||||
V[2] = mm128_swap_64( V[2] ); \
|
||||
V[1] = mm128_shuflr_32( V[1] ); \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
|
||||
_mm_set_epi32( m[ sigma[14] ], m[ sigma[12] ], \
|
||||
m[ sigma[10] ], m[ sigma[ 8] ] ) ) ); \
|
||||
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
|
||||
_mm_set_epi32( m[ sigma[15] ], m[ sigma[13] ], \
|
||||
m[ sigma[11] ], m[ sigma[ 9] ] ) ) ); \
|
||||
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
|
||||
V[3] = mm128_shuflr_32( V[3] ); \
|
||||
V[2] = mm128_swap_64( V[2] ); \
|
||||
V[1] = mm128_shufll_32( V[1] ); \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define G(r,i,a,b,c,d) \
|
||||
do { \
|
||||
a = a + b + m[blake2s_sigma[r][2*i+0]]; \
|
||||
@@ -313,10 +247,7 @@ hard to imagine that big of an effect in either direction.
|
||||
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
|
||||
G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
|
||||
} while(0)
|
||||
|
||||
#endif
|
||||
|
||||
ROUND( 0 );
|
||||
ROUND( 0 );
|
||||
ROUND( 1 );
|
||||
ROUND( 2 );
|
||||
ROUND( 3 );
|
||||
|
@@ -91,7 +91,6 @@ static inline void secure_zero_memory(void *v, size_t n)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
enum blake2s_constant
|
||||
{
|
||||
BLAKE2S_BLOCKBYTES = 64,
|
||||
@@ -100,13 +99,6 @@ extern "C" {
|
||||
BLAKE2S_SALTBYTES = 8,
|
||||
BLAKE2S_PERSONALBYTES = 8
|
||||
};
|
||||
*/
|
||||
|
||||
#define BLAKE2S_BLOCKBYTES 64
|
||||
#define BLAKE2S_OUTBYTES 32
|
||||
#define BLAKE2S_KEYBYTES 32
|
||||
#define BLAKE2S_SALTBYTES 8
|
||||
#define BLAKE2S_PERSONALBYTES 8
|
||||
|
||||
#pragma pack(push, 1)
|
||||
typedef struct __blake2s_param
|
||||
|
@@ -103,16 +103,16 @@
|
||||
const uint8_t *sigmaR = sigma[R]; \
|
||||
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
|
||||
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
|
||||
V2 = mm128_alignr_64( V[3], V[2], 1 ); \
|
||||
V3 = mm128_alignr_64( V[2], V[3], 1 ); \
|
||||
V6 = mm128_alignr_64( V[6], V[7], 1 ); \
|
||||
V7 = mm128_alignr_64( V[7], V[6], 1 ); \
|
||||
V2 = mm128_alignr_64( V[3], V[2] ); \
|
||||
V3 = mm128_alignr_64( V[2], V[3] ); \
|
||||
V6 = mm128_alignr_64( V[6], V[7] ); \
|
||||
V7 = mm128_alignr_64( V[7], V[6] ); \
|
||||
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
|
||||
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
|
||||
V[2] = mm128_alignr_64( V2, V3, 1 ); \
|
||||
V[3] = mm128_alignr_64( V3, V2, 1 ); \
|
||||
V[6] = mm128_alignr_64( V7, V6, 1 ); \
|
||||
V[7] = mm128_alignr_64( V6, V7, 1 ); \
|
||||
V[2] = mm128_alignr_64( V2, V3 ); \
|
||||
V[3] = mm128_alignr_64( V3, V2 ); \
|
||||
V[6] = mm128_alignr_64( V7, V6 ); \
|
||||
V[7] = mm128_alignr_64( V6, V7 ); \
|
||||
}
|
||||
|
||||
#else
|
||||
|
@@ -73,11 +73,11 @@ int scanhash_myriad( struct work *work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[19], nonce);
|
||||
myriad_hash(hash, endiandata);
|
||||
|
||||
if (hash[7] <= Htarg )
|
||||
if ( fulltest(hash, ptarget) && !opt_benchmark )
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget))
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
submit_solution( work, hash, mythr );
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
}
|
||||
nonce++;
|
||||
|
||||
|
@@ -19,34 +19,26 @@
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
#include <emmintrin.h>
|
||||
#include "simd-utils.h"
|
||||
#include "luffa_for_sse2.h"
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define MULT2( a0, a1 ) \
|
||||
{ \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
|
||||
a0 = _mm_alignr_epi32( a1, b, 1 ); \
|
||||
a1 = _mm_alignr_epi32( b, a1, 1 ); \
|
||||
}
|
||||
|
||||
#elif defined(__SSE4_1__)
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
#define MULT2( a0, a1 ) do \
|
||||
{ \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
|
||||
a0 = _mm_alignr_epi8( a1, b, 4 ); \
|
||||
a1 = _mm_alignr_epi8( b, a1, 4 ); \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
|
||||
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
|
||||
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
|
||||
} while(0)
|
||||
|
||||
#else
|
||||
|
||||
#define MULT2( a0, a1 ) do \
|
||||
{ \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 16 ) ); \
|
||||
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
|
||||
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
|
||||
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
|
||||
} while(0)
|
||||
|
||||
#endif
|
||||
|
@@ -24,45 +24,6 @@ typedef union {
|
||||
#endif
|
||||
} allium_16way_ctx_holder;
|
||||
|
||||
static uint32_t allium_16way_midstate_vars[16*16] __attribute__ ((aligned (64)));
|
||||
static __m512i allium_16way_block0_hash[8] __attribute__ ((aligned (64)));
|
||||
static __m512i allium_16way_block_buf[16] __attribute__ ((aligned (64)));
|
||||
|
||||
int allium_16way_prehash( struct work *work )
|
||||
{
|
||||
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
|
||||
// Prehash first block.
|
||||
blake256_transform_le( phash, pdata, 512, 0 );
|
||||
|
||||
// Interleave hash for second block prehash.
|
||||
allium_16way_block0_hash[0] = _mm512_set1_epi32( phash[0] );
|
||||
allium_16way_block0_hash[1] = _mm512_set1_epi32( phash[1] );
|
||||
allium_16way_block0_hash[2] = _mm512_set1_epi32( phash[2] );
|
||||
allium_16way_block0_hash[3] = _mm512_set1_epi32( phash[3] );
|
||||
allium_16way_block0_hash[4] = _mm512_set1_epi32( phash[4] );
|
||||
allium_16way_block0_hash[5] = _mm512_set1_epi32( phash[5] );
|
||||
allium_16way_block0_hash[6] = _mm512_set1_epi32( phash[6] );
|
||||
allium_16way_block0_hash[7] = _mm512_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave 12 of last 16 bytes of data,
|
||||
// excluding the nonce.
|
||||
allium_16way_block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
|
||||
allium_16way_block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
|
||||
allium_16way_block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
blake256_16way_round0_prehash_le( allium_16way_midstate_vars,
|
||||
allium_16way_block0_hash, allium_16way_block_buf );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void allium_16way_hash( void *state, const void *midstate_vars,
|
||||
const void *midhash, const void *block )
|
||||
{
|
||||
@@ -239,6 +200,11 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
|
||||
__m512i block0_hash[8] __attribute__ ((aligned (64)));
|
||||
__m512i block_buf[16] __attribute__ ((aligned (64)));
|
||||
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
@@ -250,19 +216,31 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
pthread_rwlock_rdlock( &g_work_lock );
|
||||
// Prehash first block.
|
||||
blake256_transform_le( phash, pdata, 512, 0 );
|
||||
|
||||
memcpy( midstate_vars, allium_16way_midstate_vars, sizeof midstate_vars );
|
||||
memcpy( block0_hash, allium_16way_block0_hash, sizeof block0_hash );
|
||||
memcpy( block_buf, allium_16way_block_buf, sizeof block_buf );
|
||||
// Interleave hash for second block prehash.
|
||||
block0_hash[0] = _mm512_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm512_set1_epi32( phash[1] );
|
||||
block0_hash[2] = _mm512_set1_epi32( phash[2] );
|
||||
block0_hash[3] = _mm512_set1_epi32( phash[3] );
|
||||
block0_hash[4] = _mm512_set1_epi32( phash[4] );
|
||||
block0_hash[5] = _mm512_set1_epi32( phash[5] );
|
||||
block0_hash[6] = _mm512_set1_epi32( phash[6] );
|
||||
block0_hash[7] = _mm512_set1_epi32( phash[7] );
|
||||
|
||||
pthread_rwlock_unlock( &g_work_lock );
|
||||
|
||||
// fill in the nonces
|
||||
block_buf[3] =
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
|
||||
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
|
||||
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
|
||||
block_buf[ 3] =
|
||||
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
|
||||
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||
|
||||
@@ -293,44 +271,6 @@ typedef union {
|
||||
#endif
|
||||
} allium_8way_ctx_holder;
|
||||
|
||||
static uint32_t allium_8way_midstate_vars[16*8] __attribute__ ((aligned (64)));
|
||||
static __m256i allium_8way_block0_hash[8] __attribute__ ((aligned (64)));
|
||||
static __m256i allium_8way_block_buf[16] __attribute__ ((aligned (64)));
|
||||
|
||||
int allium_8way_prehash ( struct work *work )
|
||||
{
|
||||
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0 );
|
||||
|
||||
allium_8way_block0_hash[0] = _mm256_set1_epi32( phash[0] );
|
||||
allium_8way_block0_hash[1] = _mm256_set1_epi32( phash[1] );
|
||||
allium_8way_block0_hash[2] = _mm256_set1_epi32( phash[2] );
|
||||
allium_8way_block0_hash[3] = _mm256_set1_epi32( phash[3] );
|
||||
allium_8way_block0_hash[4] = _mm256_set1_epi32( phash[4] );
|
||||
allium_8way_block0_hash[5] = _mm256_set1_epi32( phash[5] );
|
||||
allium_8way_block0_hash[6] = _mm256_set1_epi32( phash[6] );
|
||||
allium_8way_block0_hash[7] = _mm256_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave 12 of the last 16 bytes,
|
||||
// excepting the nonces.
|
||||
allium_8way_block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
|
||||
allium_8way_block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
|
||||
allium_8way_block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
|
||||
|
||||
// Partialy prehash second block without touching nonces
|
||||
blake256_8way_round0_prehash_le( allium_8way_midstate_vars,
|
||||
allium_8way_block0_hash, allium_8way_block_buf );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void allium_8way_hash( void *hash, const void *midstate_vars,
|
||||
const void *midhash, const void *block )
|
||||
{
|
||||
@@ -446,6 +386,11 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
|
||||
__m256i block0_hash[8] __attribute__ ((aligned (64)));
|
||||
__m256i block_buf[16] __attribute__ ((aligned (64)));
|
||||
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
uint64_t *ptarget = (uint64_t*)work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
@@ -455,17 +400,29 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i eight = m256_const1_32( 8 );
|
||||
|
||||
pthread_rwlock_rdlock( &g_work_lock );
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0 );
|
||||
|
||||
memcpy( midstate_vars, allium_8way_midstate_vars, sizeof midstate_vars );
|
||||
memcpy( block0_hash, allium_8way_block0_hash, sizeof block0_hash );
|
||||
memcpy( block_buf, allium_8way_block_buf, sizeof block_buf );
|
||||
block0_hash[0] = _mm256_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm256_set1_epi32( phash[1] );
|
||||
block0_hash[2] = _mm256_set1_epi32( phash[2] );
|
||||
block0_hash[3] = _mm256_set1_epi32( phash[3] );
|
||||
block0_hash[4] = _mm256_set1_epi32( phash[4] );
|
||||
block0_hash[5] = _mm256_set1_epi32( phash[5] );
|
||||
block0_hash[6] = _mm256_set1_epi32( phash[6] );
|
||||
block0_hash[7] = _mm256_set1_epi32( phash[7] );
|
||||
|
||||
pthread_rwlock_unlock( &g_work_lock );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
|
||||
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
|
||||
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
|
||||
block_buf[ 3] = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4,
|
||||
n+ 3, n+ 2, n+ 1, n );
|
||||
|
||||
|
||||
// Partialy prehash second block without touching nonces
|
||||
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||
|
||||
@@ -481,7 +438,6 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
||||
n += 8;
|
||||
block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
|
||||
} while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
|
||||
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
|
@@ -131,12 +131,10 @@ bool register_lyra2z_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(LYRA2Z_16WAY)
|
||||
gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
|
||||
gate->prehash = (void*)&lyra2z_16way_prehash;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z_16way;
|
||||
// gate->hash = (void*)&lyra2z_16way_hash;
|
||||
#elif defined(LYRA2Z_8WAY)
|
||||
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
|
||||
gate->prehash = (void*)&lyra2z_8way_prehash;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z_8way;
|
||||
// gate->hash = (void*)&lyra2z_8way_hash;
|
||||
#elif defined(LYRA2Z_4WAY)
|
||||
@@ -177,10 +175,8 @@ bool register_lyra2h_algo( algo_gate_t* gate )
|
||||
bool register_allium_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (ALLIUM_16WAY)
|
||||
gate->prehash = (void*)&allium_16way_prehash;
|
||||
gate->scanhash = (void*)&scanhash_allium_16way;
|
||||
#elif defined (ALLIUM_8WAY)
|
||||
gate->prehash = (void*)&allium_8way_prehash;
|
||||
gate->scanhash = (void*)&scanhash_allium_8way;
|
||||
#else
|
||||
gate->miner_thread_init = (void*)&init_allium_ctx;
|
||||
|
@@ -5,6 +5,7 @@
|
||||
#include <stdint.h>
|
||||
#include "lyra2.h"
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define LYRA2REV3_16WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
@@ -101,7 +102,6 @@ bool init_lyra2rev2_ctx();
|
||||
//void lyra2z_16way_hash( void *state, const void *input );
|
||||
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
int lyra2z_16way_prehash ( struct work *work );
|
||||
bool lyra2z_16way_thread_init();
|
||||
|
||||
#elif defined(LYRA2Z_8WAY)
|
||||
@@ -110,7 +110,6 @@ bool lyra2z_16way_thread_init();
|
||||
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool lyra2z_8way_thread_init();
|
||||
int lyra2z_8way_prehash ( struct work *work );
|
||||
|
||||
#elif defined(LYRA2Z_4WAY)
|
||||
|
||||
@@ -166,13 +165,11 @@ bool register_allium_algo( algo_gate_t* gate );
|
||||
|
||||
int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
int allium_16way_prehash ( struct work *work );
|
||||
|
||||
#elif defined(ALLIUM_8WAY)
|
||||
|
||||
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
int allium_8way_prehash ( struct work *work );
|
||||
|
||||
#else
|
||||
|
||||
|
@@ -14,44 +14,6 @@ bool lyra2z_16way_thread_init()
|
||||
return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
|
||||
}
|
||||
|
||||
static uint32_t lyra2z_16way_midstate_vars[16*16] __attribute__ ((aligned (64)));
|
||||
static __m512i lyra2z_16way_block0_hash[8] __attribute__ ((aligned (64)));
|
||||
static __m512i lyra2z_16way_block_buf[16] __attribute__ ((aligned (64)));
|
||||
|
||||
int lyra2z_16way_prehash ( struct work *work )
|
||||
{
|
||||
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0 );
|
||||
|
||||
lyra2z_16way_block0_hash[0] = _mm512_set1_epi32( phash[0] );
|
||||
lyra2z_16way_block0_hash[1] = _mm512_set1_epi32( phash[1] );
|
||||
lyra2z_16way_block0_hash[2] = _mm512_set1_epi32( phash[2] );
|
||||
lyra2z_16way_block0_hash[3] = _mm512_set1_epi32( phash[3] );
|
||||
lyra2z_16way_block0_hash[4] = _mm512_set1_epi32( phash[4] );
|
||||
lyra2z_16way_block0_hash[5] = _mm512_set1_epi32( phash[5] );
|
||||
lyra2z_16way_block0_hash[6] = _mm512_set1_epi32( phash[6] );
|
||||
lyra2z_16way_block0_hash[7] = _mm512_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave 12 of last 16 bytes of data
|
||||
// excepting the nonce.
|
||||
lyra2z_16way_block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
|
||||
lyra2z_16way_block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
|
||||
lyra2z_16way_block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
blake256_16way_round0_prehash_le( lyra2z_16way_midstate_vars,
|
||||
lyra2z_16way_block0_hash, lyra2z_16way_block_buf );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void lyra2z_16way_hash( void *state, const void *midstate_vars,
|
||||
const void *midhash, const void *block )
|
||||
{
|
||||
@@ -129,6 +91,11 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
|
||||
__m512i block0_hash[8] __attribute__ ((aligned (64)));
|
||||
__m512i block_buf[16] __attribute__ ((aligned (64)));
|
||||
uint32_t phash[8] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
@@ -140,18 +107,30 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
pthread_rwlock_rdlock( &g_work_lock );
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0 );
|
||||
|
||||
memcpy( midstate_vars, lyra2z_16way_midstate_vars, sizeof midstate_vars );
|
||||
memcpy( block0_hash, lyra2z_16way_block0_hash, sizeof block0_hash );
|
||||
memcpy( block_buf, lyra2z_16way_block_buf, sizeof block_buf );
|
||||
block0_hash[0] = _mm512_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm512_set1_epi32( phash[1] );
|
||||
block0_hash[2] = _mm512_set1_epi32( phash[2] );
|
||||
block0_hash[3] = _mm512_set1_epi32( phash[3] );
|
||||
block0_hash[4] = _mm512_set1_epi32( phash[4] );
|
||||
block0_hash[5] = _mm512_set1_epi32( phash[5] );
|
||||
block0_hash[6] = _mm512_set1_epi32( phash[6] );
|
||||
block0_hash[7] = _mm512_set1_epi32( phash[7] );
|
||||
|
||||
pthread_rwlock_unlock( &g_work_lock );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
|
||||
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
|
||||
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
|
||||
block_buf[ 3] =
|
||||
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||
|
||||
@@ -178,44 +157,6 @@ bool lyra2z_8way_thread_init()
|
||||
return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
|
||||
}
|
||||
|
||||
static uint32_t lyra2z_8way_midstate_vars[16*8] __attribute__ ((aligned (64)));
|
||||
static __m256i lyra2z_8way_block0_hash[8] __attribute__ ((aligned (64)));
|
||||
static __m256i lyra2z_8way_block_buf[16] __attribute__ ((aligned (64)));
|
||||
|
||||
int lyra2z_8way_prehash ( struct work *work )
|
||||
{
|
||||
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0 );
|
||||
|
||||
lyra2z_8way_block0_hash[0] = _mm256_set1_epi32( phash[0] );
|
||||
lyra2z_8way_block0_hash[1] = _mm256_set1_epi32( phash[1] );
|
||||
lyra2z_8way_block0_hash[2] = _mm256_set1_epi32( phash[2] );
|
||||
lyra2z_8way_block0_hash[3] = _mm256_set1_epi32( phash[3] );
|
||||
lyra2z_8way_block0_hash[4] = _mm256_set1_epi32( phash[4] );
|
||||
lyra2z_8way_block0_hash[5] = _mm256_set1_epi32( phash[5] );
|
||||
lyra2z_8way_block0_hash[6] = _mm256_set1_epi32( phash[6] );
|
||||
lyra2z_8way_block0_hash[7] = _mm256_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
lyra2z_8way_block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
|
||||
lyra2z_8way_block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
|
||||
lyra2z_8way_block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
|
||||
|
||||
// Partialy prehash second block without touching nonces
|
||||
blake256_8way_round0_prehash_le( lyra2z_8way_midstate_vars,
|
||||
lyra2z_8way_block0_hash, lyra2z_8way_block_buf );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void lyra2z_8way_hash( void *state, const void *midstate_vars,
|
||||
const void *midhash, const void *block )
|
||||
{
|
||||
@@ -260,6 +201,11 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
|
||||
__m256i block0_hash[8] __attribute__ ((aligned (64)));
|
||||
__m256i block_buf[16] __attribute__ ((aligned (64)));
|
||||
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
uint64_t *ptarget = (uint64_t*)work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
@@ -269,14 +215,23 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i eight = m256_const1_32( 8 );
|
||||
|
||||
pthread_rwlock_rdlock( &g_work_lock );
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0 );
|
||||
|
||||
memcpy( midstate_vars, lyra2z_8way_midstate_vars, sizeof midstate_vars );
|
||||
memcpy( block0_hash, lyra2z_8way_block0_hash, sizeof block0_hash );
|
||||
memcpy( block_buf, lyra2z_8way_block_buf, sizeof block_buf );
|
||||
block0_hash[0] = _mm256_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm256_set1_epi32( phash[1] );
|
||||
block0_hash[2] = _mm256_set1_epi32( phash[2] );
|
||||
block0_hash[3] = _mm256_set1_epi32( phash[3] );
|
||||
block0_hash[4] = _mm256_set1_epi32( phash[4] );
|
||||
block0_hash[5] = _mm256_set1_epi32( phash[5] );
|
||||
block0_hash[6] = _mm256_set1_epi32( phash[6] );
|
||||
block0_hash[7] = _mm256_set1_epi32( phash[7] );
|
||||
|
||||
pthread_rwlock_unlock( &g_work_lock );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
|
||||
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
|
||||
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
|
||||
block_buf[ 3] =
|
||||
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
|
||||
|
@@ -146,25 +146,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );
|
||||
|
||||
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
{ \
|
||||
__m128i t; \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
t = mm128_alignr_64( s7, s6, 1 ); \
|
||||
s6 = mm128_alignr_64( s6, s7, 1 ); \
|
||||
s7 = t; \
|
||||
t = mm128_alignr_64( s2, s3, 1 ); \
|
||||
s2 = mm128_alignr_64( s3, s2, 1 ); \
|
||||
s3 = t; \
|
||||
mm128_vrol256_64( s6, s7 ); \
|
||||
mm128_vror256_64( s2, s3 ); \
|
||||
G_2X64( s0, s2, s5, s6 ); \
|
||||
G_2X64( s1, s3, s4, s7 ); \
|
||||
t = mm128_alignr_64( s6, s7, 1 ); \
|
||||
s6 = mm128_alignr_64( s7, s6, 1 ); \
|
||||
s7 = t; \
|
||||
t = mm128_alignr_64( s3, s2, 1 ); \
|
||||
s2 = mm128_alignr_64( s2, s3, 1 ); \
|
||||
s3 = t; \
|
||||
}
|
||||
mm128_vror256_64( s6, s7 ); \
|
||||
mm128_vrol256_64( s2, s3 );
|
||||
|
||||
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
|
@@ -7,16 +7,8 @@
|
||||
|
||||
#if defined (SKEIN_8WAY)
|
||||
|
||||
static skein512_8way_context skein512_8way_ctx
|
||||
static __thread skein512_8way_context skein512_8way_ctx
|
||||
__attribute__ ((aligned (64)));
|
||||
static uint32_t skein_8way_vdata[20*8] __attribute__ ((aligned (64)));
|
||||
|
||||
int skein_8way_prehash( struct work *work )
|
||||
{
|
||||
mm512_bswap32_intrlv80_8x64( skein_8way_vdata, work->data );
|
||||
skein512_8way_prehash64( &skein512_8way_ctx, skein_8way_vdata );
|
||||
return 1;
|
||||
}
|
||||
|
||||
void skeinhash_8way( void *state, const void *input )
|
||||
{
|
||||
@@ -37,27 +29,25 @@ void skeinhash_8way( void *state, const void *input )
|
||||
int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash_d7 = &(hash[7*8]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t targ_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
__m512i *noncev = (__m512i*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
pthread_rwlock_rdlock( &g_work_lock );
|
||||
memcpy( vdata, skein_8way_vdata, sizeof vdata );
|
||||
pthread_rwlock_unlock( &g_work_lock );
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash_d7 = &(hash[7*8]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t targ_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
__m512i *noncev = (__m512i*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
*noncev = mm512_intrlv_blend_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev );
|
||||
skein512_8way_prehash64( &skein512_8way_ctx, vdata );
|
||||
do
|
||||
{
|
||||
skeinhash_8way( hash, vdata );
|
||||
@@ -84,16 +74,8 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#elif defined (SKEIN_4WAY)
|
||||
|
||||
static skein512_4way_context skein512_4way_ctx
|
||||
static __thread skein512_4way_context skein512_4way_ctx
|
||||
__attribute__ ((aligned (64)));
|
||||
static uint32_t skein_4way_vdata[20*4] __attribute__ ((aligned (64)));
|
||||
|
||||
int skein_4way_prehash( struct work *work )
|
||||
{
|
||||
mm256_bswap32_intrlv80_4x64( skein_4way_vdata, work->data );
|
||||
skein512_4way_prehash64( &skein512_4way_ctx, skein_4way_vdata );
|
||||
return 1;
|
||||
}
|
||||
|
||||
void skeinhash_4way( void *state, const void *input )
|
||||
{
|
||||
@@ -136,24 +118,23 @@ void skeinhash_4way( void *state, const void *input )
|
||||
int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash_d7 = &(hash[7<<2]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t targ_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncev = (__m256i*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash_d7 = &(hash[7<<2]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t targ_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncev = (__m256i*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
skein512_4way_prehash64( &skein512_4way_ctx, vdata );
|
||||
|
||||
pthread_rwlock_rdlock( &g_work_lock );
|
||||
memcpy( vdata, skein_4way_vdata, sizeof vdata );
|
||||
pthread_rwlock_unlock( &g_work_lock );
|
||||
|
||||
*noncev = mm256_intrlv_blend_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
|
@@ -7,12 +7,10 @@ bool register_skein_algo( algo_gate_t* gate )
|
||||
#if defined (SKEIN_8WAY)
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
gate->scanhash = (void*)&scanhash_skein_8way;
|
||||
gate->prehash = (void*)&skein_8way_prehash;
|
||||
gate->hash = (void*)&skeinhash_8way;
|
||||
#elif defined (SKEIN_4WAY)
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_skein_4way;
|
||||
gate->prehash = (void*)&skein_4way_prehash;
|
||||
gate->hash = (void*)&skeinhash_4way;
|
||||
#else
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
|
||||
@@ -27,12 +25,10 @@ bool register_skein2_algo( algo_gate_t* gate )
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
#if defined (SKEIN_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_skein2_8way;
|
||||
// gate->hash = (void*)&skein2hash_8way;
|
||||
gate->prehash = (void*)&skein2_8way_prehash;
|
||||
gate->hash = (void*)&skein2hash_8way;
|
||||
#elif defined (SKEIN_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_skein2_4way;
|
||||
// gate->hash = (void*)&skein2hash_4way;
|
||||
gate->prehash = (void*)&skein2_4way_prehash;
|
||||
gate->hash = (void*)&skein2hash_4way;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_skein2;
|
||||
gate->hash = (void*)&skein2hash;
|
||||
|
@@ -14,24 +14,20 @@
|
||||
void skeinhash_8way( void *output, const void *input );
|
||||
int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
int skein_8way_prehash( struct work * );
|
||||
|
||||
void skein2hash_8way( void *output, const void *input );
|
||||
int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t* hashes_done, struct thr_info *mythr );
|
||||
int skein2_8way_prehash( struct work * );
|
||||
|
||||
#elif defined(SKEIN_4WAY)
|
||||
|
||||
void skeinhash_4way( void *output, const void *input );
|
||||
int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
int skein_4way_prehash( struct work * );
|
||||
|
||||
void skein2hash_4way( void *output, const void *input );
|
||||
int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t* hashes_done, struct thr_info *mythr );
|
||||
int skein2_4way_prehash( struct work * );
|
||||
|
||||
#else
|
||||
|
||||
|
@@ -31,19 +31,18 @@ int scanhash_skein( struct work *work, uint32_t max_nonce,
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
skeinhash(hash64, endiandata);
|
||||
if (hash64[7] <= Htarg )
|
||||
if ( fulltest(hash64, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hash64, mythr );
|
||||
}
|
||||
if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return true;
|
||||
}
|
||||
n++;
|
||||
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
@@ -5,17 +5,9 @@
|
||||
|
||||
#if defined(SKEIN_8WAY)
|
||||
|
||||
static skein512_8way_context skein512_8way_ctx __attribute__ ((aligned (64)));
|
||||
static uint32_t skein2_8way_vdata[20*8] __attribute__ ((aligned (64)));
|
||||
static __thread skein512_8way_context skein512_8way_ctx
|
||||
__attribute__ ((aligned (64)));
|
||||
|
||||
int skein2_8way_prehash( struct work *work )
|
||||
{
|
||||
mm512_bswap32_intrlv80_8x64( skein2_8way_vdata, work->data );
|
||||
skein512_8way_prehash64( &skein512_8way_ctx, skein2_8way_vdata );
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* not used
|
||||
void skein2hash_8way( void *output, const void *input )
|
||||
{
|
||||
uint64_t hash[16*8] __attribute__ ((aligned (128)));
|
||||
@@ -25,7 +17,6 @@ void skein2hash_8way( void *output, const void *input )
|
||||
skein512_8way_final16( &ctx, hash, input + (64*8) );
|
||||
skein512_8way_full( &ctx, output, hash, 64 );
|
||||
}
|
||||
*/
|
||||
|
||||
int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
@@ -45,14 +36,11 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
|
||||
const bool bench = opt_benchmark;
|
||||
skein512_8way_context ctx;
|
||||
|
||||
pthread_rwlock_rdlock( &g_work_lock );
|
||||
memcpy( vdata, skein2_8way_vdata, sizeof vdata );
|
||||
memcpy( &ctx, &skein512_8way_ctx, sizeof ctx );
|
||||
pthread_rwlock_unlock( &g_work_lock );
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
*noncev = mm512_intrlv_blend_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev );
|
||||
skein512_8way_prehash64( &ctx, vdata );
|
||||
do
|
||||
{
|
||||
skein512_8way_final16( &ctx, hash, vdata + (16*8) );
|
||||
@@ -79,18 +67,10 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
|
||||
#elif defined(SKEIN_4WAY)
|
||||
|
||||
static skein512_4way_context skein512_4way_ctx __attribute__ ((aligned (64)));
|
||||
static uint32_t skein2_4way_vdata[20*4] __attribute__ ((aligned (64)));
|
||||
|
||||
int skein2_4way_prehash( struct work *work )
|
||||
{
|
||||
mm256_bswap32_intrlv80_4x64( skein2_4way_vdata, work->data );
|
||||
skein512_4way_prehash64( &skein512_4way_ctx, skein2_4way_vdata );
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* not used
|
||||
static __thread skein512_4way_context skein512_4way_ctx
|
||||
__attribute__ ((aligned (64)));
|
||||
|
||||
void skein2hash_4way( void *output, const void *input )
|
||||
{
|
||||
skein512_4way_context ctx;
|
||||
@@ -100,7 +80,6 @@ void skein2hash_4way( void *output, const void *input )
|
||||
skein512_4way_final16( &ctx, hash, input + (64*4) );
|
||||
skein512_4way_full( &ctx, output, hash, 64 );
|
||||
}
|
||||
*/
|
||||
|
||||
int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
@@ -120,11 +99,8 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
|
||||
const bool bench = opt_benchmark;
|
||||
skein512_4way_context ctx;
|
||||
|
||||
pthread_rwlock_rdlock( &g_work_lock );
|
||||
memcpy( vdata, skein2_4way_vdata, sizeof vdata );
|
||||
memcpy( &ctx, &skein512_4way_ctx, sizeof ctx );
|
||||
pthread_rwlock_unlock( &g_work_lock );
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
skein512_4way_prehash64( &ctx, vdata );
|
||||
*noncev = mm256_intrlv_blend_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
|
@@ -34,31 +34,31 @@ void skein2hash(void *output, const void *input)
|
||||
sph_skein512_close(&ctx_skein, hash);
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
|
||||
}
|
||||
|
||||
int scanhash_skein2( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t hash64[8] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__ ((aligned (64)));
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
skein2hash(hash64, endiandata);
|
||||
if (hash64[7] <= Htarg )
|
||||
if ( fulltest(hash64, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hash64, mythr );
|
||||
}
|
||||
if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return true;
|
||||
}
|
||||
n++;
|
||||
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
@@ -25,7 +25,7 @@ static void hex_getAlgoString(const uint32_t* prevblock, char *output)
|
||||
|
||||
static __thread x16r_context_overlay hex_ctx;
|
||||
|
||||
int hex_hash( void* output, const void* input, const int thrid )
|
||||
int hex_hash( void* output, const void* input, int thrid )
|
||||
{
|
||||
uint32_t _ALIGN(128) hash[16];
|
||||
x16r_context_overlay ctx;
|
||||
|
@@ -72,7 +72,7 @@ struct TortureGarden
|
||||
|
||||
// Get a 64-byte hash for given 64-byte input, using given TortureGarden contexts and given algo index
|
||||
static int get_hash( void *output, const void *input, TortureGarden *garden,
|
||||
unsigned int algo, const int thr_id )
|
||||
unsigned int algo, int thr_id )
|
||||
{
|
||||
unsigned char hash[64] __attribute__ ((aligned (64)));
|
||||
int rc = 1;
|
||||
@@ -233,7 +233,7 @@ bool initialize_torture_garden()
|
||||
}
|
||||
|
||||
// Produce a 32-byte hash from 80-byte input data
|
||||
int minotaur_hash( void *output, const void *input, const int thr_id )
|
||||
int minotaur_hash( void *output, const void *input, int thr_id )
|
||||
{
|
||||
unsigned char hash[64] __attribute__ ((aligned (64)));
|
||||
int rc = 1;
|
||||
|
@@ -19,7 +19,7 @@
|
||||
// Perform midstate prehash of hash functions with block size <= 72 bytes,
|
||||
// 76 bytes for hash functions that operate on 32 bit data.
|
||||
|
||||
void x16r_8way_do_prehash( void *vdata, const void *pdata )
|
||||
void x16r_8way_prehash( void *vdata, void *pdata )
|
||||
{
|
||||
uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t edata[20] __attribute__ ((aligned (64)));
|
||||
@@ -106,18 +106,11 @@ void x16r_8way_do_prehash( void *vdata, const void *pdata )
|
||||
}
|
||||
}
|
||||
|
||||
int x16r_8way_prehash( struct work *work )
|
||||
{
|
||||
x16r_gate_get_hash_order( work, x16r_hash_order );
|
||||
x16r_8way_do_prehash( x16r_8way_vdata, work->data );
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Perform the full x16r hash and returns 512 bit intermediate hash.
|
||||
// Called by wrapper hash function to optionally continue hashing and
|
||||
// convert to final hash.
|
||||
|
||||
int x16r_8way_hash_generic( void* output, const void* input, const int thrid )
|
||||
int x16r_8way_hash_generic( void* output, const void* input, int thrid )
|
||||
{
|
||||
uint32_t vhash[20*8] __attribute__ ((aligned (128)));
|
||||
uint32_t hash0[20] __attribute__ ((aligned (16)));
|
||||
@@ -478,7 +471,7 @@ int x16r_8way_hash_generic( void* output, const void* input, const int thrid )
|
||||
|
||||
// x16-r,-s,-rt wrapper called directly by scanhash to repackage 512 bit
|
||||
// hash to 256 bit final hash.
|
||||
int x16r_8way_hash( void* output, const void* input, const int thrid )
|
||||
int x16r_8way_hash( void* output, const void* input, int thrid )
|
||||
{
|
||||
uint8_t hash[64*8] __attribute__ ((aligned (128)));
|
||||
if ( !x16r_8way_hash_generic( hash, input, thrid ) )
|
||||
@@ -502,6 +495,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t hash[16*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t bedata1[2];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
@@ -514,16 +508,27 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
if ( bench ) ptarget[7] = 0x0cff;
|
||||
|
||||
pthread_rwlock_rdlock( &g_work_lock );
|
||||
memcpy( vdata, x16r_8way_vdata, sizeof vdata );
|
||||
pthread_rwlock_unlock( &g_work_lock );
|
||||
bedata1[0] = bswap_32( pdata[1] );
|
||||
bedata1[1] = bswap_32( pdata[2] );
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
const uint32_t ntime = bswap_32( pdata[17] );
|
||||
if ( s_ntime != ntime )
|
||||
{
|
||||
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
|
||||
s_ntime = ntime;
|
||||
|
||||
if ( opt_debug && !thr_id )
|
||||
applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
|
||||
}
|
||||
|
||||
x16r_8way_prehash( vdata, pdata );
|
||||
*noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
|
||||
n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
if( algo_gate.hash( hash, vdata, thr_id ) );
|
||||
if( x16r_8way_hash( hash, vdata, thr_id ) );
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
|
||||
{
|
||||
@@ -541,7 +546,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#elif defined (X16R_4WAY)
|
||||
|
||||
void x16r_4way_do_prehash( void *vdata, const void *pdata )
|
||||
void x16r_4way_prehash( void *vdata, void *pdata )
|
||||
{
|
||||
uint32_t vdata2[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t edata[20] __attribute__ ((aligned (64)));
|
||||
@@ -622,14 +627,7 @@ void x16r_4way_do_prehash( void *vdata, const void *pdata )
|
||||
}
|
||||
}
|
||||
|
||||
int x16r_4way_prehash( struct work *work )
|
||||
{
|
||||
x16r_gate_get_hash_order( work, x16r_hash_order );
|
||||
x16r_4way_do_prehash( x16r_4way_vdata, work->data );
|
||||
return 1;
|
||||
}
|
||||
|
||||
int x16r_4way_hash_generic( void* output, const void* input, const int thrid )
|
||||
int x16r_4way_hash_generic( void* output, const void* input, int thrid )
|
||||
{
|
||||
uint32_t vhash[20*4] __attribute__ ((aligned (128)));
|
||||
uint32_t hash0[20] __attribute__ ((aligned (32)));
|
||||
@@ -637,14 +635,13 @@ int x16r_4way_hash_generic( void* output, const void* input, const int thrid )
|
||||
uint32_t hash2[20] __attribute__ ((aligned (32)));
|
||||
uint32_t hash3[20] __attribute__ ((aligned (32)));
|
||||
x16r_4way_context_overlay ctx;
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
void *in0 = (void*) hash0;
|
||||
void *in1 = (void*) hash1;
|
||||
void *in2 = (void*) hash2;
|
||||
void *in3 = (void*) hash3;
|
||||
int size = 80;
|
||||
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
|
||||
|
||||
for ( int i = 0; i < 16; i++ )
|
||||
@@ -908,7 +905,7 @@ int x16r_4way_hash_generic( void* output, const void* input, const int thrid )
|
||||
return 1;
|
||||
}
|
||||
|
||||
int x16r_4way_hash( void* output, const void* input, const int thrid )
|
||||
int x16r_4way_hash( void* output, const void* input, int thrid )
|
||||
{
|
||||
uint8_t hash[64*4] __attribute__ ((aligned (64)));
|
||||
if ( !x16r_4way_hash_generic( hash, input, thrid ) )
|
||||
@@ -927,6 +924,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t hash[16*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t bedata1[2];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
@@ -939,15 +937,25 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
if ( bench ) ptarget[7] = 0x0cff;
|
||||
|
||||
pthread_rwlock_rdlock( &g_work_lock );
|
||||
memcpy( vdata, x16r_4way_vdata, sizeof vdata );
|
||||
pthread_rwlock_unlock( &g_work_lock );
|
||||
bedata1[0] = bswap_32( pdata[1] );
|
||||
bedata1[1] = bswap_32( pdata[2] );
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
const uint32_t ntime = bswap_32( pdata[17] );
|
||||
if ( s_ntime != ntime )
|
||||
{
|
||||
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
|
||||
s_ntime = ntime;
|
||||
if ( opt_debug && !thr_id )
|
||||
applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
|
||||
}
|
||||
|
||||
x16r_4way_prehash( vdata, pdata );
|
||||
*noncev = mm256_intrlv_blend_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
if ( algo_gate.hash( hash, vdata, thr_id ) );
|
||||
if ( x16r_4way_hash( hash, vdata, thr_id ) );
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
|
||||
{
|
||||
|
@@ -1,44 +1,26 @@
|
||||
#include "x16r-gate.h"
|
||||
#include "algo/sha/sha256d.h"
|
||||
|
||||
char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = {0};
|
||||
__thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = { 0 };
|
||||
|
||||
void (*x16r_gate_get_hash_order) ( const struct work *, char * ) = NULL;
|
||||
void (*x16_r_s_getAlgoString) ( const uint8_t*, char* ) = NULL;
|
||||
|
||||
#if defined (X16R_8WAY)
|
||||
|
||||
x16r_8way_context_overlay x16r_ctx;
|
||||
uint32_t x16r_8way_vdata[24*8] __attribute__ ((aligned (64)));
|
||||
__thread x16r_8way_context_overlay x16r_ctx;
|
||||
|
||||
#elif defined (X16R_4WAY)
|
||||
|
||||
x16r_4way_context_overlay x16r_ctx;
|
||||
uint32_t x16r_4way_vdata[24*4] __attribute__ ((aligned (64)));
|
||||
|
||||
__thread x16r_4way_context_overlay x16r_ctx;
|
||||
|
||||
#endif
|
||||
|
||||
#if defined (X16RV2_8WAY)
|
||||
__thread x16r_context_overlay x16_ctx;
|
||||
|
||||
x16rv2_8way_context_overlay x16rv2_ctx;
|
||||
|
||||
#elif defined (X16RV2_4WAY)
|
||||
|
||||
x16rv2_4way_context_overlay x16rv2_ctx;
|
||||
|
||||
#endif
|
||||
|
||||
x16r_context_overlay x16_ctx;
|
||||
uint32_t x16r_edata[24] __attribute__ ((aligned (32)));
|
||||
|
||||
void x16r_get_hash_order( const struct work *work, char *hash_order )
|
||||
void x16r_getAlgoString( const uint8_t* prevblock, char *output )
|
||||
{
|
||||
char *sptr = hash_order;
|
||||
const uint32_t *pdata = work->data;
|
||||
uint8_t prevblock[16];
|
||||
((uint32_t*)prevblock)[0] = bswap_32( pdata[1] );
|
||||
((uint32_t*)prevblock)[1] = bswap_32( pdata[2] );
|
||||
|
||||
char *sptr = output;
|
||||
for ( int j = 0; j < X16R_HASH_FUNC_COUNT; j++ )
|
||||
{
|
||||
uint8_t b = (15 - j) >> 1; // 16 first ascii hex chars (lsb in uint256)
|
||||
@@ -50,51 +32,38 @@ void x16r_get_hash_order( const struct work *work, char *hash_order )
|
||||
sptr++;
|
||||
}
|
||||
*sptr = '\0';
|
||||
|
||||
if ( !opt_quiet )
|
||||
applog( LOG_INFO, "Hash order %s", x16r_hash_order );
|
||||
}
|
||||
|
||||
void x16s_get_hash_order( const struct work *work, char *hash_order )
|
||||
|
||||
void x16s_getAlgoString( const uint8_t* prevblock, char *output )
|
||||
{
|
||||
const uint32_t *pdata = work->data;
|
||||
uint8_t prevblock[16];
|
||||
((uint32_t*)prevblock)[0] = bswap_32( pdata[1] );
|
||||
((uint32_t*)prevblock)[1] = bswap_32( pdata[2] );
|
||||
strcpy( hash_order, "0123456789ABCDEF" );
|
||||
strcpy( output, "0123456789ABCDEF" );
|
||||
for ( int i = 0; i < 16; i++ )
|
||||
{
|
||||
uint8_t b = (15 - i) >> 1; // 16 ascii hex chars, reversed
|
||||
uint8_t algoDigit = (i & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4;
|
||||
int offset = algoDigit;
|
||||
// insert the nth character at the front
|
||||
char oldVal = hash_order[ offset ];
|
||||
char oldVal = output[offset];
|
||||
for( int j = offset; j-- > 0; )
|
||||
hash_order[ j+1 ] = hash_order[ j ];
|
||||
hash_order[ 0 ] = oldVal;
|
||||
output[j+1] = output[j];
|
||||
output[0] = oldVal;
|
||||
}
|
||||
|
||||
if ( !opt_quiet )
|
||||
applog( LOG_INFO, "Hash order %s", x16r_hash_order );
|
||||
}
|
||||
|
||||
bool register_x16r_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X16R_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16r_8way;
|
||||
gate->prehash = (void*)&x16r_8way_prehash;
|
||||
gate->hash = (void*)&x16r_8way_hash;
|
||||
#elif defined (X16R_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16r_4way;
|
||||
gate->prehash = (void*)&x16r_4way_prehash;
|
||||
gate->hash = (void*)&x16r_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_x16r;
|
||||
gate->prehash = (void*)&x16r_prehash;
|
||||
gate->hash = (void*)&x16r_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
x16r_gate_get_hash_order = (void*)&x16r_get_hash_order;
|
||||
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
@@ -102,20 +71,17 @@ bool register_x16r_algo( algo_gate_t* gate )
|
||||
bool register_x16rv2_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X16RV2_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16r_8way;
|
||||
gate->prehash = (void*)&x16rv2_8way_prehash;
|
||||
gate->scanhash = (void*)&scanhash_x16rv2_8way;
|
||||
gate->hash = (void*)&x16rv2_8way_hash;
|
||||
#elif defined (X16RV2_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16r_4way;
|
||||
gate->prehash = (void*)&x16rv2_4way_prehash;
|
||||
gate->scanhash = (void*)&scanhash_x16rv2_4way;
|
||||
gate->hash = (void*)&x16rv2_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_x16r;
|
||||
gate->prehash = (void*)&x16rv2_prehash;
|
||||
gate->scanhash = (void*)&scanhash_x16rv2;
|
||||
gate->hash = (void*)&x16rv2_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
x16r_gate_get_hash_order = (void*)&x16r_get_hash_order;
|
||||
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
@@ -124,19 +90,16 @@ bool register_x16s_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X16R_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16r_8way;
|
||||
gate->prehash = (void*)&x16r_8way_prehash;
|
||||
gate->hash = (void*)&x16r_8way_hash;
|
||||
#elif defined (X16R_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16r_4way;
|
||||
gate->prehash = (void*)&x16r_4way_prehash;
|
||||
gate->hash = (void*)&x16r_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_x16r;
|
||||
gate->prehash = (void*)&x16r_prehash;
|
||||
gate->hash = (void*)&x16r_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
x16r_gate_get_hash_order = (void*)&x16s_get_hash_order;
|
||||
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
@@ -145,33 +108,30 @@ bool register_x16s_algo( algo_gate_t* gate )
|
||||
//
|
||||
// X16RT
|
||||
|
||||
void x16rt_get_hash_order( const struct work * work, char * hash_order )
|
||||
{
|
||||
uint32_t _ALIGN(64) timehash[8*8];
|
||||
const uint32_t ntime = bswap_32( work->data[17] );
|
||||
const int32_t masked_ntime = ntime & 0xffffff80;
|
||||
uint8_t* data = (uint8_t*)timehash;
|
||||
char *sptr = hash_order;
|
||||
|
||||
sha256d( (unsigned char*)timehash, (const unsigned char*)( &masked_ntime ),
|
||||
sizeof( masked_ntime ) );
|
||||
void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash )
|
||||
{
|
||||
int32_t maskedTime = timeStamp & 0xffffff80;
|
||||
sha256d( (unsigned char*)timeHash, (const unsigned char*)( &maskedTime ),
|
||||
sizeof( maskedTime ) );
|
||||
}
|
||||
|
||||
for ( uint8_t j = 0; j < X16R_HASH_FUNC_COUNT; j++ )
|
||||
{
|
||||
void x16rt_getAlgoString( const uint32_t *timeHash, char *output)
|
||||
{
|
||||
char *sptr = output;
|
||||
uint8_t* data = (uint8_t*)timeHash;
|
||||
|
||||
for (uint8_t j = 0; j < X16R_HASH_FUNC_COUNT; j++) {
|
||||
uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed
|
||||
uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4;
|
||||
|
||||
if ( algoDigit >= 10 )
|
||||
sprintf( sptr, "%c", 'A' + (algoDigit - 10) );
|
||||
if (algoDigit >= 10)
|
||||
sprintf(sptr, "%c", 'A' + (algoDigit - 10));
|
||||
else
|
||||
sprintf( sptr, "%u", (uint32_t) algoDigit );
|
||||
sprintf(sptr, "%u", (uint32_t) algoDigit);
|
||||
sptr++;
|
||||
}
|
||||
*sptr = '\0';
|
||||
|
||||
if ( !opt_quiet )
|
||||
applog( LOG_INFO, "Hash order %s, ntime %08x, time hash %08x",
|
||||
hash_order, ntime, timehash );
|
||||
}
|
||||
|
||||
void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
@@ -262,19 +222,15 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
bool register_x16rt_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X16R_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16r_8way;
|
||||
gate->prehash = (void*)&x16r_8way_prehash;
|
||||
gate->scanhash = (void*)&scanhash_x16rt_8way;
|
||||
gate->hash = (void*)&x16r_8way_hash;
|
||||
#elif defined (X16R_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16r_4way;
|
||||
gate->prehash = (void*)&x16r_4way_prehash;
|
||||
gate->scanhash = (void*)&scanhash_x16rt_4way;
|
||||
gate->hash = (void*)&x16r_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_x16r;
|
||||
gate->prehash = (void*)&x16r_prehash;
|
||||
gate->scanhash = (void*)&scanhash_x16rt;
|
||||
gate->hash = (void*)&x16r_hash;
|
||||
#endif
|
||||
x16r_gate_get_hash_order = (void*)&x16rt_get_hash_order;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
@@ -283,20 +239,16 @@ bool register_x16rt_algo( algo_gate_t* gate )
|
||||
bool register_x16rt_veil_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X16R_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16r_8way;
|
||||
gate->prehash = (void*)&x16r_8way_prehash;
|
||||
gate->scanhash = (void*)&scanhash_x16rt_8way;
|
||||
gate->hash = (void*)&x16r_8way_hash;
|
||||
#elif defined (X16R_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16r_4way;
|
||||
gate->prehash = (void*)&x16r_4way_prehash;
|
||||
gate->scanhash = (void*)&scanhash_x16rt_4way;
|
||||
gate->hash = (void*)&x16r_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_x16r;
|
||||
gate->prehash = (void*)&x16r_prehash;
|
||||
gate->scanhash = (void*)&scanhash_x16rt;
|
||||
gate->hash = (void*)&x16r_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
x16r_gate_get_hash_order = (void*)&x16rt_get_hash_order;
|
||||
gate->build_extraheader = (void*)&veil_build_extraheader;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
@@ -323,23 +275,20 @@ bool register_hex_algo( algo_gate_t* gate )
|
||||
bool register_x21s_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X16R_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16r_8way;
|
||||
gate->prehash = (void*)&x16r_8way_prehash;
|
||||
gate->scanhash = (void*)&scanhash_x21s_8way;
|
||||
gate->hash = (void*)&x21s_8way_hash;
|
||||
gate->miner_thread_init = (void*)&x21s_8way_thread_init;
|
||||
#elif defined (X16R_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16r_4way;
|
||||
gate->prehash = (void*)&x16r_4way_prehash;
|
||||
gate->scanhash = (void*)&scanhash_x21s_4way;
|
||||
gate->hash = (void*)&x21s_4way_hash;
|
||||
gate->miner_thread_init = (void*)&x21s_4way_thread_init;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_x16r;
|
||||
gate->prehash = (void*)&x16r_prehash;
|
||||
gate->scanhash = (void*)&scanhash_x21s;
|
||||
gate->hash = (void*)&x21s_hash;
|
||||
gate->miner_thread_init = (void*)&x21s_thread_init;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
x16r_gate_get_hash_order = (void*)&x16s_get_hash_order;
|
||||
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
|
@@ -21,7 +21,6 @@
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/tiger/sph_tiger.h"
|
||||
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
@@ -58,11 +57,13 @@
|
||||
|
||||
#define X16R_8WAY 1
|
||||
#define X16RV2_8WAY 1
|
||||
#define X16RT_8WAY 1
|
||||
#define X21S_8WAY 1
|
||||
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
|
||||
#define X16RV2_4WAY 1
|
||||
#define X16RT_4WAY 1
|
||||
#define X21S_4WAY 1
|
||||
#define X16R_4WAY 1
|
||||
|
||||
@@ -88,29 +89,23 @@ enum x16r_Algo {
|
||||
X16R_HASH_FUNC_COUNT
|
||||
};
|
||||
|
||||
extern __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ];
|
||||
|
||||
//extern __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ];
|
||||
extern char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ];
|
||||
|
||||
|
||||
extern void (*x16r_gate_get_hash_order) ( const struct work *, char * );
|
||||
|
||||
// x16r, x16rv2
|
||||
void x16r_get_hash_order( const struct work *, char * );
|
||||
// x16s, x21s
|
||||
void x16s_get_hash_order( const struct work *, char * );
|
||||
// x16rt
|
||||
void x16rt_get_hash_order( const struct work *, char * );
|
||||
extern void (*x16_r_s_getAlgoString) ( const uint8_t*, char* );
|
||||
void x16r_getAlgoString( const uint8_t *prevblock, char *output );
|
||||
void x16s_getAlgoString( const uint8_t *prevblock, char *output );
|
||||
void x16rt_getAlgoString( const uint32_t *timeHash, char *output );
|
||||
|
||||
void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash );
|
||||
|
||||
bool register_x16r_algo( algo_gate_t* gate );
|
||||
bool register_x16rv2_algo( algo_gate_t* gate );
|
||||
bool register_x16s_algo( algo_gate_t* gate );
|
||||
bool register_x16rt_algo( algo_gate_t* gate );
|
||||
bool register_hex_algo( algo_gate_t* gate );
|
||||
bool register_x21s_algo( algo_gate_t* gate );
|
||||
bool register_hex__algo( algo_gate_t* gate );
|
||||
bool register_x21s__algo( algo_gate_t* gate );
|
||||
|
||||
// x16r, x16s, x16rt
|
||||
// x16r, x16s
|
||||
#if defined(X16R_8WAY)
|
||||
|
||||
union _x16r_8way_context_overlay
|
||||
@@ -141,15 +136,15 @@ union _x16r_8way_context_overlay
|
||||
|
||||
typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;
|
||||
|
||||
extern x16r_8way_context_overlay x16r_ctx;
|
||||
extern uint32_t x16r_8way_vdata[24*8] __attribute__ ((aligned (64)));
|
||||
extern __thread x16r_8way_context_overlay x16r_ctx;
|
||||
|
||||
void x16r_8way_do_prehash( void *, const void * );
|
||||
int x16r_8way_prehash( struct work * );
|
||||
int x16r_8way_hash_generic( void *, const void *, const int );
|
||||
int x16r_8way_hash( void *, const void *, const int );
|
||||
void x16r_8way_prehash( void *, void * );
|
||||
int x16r_8way_hash_generic( void *, const void *, int );
|
||||
int x16r_8way_hash( void *, const void *, int );
|
||||
int scanhash_x16r_8way( struct work *, uint32_t ,
|
||||
uint64_t *, struct thr_info * );
|
||||
extern __thread x16r_8way_context_overlay x16r_ctx;
|
||||
|
||||
|
||||
#elif defined(X16R_4WAY)
|
||||
|
||||
@@ -182,15 +177,14 @@ union _x16r_4way_context_overlay
|
||||
|
||||
typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;
|
||||
|
||||
extern x16r_4way_context_overlay x16r_ctx;
|
||||
extern uint32_t x16r_4way_vdata[24*4] __attribute__ ((aligned (64)));
|
||||
extern __thread x16r_4way_context_overlay x16r_ctx;
|
||||
|
||||
void x16r_4way_do_prehash( void *, const void * );
|
||||
int x16r_4way_prehash( struct work * );
|
||||
int x16r_4way_hash_generic( void *, const void *, const int );
|
||||
int x16r_4way_hash( void *, const void *, const int );
|
||||
void x16r_4way_prehash( void *, void * );
|
||||
int x16r_4way_hash_generic( void *, const void *, int );
|
||||
int x16r_4way_hash( void *, const void *, int );
|
||||
int scanhash_x16r_4way( struct work *, uint32_t,
|
||||
uint64_t *, struct thr_info * );
|
||||
extern __thread x16r_4way_context_overlay x16r_ctx;
|
||||
|
||||
#endif
|
||||
|
||||
@@ -223,113 +217,80 @@ union _x16r_context_overlay
|
||||
|
||||
typedef union _x16r_context_overlay x16r_context_overlay;
|
||||
|
||||
extern x16r_context_overlay x16_ctx;
|
||||
extern uint32_t x16r_edata[24] __attribute__ ((aligned (32)));
|
||||
extern __thread x16r_context_overlay x16_ctx;
|
||||
|
||||
void x16r_do_prehash( const void * );
|
||||
int x16r_prehash( const struct work * );
|
||||
int x16r_hash_generic( void *, const void *, const int );
|
||||
int x16r_hash( void *, const void *, const int );
|
||||
void x16r_prehash( void *, void * );
|
||||
int x16r_hash_generic( void *, const void *, int );
|
||||
int x16r_hash( void *, const void *, int );
|
||||
int scanhash_x16r( struct work *, uint32_t, uint64_t *, struct thr_info * );
|
||||
|
||||
// x16Rv2
|
||||
#if defined(X16RV2_8WAY)
|
||||
|
||||
union _x16rv2_8way_context_overlay
|
||||
{
|
||||
blake512_8way_context blake;
|
||||
bmw512_8way_context bmw;
|
||||
skein512_8way_context skein;
|
||||
jh512_8way_context jh;
|
||||
keccak512_8way_context keccak;
|
||||
luffa_4way_context luffa;
|
||||
cubehashParam cube;
|
||||
simd_4way_context simd;
|
||||
hamsi512_8way_context hamsi;
|
||||
hashState_fugue fugue;
|
||||
shabal512_8way_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
sha512_8way_context sha512;
|
||||
sph_tiger_context tiger;
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_context groestl;
|
||||
shavite512_4way_context shavite;
|
||||
echo_4way_context echo;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
shavite512_context shavite;
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
} __attribute__ ((aligned (64)));
|
||||
|
||||
typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay;
|
||||
extern x16rv2_8way_context_overlay x16rv2_ctx;
|
||||
|
||||
int x16rv2_8way_prehash( struct work * );
|
||||
int x16rv2_8way_hash( void *state, const void *input, const int thrid );
|
||||
//int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done, struct thr_info *mythr );
|
||||
int x16rv2_8way_hash( void *state, const void *input, int thrid );
|
||||
int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(X16RV2_4WAY)
|
||||
|
||||
union _x16rv2_4way_context_overlay
|
||||
{
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
#if defined(__VAES__)
|
||||
groestl512_2way_context groestl;
|
||||
shavite512_2way_context shavite;
|
||||
echo_2way_context echo;
|
||||
int x16rv2_4way_hash( void *state, const void *input, int thrid );
|
||||
int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
shavite512_context shavite;
|
||||
hashState_echo echo;
|
||||
|
||||
int x16rv2_hash( void *state, const void *input, int thr_id );
|
||||
int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
luffa_2way_context luffa;
|
||||
cubehashParam cube;
|
||||
simd_2way_context simd;
|
||||
hamsi512_4way_context hamsi;
|
||||
hashState_fugue fugue;
|
||||
shabal512_4way_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
sha512_4way_context sha512;
|
||||
sph_tiger_context tiger;
|
||||
};
|
||||
|
||||
typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay;
|
||||
extern x16rv2_4way_context_overlay x16rv2_ctx;
|
||||
// x16rt, veil
|
||||
#if defined(X16R_8WAY)
|
||||
|
||||
int x16rv2_4way_hash( void *state, const void *input, const int thrid );
|
||||
int x16rv2_4way_prehash( struct work * );
|
||||
//void x16rt_8way_hash( void *state, const void *input );
|
||||
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(X16R_4WAY)
|
||||
|
||||
//void x16rt_4way_hash( void *state, const void *input );
|
||||
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#else
|
||||
|
||||
int x16rv2_hash( void *state, const void *input, const int thr_id );
|
||||
int x16rv2_prehash( const struct work * );
|
||||
//void x16rt_hash( void *state, const void *input );
|
||||
int scanhash_x16rt( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
// x21s
|
||||
#if defined(X16R_8WAY)
|
||||
|
||||
int x21s_8way_hash( void *state, const void *input, const int thrid );
|
||||
int x21s_8way_hash( void *state, const void *input, int thrid );
|
||||
int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool x21s_8way_thread_init();
|
||||
|
||||
#elif defined(X16R_4WAY)
|
||||
|
||||
int x21s_4way_hash( void *state, const void *input, const int thrid );
|
||||
int x21s_4way_hash( void *state, const void *input, int thrid );
|
||||
int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool x21s_4way_thread_init();
|
||||
|
||||
#else
|
||||
|
||||
int x21s_hash( void *state, const void *input, const int thr_id );
|
||||
int x21s_hash( void *state, const void *input, int thr_id );
|
||||
int scanhash_x21s( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool x21s_thread_init();
|
||||
|
||||
#endif
|
||||
|
||||
//void hex_hash( void *state, const void *input );
|
||||
int scanhash_hex( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
|
@@ -10,7 +10,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
void x16r_do_prehash( const void *edata )
|
||||
void x16r_prehash( void *edata, void *pdata )
|
||||
{
|
||||
const char elem = x16r_hash_order[0];
|
||||
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
|
||||
@@ -48,7 +48,7 @@ void x16r_do_prehash( const void *edata )
|
||||
}
|
||||
}
|
||||
|
||||
int x16r_hash_generic( void* output, const void* input, const int thrid )
|
||||
int x16r_hash_generic( void* output, const void* input, int thrid )
|
||||
{
|
||||
uint32_t _ALIGN(128) hash[16];
|
||||
x16r_context_overlay ctx;
|
||||
@@ -192,15 +192,7 @@ int x16r_hash_generic( void* output, const void* input, const int thrid )
|
||||
return true;
|
||||
}
|
||||
|
||||
int x16r_prehash( const struct work *work )
|
||||
{
|
||||
mm128_bswap32_80( x16r_edata, work->data );
|
||||
x16r_gate_get_hash_order( work, x16r_hash_order );
|
||||
x16r_do_prehash( x16r_edata );
|
||||
return 1;
|
||||
}
|
||||
|
||||
int x16r_hash( void* output, const void* input, const int thrid )
|
||||
int x16r_hash( void* output, const void* input, int thrid )
|
||||
{
|
||||
uint8_t hash[64] __attribute__ ((aligned (64)));
|
||||
if ( !x16r_hash_generic( hash, input, thrid ) )
|
||||
@@ -213,8 +205,8 @@ int x16r_hash( void* output, const void* input, const int thrid )
|
||||
int scanhash_x16r( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(32) hash32[8];
|
||||
uint32_t _ALIGN(32) edata[20];
|
||||
uint32_t _ALIGN(128) hash32[8];
|
||||
uint32_t _ALIGN(128) edata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
@@ -224,14 +216,24 @@ int scanhash_x16r( struct work *work, uint32_t max_nonce,
|
||||
const bool bench = opt_benchmark;
|
||||
if ( bench ) ptarget[7] = 0x0cff;
|
||||
|
||||
pthread_rwlock_rdlock( &g_work_lock );
|
||||
memcpy( edata, x16r_edata, sizeof edata );
|
||||
pthread_rwlock_unlock( &g_work_lock );
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
if ( s_ntime != pdata[17] )
|
||||
{
|
||||
uint32_t ntime = swab32(pdata[17]);
|
||||
x16_r_s_getAlgoString( (const uint8_t*)(&edata[1]), x16r_hash_order );
|
||||
s_ntime = ntime;
|
||||
if ( opt_debug && !thr_id )
|
||||
applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
|
||||
}
|
||||
|
||||
x16r_prehash( edata, pdata );
|
||||
|
||||
do
|
||||
{
|
||||
edata[19] = nonce;
|
||||
if ( algo_gate.hash( hash32, edata, thr_id ) )
|
||||
if ( x16r_hash( hash32, edata, thr_id ) )
|
||||
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( nonce );
|
||||
|
113
algo/x16/x16rt-4way.c
Normal file
113
algo/x16/x16rt-4way.c
Normal file
@@ -0,0 +1,113 @@
|
||||
#include "x16r-gate.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#if defined (X16R_8WAY)
|
||||
|
||||
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
uint32_t hash[16*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t _ALIGN(64) timeHash[8*8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const int thr_id = mythr->id;
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
if ( bench ) ptarget[7] = 0x0cff;
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
|
||||
if ( s_ntime != masked_ntime )
|
||||
{
|
||||
x16rt_getTimeHash( masked_ntime, &timeHash );
|
||||
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
|
||||
s_ntime = masked_ntime;
|
||||
if ( !thr_id )
|
||||
applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
|
||||
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
|
||||
}
|
||||
|
||||
x16r_8way_prehash( vdata, pdata );
|
||||
*noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
|
||||
n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
if ( x16r_8way_hash( hash, vdata, thr_id ) )
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n+i );
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev,
|
||||
m512_const1_64( 0x0000000800000000 ) );
|
||||
n += 8;
|
||||
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined (X16R_4WAY)
|
||||
|
||||
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
uint32_t hash[4*16] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t _ALIGN(64) timeHash[4*8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
if ( bench ) ptarget[7] = 0x0cff;
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
|
||||
if ( s_ntime != masked_ntime )
|
||||
{
|
||||
x16rt_getTimeHash( masked_ntime, &timeHash );
|
||||
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
|
||||
s_ntime = masked_ntime;
|
||||
if ( !thr_id )
|
||||
applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
|
||||
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
|
||||
}
|
||||
|
||||
x16r_4way_prehash( vdata, pdata );
|
||||
*noncev = mm256_intrlv_blend_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
if ( x16r_4way_hash( hash, vdata, thr_id ) )
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n+i );
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev,
|
||||
m256_const1_64( 0x0000000400000000 ) );
|
||||
n += 4;
|
||||
} while ( ( n < last_nonce ) && !(*restart) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
53
algo/x16/x16rt.c
Normal file
53
algo/x16/x16rt.c
Normal file
@@ -0,0 +1,53 @@
|
||||
#include "x16r-gate.h"
|
||||
|
||||
#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
|
||||
|
||||
int scanhash_x16rt( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(128) hash32[8];
|
||||
uint32_t _ALIGN(128) edata[20];
|
||||
uint32_t _ALIGN(64) timeHash[8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const int thr_id = mythr->id;
|
||||
uint32_t nonce = first_nonce;
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
const bool bench = opt_benchmark;
|
||||
if ( bench ) ptarget[7] = 0x0cff;
|
||||
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
uint32_t masked_ntime = swab32( pdata[17] ) & 0xffffff80;
|
||||
if ( s_ntime != masked_ntime )
|
||||
{
|
||||
x16rt_getTimeHash( masked_ntime, &timeHash );
|
||||
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
|
||||
s_ntime = masked_ntime;
|
||||
if ( opt_debug && !thr_id )
|
||||
applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
|
||||
x16r_hash_order, swab32( pdata[17] ), timeHash );
|
||||
}
|
||||
|
||||
x16r_prehash( edata, pdata );
|
||||
|
||||
do
|
||||
{
|
||||
edata[19] = nonce;
|
||||
if ( x16r_hash( hash32, edata, thr_id ) )
|
||||
if ( valid_hash( hash32, ptarget ) && !bench )
|
||||
{
|
||||
pdata[19] = bswap_32( nonce );
|
||||
submit_solution( work, hash32, mythr );
|
||||
}
|
||||
nonce++;
|
||||
} while ( nonce < max_nonce && !(*restart) );
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // !defined(X16R_8WAY) && !defined(X16R_4WAY)
|
||||
|
@@ -12,73 +12,37 @@
|
||||
|
||||
#if defined (X16RV2_8WAY)
|
||||
|
||||
void x16rv2_8way_do_prehash( void *vdata, void *pdata )
|
||||
union _x16rv2_8way_context_overlay
|
||||
{
|
||||
uint32_t vdata32[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t edata[20] __attribute__ ((aligned (64)));
|
||||
blake512_8way_context blake;
|
||||
bmw512_8way_context bmw;
|
||||
skein512_8way_context skein;
|
||||
jh512_8way_context jh;
|
||||
keccak512_8way_context keccak;
|
||||
luffa_4way_context luffa;
|
||||
cubehashParam cube;
|
||||
simd_4way_context simd;
|
||||
hamsi512_8way_context hamsi;
|
||||
hashState_fugue fugue;
|
||||
shabal512_8way_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
sha512_8way_context sha512;
|
||||
sph_tiger_context tiger;
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_context groestl;
|
||||
shavite512_4way_context shavite;
|
||||
echo_4way_context echo;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
shavite512_context shavite;
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
} __attribute__ ((aligned (64)));
|
||||
|
||||
const char elem = x16r_hash_order[0];
|
||||
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
|
||||
typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay;
|
||||
static __thread x16rv2_8way_context_overlay x16rv2_ctx;
|
||||
|
||||
switch ( algo )
|
||||
{
|
||||
case JH:
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
jh512_8way_init( &x16rv2_ctx.jh );
|
||||
jh512_8way_update( &x16rv2_ctx.jh, vdata, 64 );
|
||||
break;
|
||||
case KECCAK:
|
||||
case LUFFA:
|
||||
case SHA_512:
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
sph_tiger_init( &x16rv2_ctx.tiger );
|
||||
sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
|
||||
intrlv_8x64( vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 640 );
|
||||
break;
|
||||
case SKEIN:
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
skein512_8way_init( &x16rv2_ctx.skein );
|
||||
skein512_8way_update( &x16rv2_ctx.skein, vdata, 64 );
|
||||
break;
|
||||
case CUBEHASH:
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 );
|
||||
intrlv_8x64( vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 640 );
|
||||
break;
|
||||
case HAMSI:
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
hamsi512_8way_init( &x16rv2_ctx.hamsi );
|
||||
hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 );
|
||||
break;
|
||||
case SHABAL:
|
||||
mm256_bswap32_intrlv80_8x32( vdata32, pdata );
|
||||
shabal512_8way_init( &x16rv2_ctx.shabal );
|
||||
shabal512_8way_update( &x16rv2_ctx.shabal, vdata32, 64 );
|
||||
rintrlv_8x32_8x64( vdata, vdata32, 640 );
|
||||
break;
|
||||
case WHIRLPOOL:
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
sph_whirlpool_init( &x16rv2_ctx.whirlpool );
|
||||
sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 );
|
||||
intrlv_8x64( vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 640 );
|
||||
break;
|
||||
default:
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
}
|
||||
}
|
||||
|
||||
int x16rv2_8way_prehash( struct work *work )
|
||||
{
|
||||
x16r_gate_get_hash_order( work, x16r_hash_order );
|
||||
x16rv2_8way_do_prehash( x16r_8way_vdata, work->data );
|
||||
return 1;
|
||||
}
|
||||
|
||||
int x16rv2_8way_hash( void* output, const void* input, const int thrid )
|
||||
int x16rv2_8way_hash( void* output, const void* input, int thrid )
|
||||
{
|
||||
uint32_t vhash[24*8] __attribute__ ((aligned (128)));
|
||||
uint32_t hash0[24] __attribute__ ((aligned (32)));
|
||||
@@ -593,28 +557,50 @@ int x16rv2_8way_hash( void* output, const void* input, const int thrid )
|
||||
return 1;
|
||||
}
|
||||
|
||||
#elif defined (X16RV2_4WAY)
|
||||
|
||||
// Pad the 24 bytes tiger hash to 64 bytes
|
||||
inline void padtiger512( uint32_t* hash )
|
||||
int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
for ( int i = 6; i < 16; i++ ) hash[i] = 0;
|
||||
}
|
||||
|
||||
void x16rv2_4way_do_prehash( void *vdata, void *pdata )
|
||||
{
|
||||
uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[16*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t edata[20] __attribute__ ((aligned (64)));
|
||||
uint32_t bedata1[2] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const int thr_id = mythr->id;
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
if ( bench ) ptarget[7] = 0x0cff;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
|
||||
bedata1[0] = bswap_32( pdata[1] );
|
||||
bedata1[1] = bswap_32( pdata[2] );
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
const uint32_t ntime = bswap_32( pdata[17] );
|
||||
if ( s_ntime != ntime )
|
||||
{
|
||||
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
|
||||
s_ntime = ntime;
|
||||
if ( opt_debug && !thr_id )
|
||||
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
|
||||
}
|
||||
|
||||
// Do midstate prehash on hash functions with block size <= 64 bytes.
|
||||
const char elem = x16r_hash_order[0];
|
||||
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
|
||||
|
||||
switch ( algo )
|
||||
{
|
||||
case JH:
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
jh512_4way_init( &x16rv2_ctx.jh );
|
||||
jh512_4way_update( &x16rv2_ctx.jh, vdata, 64 );
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
jh512_8way_init( &x16rv2_ctx.jh );
|
||||
jh512_8way_update( &x16rv2_ctx.jh, vdata, 64 );
|
||||
break;
|
||||
case KECCAK:
|
||||
case LUFFA:
|
||||
@@ -622,45 +608,100 @@ void x16rv2_4way_do_prehash( void *vdata, void *pdata )
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
sph_tiger_init( &x16rv2_ctx.tiger );
|
||||
sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
|
||||
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
|
||||
intrlv_8x64( vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 640 );
|
||||
break;
|
||||
case SKEIN:
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
skein512_4way_prehash64( &x16r_ctx.skein, vdata );
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
skein512_8way_init( &x16rv2_ctx.skein );
|
||||
skein512_8way_update( &x16rv2_ctx.skein, vdata, 64 );
|
||||
break;
|
||||
case CUBEHASH:
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 );
|
||||
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
|
||||
intrlv_8x64( vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 640 );
|
||||
break;
|
||||
case HAMSI:
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
hamsi512_4way_init( &x16rv2_ctx.hamsi );
|
||||
hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 );
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
hamsi512_8way_init( &x16rv2_ctx.hamsi );
|
||||
hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 );
|
||||
break;
|
||||
case SHABAL:
|
||||
mm128_bswap32_intrlv80_4x32( vdata32, pdata );
|
||||
shabal512_4way_init( &x16rv2_ctx.shabal );
|
||||
shabal512_4way_update( &x16rv2_ctx.shabal, vdata32, 64 );
|
||||
rintrlv_4x32_4x64( vdata, vdata32, 640 );
|
||||
mm256_bswap32_intrlv80_8x32( vdata2, pdata );
|
||||
shabal512_8way_init( &x16rv2_ctx.shabal );
|
||||
shabal512_8way_update( &x16rv2_ctx.shabal, vdata2, 64 );
|
||||
rintrlv_8x32_8x64( vdata, vdata2, 640 );
|
||||
break;
|
||||
case WHIRLPOOL:
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
sph_whirlpool_init( &x16rv2_ctx.whirlpool );
|
||||
sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 );
|
||||
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
|
||||
intrlv_8x64( vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 640 );
|
||||
break;
|
||||
default:
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
}
|
||||
}
|
||||
|
||||
*noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
|
||||
n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
if ( x16rv2_8way_hash( hash, vdata, thr_id ) )
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n+i );
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev,
|
||||
m512_const1_64( 0x0000000800000000 ) );
|
||||
n += 8;
|
||||
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int x16rv2_4way_prehash( struct work *work )
|
||||
#elif defined (X16RV2_4WAY)
|
||||
|
||||
union _x16rv2_4way_context_overlay
|
||||
{
|
||||
x16r_gate_get_hash_order( work, x16r_hash_order );
|
||||
x16rv2_4way_do_prehash( x16r_4way_vdata, work->data );
|
||||
return 1;
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
#if defined(__VAES__)
|
||||
groestl512_2way_context groestl;
|
||||
shavite512_2way_context shavite;
|
||||
echo_2way_context echo;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
shavite512_context shavite;
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
luffa_2way_context luffa;
|
||||
cubehashParam cube;
|
||||
simd_2way_context simd;
|
||||
hamsi512_4way_context hamsi;
|
||||
hashState_fugue fugue;
|
||||
shabal512_4way_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
sha512_4way_context sha512;
|
||||
sph_tiger_context tiger;
|
||||
};
|
||||
typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay;
|
||||
|
||||
static __thread x16rv2_4way_context_overlay x16rv2_ctx;
|
||||
|
||||
// Pad the 24 bytes tiger hash to 64 bytes
|
||||
inline void padtiger512( uint32_t* hash )
|
||||
{
|
||||
for ( int i = 6; i < 16; i++ ) hash[i] = 0;
|
||||
}
|
||||
|
||||
int x16rv2_4way_hash( void* output, const void* input, int thrid )
|
||||
@@ -1007,4 +1048,107 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
|
||||
return 1;
|
||||
}
|
||||
|
||||
int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
uint32_t hash[4*16] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t edata[20];
|
||||
uint32_t bedata1[2];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
__m256i *noncev = (__m256i*)vdata + 9;
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
if ( bench ) ptarget[7] = 0x0fff;
|
||||
|
||||
bedata1[0] = bswap_32( pdata[1] );
|
||||
bedata1[1] = bswap_32( pdata[2] );
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
const uint32_t ntime = bswap_32(pdata[17]);
|
||||
if ( s_ntime != ntime )
|
||||
{
|
||||
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
|
||||
s_ntime = ntime;
|
||||
if ( opt_debug && !thr_id )
|
||||
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
|
||||
}
|
||||
|
||||
// Do midstate prehash on hash functions with block size <= 64 bytes.
|
||||
const char elem = x16r_hash_order[0];
|
||||
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
|
||||
switch ( algo )
|
||||
{
|
||||
case JH:
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
jh512_4way_init( &x16rv2_ctx.jh );
|
||||
jh512_4way_update( &x16rv2_ctx.jh, vdata, 64 );
|
||||
break;
|
||||
case KECCAK:
|
||||
case LUFFA:
|
||||
case SHA_512:
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
sph_tiger_init( &x16rv2_ctx.tiger );
|
||||
sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
|
||||
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
|
||||
break;
|
||||
case SKEIN:
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
skein512_4way_prehash64( &x16r_ctx.skein, vdata );
|
||||
break;
|
||||
case CUBEHASH:
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 );
|
||||
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
|
||||
break;
|
||||
case HAMSI:
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
hamsi512_4way_init( &x16rv2_ctx.hamsi );
|
||||
hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 );
|
||||
break;
|
||||
case SHABAL:
|
||||
mm128_bswap32_intrlv80_4x32( vdata32, pdata );
|
||||
shabal512_4way_init( &x16rv2_ctx.shabal );
|
||||
shabal512_4way_update( &x16rv2_ctx.shabal, vdata32, 64 );
|
||||
rintrlv_4x32_4x64( vdata, vdata32, 640 );
|
||||
break;
|
||||
case WHIRLPOOL:
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
sph_whirlpool_init( &x16rv2_ctx.whirlpool );
|
||||
sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 );
|
||||
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
|
||||
break;
|
||||
default:
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
}
|
||||
|
||||
*noncev = mm256_intrlv_blend_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
|
||||
do
|
||||
{
|
||||
if ( x16rv2_4way_hash( hash, vdata, thr_id ) )
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n+i );
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev,
|
||||
m256_const1_64( 0x0000000400000000 ) );
|
||||
n += 4;
|
||||
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -43,16 +43,9 @@ inline void padtiger512(uint32_t* hash) {
|
||||
for (int i = (24/4); i < (64/4); i++) hash[i] = 0;
|
||||
}
|
||||
|
||||
// no prehash
|
||||
int x16rv2_prehash( const struct work *work )
|
||||
int x16rv2_hash( void* output, const void* input, int thrid )
|
||||
{
|
||||
x16r_gate_get_hash_order( work, x16r_hash_order );
|
||||
return 1;
|
||||
}
|
||||
|
||||
int x16rv2_hash( void* output, const void* input, const int thrid )
|
||||
{
|
||||
uint32_t _ALIGN(32) hash[16];
|
||||
uint32_t _ALIGN(128) hash[16];
|
||||
x16rv2_context_overlay ctx;
|
||||
void *in = (void*) input;
|
||||
int size = 80;
|
||||
@@ -177,4 +170,52 @@ int x16rv2_hash( void* output, const void* input, const int thrid )
|
||||
return 1;
|
||||
}
|
||||
|
||||
int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(128) hash32[8];
|
||||
uint32_t _ALIGN(128) edata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const int thr_id = mythr->id;
|
||||
uint32_t nonce = first_nonce;
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
|
||||
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
|
||||
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
|
||||
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
|
||||
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
if ( s_ntime != pdata[17] )
|
||||
{
|
||||
uint32_t ntime = swab32(pdata[17]);
|
||||
x16_r_s_getAlgoString( (const uint8_t*) (&edata[1]), x16r_hash_order );
|
||||
s_ntime = ntime;
|
||||
if ( opt_debug && !thr_id )
|
||||
applog( LOG_DEBUG, "hash order %s (%08x)",
|
||||
x16r_hash_order, ntime );
|
||||
}
|
||||
|
||||
if ( bench ) ptarget[7] = 0x0cff;
|
||||
|
||||
do
|
||||
{
|
||||
edata[19] = nonce;
|
||||
if ( x16rv2_hash( hash32, edata, thr_id ) )
|
||||
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( nonce );
|
||||
submit_solution( work, hash32, mythr );
|
||||
}
|
||||
nonce++;
|
||||
} while ( nonce < max_nonce && !(*restart) );
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -30,7 +30,7 @@ union _x21s_8way_context_overlay
|
||||
|
||||
typedef union _x21s_8way_context_overlay x21s_8way_context_overlay;
|
||||
|
||||
int x21s_8way_hash( void* output, const void* input, const int thrid )
|
||||
int x21s_8way_hash( void* output, const void* input, int thrid )
|
||||
{
|
||||
uint32_t vhash[16*8] __attribute__ ((aligned (128)));
|
||||
uint8_t shash[64*8] __attribute__ ((aligned (64)));
|
||||
@@ -129,6 +129,66 @@ int x21s_8way_hash( void* output, const void* input, const int thrid )
|
||||
return 1;
|
||||
}
|
||||
|
||||
int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
uint32_t hash[16*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &hash[7<<3];
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t bedata1[2] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const int thr_id = mythr->id;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
if ( bench ) ptarget[7] = 0x0cff;
|
||||
|
||||
bedata1[0] = bswap_32( pdata[1] );
|
||||
bedata1[1] = bswap_32( pdata[2] );
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
uint32_t ntime = bswap_32( pdata[17] );
|
||||
if ( s_ntime != ntime )
|
||||
{
|
||||
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
|
||||
s_ntime = ntime;
|
||||
if ( opt_debug && !thr_id )
|
||||
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
|
||||
}
|
||||
|
||||
x16r_8way_prehash( vdata, pdata );
|
||||
*noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
|
||||
n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
if ( x21s_8way_hash( hash, vdata, thr_id ) )
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev,
|
||||
m512_const1_64( 0x0000000800000000 ) );
|
||||
n += 8;
|
||||
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool x21s_8way_thread_init()
|
||||
{
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
|
||||
@@ -155,7 +215,7 @@ union _x21s_4way_context_overlay
|
||||
|
||||
typedef union _x21s_4way_context_overlay x21s_4way_context_overlay;
|
||||
|
||||
int x21s_4way_hash( void* output, const void* input, const int thrid )
|
||||
int x21s_4way_hash( void* output, const void* input, int thrid )
|
||||
{
|
||||
uint32_t vhash[16*4] __attribute__ ((aligned (64)));
|
||||
uint8_t shash[64*4] __attribute__ ((aligned (64)));
|
||||
@@ -231,6 +291,58 @@ int x21s_4way_hash( void* output, const void* input, const int thrid )
|
||||
return 1;
|
||||
}
|
||||
|
||||
int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
uint32_t hash[16*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t bedata1[2] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
|
||||
if ( bench ) ptarget[7] = 0x0cff;
|
||||
|
||||
bedata1[0] = bswap_32( pdata[1] );
|
||||
bedata1[1] = bswap_32( pdata[2] );
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
uint32_t ntime = bswap_32( pdata[17] );
|
||||
if ( s_ntime != ntime )
|
||||
{
|
||||
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
|
||||
s_ntime = ntime;
|
||||
if ( opt_debug && !thr_id )
|
||||
applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
|
||||
}
|
||||
|
||||
x16r_4way_prehash( vdata, pdata );
|
||||
*noncev = mm256_intrlv_blend_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
if ( x21s_4way_hash( hash, vdata, thr_id ) )
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n+i );
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev,
|
||||
m256_const1_64( 0x0000000400000000 ) );
|
||||
n += 4;
|
||||
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool x21s_4way_thread_init()
|
||||
{
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
|
||||
|
@@ -27,7 +27,7 @@ union _x21s_context_overlay
|
||||
};
|
||||
typedef union _x21s_context_overlay x21s_context_overlay;
|
||||
|
||||
int x21s_hash( void* output, const void* input, const int thrid )
|
||||
int x21s_hash( void* output, const void* input, int thrid )
|
||||
{
|
||||
uint32_t _ALIGN(128) hash[16];
|
||||
x21s_context_overlay ctx;
|
||||
@@ -57,6 +57,50 @@ int x21s_hash( void* output, const void* input, const int thrid )
|
||||
return 1;
|
||||
}
|
||||
|
||||
int scanhash_x21s( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(128) hash32[8];
|
||||
uint32_t _ALIGN(128) edata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const int thr_id = mythr->id;
|
||||
uint32_t nonce = first_nonce;
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
const bool bench = opt_benchmark;
|
||||
if ( bench ) ptarget[7] = 0x0cff;
|
||||
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
if ( s_ntime != pdata[17] )
|
||||
{
|
||||
uint32_t ntime = swab32(pdata[17]);
|
||||
x16_r_s_getAlgoString( (const uint8_t*)(&edata[1]), x16r_hash_order );
|
||||
s_ntime = ntime;
|
||||
if ( opt_debug && !thr_id )
|
||||
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
|
||||
}
|
||||
|
||||
x16r_prehash( edata, pdata );
|
||||
|
||||
do
|
||||
{
|
||||
edata[19] = nonce;
|
||||
if ( x21s_hash( hash32, edata, thr_id ) )
|
||||
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( nonce );
|
||||
submit_solution( work, hash32, mythr );
|
||||
}
|
||||
nonce++;
|
||||
} while ( nonce < max_nonce && !(*restart) );
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool x21s_thread_init()
|
||||
{
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
|
||||
|
@@ -31,26 +31,8 @@
|
||||
|
||||
yespower_params_t yespower_params;
|
||||
|
||||
// master g_work
|
||||
sha256_context yespower_sha256_prehash_ctx;
|
||||
uint32_t _ALIGN(64) yespower_endiandata[20];
|
||||
|
||||
// local work
|
||||
__thread sha256_context sha256_prehash_ctx;
|
||||
|
||||
|
||||
int yespower_sha256_prehash( struct work *work )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
|
||||
for ( int k = 0; k < 19; k++ )
|
||||
be32enc( &yespower_endiandata[k], pdata[k] );
|
||||
|
||||
sha256_ctx_init( &yespower_sha256_prehash_ctx );
|
||||
sha256_update( &yespower_sha256_prehash_ctx, yespower_endiandata, 64 );
|
||||
|
||||
return 1;
|
||||
}
|
||||
// YESPOWER
|
||||
|
||||
int yespower_hash( const char *input, char *output, uint32_t len, int thrid )
|
||||
@@ -71,15 +53,14 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce,
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
|
||||
// pthread_rwlock_rdlock( &g_work_lock );
|
||||
|
||||
memcpy( endiandata, yespower_endiandata, sizeof endiandata );
|
||||
memcpy( &sha256_prehash_ctx, &yespower_sha256_prehash_ctx, sizeof sha256_prehash_ctx );
|
||||
|
||||
// pthread_rwlock_unlock( &g_work_lock );
|
||||
|
||||
for ( int k = 0; k < 19; k++ )
|
||||
be32enc( &endiandata[k], pdata[k] );
|
||||
endiandata[19] = n;
|
||||
|
||||
// do sha256 prehash
|
||||
sha256_ctx_init( &sha256_prehash_ctx );
|
||||
sha256_update( &sha256_prehash_ctx, endiandata, 64 );
|
||||
|
||||
do {
|
||||
if ( yespower_hash( (char*)endiandata, (char*)vhash, 80, thr_id ) )
|
||||
if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark )
|
||||
@@ -159,7 +140,6 @@ bool register_yespower_algo( algo_gate_t* gate )
|
||||
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yespower;
|
||||
gate->prehash = (void*)&yespower_sha256_prehash;
|
||||
gate->hash = (void*)&yespower_hash;
|
||||
opt_target_factor = 65536.0;
|
||||
return true;
|
||||
@@ -174,7 +154,6 @@ bool register_yespowerr16_algo( algo_gate_t* gate )
|
||||
yespower_params.perslen = 0;
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yespower;
|
||||
gate->prehash = (void*)&yespower_sha256_prehash;
|
||||
gate->hash = (void*)&yespower_hash;
|
||||
opt_target_factor = 65536.0;
|
||||
return true;
|
||||
@@ -186,7 +165,6 @@ bool register_yescrypt_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yespower;
|
||||
gate->prehash = (void*)&yespower_sha256_prehash;
|
||||
yespower_params.version = YESPOWER_0_5;
|
||||
opt_target_factor = 65536.0;
|
||||
|
||||
@@ -220,7 +198,6 @@ bool register_yescryptr8_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yespower;
|
||||
gate->prehash = (void*)&yespower_sha256_prehash;
|
||||
yespower_params.version = YESPOWER_0_5;
|
||||
yespower_params.N = 2048;
|
||||
yespower_params.r = 8;
|
||||
@@ -234,7 +211,6 @@ bool register_yescryptr16_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yespower;
|
||||
gate->prehash = (void*)&yespower_sha256_prehash;
|
||||
yespower_params.version = YESPOWER_0_5;
|
||||
yespower_params.N = 4096;
|
||||
yespower_params.r = 16;
|
||||
@@ -248,7 +224,6 @@ bool register_yescryptr32_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yespower;
|
||||
gate->prehash = (void*)&yespower_sha256_prehash;
|
||||
yespower_params.version = YESPOWER_0_5;
|
||||
yespower_params.N = 4096;
|
||||
yespower_params.r = 32;
|
||||
|
@@ -80,8 +80,6 @@ extern yespower_params_t yespower_params;
|
||||
|
||||
extern __thread sha256_context sha256_prehash_ctx;
|
||||
|
||||
int yespower_sha256_prehash( struct work *work );
|
||||
|
||||
/**
|
||||
* yespower_init_local(local):
|
||||
* Initialize the thread-local (RAM) data structure. Actual memory allocation
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.21.3.
|
||||
# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.21.3.1.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
|
||||
@@ -608,8 +608,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.21.3'
|
||||
PACKAGE_STRING='cpuminer-opt 3.21.3'
|
||||
PACKAGE_VERSION='3.21.3.1'
|
||||
PACKAGE_STRING='cpuminer-opt 3.21.3.1'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.21.3 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.21.3.1 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1432,7 +1432,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.21.3:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.21.3.1:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1538,7 +1538,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.21.3
|
||||
cpuminer-opt configure 3.21.3.1
|
||||
generated by GNU Autoconf 2.71
|
||||
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.21.3, which was
|
||||
It was created by cpuminer-opt $as_me 3.21.3.1, which was
|
||||
generated by GNU Autoconf 2.71. Invocation command line was
|
||||
|
||||
$ $0$ac_configure_args_raw
|
||||
@@ -3593,7 +3593,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.21.3'
|
||||
VERSION='3.21.3.1'
|
||||
|
||||
|
||||
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
|
||||
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.21.3, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.21.3.1, which was
|
||||
generated by GNU Autoconf 2.71. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config='$ac_cs_config_escaped'
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.21.3
|
||||
cpuminer-opt config.status 3.21.3.1
|
||||
configured by $0, generated by GNU Autoconf 2.71,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.21.3])
|
||||
AC_INIT([cpuminer-opt], [3.21.3.1])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
77
cpu-miner.c
77
cpu-miner.c
@@ -37,7 +37,6 @@
|
||||
#include <curl/curl.h>
|
||||
#include <jansson.h>
|
||||
#include <openssl/sha.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "sysinfos.c"
|
||||
#include "algo/sha/sha256d.h"
|
||||
|
||||
@@ -318,9 +317,8 @@ static void affine_to_cpu( struct thr_info *thr )
|
||||
if ( !ok )
|
||||
{
|
||||
last_error = GetLastError();
|
||||
if ( !thread )
|
||||
applog( LOG_WARNING, "Set affinity returned error 0x%x for thread %d",
|
||||
last_error, thread );
|
||||
applog( LOG_WARNING, "affine_to_cpu_mask for %u returned 0x%x",
|
||||
thread, last_error );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1727,9 +1725,9 @@ static bool workio_get_work( struct workio_cmd *wc, CURL *curl )
|
||||
struct work *ret_work;
|
||||
int failures = 0;
|
||||
|
||||
ret_work = (struct work*) _mm_malloc( sizeof(*ret_work), 32 );
|
||||
if ( !ret_work ) return false;
|
||||
memset( ret_work, 0, sizeof(*ret_work) );
|
||||
ret_work = (struct work*) calloc( 1, sizeof(*ret_work) );
|
||||
if ( !ret_work )
|
||||
return false;
|
||||
|
||||
/* obtain new work from bitcoin via JSON-RPC */
|
||||
while ( !get_upstream_work( curl, ret_work ) )
|
||||
@@ -1738,23 +1736,22 @@ static bool workio_get_work( struct workio_cmd *wc, CURL *curl )
|
||||
{
|
||||
applog( LOG_ERR, "json_rpc_call failed, terminating workio thread" );
|
||||
free( ret_work );
|
||||
return false;
|
||||
return false;
|
||||
}
|
||||
|
||||
/* pause, then restart work-request loop */
|
||||
applog( LOG_ERR, "json_rpc_call failed, retry after %d seconds",
|
||||
opt_fail_pause );
|
||||
applog( LOG_ERR, "json_rpc_call failed, retry after %d seconds",
|
||||
opt_fail_pause );
|
||||
sleep( opt_fail_pause );
|
||||
}
|
||||
|
||||
/* send work to requesting thread */
|
||||
if ( !tq_push(wc->thr->q, ret_work ) )
|
||||
free( ret_work );
|
||||
free( ret_work );
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static bool workio_submit_work(struct workio_cmd *wc, CURL *curl)
|
||||
{
|
||||
int failures = 0;
|
||||
@@ -1973,15 +1970,15 @@ static bool wanna_mine(int thr_id)
|
||||
float temp = cpu_temp(0);
|
||||
if (temp > opt_max_temp)
|
||||
{
|
||||
if ( !thr_id && !conditional_state[thr_id] && !opt_quiet )
|
||||
applog(LOG_NOTICE, "CPU temp too high: %.0fC max %.0f, waiting...", temp, opt_max_temp );
|
||||
state = false;
|
||||
if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
|
||||
applog(LOG_INFO, "temperature too high (%.0fC), waiting...", temp);
|
||||
state = false;
|
||||
}
|
||||
}
|
||||
if (opt_max_diff > 0.0 && net_diff > opt_max_diff)
|
||||
{
|
||||
if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
|
||||
applog(LOG_NOTICE, "network diff too high, waiting...");
|
||||
applog(LOG_INFO, "network diff too high, waiting...");
|
||||
state = false;
|
||||
}
|
||||
if (opt_max_rate > 0.0 && net_hashrate > opt_max_rate)
|
||||
@@ -1990,14 +1987,12 @@ static bool wanna_mine(int thr_id)
|
||||
{
|
||||
char rate[32];
|
||||
format_hashrate(opt_max_rate, rate);
|
||||
applog(LOG_NOTICE, "network hashrate too high (%s), waiting...", rate);
|
||||
applog(LOG_INFO, "network hashrate too high, waiting %s...", rate);
|
||||
}
|
||||
state = false;
|
||||
}
|
||||
|
||||
if ( conditional_state[thr_id] && state && !thr_id && !opt_quiet )
|
||||
applog(LOG_NOTICE, "...resuming" );
|
||||
conditional_state[thr_id] = (uint8_t) !state;
|
||||
if (thr_id < MAX_CPUS)
|
||||
conditional_state[thr_id] = (uint8_t) !state;
|
||||
return state;
|
||||
}
|
||||
|
||||
@@ -2122,10 +2117,6 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
|
||||
t++ );
|
||||
|
||||
g_work_time = time(NULL);
|
||||
|
||||
// Do midstate prehash
|
||||
algo_gate.prehash( g_work );
|
||||
|
||||
restart_threads();
|
||||
|
||||
pthread_mutex_unlock( &sctx->work_lock );
|
||||
@@ -2149,7 +2140,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
|
||||
else if ( g_work->job_id && new_job )
|
||||
applog( LOG_BLUE, "New Work: Block %d, Net diff %.5g, Job %s",
|
||||
sctx->block_height, net_diff, g_work->job_id );
|
||||
else if ( opt_debug )
|
||||
else if ( !opt_quiet )
|
||||
{
|
||||
unsigned char *xnonce2str = bebin2hex( g_work->xnonce2,
|
||||
g_work->xnonce2_len );
|
||||
@@ -2344,9 +2335,6 @@ static void *miner_thread( void *userdata )
|
||||
goto out;
|
||||
}
|
||||
g_work_time = time(NULL);
|
||||
|
||||
// do midstate prehash
|
||||
algo_gate.prehash( &g_work );
|
||||
restart_threads();
|
||||
}
|
||||
|
||||
@@ -2366,14 +2354,6 @@ static void *miner_thread( void *userdata )
|
||||
if ( unlikely( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) ) )
|
||||
continue;
|
||||
|
||||
// conditional mining
|
||||
if ( unlikely( !wanna_mine( thr_id ) ) )
|
||||
{
|
||||
restart_threads();
|
||||
sleep(5);
|
||||
continue;
|
||||
}
|
||||
|
||||
// opt_scantime expressed in hashes
|
||||
max64 = opt_scantime * thr_hashrates[thr_id];
|
||||
|
||||
@@ -2520,6 +2500,14 @@ static void *miner_thread( void *userdata )
|
||||
}
|
||||
}
|
||||
} // benchmark
|
||||
|
||||
// conditional mining
|
||||
if ( unlikely( !wanna_mine( thr_id ) ) )
|
||||
{
|
||||
sleep(5);
|
||||
continue;
|
||||
}
|
||||
|
||||
} // miner_thread loop
|
||||
|
||||
out:
|
||||
@@ -3694,7 +3682,7 @@ int main(int argc, char *argv[])
|
||||
|
||||
#if defined(WIN32)
|
||||
|
||||
// Get the number of cpus, display after parsing command line
|
||||
// Are Windows CPU Groups supported?
|
||||
#if defined(WINDOWS_CPU_GROUPS_ENABLED)
|
||||
num_cpus = 0;
|
||||
num_cpugroups = GetActiveProcessorGroupCount();
|
||||
@@ -3703,8 +3691,8 @@ int main(int argc, char *argv[])
|
||||
int cpus = GetActiveProcessorCount( i );
|
||||
num_cpus += cpus;
|
||||
|
||||
// if (opt_debug)
|
||||
// applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
|
||||
if (opt_debug)
|
||||
applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
|
||||
}
|
||||
|
||||
#else
|
||||
@@ -3721,7 +3709,7 @@ int main(int argc, char *argv[])
|
||||
sysctl(req, 2, &num_cpus, &len, NULL, 0);
|
||||
#else
|
||||
num_cpus = 1;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if ( num_cpus < 1 )
|
||||
num_cpus = 1;
|
||||
@@ -3873,11 +3861,6 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(WIN32) && defined(WINDOWS_CPU_GROUPS_ENABLED)
|
||||
if ( !opt_quiet )
|
||||
applog( LOG_INFO, "Found %d CPUs in %d groups", num_cpus, num_cpugroups );
|
||||
#endif
|
||||
|
||||
if ( opt_affinity && num_cpus > max_cpus )
|
||||
{
|
||||
applog( LOG_WARNING, "More than %d CPUs, CPU affinity is disabled",
|
||||
@@ -3889,7 +3872,7 @@ int main(int argc, char *argv[])
|
||||
{
|
||||
for ( int thr = 0, cpu = 0; thr < opt_n_threads; thr++, cpu++ )
|
||||
{
|
||||
while ( !( ( opt_affinity >> ( cpu & 63 ) ) & 1ULL ) ) cpu++;
|
||||
while ( !( ( opt_affinity >> ( cpu&63 ) ) & 1ULL ) ) cpu++;
|
||||
thread_affinity_map[ thr ] = cpu % num_cpus;
|
||||
}
|
||||
if ( !opt_quiet )
|
||||
|
4
miner.h
4
miner.h
@@ -392,8 +392,8 @@ float cpu_temp( int core );
|
||||
|
||||
struct work
|
||||
{
|
||||
uint32_t data[48] __attribute__ ((aligned (64)));
|
||||
uint32_t target[8] __attribute__ ((aligned (32)));
|
||||
uint32_t target[8] __attribute__ ((aligned (64)));
|
||||
uint32_t data[48] __attribute__ ((aligned (64)));
|
||||
double targetdiff;
|
||||
double sharediff;
|
||||
double stratum_diff;
|
||||
|
@@ -470,7 +470,7 @@ static inline void mm128_intrlv_4x32x( void *dst, void *src0, void *src1,
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
static inline void mm128_bswap32_80( void *d, const void *s )
|
||||
static inline void mm128_bswap32_80( void *d, void *s )
|
||||
{
|
||||
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), bswap_shuf );
|
||||
@@ -482,7 +482,7 @@ static inline void mm128_bswap32_80( void *d, const void *s )
|
||||
|
||||
#else
|
||||
|
||||
static inline void mm128_bswap32_80( void *d, const void *s )
|
||||
static inline void mm128_bswap32_80( void *d, void *s )
|
||||
{
|
||||
( (uint32_t*)d )[ 0] = bswap_32( ( (uint32_t*)s )[ 0] );
|
||||
( (uint32_t*)d )[ 1] = bswap_32( ( (uint32_t*)s )[ 1] );
|
||||
|
@@ -385,7 +385,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define mm128_rol_var_32( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
/* Not used
|
||||
//
|
||||
// Limited 2 input shuffle, combines shuffle with blend. The destination low
|
||||
// half is always taken from v1, and the high half from v2.
|
||||
#define mm128_shuffle2_64( v1, v2, c ) \
|
||||
@@ -395,7 +395,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define mm128_shuffle2_32( v1, v2, c ) \
|
||||
_mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
|
||||
_mm_castsi128_ps( v2 ), c ) );
|
||||
*/
|
||||
|
||||
//
|
||||
// Rotate vector elements accross all lanes
|
||||
@@ -407,7 +406,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 )
|
||||
#define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 )
|
||||
|
||||
/* Not used
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
// Rotate right by c bytes, no SSE2 equivalent.
|
||||
@@ -415,7 +413,6 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
{ return _mm_alignr_epi8( v, v, c ); }
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit rotations
|
||||
// for multiples of 8 bits. Uses ror/rol macros when AVX512 is available
|
||||
@@ -558,25 +555,68 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
v2 = _mm_xor_si128( v1, v2 ); \
|
||||
v1 = _mm_xor_si128( v1, v2 );
|
||||
|
||||
// Concatenate { hi, lo }, rotate right by c elements and return low 128 bits.
|
||||
|
||||
// alignr for 32 & 64 bit elements is only available with AVX512 but
|
||||
// emulated here. Shift argument is not needed, it's always 1.
|
||||
// Behaviour is otherwise consistent with Intel alignr intrinsics.
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
// _mm_alignr_epi32 & _mm_alignr_epi64 are only available with AVX512VL but
|
||||
// are emulated here using _mm_alignr_epi8. There are no fast equivalents for
|
||||
// 256 bit vectors, though there is no for this functionality.
|
||||
|
||||
#define mm128_alignr_64( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*8 )
|
||||
#define mm128_alignr_32( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*4 )
|
||||
#define mm128_alignr_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
|
||||
#define mm128_alignr_32( v1, v2 ) _mm_alignr_epi8( v1, v2, 4 )
|
||||
|
||||
#else
|
||||
|
||||
#define mm128_alignr_64( hi, lo, c ) \
|
||||
_mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )
|
||||
#define mm128_alignr_64( v1, v2 ) _mm_or_si128( _mm_slli_si128( v1, 8 ), \
|
||||
_mm_srli_si128( v2, 8 ) )
|
||||
|
||||
#define mm128_alignr_32( hi, lo, c ) \
|
||||
_mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )
|
||||
#define mm128_alignr_32( v1, v2 ) _mm_or_si128( _mm_slli_si128( v1, 4 ), \
|
||||
_mm_srli_si128( v2, 4 ) )
|
||||
|
||||
#endif
|
||||
|
||||
// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
|
||||
// vrol & vror are deprecated and do not exist for larger vectors.
|
||||
// Their only use is by lyra2 blake2b when AVX2 is not available and is
|
||||
// grandfathered.
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
#define mm128_vror256_64( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
|
||||
v1 = _mm_alignr_epi8( v2, v1, 8 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vrol256_64( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
|
||||
v2 = _mm_alignr_epi8( v2, v1, 8 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#else // SSE2
|
||||
|
||||
#define mm128_vror256_64( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
|
||||
_mm_slli_si128( v2, 8 ) ); \
|
||||
v2 = _mm_or_si128( _mm_srli_si128( v2, 8 ), \
|
||||
_mm_slli_si128( v1, 8 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_vrol256_64( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
|
||||
_mm_srli_si128( v2, 8 ) ); \
|
||||
v2 = _mm_or_si128( _mm_slli_si128( v2, 8 ), \
|
||||
_mm_srli_si128( v1, 8 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#endif // SSE4.1 else SSE2
|
||||
|
||||
#endif // __SSE2__
|
||||
#endif // SIMD_128_H__
|
||||
|
@@ -239,8 +239,8 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
|
||||
// Mask making
|
||||
// Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
|
||||
// Create a 64 or 32 bit integer mask from MSB of 64 or 32 bit elements.
|
||||
// Effectively a sign test: if (mask[n]) then -1 else 0.
|
||||
// Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
|
||||
// Effectively a sign test.
|
||||
|
||||
#define mm256_movmask_64( v ) \
|
||||
_mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) )
|
||||
@@ -348,7 +348,7 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
_mm256_or_si256( _mm256_slli_epi16( v, c ), \
|
||||
_mm256_srli_epi16( v, 16-(c) ) )
|
||||
|
||||
// Deprecated. Obsolete sm3, the only user, is grandfathered.
|
||||
// Deprecated.
|
||||
#define mm256_rol_var_32( v, c ) \
|
||||
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
|
||||
_mm256_srli_epi32( v, 32-(c) ) )
|
||||
@@ -391,7 +391,6 @@ static inline __m256i mm256_shufll_32( const __m256i v )
|
||||
//
|
||||
// Rotate elements within each 128 bit lane of 256 bit vector.
|
||||
|
||||
/* Not used
|
||||
// Limited 2 input shuffle
|
||||
#define mm256_shuffle2_64( v1, v2, c ) \
|
||||
_mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( v1 ), \
|
||||
@@ -400,7 +399,6 @@ static inline __m256i mm256_shufll_32( const __m256i v )
|
||||
#define mm256_shuffle2_32( v1, v2, c ) \
|
||||
_mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( v1 ), \
|
||||
_mm256_castsi256_ps( v2 ), c ) );
|
||||
*/
|
||||
|
||||
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
|
||||
#define mm256_shuflr128_64 mm256_swap128_64
|
||||
@@ -513,8 +511,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
} while(0)
|
||||
|
||||
// swap 256 bit vectors in place.
|
||||
// Deprecated, Shabal is the only user and it should be modified to reorder
|
||||
// instructions.
|
||||
// This should be avoided, it's more efficient to switch references.
|
||||
#define mm256_swap512_256( v1, v2 ) \
|
||||
v1 = _mm256_xor_si256( v1, v2 ); \
|
||||
v2 = _mm256_xor_si256( v1, v2 ); \
|
||||
|
@@ -409,20 +409,19 @@ static inline __m512i mm512_shuflr_x64( const __m512i v, const int n )
|
||||
static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
{ return _mm512_alignr_epi32( v, v, n ); }
|
||||
|
||||
/* Not used
|
||||
#define mm512_shuflr_16( v ) \
|
||||
_mm512_permutexvar_epi16( m512_const_64( \
|
||||
0x0000001F001E001D, 0x001C001B001A0019, \
|
||||
0x0018001700160015, 0x0014001300120011, \
|
||||
0x0010000F000E000D, 0x000C000B000A0009, \
|
||||
0x0008000700060005, 0x0004000300020001 ), v )
|
||||
0X0018001700160015, 0X0014001300120011, \
|
||||
0X0010000F000E000D, 0X000C000B000A0009, \
|
||||
0X0008000700060005, 0X0004000300020001 ), v )
|
||||
|
||||
#define mm512_shufll_16( v ) \
|
||||
_mm512_permutexvar_epi16( m512_const_64( \
|
||||
0x001E001D001C001B, 0x001A001900180017, \
|
||||
0x0016001500140013, 0x001200110010000F, \
|
||||
0x000E000D000C000B, 0x000A000900080007, \
|
||||
0x0006000500040003, 0x000200010000001F ), v )
|
||||
0X0016001500140013, 0X001200110010000F, \
|
||||
0X000E000D000C000B, 0X000A000900080007, \
|
||||
0X0006000500040003, 0X000200010000001F ), v )
|
||||
|
||||
#define mm512_shuflr_8( v ) \
|
||||
_mm512_shuffle_epi8( v, m512_const_64( \
|
||||
@@ -437,7 +436,6 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
0x2E2D2C2B2A292827, 0x262524232221201F, \
|
||||
0x1E1D1C1B1A191817, 0x161514131211100F, \
|
||||
0x0E0D0C0B0A090807, 0x060504030201003F ) )
|
||||
*/
|
||||
|
||||
// 256 bit lanes used only by lyra2, move these there
|
||||
// Rotate elements within 256 bit lanes of 512 bit vector.
|
||||
@@ -451,7 +449,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
#define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 )
|
||||
#define mm512_shufll256_64( v ) _mm512_permutex_epi64( v, 0x93 )
|
||||
|
||||
/* Not used
|
||||
/*
|
||||
// Rotate 256 bit lanes by one 32 bit element
|
||||
#define mm512_shuflr256_32( v ) \
|
||||
_mm512_permutexvar_epi32( m512_const_64( \
|
||||
@@ -498,7 +496,6 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
//
|
||||
// Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
|
||||
|
||||
/* Not used
|
||||
// Limited 2 input, 1 output shuffle, combines shuffle with blend.
|
||||
// Like most shuffles it's limited to 128 bit lanes and like some shuffles
|
||||
// destination elements must come from a specific source arg.
|
||||
@@ -509,10 +506,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
#define mm512_shuffle2_32( v1, v2, c ) \
|
||||
_mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( v1 ), \
|
||||
_mm512_castsi512_ps( v2 ), c ) );
|
||||
*/
|
||||
|
||||
// These hard coded shuffles exist for consistency with AVX2 & SSE2 where
|
||||
// efficient generic versions don't exist.
|
||||
// Swap 64 bits in each 128 bit lane
|
||||
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
|
||||
#define mm512_shuflr128_64 mm512_swap128_64
|
||||
@@ -522,11 +516,9 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
#define mm512_shuflr128_32( v ) _mm512_shuffle_epi32( v, 0x39 )
|
||||
#define mm512_shufll128_32( v ) _mm512_shuffle_epi32( v, 0x93 )
|
||||
|
||||
/* Not used
|
||||
// Rotate right 128 bit lanes by c bytes, efficient generic version of above.
|
||||
// Rotate right 128 bit lanes by c bytes, versatile and just as fast
|
||||
static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
|
||||
{ return _mm512_alignr_epi8( v, v, c ); }
|
||||
*/
|
||||
|
||||
// Rotate byte elements in each 64 or 32 bit lane. Redundant for AVX512, all
|
||||
// can be done with ror & rol. Defined only for convenience and consistency
|
||||
|
@@ -17,7 +17,6 @@ export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
|
||||
# used by GCC
|
||||
export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
|
||||
# Support for Windows 7 CPU groups, AES sometimes not included in -march
|
||||
# Disabled due to CPU group incompatibilities between Intel and AMD CPU.
|
||||
export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
|
||||
export DEFAULT_CFLAGS_OLD="-O3 -Wall"
|
||||
|
||||
@@ -46,7 +45,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
|
||||
./clean-all.sh || echo clean
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
CFLAGS="-march=icelake-client $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
|
||||
CFLAGS="-march=icelake-client $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
|
||||
@@ -54,7 +53,7 @@ mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
|
||||
# AVX512 AES: Intel Core HEDT Slylake-X, Cascadelake
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-march=skylake-avx512 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
|
||||
CFLAGS="-march=skylake-avx512 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-avx512.exe
|
||||
@@ -62,7 +61,7 @@ mv cpuminer.exe release/cpuminer-avx512.exe
|
||||
# AVX2 SHA VAES: Intel Alderlake, AMD Zen3
|
||||
make clean || echo done
|
||||
rm -f config.status
|
||||
CFLAGS="-mavx2 -msha -mvaes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
|
||||
CFLAGS="-mavx2 -msha -mvaes $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
|
||||
@@ -70,7 +69,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
|
||||
# AVX2 AES SHA: AMD Zen1
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-march=znver1 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
|
||||
CFLAGS="-march=znver1 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-avx2-sha.exe
|
||||
@@ -78,7 +77,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha.exe
|
||||
# AVX2 AES: Intel Core Haswell, Skylake, Kabylake, Coffeelake, Cometlake
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-march=core-avx2 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
|
||||
CFLAGS="-march=core-avx2 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-avx2.exe
|
||||
|
Reference in New Issue
Block a user