diff --git a/Makefile.am b/Makefile.am index 630b294..4254e99 100644 --- a/Makefile.am +++ b/Makefile.am @@ -264,8 +264,6 @@ cpuminer_SOURCES = \ algo/x16/x16r-4way.c \ algo/x16/x16rv2.c \ algo/x16/x16rv2-4way.c \ - algo/x16/x16rt.c \ - algo/x16/x16rt-4way.c \ algo/x16/hex.c \ algo/x16/x21s-4way.c \ algo/x16/x21s.c \ diff --git a/RELEASE_NOTES b/RELEASE_NOTES index e03a31a..f852e1a 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,6 +65,18 @@ If not what makes it happen or not happen? Change Log ---------- +v3.22.3 + +#392 #379 #389 Fixed misaligned address segfault solo mining. +#392 Fixed stats for myr-gr algo, and a few others, for CPUs without AVX2. +#392 Fixed conditional mining. +#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries, + Windows binaries no longer support CPU groups, + Windows binaries support CPUs with up to 64 threads. +Midstate prehash is now centralized, done only once instead of by every thread +for selected algos. +Small optimizations to serialized vectoring. + v3.22.2 Faster SALSA SIMD shuffle for yespower, yescrypt & scryptn2. diff --git a/algo-gate-api.c b/algo-gate-api.c index 73ec286..5fc2e61 100644 --- a/algo-gate-api.c +++ b/algo-gate-api.c @@ -253,6 +253,7 @@ void init_algo_gate( algo_gate_t* gate ) gate->miner_thread_init = (void*)&return_true; gate->scanhash = (void*)&scanhash_generic; gate->hash = (void*)&null_hash; + gate->prehash = (void*)&return_true; gate->get_new_work = (void*)&std_get_new_work; gate->work_decode = (void*)&std_le_work_decode; gate->decode_extra_data = (void*)&do_nothing; diff --git a/algo-gate-api.h b/algo-gate-api.h index 0710802..c9b7520 100644 --- a/algo-gate-api.h +++ b/algo-gate-api.h @@ -119,7 +119,7 @@ typedef struct // to be registered with the gate. int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* ); -int ( *hash ) ( void*, const void*, int ); +int ( *hash ) ( void*, const void*, const int ); //optional, safe to use default in most cases @@ -127,6 +127,9 @@ int ( *hash ) ( void*, const void*, int ); // other initialization specific to miner threads. bool ( *miner_thread_init ) ( int ); +// Perform prehash after receiving new work +int ( *prehash ) ( struct work* ); + // Get thread local copy of blockheader with unique nonce. void ( *get_new_work ) ( struct work*, struct work*, int, uint32_t* ); diff --git a/algo/blake/blake2s-4way.c b/algo/blake/blake2s-4way.c index ea8b266..9e9a187 100644 --- a/algo/blake/blake2s-4way.c +++ b/algo/blake/blake2s-4way.c @@ -1,5 +1,6 @@ #include "blake2s-gate.h" #include "blake2s-hash-4way.h" +//#include "sph-blake2s.h" #include #include @@ -7,6 +8,43 @@ static __thread blake2s_16way_state blake2s_16w_ctx; +/* +static blake2s_16way_state blake2s_16w_ctx; +static uint32_t blake2s_16way_vdata[20*16] __attribute__ ((aligned (64))); +*/ +/* +int blake2s_16way_prehash( struct work *work ) +{ + uint32_t edata[20] __attribute__ ((aligned (64))); + blake2s_state ctx; + mm128_bswap32_80( edata, work->data ); + blake2s_init( &ctx, BLAKE2S_OUTBYTES ); + ctx.buflen = ctx.t[0] = 64; + blake2s_compress( &ctx, (const uint8_t*)edata ); + + blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES ); + intrlv_16x32( blake2s_16w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h, + ctx.h, ctx.h, ctx.h, ctx.h, + ctx.h, ctx.h, ctx.h, ctx.h, + ctx.h, ctx.h, ctx.h, ctx.h, 256 ); + intrlv_16x32( blake2s_16way_vdata, edata, edata, edata, edata, + edata, edata, edata, edata, + edata, edata, edata, edata, + edata, edata, edata, edata, 640 ); + blake2s_16w_ctx.t[0] = 64; + return 1; +} +*/ +/* +int blake2s_16way_prehash( struct work *work ) +{ + mm512_bswap32_intrlv80_16x32( blake2s_16way_vdata, work->data ); + blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES ); + blake2s_16way_update( &blake2s_16w_ctx, blake2s_16way_vdata, 64 ); + return 1; +} +*/ + void blake2s_16way_hash( void *output, const void *input ) { blake2s_16way_state ctx; @@ -30,10 +68,40 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce, uint32_t n = first_nonce; int thr_id = mythr->id; +/* +// pthread_rwlock_rdlock( &g_work_lock ); + memcpy( (__m512i*)vdata +16, (__m512i*)blake2s_16way_vdata +16, 3*4*16 ); +// casti_m512i( vdata, 16 ) = casti_m512i( blake2s_16way_vdata, 16 ); +// casti_m512i( vdata, 17 ) = casti_m512i( blake2s_16way_vdata, 17 ); +// casti_m512i( vdata, 18 ) = casti_m512i( blake2s_16way_vdata, 18 ); + +// pthread_rwlock_unlock( &g_work_lock ); +*/ +/* + uint32_t edata[20] __attribute__ ((aligned (64))); + blake2s_state ctx; + mm128_bswap32_80( edata, pdata ); + blake2s_init( &ctx, BLAKE2S_OUTBYTES ); + ctx.buflen = ctx.t[0] = 64; + blake2s_compress( &ctx, (const uint8_t*)edata ); + + blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES ); + intrlv_16x32( blake2s_16w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h, + ctx.h, ctx.h, ctx.h, ctx.h, + ctx.h, ctx.h, ctx.h, ctx.h, + ctx.h, ctx.h, ctx.h, ctx.h, 256 ); + intrlv_16x32( blake2s_16way_blake2s_16way_vdata, edata, edata, edata, edata, + edata, edata, edata, edata, + edata, edata, edata, edata, + edata, edata, edata, edata, 640 ); + blake2s_16w_ctx.t[0] = 64; +*/ + mm512_bswap32_intrlv80_16x32( vdata, pdata ); blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES ); blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 ); + do { *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8, @@ -63,6 +131,36 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce, static __thread blake2s_8way_state blake2s_8w_ctx; +/* +static blake2s_8way_state blake2s_8w_ctx; +static uint32_t blake2s_8way_vdata[20*8] __attribute__ ((aligned (32))); + +int blake2s_8way_prehash( struct work *work ) +{ + uint32_t edata[20] __attribute__ ((aligned (64))); + blake2s_state ctx; + mm128_bswap32_80( edata, work->data ); + blake2s_init( &ctx, BLAKE2S_OUTBYTES ); + ctx.buflen = ctx.t[0] = 64; + blake2s_compress( &ctx, (const uint8_t*)edata ); + + blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES ); + + for ( int i = 0; i < 8; i++ ) + casti_m256i( blake2s_8w_ctx.h, i ) = _mm256_set1_epi32( ctx.h[i] ); + + casti_m256i( blake2s_8way_vdata, 16 ) = _mm256_set1_epi32( edata[16] ); + casti_m256i( blake2s_8way_vdata, 17 ) = _mm256_set1_epi32( edata[17] ); + casti_m256i( blake2s_8way_vdata, 18 ) = _mm256_set1_epi32( edata[18] ); + +// intrlv_8x32( blake2s_8w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h, +// ctx.h, ctx.h, ctx.h, ctx.h, 256 ); +// intrlv_8x32( blake2s_8way_vdata, edata, edata, edata, edata, +// edata, edata, edata, edata, 640 ); + blake2s_8w_ctx.t[0] = 64; +} +*/ + void blake2s_8way_hash( void *output, const void *input ) { blake2s_8way_state ctx; @@ -86,10 +184,41 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce, uint32_t n = first_nonce; int thr_id = mythr->id; +/* +// pthread_rwlock_rdlock( &g_work_lock ); + memcpy( &vdata[16*8], &blake2s_8way_vdata[16*8], 3*4*8 ); +// pthread_rwlock_unlock( &g_work_lock ); +*/ +/* + uint32_t edata[20] __attribute__ ((aligned (64))); + blake2s_state ctx; + mm128_bswap32_80( edata, pdata ); + blake2s_init( &ctx, BLAKE2S_OUTBYTES ); + ctx.buflen = ctx.t[0] = 64; + blake2s_compress( &ctx, (const uint8_t*)edata ); + + blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES ); + for ( int i = 0; i < 8; i++ ) + casti_m256i( blake2s_8w_ctx.h, i ) = _mm256_set1_epi32( ctx.h[i] ); + + casti_m256i( vdata, 16 ) = _mm256_set1_epi32( edata[16] ); + casti_m256i( vdata, 17 ) = _mm256_set1_epi32( edata[17] ); + casti_m256i( vdata, 18 ) = _mm256_set1_epi32( edata[18] ); + + +// intrlv_8x32( blake2s_8w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h, +// ctx.h, ctx.h, ctx.h, ctx.h, 256 ); +// intrlv_8x32( vdata, edata, edata, edata, edata, +// edata, edata, edata, edata, 640 ); + + blake2s_8w_ctx.t[0] = 64; +*/ + mm256_bswap32_intrlv80_8x32( vdata, pdata ); blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES ); blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 ); + do { *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) ); @@ -117,7 +246,25 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce, #elif defined(BLAKE2S_4WAY) static __thread blake2s_4way_state blake2s_4w_ctx; +/* +static blake2s_4way_state blake2s_4w_ctx; +static uint32_t blake2s_4way_vdata[20*4] __attribute__ ((aligned (32))); +int blake2s_4way_prehash( struct work *work ) +{ + uint32_t edata[20] __attribute__ ((aligned (64))); + blake2s_state ctx; + mm128_bswap32_80( edata, work->data ); + blake2s_init( &ctx, BLAKE2S_OUTBYTES ); + ctx.buflen = ctx.t[0] = 64; + blake2s_compress( &ctx, (const uint8_t*)edata ); + + blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES ); + intrlv_4x32( blake2s_4w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h, 256 ); + intrlv_4x32( blake2s_4way_vdata, edata, edata, edata, edata, 640 ); + blake2s_4w_ctx.t[0] = 64; +} +*/ void blake2s_4way_hash( void *output, const void *input ) { blake2s_4way_state ctx; @@ -140,11 +287,15 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce, __m128i *noncev = (__m128i*)vdata + 19; // aligned uint32_t n = first_nonce; int thr_id = mythr->id; - +/* + pthread_rwlock_rdlock( &g_work_lock ); + memcpy( vdata, blake2s_4way_vdata, sizeof vdata ); + pthread_rwlock_unlock( &g_work_lock ); +*/ mm128_bswap32_intrlv80_4x32( vdata, pdata ); blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES ); blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 ); - + do { *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) ); pdata[19] = n; diff --git a/algo/blake/blake2s-gate.c b/algo/blake/blake2s-gate.c index 505c4d5..95b86f6 100644 --- a/algo/blake/blake2s-gate.c +++ b/algo/blake/blake2s-gate.c @@ -5,13 +5,15 @@ bool register_blake2s_algo( algo_gate_t* gate ) #if defined(BLAKE2S_16WAY) gate->scanhash = (void*)&scanhash_blake2s_16way; gate->hash = (void*)&blake2s_16way_hash; +// gate->prehash = (void*)&blake2s_16way_prehash; #elif defined(BLAKE2S_8WAY) -//#if defined(BLAKE2S_8WAY) gate->scanhash = (void*)&scanhash_blake2s_8way; gate->hash = (void*)&blake2s_8way_hash; +// gate->prehash = (void*)&blake2s_8way_prehash; #elif defined(BLAKE2S_4WAY) gate->scanhash = (void*)&scanhash_blake2s_4way; gate->hash = (void*)&blake2s_4way_hash; +// gate->prehash = (void*)&blake2s_4way_prehash; #else gate->scanhash = (void*)&scanhash_blake2s; gate->hash = (void*)&blake2s_hash; diff --git a/algo/blake/blake2s-gate.h b/algo/blake/blake2s-gate.h index 4a7942c..62136b3 100644 --- a/algo/blake/blake2s-gate.h +++ b/algo/blake/blake2s-gate.h @@ -23,18 +23,22 @@ bool register_blake2s_algo( algo_gate_t* gate ); void blake2s_16way_hash( void *state, const void *input ); int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); +int blake2s_16way_prehash( struct work * ); #elif defined (BLAKE2S_8WAY) void blake2s_8way_hash( void *state, const void *input ); int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); +int blake2s_8way_prehash( struct work * ); #elif defined (BLAKE2S_4WAY) void blake2s_4way_hash( void *state, const void *input ); int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); +int blake2s_4way_prehash( struct work * ); + #else void blake2s_hash( void *state, const void *input ); diff --git a/algo/blake/blake2s-hash-4way.c b/algo/blake/blake2s-hash-4way.c index a69e501..7ab9f43 100644 --- a/algo/blake/blake2s-hash-4way.c +++ b/algo/blake/blake2s-hash-4way.c @@ -105,8 +105,8 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block ) #define G4W( sigma0, sigma1, a, b, c, d ) \ do { \ - uint8_t s0 = sigma0; \ - uint8_t s1 = sigma1; \ + const uint8_t s0 = sigma0; \ + const uint8_t s1 = sigma1; \ a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \ d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \ c = _mm_add_epi32( c, d ); \ @@ -120,7 +120,7 @@ do { \ #define ROUND4W(r) \ do { \ - uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \ + const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \ G4W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \ G4W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \ G4W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \ @@ -317,8 +317,8 @@ do { \ #define G8W( sigma0, sigma1, a, b, c, d) \ do { \ - uint8_t s0 = sigma0; \ - uint8_t s1 = sigma1; \ + const uint8_t s0 = sigma0; \ + const uint8_t s1 = sigma1; \ a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \ d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \ c = _mm256_add_epi32( c, d ); \ @@ -331,7 +331,7 @@ do { \ #define ROUND8W(r) \ do { \ - uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \ + const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \ G8W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \ G8W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \ G8W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \ @@ -529,8 +529,8 @@ int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block ) #define G16W( sigma0, sigma1, a, b, c, d) \ do { \ - uint8_t s0 = sigma0; \ - uint8_t s1 = sigma1; \ + const uint8_t s0 = sigma0; \ + const uint8_t s1 = sigma1; \ a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s0 ] ); \ d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \ c = _mm512_add_epi32( c, d ); \ @@ -543,7 +543,7 @@ do { \ #define ROUND16W(r) \ do { \ - uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \ + const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \ G16W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \ G16W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \ G16W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \ diff --git a/algo/blake/blake2s-hash-4way.h b/algo/blake/blake2s-hash-4way.h index fc86c4f..0f26b11 100644 --- a/algo/blake/blake2s-hash-4way.h +++ b/algo/blake/blake2s-hash-4way.h @@ -20,6 +20,7 @@ #include #include +//#include "sph-blake2s.h" #if defined(_MSC_VER) #include @@ -33,7 +34,7 @@ #if defined(__cplusplus) extern "C" { #endif - +/* enum blake2s_constant { BLAKE2S_BLOCKBYTES = 64, @@ -42,6 +43,13 @@ enum blake2s_constant BLAKE2S_SALTBYTES = 8, BLAKE2S_PERSONALBYTES = 8 }; +*/ + +#define BLAKE2S_BLOCKBYTES 64 +#define BLAKE2S_OUTBYTES 32 +#define BLAKE2S_KEYBYTES 32 +#define BLAKE2S_SALTBYTES 8 +#define BLAKE2S_PERSONALBYTES 8 #pragma pack(push, 1) typedef struct __blake2s_nway_param diff --git a/algo/blake/blake2s.c b/algo/blake/blake2s.c index 86d4f77..6cc538a 100644 --- a/algo/blake/blake2s.c +++ b/algo/blake/blake2s.c @@ -8,8 +8,6 @@ #include "sph-blake2s.h" static __thread blake2s_state blake2s_ctx; -//static __thread blake2s_state s_ctx; -#define MIDLEN 76 void blake2s_hash( void *output, const void *input ) { @@ -19,37 +17,27 @@ void blake2s_hash( void *output, const void *input ) memcpy( &ctx, &blake2s_ctx, sizeof ctx ); blake2s_update( &ctx, input+64, 16 ); -// blake2s_init(&ctx, BLAKE2S_OUTBYTES); -// blake2s_update(&ctx, input, 80); blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES ); memcpy(output, hash, 32); } -/* -static void blake2s_hash_end(uint32_t *output, const uint32_t *input) -{ - s_ctx.buflen = MIDLEN; - memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN); - blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN); - blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES); -} -*/ + int scanhash_blake2s( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; uint32_t _ALIGN(64) hash64[8]; uint32_t _ALIGN(64) endiandata[20]; - int thr_id = mythr->id; // thr_id arg is deprecated + int thr_id = mythr->id; const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; uint32_t n = first_nonce; - swab32_array( endiandata, pdata, 20 ); + swab32_array( endiandata, pdata, 20 ); // midstate blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES ); @@ -58,11 +46,12 @@ int scanhash_blake2s( struct work *work, do { be32enc(&endiandata[19], n); blake2s_hash( hash64, endiandata ); - if (hash64[7] <= Htarg && fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return true; - } + if (hash64[7] <= Htarg ) + if ( fulltest(hash64, ptarget) && !opt_benchmark ) + { + pdata[19] = n; + submit_solution( work, hash64, mythr ); + } n++; } while (n < max_nonce && !work_restart[thr_id].restart); diff --git a/algo/blake/sph-blake2s.c b/algo/blake/sph-blake2s.c index 0ebe547..2bd2651 100644 --- a/algo/blake/sph-blake2s.c +++ b/algo/blake/sph-blake2s.c @@ -17,6 +17,7 @@ #include "algo/sha/sph_types.h" #include "sph-blake2s.h" +#include "simd-utils.h" static const uint32_t blake2s_IV[8] = { @@ -225,6 +226,71 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] v[13] = S->t[1] ^ blake2s_IV[5]; v[14] = S->f[0] ^ blake2s_IV[6]; v[15] = S->f[1] ^ blake2s_IV[7]; + +#if 0 +//#if defined(__SSE2__) // always true + +The only application for this is to do a prehash for the blake2s algorithm. +SSE2 also supports 4 way parallel hashing so that is preferred in most cases. +Testing has found that using this serial SIMD code for prehash is slower than +doing a parallel hash. A parallel hash has more instructions and uses more +data. The serial hash uses fewer instructions and data and only needs to +interleave the final hash into parallel streams. This has shown negligible +improvement on other algos, notably blake256 which is almost identical. +Considering the low frequency of prehash no statistically valid change +was expected. It was simply better on paper. + +Furthermore, simply defining this macro has an additional negative effect on +blake2s as a whole. There are no references to this macro, blake2s-4way does +not include it in any header files, it's just another unused macro which should +have no effect beyond the preprocessor. But just being visible to the compiler +changes things in a dramatic way. + +These 2 things combined reduced the hash rate for blake2s by more than 5% when +using serial SIMD for the blake2s prehash over 16way parallel prehash. +16way parallel hashing was used in the high frequency nonce loop in both cases. +Comsidering the prehash represents 50% of the algorithm and is done once vs +the high frequency second half that is done mega, maybe giga, times more it's +hard to imagine that big of an effect in either direction. + +#define ROUND( r ) \ +{ \ + __m128i *V = (__m128i*)v; \ + const uint8_t *sigma = blake2s_sigma[r]; \ + V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \ + _mm_set_epi32( m[ sigma[ 6 ] ], m[ sigma[ 4 ] ], \ + m[ sigma[ 2 ] ], m[ sigma[ 0 ] ] ) ) ); \ + V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \ + V[2] = _mm_add_epi32( V[2], V[3] ); \ + V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \ + V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \ + _mm_set_epi32( m[ sigma[ 7 ] ], m[ sigma[ 5 ] ], \ + m[ sigma[ 3 ] ], m[ sigma[ 1 ] ] ) ) ); \ + V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \ + V[2] = _mm_add_epi32( V[2], V[3] ); \ + V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \ + V[3] = mm128_shufll_32( V[3] ); \ + V[2] = mm128_swap_64( V[2] ); \ + V[1] = mm128_shuflr_32( V[1] ); \ + V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \ + _mm_set_epi32( m[ sigma[14] ], m[ sigma[12] ], \ + m[ sigma[10] ], m[ sigma[ 8] ] ) ) ); \ + V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \ + V[2] = _mm_add_epi32( V[2], V[3] ); \ + V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \ + V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \ + _mm_set_epi32( m[ sigma[15] ], m[ sigma[13] ], \ + m[ sigma[11] ], m[ sigma[ 9] ] ) ) ); \ + V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \ + V[2] = _mm_add_epi32( V[2], V[3] ); \ + V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \ + V[3] = mm128_shuflr_32( V[3] ); \ + V[2] = mm128_swap_64( V[2] ); \ + V[1] = mm128_shufll_32( V[1] ); \ +} + +#else + #define G(r,i,a,b,c,d) \ do { \ a = a + b + m[blake2s_sigma[r][2*i+0]]; \ @@ -247,7 +313,10 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ } while(0) - ROUND( 0 ); + +#endif + + ROUND( 0 ); ROUND( 1 ); ROUND( 2 ); ROUND( 3 ); diff --git a/algo/blake/sph-blake2s.h b/algo/blake/sph-blake2s.h index eb66b7a..8f2bf1e 100644 --- a/algo/blake/sph-blake2s.h +++ b/algo/blake/sph-blake2s.h @@ -91,6 +91,7 @@ static inline void secure_zero_memory(void *v, size_t n) extern "C" { #endif +/* enum blake2s_constant { BLAKE2S_BLOCKBYTES = 64, @@ -99,6 +100,13 @@ extern "C" { BLAKE2S_SALTBYTES = 8, BLAKE2S_PERSONALBYTES = 8 }; +*/ + +#define BLAKE2S_BLOCKBYTES 64 +#define BLAKE2S_OUTBYTES 32 +#define BLAKE2S_KEYBYTES 32 +#define BLAKE2S_SALTBYTES 8 +#define BLAKE2S_PERSONALBYTES 8 #pragma pack(push, 1) typedef struct __blake2s_param diff --git a/algo/blake/sph_blake2b.c b/algo/blake/sph_blake2b.c index 7dcb825..3bcb6e1 100644 --- a/algo/blake/sph_blake2b.c +++ b/algo/blake/sph_blake2b.c @@ -103,16 +103,16 @@ const uint8_t *sigmaR = sigma[R]; \ BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \ BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \ - V2 = mm128_alignr_64( V[3], V[2] ); \ - V3 = mm128_alignr_64( V[2], V[3] ); \ - V6 = mm128_alignr_64( V[6], V[7] ); \ - V7 = mm128_alignr_64( V[7], V[6] ); \ + V2 = mm128_alignr_64( V[3], V[2], 1 ); \ + V3 = mm128_alignr_64( V[2], V[3], 1 ); \ + V6 = mm128_alignr_64( V[6], V[7], 1 ); \ + V7 = mm128_alignr_64( V[7], V[6], 1 ); \ BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \ BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \ - V[2] = mm128_alignr_64( V2, V3 ); \ - V[3] = mm128_alignr_64( V3, V2 ); \ - V[6] = mm128_alignr_64( V7, V6 ); \ - V[7] = mm128_alignr_64( V6, V7 ); \ + V[2] = mm128_alignr_64( V2, V3, 1 ); \ + V[3] = mm128_alignr_64( V3, V2, 1 ); \ + V[6] = mm128_alignr_64( V7, V6, 1 ); \ + V[7] = mm128_alignr_64( V6, V7, 1 ); \ } #else diff --git a/algo/groestl/myr-groestl.c b/algo/groestl/myr-groestl.c index 4f17c64..d9b6c1b 100644 --- a/algo/groestl/myr-groestl.c +++ b/algo/groestl/myr-groestl.c @@ -73,11 +73,11 @@ int scanhash_myriad( struct work *work, uint32_t max_nonce, be32enc(&endiandata[19], nonce); myriad_hash(hash, endiandata); - if (hash[7] <= Htarg && fulltest(hash, ptarget)) + if (hash[7] <= Htarg ) + if ( fulltest(hash, ptarget) && !opt_benchmark ) { pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce; - return 1; + submit_solution( work, hash, mythr ); } nonce++; diff --git a/algo/luffa/luffa_for_sse2.c b/algo/luffa/luffa_for_sse2.c index 6a90780..069e652 100644 --- a/algo/luffa/luffa_for_sse2.c +++ b/algo/luffa/luffa_for_sse2.c @@ -19,26 +19,34 @@ */ #include -#include #include "simd-utils.h" #include "luffa_for_sse2.h" -#if defined(__SSE4_1__) +#if defined(__AVX512VL__) + +#define MULT2( a0, a1 ) \ +{ \ + __m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \ + a0 = _mm_alignr_epi32( a1, b, 1 ); \ + a1 = _mm_alignr_epi32( b, a1, 1 ); \ +} + +#elif defined(__SSE4_1__) #define MULT2( a0, a1 ) do \ { \ - __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \ - a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \ - a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \ + __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \ + a0 = _mm_alignr_epi8( a1, b, 4 ); \ + a1 = _mm_alignr_epi8( b, a1, 4 ); \ } while(0) #else #define MULT2( a0, a1 ) do \ { \ - __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 16 ) ); \ + __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \ a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \ - a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \ + a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \ } while(0) #endif diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c index 118deef..bfbf8b7 100644 --- a/algo/lyra2/allium-4way.c +++ b/algo/lyra2/allium-4way.c @@ -24,6 +24,45 @@ typedef union { #endif } allium_16way_ctx_holder; +static uint32_t allium_16way_midstate_vars[16*16] __attribute__ ((aligned (64))); +static __m512i allium_16way_block0_hash[8] __attribute__ ((aligned (64))); +static __m512i allium_16way_block_buf[16] __attribute__ ((aligned (64))); + +int allium_16way_prehash( struct work *work ) +{ + uint32_t phash[8] __attribute__ ((aligned (32))) = + { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 + }; + uint32_t *pdata = work->data; + + // Prehash first block. + blake256_transform_le( phash, pdata, 512, 0 ); + + // Interleave hash for second block prehash. + allium_16way_block0_hash[0] = _mm512_set1_epi32( phash[0] ); + allium_16way_block0_hash[1] = _mm512_set1_epi32( phash[1] ); + allium_16way_block0_hash[2] = _mm512_set1_epi32( phash[2] ); + allium_16way_block0_hash[3] = _mm512_set1_epi32( phash[3] ); + allium_16way_block0_hash[4] = _mm512_set1_epi32( phash[4] ); + allium_16way_block0_hash[5] = _mm512_set1_epi32( phash[5] ); + allium_16way_block0_hash[6] = _mm512_set1_epi32( phash[6] ); + allium_16way_block0_hash[7] = _mm512_set1_epi32( phash[7] ); + + // Build vectored second block, interleave 12 of last 16 bytes of data, + // excluding the nonce. + allium_16way_block_buf[ 0] = _mm512_set1_epi32( pdata[16] ); + allium_16way_block_buf[ 1] = _mm512_set1_epi32( pdata[17] ); + allium_16way_block_buf[ 2] = _mm512_set1_epi32( pdata[18] ); + + // Partialy prehash second block without touching nonces in block_buf[3]. + blake256_16way_round0_prehash_le( allium_16way_midstate_vars, + allium_16way_block0_hash, allium_16way_block_buf ); + + return 1; +} + static void allium_16way_hash( void *state, const void *midstate_vars, const void *midhash, const void *block ) { @@ -200,11 +239,6 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce, uint32_t midstate_vars[16*16] __attribute__ ((aligned (64))); __m512i block0_hash[8] __attribute__ ((aligned (64))); __m512i block_buf[16] __attribute__ ((aligned (64))); - uint32_t phash[8] __attribute__ ((aligned (32))) = - { - 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, - 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 - }; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; @@ -216,31 +250,19 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce, if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff; - // Prehash first block. - blake256_transform_le( phash, pdata, 512, 0 ); + pthread_rwlock_rdlock( &g_work_lock ); - // Interleave hash for second block prehash. - block0_hash[0] = _mm512_set1_epi32( phash[0] ); - block0_hash[1] = _mm512_set1_epi32( phash[1] ); - block0_hash[2] = _mm512_set1_epi32( phash[2] ); - block0_hash[3] = _mm512_set1_epi32( phash[3] ); - block0_hash[4] = _mm512_set1_epi32( phash[4] ); - block0_hash[5] = _mm512_set1_epi32( phash[5] ); - block0_hash[6] = _mm512_set1_epi32( phash[6] ); - block0_hash[7] = _mm512_set1_epi32( phash[7] ); + memcpy( midstate_vars, allium_16way_midstate_vars, sizeof midstate_vars ); + memcpy( block0_hash, allium_16way_block0_hash, sizeof block0_hash ); + memcpy( block_buf, allium_16way_block_buf, sizeof block_buf ); - // Build vectored second block, interleave last 16 bytes of data using - // unique nonces. - block_buf[ 0] = _mm512_set1_epi32( pdata[16] ); - block_buf[ 1] = _mm512_set1_epi32( pdata[17] ); - block_buf[ 2] = _mm512_set1_epi32( pdata[18] ); - block_buf[ 3] = + pthread_rwlock_unlock( &g_work_lock ); + + // fill in the nonces + block_buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8, n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ); - - // Partialy prehash second block without touching nonces in block_buf[3]. - blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf ); - + do { allium_16way_hash( hash, midstate_vars, block0_hash, block_buf ); @@ -271,6 +293,44 @@ typedef union { #endif } allium_8way_ctx_holder; +static uint32_t allium_8way_midstate_vars[16*8] __attribute__ ((aligned (64))); +static __m256i allium_8way_block0_hash[8] __attribute__ ((aligned (64))); +static __m256i allium_8way_block_buf[16] __attribute__ ((aligned (64))); + +int allium_8way_prehash ( struct work *work ) +{ + uint32_t phash[8] __attribute__ ((aligned (32))) = + { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 + }; + uint32_t *pdata = work->data; + + // Prehash first block + blake256_transform_le( phash, pdata, 512, 0 ); + + allium_8way_block0_hash[0] = _mm256_set1_epi32( phash[0] ); + allium_8way_block0_hash[1] = _mm256_set1_epi32( phash[1] ); + allium_8way_block0_hash[2] = _mm256_set1_epi32( phash[2] ); + allium_8way_block0_hash[3] = _mm256_set1_epi32( phash[3] ); + allium_8way_block0_hash[4] = _mm256_set1_epi32( phash[4] ); + allium_8way_block0_hash[5] = _mm256_set1_epi32( phash[5] ); + allium_8way_block0_hash[6] = _mm256_set1_epi32( phash[6] ); + allium_8way_block0_hash[7] = _mm256_set1_epi32( phash[7] ); + + // Build vectored second block, interleave 12 of the last 16 bytes, + // excepting the nonces. + allium_8way_block_buf[ 0] = _mm256_set1_epi32( pdata[16] ); + allium_8way_block_buf[ 1] = _mm256_set1_epi32( pdata[17] ); + allium_8way_block_buf[ 2] = _mm256_set1_epi32( pdata[18] ); + + // Partialy prehash second block without touching nonces + blake256_8way_round0_prehash_le( allium_8way_midstate_vars, + allium_8way_block0_hash, allium_8way_block_buf ); + + return 1; +} + static void allium_8way_hash( void *hash, const void *midstate_vars, const void *midhash, const void *block ) { @@ -386,11 +446,6 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce, uint32_t midstate_vars[16*8] __attribute__ ((aligned (64))); __m256i block0_hash[8] __attribute__ ((aligned (64))); __m256i block_buf[16] __attribute__ ((aligned (64))); - uint32_t phash[8] __attribute__ ((aligned (32))) = - { - 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, - 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 - }; uint32_t *pdata = work->data; uint64_t *ptarget = (uint64_t*)work->target; const uint32_t first_nonce = pdata[19]; @@ -400,29 +455,17 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce, const bool bench = opt_benchmark; const __m256i eight = m256_const1_32( 8 ); - // Prehash first block - blake256_transform_le( phash, pdata, 512, 0 ); + pthread_rwlock_rdlock( &g_work_lock ); - block0_hash[0] = _mm256_set1_epi32( phash[0] ); - block0_hash[1] = _mm256_set1_epi32( phash[1] ); - block0_hash[2] = _mm256_set1_epi32( phash[2] ); - block0_hash[3] = _mm256_set1_epi32( phash[3] ); - block0_hash[4] = _mm256_set1_epi32( phash[4] ); - block0_hash[5] = _mm256_set1_epi32( phash[5] ); - block0_hash[6] = _mm256_set1_epi32( phash[6] ); - block0_hash[7] = _mm256_set1_epi32( phash[7] ); + memcpy( midstate_vars, allium_8way_midstate_vars, sizeof midstate_vars ); + memcpy( block0_hash, allium_8way_block0_hash, sizeof block0_hash ); + memcpy( block_buf, allium_8way_block_buf, sizeof block_buf ); - // Build vectored second block, interleave last 16 bytes of data using - // unique nonces. - block_buf[ 0] = _mm256_set1_epi32( pdata[16] ); - block_buf[ 1] = _mm256_set1_epi32( pdata[17] ); - block_buf[ 2] = _mm256_set1_epi32( pdata[18] ); + pthread_rwlock_unlock( &g_work_lock ); + block_buf[ 3] = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ); - - // Partialy prehash second block without touching nonces - blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf ); - + do { allium_8way_hash( hash, midstate_vars, block0_hash, block_buf ); @@ -438,6 +481,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce, n += 8; block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight ); } while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) ); + pdata[19] = n; *hashes_done = n - first_nonce; return 0; diff --git a/algo/lyra2/lyra2-gate.c b/algo/lyra2/lyra2-gate.c index 9ec505b..4f0a0b5 100644 --- a/algo/lyra2/lyra2-gate.c +++ b/algo/lyra2/lyra2-gate.c @@ -131,10 +131,12 @@ bool register_lyra2z_algo( algo_gate_t* gate ) { #if defined(LYRA2Z_16WAY) gate->miner_thread_init = (void*)&lyra2z_16way_thread_init; + gate->prehash = (void*)&lyra2z_16way_prehash; gate->scanhash = (void*)&scanhash_lyra2z_16way; // gate->hash = (void*)&lyra2z_16way_hash; #elif defined(LYRA2Z_8WAY) gate->miner_thread_init = (void*)&lyra2z_8way_thread_init; + gate->prehash = (void*)&lyra2z_8way_prehash; gate->scanhash = (void*)&scanhash_lyra2z_8way; // gate->hash = (void*)&lyra2z_8way_hash; #elif defined(LYRA2Z_4WAY) @@ -175,8 +177,10 @@ bool register_lyra2h_algo( algo_gate_t* gate ) bool register_allium_algo( algo_gate_t* gate ) { #if defined (ALLIUM_16WAY) + gate->prehash = (void*)&allium_16way_prehash; gate->scanhash = (void*)&scanhash_allium_16way; #elif defined (ALLIUM_8WAY) + gate->prehash = (void*)&allium_8way_prehash; gate->scanhash = (void*)&scanhash_allium_8way; #else gate->miner_thread_init = (void*)&init_allium_ctx; diff --git a/algo/lyra2/lyra2-gate.h b/algo/lyra2/lyra2-gate.h index 124b0c9..9e1b9c6 100644 --- a/algo/lyra2/lyra2-gate.h +++ b/algo/lyra2/lyra2-gate.h @@ -5,7 +5,6 @@ #include #include "lyra2.h" - #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #define LYRA2REV3_16WAY 1 #elif defined(__AVX2__) @@ -102,6 +101,7 @@ bool init_lyra2rev2_ctx(); //void lyra2z_16way_hash( void *state, const void *input ); int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); +int lyra2z_16way_prehash ( struct work *work ); bool lyra2z_16way_thread_init(); #elif defined(LYRA2Z_8WAY) @@ -110,6 +110,7 @@ bool lyra2z_16way_thread_init(); int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); bool lyra2z_8way_thread_init(); +int lyra2z_8way_prehash ( struct work *work ); #elif defined(LYRA2Z_4WAY) @@ -165,11 +166,13 @@ bool register_allium_algo( algo_gate_t* gate ); int scanhash_allium_16way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); +int allium_16way_prehash ( struct work *work ); #elif defined(ALLIUM_8WAY) int scanhash_allium_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); +int allium_8way_prehash ( struct work *work ); #else diff --git a/algo/lyra2/lyra2z-4way.c b/algo/lyra2/lyra2z-4way.c index b50b071..c72744a 100644 --- a/algo/lyra2/lyra2z-4way.c +++ b/algo/lyra2/lyra2z-4way.c @@ -14,6 +14,44 @@ bool lyra2z_16way_thread_init() return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) ); } +static uint32_t lyra2z_16way_midstate_vars[16*16] __attribute__ ((aligned (64))); +static __m512i lyra2z_16way_block0_hash[8] __attribute__ ((aligned (64))); +static __m512i lyra2z_16way_block_buf[16] __attribute__ ((aligned (64))); + +int lyra2z_16way_prehash ( struct work *work ) +{ + uint32_t phash[8] __attribute__ ((aligned (32))) = + { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 + }; + uint32_t *pdata = work->data; + + // Prehash first block + blake256_transform_le( phash, pdata, 512, 0 ); + + lyra2z_16way_block0_hash[0] = _mm512_set1_epi32( phash[0] ); + lyra2z_16way_block0_hash[1] = _mm512_set1_epi32( phash[1] ); + lyra2z_16way_block0_hash[2] = _mm512_set1_epi32( phash[2] ); + lyra2z_16way_block0_hash[3] = _mm512_set1_epi32( phash[3] ); + lyra2z_16way_block0_hash[4] = _mm512_set1_epi32( phash[4] ); + lyra2z_16way_block0_hash[5] = _mm512_set1_epi32( phash[5] ); + lyra2z_16way_block0_hash[6] = _mm512_set1_epi32( phash[6] ); + lyra2z_16way_block0_hash[7] = _mm512_set1_epi32( phash[7] ); + + // Build vectored second block, interleave 12 of last 16 bytes of data + // excepting the nonce. + lyra2z_16way_block_buf[ 0] = _mm512_set1_epi32( pdata[16] ); + lyra2z_16way_block_buf[ 1] = _mm512_set1_epi32( pdata[17] ); + lyra2z_16way_block_buf[ 2] = _mm512_set1_epi32( pdata[18] ); + + // Partialy prehash second block without touching nonces in block_buf[3]. + blake256_16way_round0_prehash_le( lyra2z_16way_midstate_vars, + lyra2z_16way_block0_hash, lyra2z_16way_block_buf ); + + return 1; +} + static void lyra2z_16way_hash( void *state, const void *midstate_vars, const void *midhash, const void *block ) { @@ -91,11 +129,6 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce, uint32_t midstate_vars[16*16] __attribute__ ((aligned (64))); __m512i block0_hash[8] __attribute__ ((aligned (64))); __m512i block_buf[16] __attribute__ ((aligned (64))); - uint32_t phash[8] __attribute__ ((aligned (64))) = - { - 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, - 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 - }; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; @@ -107,30 +140,18 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce, if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff; - // Prehash first block - blake256_transform_le( phash, pdata, 512, 0 ); + pthread_rwlock_rdlock( &g_work_lock ); - block0_hash[0] = _mm512_set1_epi32( phash[0] ); - block0_hash[1] = _mm512_set1_epi32( phash[1] ); - block0_hash[2] = _mm512_set1_epi32( phash[2] ); - block0_hash[3] = _mm512_set1_epi32( phash[3] ); - block0_hash[4] = _mm512_set1_epi32( phash[4] ); - block0_hash[5] = _mm512_set1_epi32( phash[5] ); - block0_hash[6] = _mm512_set1_epi32( phash[6] ); - block0_hash[7] = _mm512_set1_epi32( phash[7] ); + memcpy( midstate_vars, lyra2z_16way_midstate_vars, sizeof midstate_vars ); + memcpy( block0_hash, lyra2z_16way_block0_hash, sizeof block0_hash ); + memcpy( block_buf, lyra2z_16way_block_buf, sizeof block_buf ); - // Build vectored second block, interleave last 16 bytes of data using - // unique nonces. - block_buf[ 0] = _mm512_set1_epi32( pdata[16] ); - block_buf[ 1] = _mm512_set1_epi32( pdata[17] ); - block_buf[ 2] = _mm512_set1_epi32( pdata[18] ); + pthread_rwlock_unlock( &g_work_lock ); + block_buf[ 3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8, n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n ); - // Partialy prehash second block without touching nonces in block_buf[3]. - blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf ); - do { lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf ); @@ -157,6 +178,44 @@ bool lyra2z_8way_thread_init() return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) ); } +static uint32_t lyra2z_8way_midstate_vars[16*8] __attribute__ ((aligned (64))); +static __m256i lyra2z_8way_block0_hash[8] __attribute__ ((aligned (64))); +static __m256i lyra2z_8way_block_buf[16] __attribute__ ((aligned (64))); + +int lyra2z_8way_prehash ( struct work *work ) +{ + uint32_t phash[8] __attribute__ ((aligned (32))) = + { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 + }; + uint32_t *pdata = work->data; + + // Prehash first block + blake256_transform_le( phash, pdata, 512, 0 ); + + lyra2z_8way_block0_hash[0] = _mm256_set1_epi32( phash[0] ); + lyra2z_8way_block0_hash[1] = _mm256_set1_epi32( phash[1] ); + lyra2z_8way_block0_hash[2] = _mm256_set1_epi32( phash[2] ); + lyra2z_8way_block0_hash[3] = _mm256_set1_epi32( phash[3] ); + lyra2z_8way_block0_hash[4] = _mm256_set1_epi32( phash[4] ); + lyra2z_8way_block0_hash[5] = _mm256_set1_epi32( phash[5] ); + lyra2z_8way_block0_hash[6] = _mm256_set1_epi32( phash[6] ); + lyra2z_8way_block0_hash[7] = _mm256_set1_epi32( phash[7] ); + + // Build vectored second block, interleave last 16 bytes of data using + // unique nonces. + lyra2z_8way_block_buf[ 0] = _mm256_set1_epi32( pdata[16] ); + lyra2z_8way_block_buf[ 1] = _mm256_set1_epi32( pdata[17] ); + lyra2z_8way_block_buf[ 2] = _mm256_set1_epi32( pdata[18] ); + + // Partialy prehash second block without touching nonces + blake256_8way_round0_prehash_le( lyra2z_8way_midstate_vars, + lyra2z_8way_block0_hash, lyra2z_8way_block_buf ); + + return 1; +} + static void lyra2z_8way_hash( void *state, const void *midstate_vars, const void *midhash, const void *block ) { @@ -201,11 +260,6 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce, uint32_t midstate_vars[16*8] __attribute__ ((aligned (64))); __m256i block0_hash[8] __attribute__ ((aligned (64))); __m256i block_buf[16] __attribute__ ((aligned (64))); - uint32_t phash[8] __attribute__ ((aligned (32))) = - { - 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, - 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 - }; uint32_t *pdata = work->data; uint64_t *ptarget = (uint64_t*)work->target; const uint32_t first_nonce = pdata[19]; @@ -215,23 +269,14 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce, const bool bench = opt_benchmark; const __m256i eight = m256_const1_32( 8 ); - // Prehash first block - blake256_transform_le( phash, pdata, 512, 0 ); + pthread_rwlock_rdlock( &g_work_lock ); - block0_hash[0] = _mm256_set1_epi32( phash[0] ); - block0_hash[1] = _mm256_set1_epi32( phash[1] ); - block0_hash[2] = _mm256_set1_epi32( phash[2] ); - block0_hash[3] = _mm256_set1_epi32( phash[3] ); - block0_hash[4] = _mm256_set1_epi32( phash[4] ); - block0_hash[5] = _mm256_set1_epi32( phash[5] ); - block0_hash[6] = _mm256_set1_epi32( phash[6] ); - block0_hash[7] = _mm256_set1_epi32( phash[7] ); + memcpy( midstate_vars, lyra2z_8way_midstate_vars, sizeof midstate_vars ); + memcpy( block0_hash, lyra2z_8way_block0_hash, sizeof block0_hash ); + memcpy( block_buf, lyra2z_8way_block_buf, sizeof block_buf ); - // Build vectored second block, interleave last 16 bytes of data using - // unique nonces. - block_buf[ 0] = _mm256_set1_epi32( pdata[16] ); - block_buf[ 1] = _mm256_set1_epi32( pdata[17] ); - block_buf[ 2] = _mm256_set1_epi32( pdata[18] ); + pthread_rwlock_unlock( &g_work_lock ); + block_buf[ 3] = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n ); diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h index 34df0cc..636cbf2 100644 --- a/algo/lyra2/sponge.h +++ b/algo/lyra2/sponge.h @@ -146,14 +146,25 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ b = mm128_ror_64( _mm_xor_si128( b, c ), 63 ); #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ +{ \ + __m128i t; \ G_2X64( s0, s2, s4, s6 ); \ G_2X64( s1, s3, s5, s7 ); \ - mm128_vrol256_64( s6, s7 ); \ - mm128_vror256_64( s2, s3 ); \ + t = mm128_alignr_64( s7, s6, 1 ); \ + s6 = mm128_alignr_64( s6, s7, 1 ); \ + s7 = t; \ + t = mm128_alignr_64( s2, s3, 1 ); \ + s2 = mm128_alignr_64( s3, s2, 1 ); \ + s3 = t; \ G_2X64( s0, s2, s5, s6 ); \ G_2X64( s1, s3, s4, s7 ); \ - mm128_vror256_64( s6, s7 ); \ - mm128_vrol256_64( s2, s3 ); + t = mm128_alignr_64( s6, s7, 1 ); \ + s6 = mm128_alignr_64( s7, s6, 1 ); \ + s7 = t; \ + t = mm128_alignr_64( s3, s2, 1 ); \ + s2 = mm128_alignr_64( s2, s3, 1 ); \ + s3 = t; \ +} #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ diff --git a/algo/skein/skein-4way.c b/algo/skein/skein-4way.c index 5a7cdbd..4d3961f 100644 --- a/algo/skein/skein-4way.c +++ b/algo/skein/skein-4way.c @@ -7,8 +7,16 @@ #if defined (SKEIN_8WAY) -static __thread skein512_8way_context skein512_8way_ctx +static skein512_8way_context skein512_8way_ctx __attribute__ ((aligned (64))); +static uint32_t skein_8way_vdata[20*8] __attribute__ ((aligned (64))); + +int skein_8way_prehash( struct work *work ) +{ + mm512_bswap32_intrlv80_8x64( skein_8way_vdata, work->data ); + skein512_8way_prehash64( &skein512_8way_ctx, skein_8way_vdata ); + return 1; +} void skeinhash_8way( void *state, const void *input ) { @@ -29,25 +37,27 @@ void skeinhash_8way( void *state, const void *input ) int scanhash_skein_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t vdata[20*8] __attribute__ ((aligned (128))); - uint32_t hash[8*8] __attribute__ ((aligned (64))); - uint32_t lane_hash[8] __attribute__ ((aligned (64))); - uint32_t *hash_d7 = &(hash[7*8]); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t targ_d7 = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - const uint32_t last_nonce = max_nonce - 8; - uint32_t n = first_nonce; - __m512i *noncev = (__m512i*)vdata + 9; - const int thr_id = mythr->id; - const bool bench = opt_benchmark; + uint32_t vdata[20*8] __attribute__ ((aligned (128))); + uint32_t hash[8*8] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t *hash_d7 = &(hash[7*8]); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t targ_d7 = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 8; + uint32_t n = first_nonce; + __m512i *noncev = (__m512i*)vdata + 9; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + + pthread_rwlock_rdlock( &g_work_lock ); + memcpy( vdata, skein_8way_vdata, sizeof vdata ); + pthread_rwlock_unlock( &g_work_lock ); - mm512_bswap32_intrlv80_8x64( vdata, pdata ); *noncev = mm512_intrlv_blend_32( _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev ); - skein512_8way_prehash64( &skein512_8way_ctx, vdata ); do { skeinhash_8way( hash, vdata ); @@ -74,8 +84,16 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce, #elif defined (SKEIN_4WAY) -static __thread skein512_4way_context skein512_4way_ctx +static skein512_4way_context skein512_4way_ctx __attribute__ ((aligned (64))); +static uint32_t skein_4way_vdata[20*4] __attribute__ ((aligned (64))); + +int skein_4way_prehash( struct work *work ) +{ + mm256_bswap32_intrlv80_4x64( skein_4way_vdata, work->data ); + skein512_4way_prehash64( &skein512_4way_ctx, skein_4way_vdata ); + return 1; +} void skeinhash_4way( void *state, const void *input ) { @@ -118,23 +136,24 @@ void skeinhash_4way( void *state, const void *input ) int scanhash_skein_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t hash[8*4] __attribute__ ((aligned (64))); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t *hash_d7 = &(hash[7<<2]); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t targ_d7 = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - const uint32_t last_nonce = max_nonce - 4; - uint32_t n = first_nonce; - __m256i *noncev = (__m256i*)vdata + 9; - const int thr_id = mythr->id; - const bool bench = opt_benchmark; - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - skein512_4way_prehash64( &skein512_4way_ctx, vdata ); + uint32_t vdata[20*4] __attribute__ ((aligned (64))); + uint32_t hash[8*4] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + uint32_t *hash_d7 = &(hash[7<<2]); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t targ_d7 = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 4; + uint32_t n = first_nonce; + __m256i *noncev = (__m256i*)vdata + 9; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + pthread_rwlock_rdlock( &g_work_lock ); + memcpy( vdata, skein_4way_vdata, sizeof vdata ); + pthread_rwlock_unlock( &g_work_lock ); + *noncev = mm256_intrlv_blend_32( _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do diff --git a/algo/skein/skein-gate.c b/algo/skein/skein-gate.c index 7adeac9..c169f52 100644 --- a/algo/skein/skein-gate.c +++ b/algo/skein/skein-gate.c @@ -7,10 +7,12 @@ bool register_skein_algo( algo_gate_t* gate ) #if defined (SKEIN_8WAY) gate->optimizations = AVX2_OPT | AVX512_OPT; gate->scanhash = (void*)&scanhash_skein_8way; + gate->prehash = (void*)&skein_8way_prehash; gate->hash = (void*)&skeinhash_8way; #elif defined (SKEIN_4WAY) gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT; gate->scanhash = (void*)&scanhash_skein_4way; + gate->prehash = (void*)&skein_4way_prehash; gate->hash = (void*)&skeinhash_4way; #else gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT; @@ -25,10 +27,12 @@ bool register_skein2_algo( algo_gate_t* gate ) gate->optimizations = AVX2_OPT | AVX512_OPT; #if defined (SKEIN_8WAY) gate->scanhash = (void*)&scanhash_skein2_8way; - gate->hash = (void*)&skein2hash_8way; +// gate->hash = (void*)&skein2hash_8way; + gate->prehash = (void*)&skein2_8way_prehash; #elif defined (SKEIN_4WAY) gate->scanhash = (void*)&scanhash_skein2_4way; - gate->hash = (void*)&skein2hash_4way; +// gate->hash = (void*)&skein2hash_4way; + gate->prehash = (void*)&skein2_4way_prehash; #else gate->scanhash = (void*)&scanhash_skein2; gate->hash = (void*)&skein2hash; diff --git a/algo/skein/skein-gate.h b/algo/skein/skein-gate.h index eba535e..71d2501 100644 --- a/algo/skein/skein-gate.h +++ b/algo/skein/skein-gate.h @@ -14,20 +14,24 @@ void skeinhash_8way( void *output, const void *input ); int scanhash_skein_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); +int skein_8way_prehash( struct work * ); void skein2hash_8way( void *output, const void *input ); int scanhash_skein2_8way( struct work *work, uint32_t max_nonce, uint64_t* hashes_done, struct thr_info *mythr ); +int skein2_8way_prehash( struct work * ); #elif defined(SKEIN_4WAY) void skeinhash_4way( void *output, const void *input ); int scanhash_skein_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); +int skein_4way_prehash( struct work * ); void skein2hash_4way( void *output, const void *input ); int scanhash_skein2_4way( struct work *work, uint32_t max_nonce, uint64_t* hashes_done, struct thr_info *mythr ); +int skein2_4way_prehash( struct work * ); #else diff --git a/algo/skein/skein.c b/algo/skein/skein.c index be9bb82..6fde710 100644 --- a/algo/skein/skein.c +++ b/algo/skein/skein.c @@ -31,18 +31,19 @@ int scanhash_skein( struct work *work, uint32_t max_nonce, const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; uint32_t n = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated + int thr_id = mythr->id; swab32_array( endiandata, pdata, 20 ); do { be32enc(&endiandata[19], n); skeinhash(hash64, endiandata); - if (hash64[7] < Htarg && fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return true; - } + if (hash64[7] <= Htarg ) + if ( fulltest(hash64, ptarget) && !opt_benchmark ) + { + pdata[19] = n; + submit_solution( work, hash64, mythr ); + } n++; } while (n < max_nonce && !work_restart[thr_id].restart); diff --git a/algo/skein/skein2-4way.c b/algo/skein/skein2-4way.c index 2eb0deb..088c6d6 100644 --- a/algo/skein/skein2-4way.c +++ b/algo/skein/skein2-4way.c @@ -5,9 +5,17 @@ #if defined(SKEIN_8WAY) - static __thread skein512_8way_context skein512_8way_ctx - __attribute__ ((aligned (64))); +static skein512_8way_context skein512_8way_ctx __attribute__ ((aligned (64))); +static uint32_t skein2_8way_vdata[20*8] __attribute__ ((aligned (64))); +int skein2_8way_prehash( struct work *work ) +{ + mm512_bswap32_intrlv80_8x64( skein2_8way_vdata, work->data ); + skein512_8way_prehash64( &skein512_8way_ctx, skein2_8way_vdata ); + return 1; +} + +/* not used void skein2hash_8way( void *output, const void *input ) { uint64_t hash[16*8] __attribute__ ((aligned (128))); @@ -17,6 +25,7 @@ void skein2hash_8way( void *output, const void *input ) skein512_8way_final16( &ctx, hash, input + (64*8) ); skein512_8way_full( &ctx, output, hash, 64 ); } +*/ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) @@ -36,11 +45,14 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce, const bool bench = opt_benchmark; skein512_8way_context ctx; - mm512_bswap32_intrlv80_8x64( vdata, pdata ); + pthread_rwlock_rdlock( &g_work_lock ); + memcpy( vdata, skein2_8way_vdata, sizeof vdata ); + memcpy( &ctx, &skein512_8way_ctx, sizeof ctx ); + pthread_rwlock_unlock( &g_work_lock ); + *noncev = mm512_intrlv_blend_32( _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev ); - skein512_8way_prehash64( &ctx, vdata ); do { skein512_8way_final16( &ctx, hash, vdata + (16*8) ); @@ -67,10 +79,18 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce, } #elif defined(SKEIN_4WAY) + +static skein512_4way_context skein512_4way_ctx __attribute__ ((aligned (64))); +static uint32_t skein2_4way_vdata[20*4] __attribute__ ((aligned (64))); + +int skein2_4way_prehash( struct work *work ) +{ + mm256_bswap32_intrlv80_4x64( skein2_4way_vdata, work->data ); + skein512_4way_prehash64( &skein512_4way_ctx, skein2_4way_vdata ); + return 1; +} -static __thread skein512_4way_context skein512_4way_ctx - __attribute__ ((aligned (64))); - +/* not used void skein2hash_4way( void *output, const void *input ) { skein512_4way_context ctx; @@ -80,6 +100,7 @@ void skein2hash_4way( void *output, const void *input ) skein512_4way_final16( &ctx, hash, input + (64*4) ); skein512_4way_full( &ctx, output, hash, 64 ); } +*/ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) @@ -99,8 +120,11 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce, const bool bench = opt_benchmark; skein512_4way_context ctx; - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - skein512_4way_prehash64( &ctx, vdata ); + pthread_rwlock_rdlock( &g_work_lock ); + memcpy( vdata, skein2_4way_vdata, sizeof vdata ); + memcpy( &ctx, &skein512_4way_ctx, sizeof ctx ); + pthread_rwlock_unlock( &g_work_lock ); + *noncev = mm256_intrlv_blend_32( _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do diff --git a/algo/skein/skein2.c b/algo/skein/skein2.c index cc58290..66a870d 100644 --- a/algo/skein/skein2.c +++ b/algo/skein/skein2.c @@ -34,31 +34,31 @@ void skein2hash(void *output, const void *input) sph_skein512_close(&ctx_skein, hash); memcpy(output, hash, 32); - } int scanhash_skein2( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; uint32_t hash64[8] __attribute__ ((aligned (64))); uint32_t endiandata[20] __attribute__ ((aligned (64))); const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; uint32_t n = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated + int thr_id = mythr->id; - swab32_array( endiandata, pdata, 20 ); + swab32_array( endiandata, pdata, 20 ); do { be32enc(&endiandata[19], n); skein2hash(hash64, endiandata); - if (hash64[7] < Htarg && fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return true; - } + if (hash64[7] <= Htarg ) + if ( fulltest(hash64, ptarget) && !opt_benchmark ) + { + pdata[19] = n; + submit_solution( work, hash64, mythr ); + } n++; } while (n < max_nonce && !work_restart[thr_id].restart); diff --git a/algo/x16/hex.c b/algo/x16/hex.c index 5d064d2..ac9c5e0 100644 --- a/algo/x16/hex.c +++ b/algo/x16/hex.c @@ -25,7 +25,7 @@ static void hex_getAlgoString(const uint32_t* prevblock, char *output) static __thread x16r_context_overlay hex_ctx; -int hex_hash( void* output, const void* input, int thrid ) +int hex_hash( void* output, const void* input, const int thrid ) { uint32_t _ALIGN(128) hash[16]; x16r_context_overlay ctx; diff --git a/algo/x16/minotaur.c b/algo/x16/minotaur.c index 1216441..a229eb7 100644 --- a/algo/x16/minotaur.c +++ b/algo/x16/minotaur.c @@ -72,7 +72,7 @@ struct TortureGarden // Get a 64-byte hash for given 64-byte input, using given TortureGarden contexts and given algo index static int get_hash( void *output, const void *input, TortureGarden *garden, - unsigned int algo, int thr_id ) + unsigned int algo, const int thr_id ) { unsigned char hash[64] __attribute__ ((aligned (64))); int rc = 1; @@ -233,7 +233,7 @@ bool initialize_torture_garden() } // Produce a 32-byte hash from 80-byte input data -int minotaur_hash( void *output, const void *input, int thr_id ) +int minotaur_hash( void *output, const void *input, const int thr_id ) { unsigned char hash[64] __attribute__ ((aligned (64))); int rc = 1; diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c index 7ce4546..b94778d 100644 --- a/algo/x16/x16r-4way.c +++ b/algo/x16/x16r-4way.c @@ -19,7 +19,7 @@ // Perform midstate prehash of hash functions with block size <= 72 bytes, // 76 bytes for hash functions that operate on 32 bit data. -void x16r_8way_prehash( void *vdata, void *pdata ) +void x16r_8way_do_prehash( void *vdata, const void *pdata ) { uint32_t vdata2[20*8] __attribute__ ((aligned (64))); uint32_t edata[20] __attribute__ ((aligned (64))); @@ -106,11 +106,18 @@ void x16r_8way_prehash( void *vdata, void *pdata ) } } +int x16r_8way_prehash( struct work *work ) +{ + x16r_gate_get_hash_order( work, x16r_hash_order ); + x16r_8way_do_prehash( x16r_8way_vdata, work->data ); + return 1; +} + // Perform the full x16r hash and returns 512 bit intermediate hash. // Called by wrapper hash function to optionally continue hashing and // convert to final hash. -int x16r_8way_hash_generic( void* output, const void* input, int thrid ) +int x16r_8way_hash_generic( void* output, const void* input, const int thrid ) { uint32_t vhash[20*8] __attribute__ ((aligned (128))); uint32_t hash0[20] __attribute__ ((aligned (16))); @@ -471,7 +478,7 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid ) // x16-r,-s,-rt wrapper called directly by scanhash to repackage 512 bit // hash to 256 bit final hash. -int x16r_8way_hash( void* output, const void* input, int thrid ) +int x16r_8way_hash( void* output, const void* input, const int thrid ) { uint8_t hash[64*8] __attribute__ ((aligned (128))); if ( !x16r_8way_hash_generic( hash, input, thrid ) ) @@ -495,7 +502,6 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce, { uint32_t hash[16*8] __attribute__ ((aligned (128))); uint32_t vdata[20*8] __attribute__ ((aligned (64))); - uint32_t bedata1[2]; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; @@ -508,27 +514,16 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce, if ( bench ) ptarget[7] = 0x0cff; - bedata1[0] = bswap_32( pdata[1] ); - bedata1[1] = bswap_32( pdata[2] ); + pthread_rwlock_rdlock( &g_work_lock ); + memcpy( vdata, x16r_8way_vdata, sizeof vdata ); + pthread_rwlock_unlock( &g_work_lock ); - static __thread uint32_t s_ntime = UINT32_MAX; - const uint32_t ntime = bswap_32( pdata[17] ); - if ( s_ntime != ntime ) - { - x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order ); - s_ntime = ntime; - - if ( opt_debug && !thr_id ) - applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime ); - } - - x16r_8way_prehash( vdata, pdata ); *noncev = mm512_intrlv_blend_32( _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do { - if( x16r_8way_hash( hash, vdata, thr_id ) ); + if( algo_gate.hash( hash, vdata, thr_id ) ); for ( int i = 0; i < 8; i++ ) if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) { @@ -546,7 +541,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce, #elif defined (X16R_4WAY) -void x16r_4way_prehash( void *vdata, void *pdata ) +void x16r_4way_do_prehash( void *vdata, const void *pdata ) { uint32_t vdata2[20*4] __attribute__ ((aligned (64))); uint32_t edata[20] __attribute__ ((aligned (64))); @@ -627,7 +622,14 @@ void x16r_4way_prehash( void *vdata, void *pdata ) } } -int x16r_4way_hash_generic( void* output, const void* input, int thrid ) +int x16r_4way_prehash( struct work *work ) +{ + x16r_gate_get_hash_order( work, x16r_hash_order ); + x16r_4way_do_prehash( x16r_4way_vdata, work->data ); + return 1; +} + +int x16r_4way_hash_generic( void* output, const void* input, const int thrid ) { uint32_t vhash[20*4] __attribute__ ((aligned (128))); uint32_t hash0[20] __attribute__ ((aligned (32))); @@ -635,13 +637,14 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid ) uint32_t hash2[20] __attribute__ ((aligned (32))); uint32_t hash3[20] __attribute__ ((aligned (32))); x16r_4way_context_overlay ctx; - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); void *in0 = (void*) hash0; void *in1 = (void*) hash1; void *in2 = (void*) hash2; void *in3 = (void*) hash3; int size = 80; + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 ); for ( int i = 0; i < 16; i++ ) @@ -905,7 +908,7 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid ) return 1; } -int x16r_4way_hash( void* output, const void* input, int thrid ) +int x16r_4way_hash( void* output, const void* input, const int thrid ) { uint8_t hash[64*4] __attribute__ ((aligned (64))); if ( !x16r_4way_hash_generic( hash, input, thrid ) ) @@ -924,7 +927,6 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce, { uint32_t hash[16*4] __attribute__ ((aligned (64))); uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t bedata1[2]; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; @@ -937,25 +939,15 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce, if ( bench ) ptarget[7] = 0x0cff; - bedata1[0] = bswap_32( pdata[1] ); - bedata1[1] = bswap_32( pdata[2] ); + pthread_rwlock_rdlock( &g_work_lock ); + memcpy( vdata, x16r_4way_vdata, sizeof vdata ); + pthread_rwlock_unlock( &g_work_lock ); - static __thread uint32_t s_ntime = UINT32_MAX; - const uint32_t ntime = bswap_32( pdata[17] ); - if ( s_ntime != ntime ) - { - x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order ); - s_ntime = ntime; - if ( opt_debug && !thr_id ) - applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime ); - } - - x16r_4way_prehash( vdata, pdata ); *noncev = mm256_intrlv_blend_32( _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do { - if ( x16r_4way_hash( hash, vdata, thr_id ) ); + if ( algo_gate.hash( hash, vdata, thr_id ) ); for ( int i = 0; i < 4; i++ ) if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) { diff --git a/algo/x16/x16r-gate.c b/algo/x16/x16r-gate.c index c498ff7..a91a4ec 100644 --- a/algo/x16/x16r-gate.c +++ b/algo/x16/x16r-gate.c @@ -1,26 +1,44 @@ #include "x16r-gate.h" #include "algo/sha/sha256d.h" -__thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = { 0 }; +char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = {0}; -void (*x16_r_s_getAlgoString) ( const uint8_t*, char* ) = NULL; +void (*x16r_gate_get_hash_order) ( const struct work *, char * ) = NULL; #if defined (X16R_8WAY) -__thread x16r_8way_context_overlay x16r_ctx; +x16r_8way_context_overlay x16r_ctx; +uint32_t x16r_8way_vdata[24*8] __attribute__ ((aligned (64))); #elif defined (X16R_4WAY) -__thread x16r_4way_context_overlay x16r_ctx; +x16r_4way_context_overlay x16r_ctx; +uint32_t x16r_4way_vdata[24*4] __attribute__ ((aligned (64))); + #endif -__thread x16r_context_overlay x16_ctx; +#if defined (X16RV2_8WAY) +x16rv2_8way_context_overlay x16rv2_ctx; -void x16r_getAlgoString( const uint8_t* prevblock, char *output ) +#elif defined (X16RV2_4WAY) + +x16rv2_4way_context_overlay x16rv2_ctx; + +#endif + +x16r_context_overlay x16_ctx; +uint32_t x16r_edata[24] __attribute__ ((aligned (32))); + +void x16r_get_hash_order( const struct work *work, char *hash_order ) { - char *sptr = output; + char *sptr = hash_order; + const uint32_t *pdata = work->data; + uint8_t prevblock[16]; + ((uint32_t*)prevblock)[0] = bswap_32( pdata[1] ); + ((uint32_t*)prevblock)[1] = bswap_32( pdata[2] ); + for ( int j = 0; j < X16R_HASH_FUNC_COUNT; j++ ) { uint8_t b = (15 - j) >> 1; // 16 first ascii hex chars (lsb in uint256) @@ -32,38 +50,51 @@ void x16r_getAlgoString( const uint8_t* prevblock, char *output ) sptr++; } *sptr = '\0'; -} -void x16s_getAlgoString( const uint8_t* prevblock, char *output ) + if ( !opt_quiet ) + applog( LOG_INFO, "Hash order %s", x16r_hash_order ); +} + +void x16s_get_hash_order( const struct work *work, char *hash_order ) { - strcpy( output, "0123456789ABCDEF" ); + const uint32_t *pdata = work->data; + uint8_t prevblock[16]; + ((uint32_t*)prevblock)[0] = bswap_32( pdata[1] ); + ((uint32_t*)prevblock)[1] = bswap_32( pdata[2] ); + strcpy( hash_order, "0123456789ABCDEF" ); for ( int i = 0; i < 16; i++ ) { uint8_t b = (15 - i) >> 1; // 16 ascii hex chars, reversed uint8_t algoDigit = (i & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4; int offset = algoDigit; // insert the nth character at the front - char oldVal = output[offset]; + char oldVal = hash_order[ offset ]; for( int j = offset; j-- > 0; ) - output[j+1] = output[j]; - output[0] = oldVal; + hash_order[ j+1 ] = hash_order[ j ]; + hash_order[ 0 ] = oldVal; } + + if ( !opt_quiet ) + applog( LOG_INFO, "Hash order %s", x16r_hash_order ); } bool register_x16r_algo( algo_gate_t* gate ) { #if defined (X16R_8WAY) gate->scanhash = (void*)&scanhash_x16r_8way; + gate->prehash = (void*)&x16r_8way_prehash; gate->hash = (void*)&x16r_8way_hash; #elif defined (X16R_4WAY) gate->scanhash = (void*)&scanhash_x16r_4way; + gate->prehash = (void*)&x16r_4way_prehash; gate->hash = (void*)&x16r_4way_hash; #else gate->scanhash = (void*)&scanhash_x16r; + gate->prehash = (void*)&x16r_prehash; gate->hash = (void*)&x16r_hash; #endif gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; - x16_r_s_getAlgoString = (void*)&x16r_getAlgoString; + x16r_gate_get_hash_order = (void*)&x16r_get_hash_order; opt_target_factor = 256.0; return true; }; @@ -71,17 +102,20 @@ bool register_x16r_algo( algo_gate_t* gate ) bool register_x16rv2_algo( algo_gate_t* gate ) { #if defined (X16RV2_8WAY) - gate->scanhash = (void*)&scanhash_x16rv2_8way; + gate->scanhash = (void*)&scanhash_x16r_8way; + gate->prehash = (void*)&x16rv2_8way_prehash; gate->hash = (void*)&x16rv2_8way_hash; #elif defined (X16RV2_4WAY) - gate->scanhash = (void*)&scanhash_x16rv2_4way; + gate->scanhash = (void*)&scanhash_x16r_4way; + gate->prehash = (void*)&x16rv2_4way_prehash; gate->hash = (void*)&x16rv2_4way_hash; #else - gate->scanhash = (void*)&scanhash_x16rv2; + gate->scanhash = (void*)&scanhash_x16r; + gate->prehash = (void*)&x16rv2_prehash; gate->hash = (void*)&x16rv2_hash; #endif gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; - x16_r_s_getAlgoString = (void*)&x16r_getAlgoString; + x16r_gate_get_hash_order = (void*)&x16r_get_hash_order; opt_target_factor = 256.0; return true; }; @@ -90,16 +124,19 @@ bool register_x16s_algo( algo_gate_t* gate ) { #if defined (X16R_8WAY) gate->scanhash = (void*)&scanhash_x16r_8way; + gate->prehash = (void*)&x16r_8way_prehash; gate->hash = (void*)&x16r_8way_hash; #elif defined (X16R_4WAY) gate->scanhash = (void*)&scanhash_x16r_4way; + gate->prehash = (void*)&x16r_4way_prehash; gate->hash = (void*)&x16r_4way_hash; #else gate->scanhash = (void*)&scanhash_x16r; + gate->prehash = (void*)&x16r_prehash; gate->hash = (void*)&x16r_hash; #endif gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; - x16_r_s_getAlgoString = (void*)&x16s_getAlgoString; + x16r_gate_get_hash_order = (void*)&x16s_get_hash_order; opt_target_factor = 256.0; return true; }; @@ -108,30 +145,33 @@ bool register_x16s_algo( algo_gate_t* gate ) // // X16RT +void x16rt_get_hash_order( const struct work * work, char * hash_order ) +{ + uint32_t _ALIGN(64) timehash[8*8]; + const uint32_t ntime = bswap_32( work->data[17] ); + const int32_t masked_ntime = ntime & 0xffffff80; + uint8_t* data = (uint8_t*)timehash; + char *sptr = hash_order; -void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash ) -{ - int32_t maskedTime = timeStamp & 0xffffff80; - sha256d( (unsigned char*)timeHash, (const unsigned char*)( &maskedTime ), - sizeof( maskedTime ) ); -} + sha256d( (unsigned char*)timehash, (const unsigned char*)( &masked_ntime ), + sizeof( masked_ntime ) ); -void x16rt_getAlgoString( const uint32_t *timeHash, char *output) -{ - char *sptr = output; - uint8_t* data = (uint8_t*)timeHash; - - for (uint8_t j = 0; j < X16R_HASH_FUNC_COUNT; j++) { + for ( uint8_t j = 0; j < X16R_HASH_FUNC_COUNT; j++ ) + { uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4; - if (algoDigit >= 10) - sprintf(sptr, "%c", 'A' + (algoDigit - 10)); + if ( algoDigit >= 10 ) + sprintf( sptr, "%c", 'A' + (algoDigit - 10) ); else - sprintf(sptr, "%u", (uint32_t) algoDigit); + sprintf( sptr, "%u", (uint32_t) algoDigit ); sptr++; } *sptr = '\0'; + + if ( !opt_quiet ) + applog( LOG_INFO, "Hash order %s, ntime %08x, time hash %08x", + hash_order, ntime, timehash ); } void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) @@ -222,15 +262,19 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) bool register_x16rt_algo( algo_gate_t* gate ) { #if defined (X16R_8WAY) - gate->scanhash = (void*)&scanhash_x16rt_8way; + gate->scanhash = (void*)&scanhash_x16r_8way; + gate->prehash = (void*)&x16r_8way_prehash; gate->hash = (void*)&x16r_8way_hash; #elif defined (X16R_4WAY) - gate->scanhash = (void*)&scanhash_x16rt_4way; + gate->scanhash = (void*)&scanhash_x16r_4way; + gate->prehash = (void*)&x16r_4way_prehash; gate->hash = (void*)&x16r_4way_hash; #else - gate->scanhash = (void*)&scanhash_x16rt; + gate->scanhash = (void*)&scanhash_x16r; + gate->prehash = (void*)&x16r_prehash; gate->hash = (void*)&x16r_hash; #endif + x16r_gate_get_hash_order = (void*)&x16rt_get_hash_order; gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; opt_target_factor = 256.0; return true; @@ -239,16 +283,20 @@ bool register_x16rt_algo( algo_gate_t* gate ) bool register_x16rt_veil_algo( algo_gate_t* gate ) { #if defined (X16R_8WAY) - gate->scanhash = (void*)&scanhash_x16rt_8way; + gate->scanhash = (void*)&scanhash_x16r_8way; + gate->prehash = (void*)&x16r_8way_prehash; gate->hash = (void*)&x16r_8way_hash; #elif defined (X16R_4WAY) - gate->scanhash = (void*)&scanhash_x16rt_4way; + gate->scanhash = (void*)&scanhash_x16r_4way; + gate->prehash = (void*)&x16r_4way_prehash; gate->hash = (void*)&x16r_4way_hash; #else - gate->scanhash = (void*)&scanhash_x16rt; + gate->scanhash = (void*)&scanhash_x16r; + gate->prehash = (void*)&x16r_prehash; gate->hash = (void*)&x16r_hash; #endif gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; + x16r_gate_get_hash_order = (void*)&x16rt_get_hash_order; gate->build_extraheader = (void*)&veil_build_extraheader; opt_target_factor = 256.0; return true; @@ -275,20 +323,23 @@ bool register_hex_algo( algo_gate_t* gate ) bool register_x21s_algo( algo_gate_t* gate ) { #if defined (X16R_8WAY) - gate->scanhash = (void*)&scanhash_x21s_8way; + gate->scanhash = (void*)&scanhash_x16r_8way; + gate->prehash = (void*)&x16r_8way_prehash; gate->hash = (void*)&x21s_8way_hash; gate->miner_thread_init = (void*)&x21s_8way_thread_init; #elif defined (X16R_4WAY) - gate->scanhash = (void*)&scanhash_x21s_4way; + gate->scanhash = (void*)&scanhash_x16r_4way; + gate->prehash = (void*)&x16r_4way_prehash; gate->hash = (void*)&x21s_4way_hash; gate->miner_thread_init = (void*)&x21s_4way_thread_init; #else - gate->scanhash = (void*)&scanhash_x21s; + gate->scanhash = (void*)&scanhash_x16r; + gate->prehash = (void*)&x16r_prehash; gate->hash = (void*)&x21s_hash; gate->miner_thread_init = (void*)&x21s_thread_init; #endif gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; - x16_r_s_getAlgoString = (void*)&x16s_getAlgoString; + x16r_gate_get_hash_order = (void*)&x16s_get_hash_order; opt_target_factor = 256.0; return true; }; diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h index 76ca5e7..2a42720 100644 --- a/algo/x16/x16r-gate.h +++ b/algo/x16/x16r-gate.h @@ -21,6 +21,7 @@ #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" #include "algo/sha/sph_sha2.h" +#include "algo/tiger/sph_tiger.h" #if defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" @@ -57,13 +58,11 @@ #define X16R_8WAY 1 #define X16RV2_8WAY 1 - #define X16RT_8WAY 1 #define X21S_8WAY 1 #elif defined(__AVX2__) && defined(__AES__) #define X16RV2_4WAY 1 - #define X16RT_4WAY 1 #define X21S_4WAY 1 #define X16R_4WAY 1 @@ -89,23 +88,29 @@ enum x16r_Algo { X16R_HASH_FUNC_COUNT }; -extern __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ]; -extern void (*x16_r_s_getAlgoString) ( const uint8_t*, char* ); -void x16r_getAlgoString( const uint8_t *prevblock, char *output ); -void x16s_getAlgoString( const uint8_t *prevblock, char *output ); -void x16rt_getAlgoString( const uint32_t *timeHash, char *output ); +//extern __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ]; +extern char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ]; + + +extern void (*x16r_gate_get_hash_order) ( const struct work *, char * ); + +// x16r, x16rv2 +void x16r_get_hash_order( const struct work *, char * ); +// x16s, x21s +void x16s_get_hash_order( const struct work *, char * ); +// x16rt +void x16rt_get_hash_order( const struct work *, char * ); -void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash ); bool register_x16r_algo( algo_gate_t* gate ); bool register_x16rv2_algo( algo_gate_t* gate ); bool register_x16s_algo( algo_gate_t* gate ); bool register_x16rt_algo( algo_gate_t* gate ); -bool register_hex__algo( algo_gate_t* gate ); -bool register_x21s__algo( algo_gate_t* gate ); +bool register_hex_algo( algo_gate_t* gate ); +bool register_x21s_algo( algo_gate_t* gate ); -// x16r, x16s +// x16r, x16s, x16rt #if defined(X16R_8WAY) union _x16r_8way_context_overlay @@ -136,15 +141,15 @@ union _x16r_8way_context_overlay typedef union _x16r_8way_context_overlay x16r_8way_context_overlay; -extern __thread x16r_8way_context_overlay x16r_ctx; +extern x16r_8way_context_overlay x16r_ctx; +extern uint32_t x16r_8way_vdata[24*8] __attribute__ ((aligned (64))); -void x16r_8way_prehash( void *, void * ); -int x16r_8way_hash_generic( void *, const void *, int ); -int x16r_8way_hash( void *, const void *, int ); +void x16r_8way_do_prehash( void *, const void * ); +int x16r_8way_prehash( struct work * ); +int x16r_8way_hash_generic( void *, const void *, const int ); +int x16r_8way_hash( void *, const void *, const int ); int scanhash_x16r_8way( struct work *, uint32_t , uint64_t *, struct thr_info * ); -extern __thread x16r_8way_context_overlay x16r_ctx; - #elif defined(X16R_4WAY) @@ -177,14 +182,15 @@ union _x16r_4way_context_overlay typedef union _x16r_4way_context_overlay x16r_4way_context_overlay; -extern __thread x16r_4way_context_overlay x16r_ctx; +extern x16r_4way_context_overlay x16r_ctx; +extern uint32_t x16r_4way_vdata[24*4] __attribute__ ((aligned (64))); -void x16r_4way_prehash( void *, void * ); -int x16r_4way_hash_generic( void *, const void *, int ); -int x16r_4way_hash( void *, const void *, int ); +void x16r_4way_do_prehash( void *, const void * ); +int x16r_4way_prehash( struct work * ); +int x16r_4way_hash_generic( void *, const void *, const int ); +int x16r_4way_hash( void *, const void *, const int ); int scanhash_x16r_4way( struct work *, uint32_t, uint64_t *, struct thr_info * ); -extern __thread x16r_4way_context_overlay x16r_ctx; #endif @@ -217,80 +223,113 @@ union _x16r_context_overlay typedef union _x16r_context_overlay x16r_context_overlay; -extern __thread x16r_context_overlay x16_ctx; +extern x16r_context_overlay x16_ctx; +extern uint32_t x16r_edata[24] __attribute__ ((aligned (32))); -void x16r_prehash( void *, void * ); -int x16r_hash_generic( void *, const void *, int ); -int x16r_hash( void *, const void *, int ); +void x16r_do_prehash( const void * ); +int x16r_prehash( const struct work * ); +int x16r_hash_generic( void *, const void *, const int ); +int x16r_hash( void *, const void *, const int ); int scanhash_x16r( struct work *, uint32_t, uint64_t *, struct thr_info * ); // x16Rv2 #if defined(X16RV2_8WAY) -int x16rv2_8way_hash( void *state, const void *input, int thrid ); -int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); +union _x16rv2_8way_context_overlay +{ + blake512_8way_context blake; + bmw512_8way_context bmw; + skein512_8way_context skein; + jh512_8way_context jh; + keccak512_8way_context keccak; + luffa_4way_context luffa; + cubehashParam cube; + simd_4way_context simd; + hamsi512_8way_context hamsi; + hashState_fugue fugue; + shabal512_8way_context shabal; + sph_whirlpool_context whirlpool; + sha512_8way_context sha512; + sph_tiger_context tiger; +#if defined(__VAES__) + groestl512_4way_context groestl; + shavite512_4way_context shavite; + echo_4way_context echo; +#else + hashState_groestl groestl; + shavite512_context shavite; + hashState_echo echo; +#endif +} __attribute__ ((aligned (64))); + +typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay; +extern x16rv2_8way_context_overlay x16rv2_ctx; + +int x16rv2_8way_prehash( struct work * ); +int x16rv2_8way_hash( void *state, const void *input, const int thrid ); +//int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce, +// uint64_t *hashes_done, struct thr_info *mythr ); #elif defined(X16RV2_4WAY) -int x16rv2_4way_hash( void *state, const void *input, int thrid ); -int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - +union _x16rv2_4way_context_overlay +{ + blake512_4way_context blake; + bmw512_4way_context bmw; +#if defined(__VAES__) + groestl512_2way_context groestl; + shavite512_2way_context shavite; + echo_2way_context echo; #else - -int x16rv2_hash( void *state, const void *input, int thr_id ); -int scanhash_x16rv2( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - + hashState_groestl groestl; + shavite512_context shavite; + hashState_echo echo; #endif + skein512_4way_context skein; + jh512_4way_context jh; + keccak512_4way_context keccak; + luffa_2way_context luffa; + cubehashParam cube; + simd_2way_context simd; + hamsi512_4way_context hamsi; + hashState_fugue fugue; + shabal512_4way_context shabal; + sph_whirlpool_context whirlpool; + sha512_4way_context sha512; + sph_tiger_context tiger; +}; -// x16rt, veil -#if defined(X16R_8WAY) +typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay; +extern x16rv2_4way_context_overlay x16rv2_ctx; -//void x16rt_8way_hash( void *state, const void *input ); -int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -#elif defined(X16R_4WAY) - -//void x16rt_4way_hash( void *state, const void *input ); -int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); +int x16rv2_4way_hash( void *state, const void *input, const int thrid ); +int x16rv2_4way_prehash( struct work * ); #else -//void x16rt_hash( void *state, const void *input ); -int scanhash_x16rt( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); +int x16rv2_hash( void *state, const void *input, const int thr_id ); +int x16rv2_prehash( const struct work * ); #endif // x21s #if defined(X16R_8WAY) -int x21s_8way_hash( void *state, const void *input, int thrid ); -int scanhash_x21s_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); +int x21s_8way_hash( void *state, const void *input, const int thrid ); bool x21s_8way_thread_init(); #elif defined(X16R_4WAY) -int x21s_4way_hash( void *state, const void *input, int thrid ); -int scanhash_x21s_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); +int x21s_4way_hash( void *state, const void *input, const int thrid ); bool x21s_4way_thread_init(); #else -int x21s_hash( void *state, const void *input, int thr_id ); -int scanhash_x21s( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); +int x21s_hash( void *state, const void *input, const int thr_id ); bool x21s_thread_init(); #endif -//void hex_hash( void *state, const void *input ); int scanhash_hex( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); diff --git a/algo/x16/x16r.c b/algo/x16/x16r.c index f9ad45e..cdfdbaf 100644 --- a/algo/x16/x16r.c +++ b/algo/x16/x16r.c @@ -10,7 +10,7 @@ #include #include -void x16r_prehash( void *edata, void *pdata ) +void x16r_do_prehash( const void *edata ) { const char elem = x16r_hash_order[0]; const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; @@ -48,7 +48,7 @@ void x16r_prehash( void *edata, void *pdata ) } } -int x16r_hash_generic( void* output, const void* input, int thrid ) +int x16r_hash_generic( void* output, const void* input, const int thrid ) { uint32_t _ALIGN(128) hash[16]; x16r_context_overlay ctx; @@ -192,7 +192,15 @@ int x16r_hash_generic( void* output, const void* input, int thrid ) return true; } -int x16r_hash( void* output, const void* input, int thrid ) +int x16r_prehash( const struct work *work ) +{ + mm128_bswap32_80( x16r_edata, work->data ); + x16r_gate_get_hash_order( work, x16r_hash_order ); + x16r_do_prehash( x16r_edata ); + return 1; +} + +int x16r_hash( void* output, const void* input, const int thrid ) { uint8_t hash[64] __attribute__ ((aligned (64))); if ( !x16r_hash_generic( hash, input, thrid ) ) @@ -205,8 +213,8 @@ int x16r_hash( void* output, const void* input, int thrid ) int scanhash_x16r( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t _ALIGN(128) hash32[8]; - uint32_t _ALIGN(128) edata[20]; + uint32_t _ALIGN(32) hash32[8]; + uint32_t _ALIGN(32) edata[20]; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; @@ -216,24 +224,14 @@ int scanhash_x16r( struct work *work, uint32_t max_nonce, const bool bench = opt_benchmark; if ( bench ) ptarget[7] = 0x0cff; - mm128_bswap32_80( edata, pdata ); - - static __thread uint32_t s_ntime = UINT32_MAX; - if ( s_ntime != pdata[17] ) - { - uint32_t ntime = swab32(pdata[17]); - x16_r_s_getAlgoString( (const uint8_t*)(&edata[1]), x16r_hash_order ); - s_ntime = ntime; - if ( opt_debug && !thr_id ) - applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime ); - } - - x16r_prehash( edata, pdata ); + pthread_rwlock_rdlock( &g_work_lock ); + memcpy( edata, x16r_edata, sizeof edata ); + pthread_rwlock_unlock( &g_work_lock ); do { edata[19] = nonce; - if ( x16r_hash( hash32, edata, thr_id ) ) + if ( algo_gate.hash( hash32, edata, thr_id ) ) if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) ) { pdata[19] = bswap_32( nonce ); diff --git a/algo/x16/x16rt-4way.c b/algo/x16/x16rt-4way.c deleted file mode 100644 index 50c691e..0000000 --- a/algo/x16/x16rt-4way.c +++ /dev/null @@ -1,113 +0,0 @@ -#include "x16r-gate.h" -#include -#include -#include - -#if defined (X16R_8WAY) - -int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t hash[16*8] __attribute__ ((aligned (128))); - uint32_t vdata[20*8] __attribute__ ((aligned (64))); - uint32_t _ALIGN(64) timeHash[8*8]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - const uint32_t last_nonce = max_nonce - 8; - uint32_t n = first_nonce; - __m512i *noncev = (__m512i*)vdata + 9; // aligned - const int thr_id = mythr->id; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - const bool bench = opt_benchmark; - - if ( bench ) ptarget[7] = 0x0cff; - - static __thread uint32_t s_ntime = UINT32_MAX; - uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80; - if ( s_ntime != masked_ntime ) - { - x16rt_getTimeHash( masked_ntime, &timeHash ); - x16rt_getAlgoString( &timeHash[0], x16r_hash_order ); - s_ntime = masked_ntime; - if ( !thr_id ) - applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x", - x16r_hash_order, bswap_32( pdata[17] ), timeHash ); - } - - x16r_8way_prehash( vdata, pdata ); - *noncev = mm512_intrlv_blend_32( _mm512_set_epi32( - n+7, 0, n+6, 0, n+5, 0, n+4, 0, - n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); - do - { - if ( x16r_8way_hash( hash, vdata, thr_id ) ) - for ( int i = 0; i < 8; i++ ) - if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) - { - pdata[19] = bswap_32( n+i ); - submit_solution( work, hash+(i<<3), mythr ); - } - *noncev = _mm512_add_epi32( *noncev, - m512_const1_64( 0x0000000800000000 ) ); - n += 8; - } while ( likely( ( n < last_nonce ) && !(*restart) ) ); - pdata[19] = n; - *hashes_done = n - first_nonce; - return 0; -} - -#elif defined (X16R_4WAY) - -int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t hash[4*16] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t _ALIGN(64) timeHash[4*8]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - const uint32_t last_nonce = max_nonce - 4; - uint32_t n = first_nonce; - const int thr_id = mythr->id; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - volatile uint8_t *restart = &(work_restart[thr_id].restart); - const bool bench = opt_benchmark; - - if ( bench ) ptarget[7] = 0x0cff; - - static __thread uint32_t s_ntime = UINT32_MAX; - uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80; - if ( s_ntime != masked_ntime ) - { - x16rt_getTimeHash( masked_ntime, &timeHash ); - x16rt_getAlgoString( &timeHash[0], x16r_hash_order ); - s_ntime = masked_ntime; - if ( !thr_id ) - applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x", - x16r_hash_order, bswap_32( pdata[17] ), timeHash ); - } - - x16r_4way_prehash( vdata, pdata ); - *noncev = mm256_intrlv_blend_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); - do - { - if ( x16r_4way_hash( hash, vdata, thr_id ) ) - for ( int i = 0; i < 4; i++ ) - if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) - { - pdata[19] = bswap_32( n+i ); - submit_solution( work, hash+(i<<3), mythr ); - } - *noncev = _mm256_add_epi32( *noncev, - m256_const1_64( 0x0000000400000000 ) ); - n += 4; - } while ( ( n < last_nonce ) && !(*restart) ); - pdata[19] = n; - *hashes_done = n - first_nonce; - return 0; -} - -#endif diff --git a/algo/x16/x16rt.c b/algo/x16/x16rt.c deleted file mode 100644 index 7ff8dc5..0000000 --- a/algo/x16/x16rt.c +++ /dev/null @@ -1,53 +0,0 @@ -#include "x16r-gate.h" - -#if !defined(X16R_8WAY) && !defined(X16R_4WAY) - -int scanhash_x16rt( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(128) hash32[8]; - uint32_t _ALIGN(128) edata[20]; - uint32_t _ALIGN(64) timeHash[8]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - const int thr_id = mythr->id; - uint32_t nonce = first_nonce; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - const bool bench = opt_benchmark; - if ( bench ) ptarget[7] = 0x0cff; - - mm128_bswap32_80( edata, pdata ); - - static __thread uint32_t s_ntime = UINT32_MAX; - uint32_t masked_ntime = swab32( pdata[17] ) & 0xffffff80; - if ( s_ntime != masked_ntime ) - { - x16rt_getTimeHash( masked_ntime, &timeHash ); - x16rt_getAlgoString( &timeHash[0], x16r_hash_order ); - s_ntime = masked_ntime; - if ( opt_debug && !thr_id ) - applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)", - x16r_hash_order, swab32( pdata[17] ), timeHash ); - } - - x16r_prehash( edata, pdata ); - - do - { - edata[19] = nonce; - if ( x16r_hash( hash32, edata, thr_id ) ) - if ( valid_hash( hash32, ptarget ) && !bench ) - { - pdata[19] = bswap_32( nonce ); - submit_solution( work, hash32, mythr ); - } - nonce++; - } while ( nonce < max_nonce && !(*restart) ); - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce; - return 0; -} - -#endif // !defined(X16R_8WAY) && !defined(X16R_4WAY) - diff --git a/algo/x16/x16rv2-4way.c b/algo/x16/x16rv2-4way.c index 6077e20..d07d395 100644 --- a/algo/x16/x16rv2-4way.c +++ b/algo/x16/x16rv2-4way.c @@ -12,37 +12,73 @@ #if defined (X16RV2_8WAY) -union _x16rv2_8way_context_overlay +void x16rv2_8way_do_prehash( void *vdata, void *pdata ) { - blake512_8way_context blake; - bmw512_8way_context bmw; - skein512_8way_context skein; - jh512_8way_context jh; - keccak512_8way_context keccak; - luffa_4way_context luffa; - cubehashParam cube; - simd_4way_context simd; - hamsi512_8way_context hamsi; - hashState_fugue fugue; - shabal512_8way_context shabal; - sph_whirlpool_context whirlpool; - sha512_8way_context sha512; - sph_tiger_context tiger; -#if defined(__VAES__) - groestl512_4way_context groestl; - shavite512_4way_context shavite; - echo_4way_context echo; -#else - hashState_groestl groestl; - shavite512_context shavite; - hashState_echo echo; -#endif -} __attribute__ ((aligned (64))); + uint32_t vdata32[20*8] __attribute__ ((aligned (64))); + uint32_t edata[20] __attribute__ ((aligned (64))); -typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay; -static __thread x16rv2_8way_context_overlay x16rv2_ctx; + const char elem = x16r_hash_order[0]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; -int x16rv2_8way_hash( void* output, const void* input, int thrid ) + switch ( algo ) + { + case JH: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + jh512_8way_init( &x16rv2_ctx.jh ); + jh512_8way_update( &x16rv2_ctx.jh, vdata, 64 ); + break; + case KECCAK: + case LUFFA: + case SHA_512: + mm128_bswap32_80( edata, pdata ); + sph_tiger_init( &x16rv2_ctx.tiger ); + sph_tiger( &x16rv2_ctx.tiger, edata, 64 ); + intrlv_8x64( vdata, edata, edata, edata, edata, + edata, edata, edata, edata, 640 ); + break; + case SKEIN: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + skein512_8way_init( &x16rv2_ctx.skein ); + skein512_8way_update( &x16rv2_ctx.skein, vdata, 64 ); + break; + case CUBEHASH: + mm128_bswap32_80( edata, pdata ); + cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 ); + cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 ); + intrlv_8x64( vdata, edata, edata, edata, edata, + edata, edata, edata, edata, 640 ); + break; + case HAMSI: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + hamsi512_8way_init( &x16rv2_ctx.hamsi ); + hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 ); + break; + case SHABAL: + mm256_bswap32_intrlv80_8x32( vdata32, pdata ); + shabal512_8way_init( &x16rv2_ctx.shabal ); + shabal512_8way_update( &x16rv2_ctx.shabal, vdata32, 64 ); + rintrlv_8x32_8x64( vdata, vdata32, 640 ); + break; + case WHIRLPOOL: + mm128_bswap32_80( edata, pdata ); + sph_whirlpool_init( &x16rv2_ctx.whirlpool ); + sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 ); + intrlv_8x64( vdata, edata, edata, edata, edata, + edata, edata, edata, edata, 640 ); + break; + default: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + } +} + +int x16rv2_8way_prehash( struct work *work ) +{ + x16r_gate_get_hash_order( work, x16r_hash_order ); + x16rv2_8way_do_prehash( x16r_8way_vdata, work->data ); + return 1; +} + +int x16rv2_8way_hash( void* output, const void* input, const int thrid ) { uint32_t vhash[24*8] __attribute__ ((aligned (128))); uint32_t hash0[24] __attribute__ ((aligned (32))); @@ -557,50 +593,28 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid ) return 1; } -int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr) +#elif defined (X16RV2_4WAY) + +// Pad the 24 bytes tiger hash to 64 bytes +inline void padtiger512( uint32_t* hash ) { - uint32_t hash[16*8] __attribute__ ((aligned (128))); - uint32_t vdata[20*8] __attribute__ ((aligned (64))); - uint32_t vdata2[20*8] __attribute__ ((aligned (64))); + for ( int i = 6; i < 16; i++ ) hash[i] = 0; +} + +void x16rv2_4way_do_prehash( void *vdata, void *pdata ) +{ + uint32_t vdata32[20*4] __attribute__ ((aligned (64))); uint32_t edata[20] __attribute__ ((aligned (64))); - uint32_t bedata1[2] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - const uint32_t last_nonce = max_nonce - 8; - uint32_t n = first_nonce; - __m512i *noncev = (__m512i*)vdata + 9; // aligned - const int thr_id = mythr->id; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - const bool bench = opt_benchmark; - if ( bench ) ptarget[7] = 0x0cff; - - mm512_bswap32_intrlv80_8x64( vdata, pdata ); - - bedata1[0] = bswap_32( pdata[1] ); - bedata1[1] = bswap_32( pdata[2] ); - - static __thread uint32_t s_ntime = UINT32_MAX; - const uint32_t ntime = bswap_32( pdata[17] ); - if ( s_ntime != ntime ) - { - x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order ); - s_ntime = ntime; - if ( opt_debug && !thr_id ) - applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime ); - } - - // Do midstate prehash on hash functions with block size <= 64 bytes. const char elem = x16r_hash_order[0]; const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + switch ( algo ) { case JH: - mm512_bswap32_intrlv80_8x64( vdata, pdata ); - jh512_8way_init( &x16rv2_ctx.jh ); - jh512_8way_update( &x16rv2_ctx.jh, vdata, 64 ); + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + jh512_4way_init( &x16rv2_ctx.jh ); + jh512_4way_update( &x16rv2_ctx.jh, vdata, 64 ); break; case KECCAK: case LUFFA: @@ -608,100 +622,45 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce, mm128_bswap32_80( edata, pdata ); sph_tiger_init( &x16rv2_ctx.tiger ); sph_tiger( &x16rv2_ctx.tiger, edata, 64 ); - intrlv_8x64( vdata, edata, edata, edata, edata, - edata, edata, edata, edata, 640 ); + intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); break; case SKEIN: - mm512_bswap32_intrlv80_8x64( vdata, pdata ); - skein512_8way_init( &x16rv2_ctx.skein ); - skein512_8way_update( &x16rv2_ctx.skein, vdata, 64 ); + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + skein512_4way_prehash64( &x16r_ctx.skein, vdata ); break; case CUBEHASH: mm128_bswap32_80( edata, pdata ); cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 ); cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 ); - intrlv_8x64( vdata, edata, edata, edata, edata, - edata, edata, edata, edata, 640 ); + intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); break; case HAMSI: - mm512_bswap32_intrlv80_8x64( vdata, pdata ); - hamsi512_8way_init( &x16rv2_ctx.hamsi ); - hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 ); + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + hamsi512_4way_init( &x16rv2_ctx.hamsi ); + hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 ); break; case SHABAL: - mm256_bswap32_intrlv80_8x32( vdata2, pdata ); - shabal512_8way_init( &x16rv2_ctx.shabal ); - shabal512_8way_update( &x16rv2_ctx.shabal, vdata2, 64 ); - rintrlv_8x32_8x64( vdata, vdata2, 640 ); + mm128_bswap32_intrlv80_4x32( vdata32, pdata ); + shabal512_4way_init( &x16rv2_ctx.shabal ); + shabal512_4way_update( &x16rv2_ctx.shabal, vdata32, 64 ); + rintrlv_4x32_4x64( vdata, vdata32, 640 ); break; case WHIRLPOOL: mm128_bswap32_80( edata, pdata ); sph_whirlpool_init( &x16rv2_ctx.whirlpool ); sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 ); - intrlv_8x64( vdata, edata, edata, edata, edata, - edata, edata, edata, edata, 640 ); + intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); break; default: - mm512_bswap32_intrlv80_8x64( vdata, pdata ); + mm256_bswap32_intrlv80_4x64( vdata, pdata ); } - - *noncev = mm512_intrlv_blend_32( _mm512_set_epi32( - n+7, 0, n+6, 0, n+5, 0, n+4, 0, - n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); - do - { - if ( x16rv2_8way_hash( hash, vdata, thr_id ) ) - for ( int i = 0; i < 8; i++ ) - if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) - { - pdata[19] = bswap_32( n+i ); - submit_solution( work, hash+(i<<3), mythr ); - } - *noncev = _mm512_add_epi32( *noncev, - m512_const1_64( 0x0000000800000000 ) ); - n += 8; - } while ( likely( ( n < last_nonce ) && !(*restart) ) ); - pdata[19] = n; - *hashes_done = n - first_nonce; - return 0; -} +} -#elif defined (X16RV2_4WAY) - -union _x16rv2_4way_context_overlay +int x16rv2_4way_prehash( struct work *work ) { - blake512_4way_context blake; - bmw512_4way_context bmw; -#if defined(__VAES__) - groestl512_2way_context groestl; - shavite512_2way_context shavite; - echo_2way_context echo; -#else - hashState_groestl groestl; - shavite512_context shavite; - hashState_echo echo; -#endif - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; - luffa_2way_context luffa; - cubehashParam cube; - simd_2way_context simd; - hamsi512_4way_context hamsi; - hashState_fugue fugue; - shabal512_4way_context shabal; - sph_whirlpool_context whirlpool; - sha512_4way_context sha512; - sph_tiger_context tiger; -}; -typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay; - -static __thread x16rv2_4way_context_overlay x16rv2_ctx; - -// Pad the 24 bytes tiger hash to 64 bytes -inline void padtiger512( uint32_t* hash ) -{ - for ( int i = 6; i < 16; i++ ) hash[i] = 0; + x16r_gate_get_hash_order( work, x16r_hash_order ); + x16rv2_4way_do_prehash( x16r_4way_vdata, work->data ); + return 1; } int x16rv2_4way_hash( void* output, const void* input, int thrid ) @@ -1048,107 +1007,4 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid ) return 1; } -int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t hash[4*16] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t vdata32[20*4] __attribute__ ((aligned (64))); - uint32_t edata[20]; - uint32_t bedata1[2]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - const uint32_t last_nonce = max_nonce - 4; - uint32_t n = first_nonce; - const int thr_id = mythr->id; - __m256i *noncev = (__m256i*)vdata + 9; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - const bool bench = opt_benchmark; - - if ( bench ) ptarget[7] = 0x0fff; - - bedata1[0] = bswap_32( pdata[1] ); - bedata1[1] = bswap_32( pdata[2] ); - - static __thread uint32_t s_ntime = UINT32_MAX; - const uint32_t ntime = bswap_32(pdata[17]); - if ( s_ntime != ntime ) - { - x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order ); - s_ntime = ntime; - if ( opt_debug && !thr_id ) - applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime ); - } - - // Do midstate prehash on hash functions with block size <= 64 bytes. - const char elem = x16r_hash_order[0]; - const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; - switch ( algo ) - { - case JH: - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - jh512_4way_init( &x16rv2_ctx.jh ); - jh512_4way_update( &x16rv2_ctx.jh, vdata, 64 ); - break; - case KECCAK: - case LUFFA: - case SHA_512: - mm128_bswap32_80( edata, pdata ); - sph_tiger_init( &x16rv2_ctx.tiger ); - sph_tiger( &x16rv2_ctx.tiger, edata, 64 ); - intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); - break; - case SKEIN: - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - skein512_4way_prehash64( &x16r_ctx.skein, vdata ); - break; - case CUBEHASH: - mm128_bswap32_80( edata, pdata ); - cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 ); - cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 ); - intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); - break; - case HAMSI: - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - hamsi512_4way_init( &x16rv2_ctx.hamsi ); - hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 ); - break; - case SHABAL: - mm128_bswap32_intrlv80_4x32( vdata32, pdata ); - shabal512_4way_init( &x16rv2_ctx.shabal ); - shabal512_4way_update( &x16rv2_ctx.shabal, vdata32, 64 ); - rintrlv_4x32_4x64( vdata, vdata32, 640 ); - break; - case WHIRLPOOL: - mm128_bswap32_80( edata, pdata ); - sph_whirlpool_init( &x16rv2_ctx.whirlpool ); - sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 ); - intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); - break; - default: - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - } - - *noncev = mm256_intrlv_blend_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); - - do - { - if ( x16rv2_4way_hash( hash, vdata, thr_id ) ) - for ( int i = 0; i < 4; i++ ) - if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) - { - pdata[19] = bswap_32( n+i ); - submit_solution( work, hash+(i<<3), mythr ); - } - *noncev = _mm256_add_epi32( *noncev, - m256_const1_64( 0x0000000400000000 ) ); - n += 4; - } while ( likely( ( n < last_nonce ) && !(*restart) ) ); - pdata[19] = n; - *hashes_done = n - first_nonce; - return 0; -} - #endif diff --git a/algo/x16/x16rv2.c b/algo/x16/x16rv2.c index 4173afc..aa5673f 100644 --- a/algo/x16/x16rv2.c +++ b/algo/x16/x16rv2.c @@ -43,9 +43,16 @@ inline void padtiger512(uint32_t* hash) { for (int i = (24/4); i < (64/4); i++) hash[i] = 0; } -int x16rv2_hash( void* output, const void* input, int thrid ) +// no prehash +int x16rv2_prehash( const struct work *work ) { - uint32_t _ALIGN(128) hash[16]; + x16r_gate_get_hash_order( work, x16r_hash_order ); + return 1; +} + +int x16rv2_hash( void* output, const void* input, const int thrid ) +{ + uint32_t _ALIGN(32) hash[16]; x16rv2_context_overlay ctx; void *in = (void*) input; int size = 80; @@ -170,52 +177,4 @@ int x16rv2_hash( void* output, const void* input, int thrid ) return 1; } -int scanhash_x16rv2( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(128) hash32[8]; - uint32_t _ALIGN(128) edata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - const int thr_id = mythr->id; - uint32_t nonce = first_nonce; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - const bool bench = opt_benchmark; - - casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); - casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); - casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) ); - casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); - casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - - static __thread uint32_t s_ntime = UINT32_MAX; - if ( s_ntime != pdata[17] ) - { - uint32_t ntime = swab32(pdata[17]); - x16_r_s_getAlgoString( (const uint8_t*) (&edata[1]), x16r_hash_order ); - s_ntime = ntime; - if ( opt_debug && !thr_id ) - applog( LOG_DEBUG, "hash order %s (%08x)", - x16r_hash_order, ntime ); - } - - if ( bench ) ptarget[7] = 0x0cff; - - do - { - edata[19] = nonce; - if ( x16rv2_hash( hash32, edata, thr_id ) ) - if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) ) - { - pdata[19] = bswap_32( nonce ); - submit_solution( work, hash32, mythr ); - } - nonce++; - } while ( nonce < max_nonce && !(*restart) ); - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce; - return 0; -} - #endif diff --git a/algo/x16/x21s-4way.c b/algo/x16/x21s-4way.c index 2f27116..b605e31 100644 --- a/algo/x16/x21s-4way.c +++ b/algo/x16/x21s-4way.c @@ -30,7 +30,7 @@ union _x21s_8way_context_overlay typedef union _x21s_8way_context_overlay x21s_8way_context_overlay; -int x21s_8way_hash( void* output, const void* input, int thrid ) +int x21s_8way_hash( void* output, const void* input, const int thrid ) { uint32_t vhash[16*8] __attribute__ ((aligned (128))); uint8_t shash[64*8] __attribute__ ((aligned (64))); @@ -129,66 +129,6 @@ int x21s_8way_hash( void* output, const void* input, int thrid ) return 1; } -int scanhash_x21s_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t hash[16*8] __attribute__ ((aligned (128))); - uint32_t vdata[20*8] __attribute__ ((aligned (64))); - uint32_t *hash7 = &hash[7<<3]; - uint32_t lane_hash[8] __attribute__ ((aligned (64))); - uint32_t bedata1[2] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - const uint32_t last_nonce = max_nonce - 16; - const int thr_id = mythr->id; - __m512i *noncev = (__m512i*)vdata + 9; // aligned - volatile uint8_t *restart = &(work_restart[thr_id].restart); - const bool bench = opt_benchmark; - - if ( bench ) ptarget[7] = 0x0cff; - - bedata1[0] = bswap_32( pdata[1] ); - bedata1[1] = bswap_32( pdata[2] ); - - static __thread uint32_t s_ntime = UINT32_MAX; - uint32_t ntime = bswap_32( pdata[17] ); - if ( s_ntime != ntime ) - { - x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order ); - s_ntime = ntime; - if ( opt_debug && !thr_id ) - applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime ); - } - - x16r_8way_prehash( vdata, pdata ); - *noncev = mm512_intrlv_blend_32( _mm512_set_epi32( - n+7, 0, n+6, 0, n+5, 0, n+4, 0, - n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); - do - { - if ( x21s_8way_hash( hash, vdata, thr_id ) ) - for ( int lane = 0; lane < 8; lane++ ) - if ( unlikely( hash7[lane] <= Htarg ) ) - { - extr_lane_8x32( lane_hash, hash, lane, 256 ); - if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) - { - pdata[19] = bswap_32( n + lane ); - submit_solution( work, lane_hash, mythr ); - } - } - *noncev = _mm512_add_epi32( *noncev, - m512_const1_64( 0x0000000800000000 ) ); - n += 8; - } while ( likely( ( n < last_nonce ) && !(*restart) ) ); - pdata[19] = n; - *hashes_done = n - first_nonce; - return 0; -} - bool x21s_8way_thread_init() { const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols @@ -215,7 +155,7 @@ union _x21s_4way_context_overlay typedef union _x21s_4way_context_overlay x21s_4way_context_overlay; -int x21s_4way_hash( void* output, const void* input, int thrid ) +int x21s_4way_hash( void* output, const void* input, const int thrid ) { uint32_t vhash[16*4] __attribute__ ((aligned (64))); uint8_t shash[64*4] __attribute__ ((aligned (64))); @@ -291,58 +231,6 @@ int x21s_4way_hash( void* output, const void* input, int thrid ) return 1; } -int scanhash_x21s_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr) -{ - uint32_t hash[16*4] __attribute__ ((aligned (64))); - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t bedata1[2] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - const uint32_t last_nonce = max_nonce - 4; - uint32_t n = first_nonce; - const int thr_id = mythr->id; - const bool bench = opt_benchmark; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - volatile uint8_t *restart = &(work_restart[thr_id].restart); - - if ( bench ) ptarget[7] = 0x0cff; - - bedata1[0] = bswap_32( pdata[1] ); - bedata1[1] = bswap_32( pdata[2] ); - - static __thread uint32_t s_ntime = UINT32_MAX; - uint32_t ntime = bswap_32( pdata[17] ); - if ( s_ntime != ntime ) - { - x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order ); - s_ntime = ntime; - if ( opt_debug && !thr_id ) - applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime ); - } - - x16r_4way_prehash( vdata, pdata ); - *noncev = mm256_intrlv_blend_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); - do - { - if ( x21s_4way_hash( hash, vdata, thr_id ) ) - for ( int i = 0; i < 4; i++ ) - if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) - { - pdata[19] = bswap_32( n+i ); - submit_solution( work, hash+(i<<3), mythr ); - } - *noncev = _mm256_add_epi32( *noncev, - m256_const1_64( 0x0000000400000000 ) ); - n += 4; - } while ( likely( ( n < last_nonce ) && !(*restart) ) ); - pdata[19] = n; - *hashes_done = n - first_nonce; - return 0; -} - bool x21s_4way_thread_init() { const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols diff --git a/algo/x16/x21s.c b/algo/x16/x21s.c index 96782e2..5234867 100644 --- a/algo/x16/x21s.c +++ b/algo/x16/x21s.c @@ -27,7 +27,7 @@ union _x21s_context_overlay }; typedef union _x21s_context_overlay x21s_context_overlay; -int x21s_hash( void* output, const void* input, int thrid ) +int x21s_hash( void* output, const void* input, const int thrid ) { uint32_t _ALIGN(128) hash[16]; x21s_context_overlay ctx; @@ -57,50 +57,6 @@ int x21s_hash( void* output, const void* input, int thrid ) return 1; } -int scanhash_x21s( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(128) hash32[8]; - uint32_t _ALIGN(128) edata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - const int thr_id = mythr->id; - uint32_t nonce = first_nonce; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - const bool bench = opt_benchmark; - if ( bench ) ptarget[7] = 0x0cff; - - mm128_bswap32_80( edata, pdata ); - - static __thread uint32_t s_ntime = UINT32_MAX; - if ( s_ntime != pdata[17] ) - { - uint32_t ntime = swab32(pdata[17]); - x16_r_s_getAlgoString( (const uint8_t*)(&edata[1]), x16r_hash_order ); - s_ntime = ntime; - if ( opt_debug && !thr_id ) - applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime ); - } - - x16r_prehash( edata, pdata ); - - do - { - edata[19] = nonce; - if ( x21s_hash( hash32, edata, thr_id ) ) - if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) ) - { - pdata[19] = bswap_32( nonce ); - submit_solution( work, hash32, mythr ); - } - nonce++; - } while ( nonce < max_nonce && !(*restart) ); - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce; - return 0; -} - bool x21s_thread_init() { const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols diff --git a/algo/yespower/yespower-gate.c b/algo/yespower/yespower-gate.c index 54d119e..f2da57b 100644 --- a/algo/yespower/yespower-gate.c +++ b/algo/yespower/yespower-gate.c @@ -31,8 +31,26 @@ yespower_params_t yespower_params; +// master g_work +sha256_context yespower_sha256_prehash_ctx; +uint32_t _ALIGN(64) yespower_endiandata[20]; + +// local work __thread sha256_context sha256_prehash_ctx; + +int yespower_sha256_prehash( struct work *work ) +{ + uint32_t *pdata = work->data; + + for ( int k = 0; k < 19; k++ ) + be32enc( &yespower_endiandata[k], pdata[k] ); + + sha256_ctx_init( &yespower_sha256_prehash_ctx ); + sha256_update( &yespower_sha256_prehash_ctx, yespower_endiandata, 64 ); + + return 1; +} // YESPOWER int yespower_hash( const char *input, char *output, uint32_t len, int thrid ) @@ -53,13 +71,14 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce, uint32_t n = first_nonce; const int thr_id = mythr->id; - for ( int k = 0; k < 19; k++ ) - be32enc( &endiandata[k], pdata[k] ); - endiandata[19] = n; +// pthread_rwlock_rdlock( &g_work_lock ); - // do sha256 prehash - sha256_ctx_init( &sha256_prehash_ctx ); - sha256_update( &sha256_prehash_ctx, endiandata, 64 ); + memcpy( endiandata, yespower_endiandata, sizeof endiandata ); + memcpy( &sha256_prehash_ctx, &yespower_sha256_prehash_ctx, sizeof sha256_prehash_ctx ); + +// pthread_rwlock_unlock( &g_work_lock ); + + endiandata[19] = n; do { if ( yespower_hash( (char*)endiandata, (char*)vhash, 80, thr_id ) ) @@ -140,6 +159,7 @@ bool register_yespower_algo( algo_gate_t* gate ) gate->optimizations = SSE2_OPT | SHA_OPT; gate->scanhash = (void*)&scanhash_yespower; + gate->prehash = (void*)&yespower_sha256_prehash; gate->hash = (void*)&yespower_hash; opt_target_factor = 65536.0; return true; @@ -154,6 +174,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate ) yespower_params.perslen = 0; gate->optimizations = SSE2_OPT | SHA_OPT; gate->scanhash = (void*)&scanhash_yespower; + gate->prehash = (void*)&yespower_sha256_prehash; gate->hash = (void*)&yespower_hash; opt_target_factor = 65536.0; return true; @@ -165,6 +186,7 @@ bool register_yescrypt_algo( algo_gate_t* gate ) { gate->optimizations = SSE2_OPT | SHA_OPT; gate->scanhash = (void*)&scanhash_yespower; + gate->prehash = (void*)&yespower_sha256_prehash; yespower_params.version = YESPOWER_0_5; opt_target_factor = 65536.0; @@ -198,6 +220,7 @@ bool register_yescryptr8_algo( algo_gate_t* gate ) { gate->optimizations = SSE2_OPT | SHA_OPT; gate->scanhash = (void*)&scanhash_yespower; + gate->prehash = (void*)&yespower_sha256_prehash; yespower_params.version = YESPOWER_0_5; yespower_params.N = 2048; yespower_params.r = 8; @@ -211,6 +234,7 @@ bool register_yescryptr16_algo( algo_gate_t* gate ) { gate->optimizations = SSE2_OPT | SHA_OPT; gate->scanhash = (void*)&scanhash_yespower; + gate->prehash = (void*)&yespower_sha256_prehash; yespower_params.version = YESPOWER_0_5; yespower_params.N = 4096; yespower_params.r = 16; @@ -224,6 +248,7 @@ bool register_yescryptr32_algo( algo_gate_t* gate ) { gate->optimizations = SSE2_OPT | SHA_OPT; gate->scanhash = (void*)&scanhash_yespower; + gate->prehash = (void*)&yespower_sha256_prehash; yespower_params.version = YESPOWER_0_5; yespower_params.N = 4096; yespower_params.r = 32; diff --git a/algo/yespower/yespower.h b/algo/yespower/yespower.h index aa19004..c3d21e6 100644 --- a/algo/yespower/yespower.h +++ b/algo/yespower/yespower.h @@ -80,6 +80,8 @@ extern yespower_params_t yespower_params; extern __thread sha256_context sha256_prehash_ctx; +int yespower_sha256_prehash( struct work *work ); + /** * yespower_init_local(local): * Initialize the thread-local (RAM) data structure. Actual memory allocation diff --git a/configure b/configure index 4618da4..c425551 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.21.2. +# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.21.3. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -608,8 +608,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.21.2' -PACKAGE_STRING='cpuminer-opt 3.21.2' +PACKAGE_VERSION='3.21.3' +PACKAGE_STRING='cpuminer-opt 3.21.3' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.21.2 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.21.3 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1432,7 +1432,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.21.2:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.21.3:";; esac cat <<\_ACEOF @@ -1538,7 +1538,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.21.2 +cpuminer-opt configure 3.21.3 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.21.2, which was +It was created by cpuminer-opt $as_me 3.21.3, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3593,7 +3593,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.21.2' + VERSION='3.21.3' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.21.2, which was +This file was extended by cpuminer-opt $as_me 3.21.3, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 3.21.2 +cpuminer-opt config.status 3.21.3 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 4b6804f..1483802 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.21.2]) +AC_INIT([cpuminer-opt], [3.21.3]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 374123d..8915075 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "sysinfos.c" #include "algo/sha/sha256d.h" @@ -317,8 +318,9 @@ static void affine_to_cpu( struct thr_info *thr ) if ( !ok ) { last_error = GetLastError(); - applog( LOG_WARNING, "affine_to_cpu_mask for %u returned 0x%x", - thread, last_error ); + if ( !thread ) + applog( LOG_WARNING, "Set affinity returned error 0x%x for thread %d", + last_error, thread ); } } @@ -1725,9 +1727,9 @@ static bool workio_get_work( struct workio_cmd *wc, CURL *curl ) struct work *ret_work; int failures = 0; - ret_work = (struct work*) calloc( 1, sizeof(*ret_work) ); - if ( !ret_work ) - return false; + ret_work = (struct work*) _mm_malloc( sizeof(*ret_work), 32 ); + if ( !ret_work ) return false; + memset( ret_work, 0, sizeof(*ret_work) ); /* obtain new work from bitcoin via JSON-RPC */ while ( !get_upstream_work( curl, ret_work ) ) @@ -1736,22 +1738,23 @@ static bool workio_get_work( struct workio_cmd *wc, CURL *curl ) { applog( LOG_ERR, "json_rpc_call failed, terminating workio thread" ); free( ret_work ); - return false; + return false; } /* pause, then restart work-request loop */ - applog( LOG_ERR, "json_rpc_call failed, retry after %d seconds", - opt_fail_pause ); + applog( LOG_ERR, "json_rpc_call failed, retry after %d seconds", + opt_fail_pause ); sleep( opt_fail_pause ); } /* send work to requesting thread */ if ( !tq_push(wc->thr->q, ret_work ) ) - free( ret_work ); + free( ret_work ); return true; } + static bool workio_submit_work(struct workio_cmd *wc, CURL *curl) { int failures = 0; @@ -1970,15 +1973,15 @@ static bool wanna_mine(int thr_id) float temp = cpu_temp(0); if (temp > opt_max_temp) { - if (!thr_id && !conditional_state[thr_id] && !opt_quiet) - applog(LOG_INFO, "temperature too high (%.0fC), waiting...", temp); - state = false; + if ( !thr_id && !conditional_state[thr_id] && !opt_quiet ) + applog(LOG_NOTICE, "CPU temp too high: %.0fC max %.0f, waiting...", temp, opt_max_temp ); + state = false; } } if (opt_max_diff > 0.0 && net_diff > opt_max_diff) { if (!thr_id && !conditional_state[thr_id] && !opt_quiet) - applog(LOG_INFO, "network diff too high, waiting..."); + applog(LOG_NOTICE, "network diff too high, waiting..."); state = false; } if (opt_max_rate > 0.0 && net_hashrate > opt_max_rate) @@ -1987,12 +1990,14 @@ static bool wanna_mine(int thr_id) { char rate[32]; format_hashrate(opt_max_rate, rate); - applog(LOG_INFO, "network hashrate too high, waiting %s...", rate); + applog(LOG_NOTICE, "network hashrate too high (%s), waiting...", rate); } state = false; } - if (thr_id < MAX_CPUS) - conditional_state[thr_id] = (uint8_t) !state; + + if ( conditional_state[thr_id] && state && !thr_id && !opt_quiet ) + applog(LOG_NOTICE, "...resuming" ); + conditional_state[thr_id] = (uint8_t) !state; return state; } @@ -2117,6 +2122,10 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) t++ ); g_work_time = time(NULL); + + // Do midstate prehash + algo_gate.prehash( g_work ); + restart_threads(); pthread_mutex_unlock( &sctx->work_lock ); @@ -2140,7 +2149,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) else if ( g_work->job_id && new_job ) applog( LOG_BLUE, "New Work: Block %d, Net diff %.5g, Job %s", sctx->block_height, net_diff, g_work->job_id ); - else if ( !opt_quiet ) + else if ( opt_debug ) { unsigned char *xnonce2str = bebin2hex( g_work->xnonce2, g_work->xnonce2_len ); @@ -2335,6 +2344,9 @@ static void *miner_thread( void *userdata ) goto out; } g_work_time = time(NULL); + + // do midstate prehash + algo_gate.prehash( &g_work ); restart_threads(); } @@ -2354,6 +2366,14 @@ static void *miner_thread( void *userdata ) if ( unlikely( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) ) ) continue; + // conditional mining + if ( unlikely( !wanna_mine( thr_id ) ) ) + { + restart_threads(); + sleep(5); + continue; + } + // opt_scantime expressed in hashes max64 = opt_scantime * thr_hashrates[thr_id]; @@ -2500,14 +2520,6 @@ static void *miner_thread( void *userdata ) } } } // benchmark - - // conditional mining - if ( unlikely( !wanna_mine( thr_id ) ) ) - { - sleep(5); - continue; - } - } // miner_thread loop out: @@ -3682,7 +3694,7 @@ int main(int argc, char *argv[]) #if defined(WIN32) -// Are Windows CPU Groups supported? +// Get the number of cpus, display after parsing command line #if defined(WINDOWS_CPU_GROUPS_ENABLED) num_cpus = 0; num_cpugroups = GetActiveProcessorGroupCount(); @@ -3691,8 +3703,8 @@ int main(int argc, char *argv[]) int cpus = GetActiveProcessorCount( i ); num_cpus += cpus; - if (opt_debug) - applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i ); +// if (opt_debug) +// applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i ); } #else @@ -3709,7 +3721,7 @@ int main(int argc, char *argv[]) sysctl(req, 2, &num_cpus, &len, NULL, 0); #else num_cpus = 1; -#endif +#endif if ( num_cpus < 1 ) num_cpus = 1; @@ -3861,6 +3873,11 @@ int main(int argc, char *argv[]) } #endif +#if defined(WIN32) && defined(WINDOWS_CPU_GROUPS_ENABLED) + if ( !opt_quiet ) + applog( LOG_INFO, "Found %d CPUs in %d groups", num_cpus, num_cpugroups ); +#endif + if ( opt_affinity && num_cpus > max_cpus ) { applog( LOG_WARNING, "More than %d CPUs, CPU affinity is disabled", @@ -3872,7 +3889,7 @@ int main(int argc, char *argv[]) { for ( int thr = 0, cpu = 0; thr < opt_n_threads; thr++, cpu++ ) { - while ( !( ( opt_affinity >> ( cpu&63 ) ) & 1ULL ) ) cpu++; + while ( !( ( opt_affinity >> ( cpu & 63 ) ) & 1ULL ) ) cpu++; thread_affinity_map[ thr ] = cpu % num_cpus; } if ( !opt_quiet ) diff --git a/miner.h b/miner.h index 9096796..92002c3 100644 --- a/miner.h +++ b/miner.h @@ -392,8 +392,8 @@ float cpu_temp( int core ); struct work { - uint32_t target[8] __attribute__ ((aligned (64))); - uint32_t data[48] __attribute__ ((aligned (64))); + uint32_t data[48] __attribute__ ((aligned (64))); + uint32_t target[8] __attribute__ ((aligned (32))); double targetdiff; double sharediff; double stratum_diff; diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h index 886dd95..7623851 100644 --- a/simd-utils/intrlv.h +++ b/simd-utils/intrlv.h @@ -470,7 +470,7 @@ static inline void mm128_intrlv_4x32x( void *dst, void *src0, void *src1, #if defined(__SSSE3__) -static inline void mm128_bswap32_80( void *d, void *s ) +static inline void mm128_bswap32_80( void *d, const void *s ) { __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), bswap_shuf ); @@ -482,7 +482,7 @@ static inline void mm128_bswap32_80( void *d, void *s ) #else -static inline void mm128_bswap32_80( void *d, void *s ) +static inline void mm128_bswap32_80( void *d, const void *s ) { ( (uint32_t*)d )[ 0] = bswap_32( ( (uint32_t*)s )[ 0] ); ( (uint32_t*)d )[ 1] = bswap_32( ( (uint32_t*)s )[ 1] ); diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index 06d9fc0..e6cb70b 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -385,7 +385,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #define mm128_rol_var_32( v, c ) \ _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) ) -// +/* Not used // Limited 2 input shuffle, combines shuffle with blend. The destination low // half is always taken from v1, and the high half from v2. #define mm128_shuffle2_64( v1, v2, c ) \ @@ -395,6 +395,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #define mm128_shuffle2_32( v1, v2, c ) \ _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \ _mm_castsi128_ps( v2 ), c ) ); +*/ // // Rotate vector elements accross all lanes @@ -406,6 +407,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 ) #define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 ) +/* Not used #if defined(__SSSE3__) // Rotate right by c bytes, no SSE2 equivalent. @@ -413,6 +415,7 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c ) { return _mm_alignr_epi8( v, v, c ); } #endif +*/ // Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit rotations // for multiples of 8 bits. Uses ror/rol macros when AVX512 is available @@ -555,68 +558,25 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s ) v2 = _mm_xor_si128( v1, v2 ); \ v1 = _mm_xor_si128( v1, v2 ); - -// alignr for 32 & 64 bit elements is only available with AVX512 but -// emulated here. Shift argument is not needed, it's always 1. -// Behaviour is otherwise consistent with Intel alignr intrinsics. - +// Concatenate { hi, lo }, rotate right by c elements and return low 128 bits. #if defined(__SSSE3__) -#define mm128_alignr_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 ) -#define mm128_alignr_32( v1, v2 ) _mm_alignr_epi8( v1, v2, 4 ) +// _mm_alignr_epi32 & _mm_alignr_epi64 are only available with AVX512VL but +// are emulated here using _mm_alignr_epi8. There are no fast equivalents for +// 256 bit vectors, though there is no for this functionality. + +#define mm128_alignr_64( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*8 ) +#define mm128_alignr_32( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*4 ) #else -#define mm128_alignr_64( v1, v2 ) _mm_or_si128( _mm_slli_si128( v1, 8 ), \ - _mm_srli_si128( v2, 8 ) ) +#define mm128_alignr_64( hi, lo, c ) \ + _mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) ) -#define mm128_alignr_32( v1, v2 ) _mm_or_si128( _mm_slli_si128( v1, 4 ), \ - _mm_srli_si128( v2, 4 ) ) +#define mm128_alignr_32( hi, lo, c ) \ + _mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) ) #endif -// Procedure macros with 2 inputs and 2 outputs, input args are overwritten. -// vrol & vror are deprecated and do not exist for larger vectors. -// Their only use is by lyra2 blake2b when AVX2 is not available and is -// grandfathered. - -#if defined(__SSSE3__) - -#define mm128_vror256_64( v1, v2 ) \ -do { \ - __m128i t = _mm_alignr_epi8( v1, v2, 8 ); \ - v1 = _mm_alignr_epi8( v2, v1, 8 ); \ - v2 = t; \ -} while(0) - -#define mm128_vrol256_64( v1, v2 ) \ -do { \ - __m128i t = _mm_alignr_epi8( v1, v2, 8 ); \ - v2 = _mm_alignr_epi8( v2, v1, 8 ); \ - v1 = t; \ -} while(0) - -#else // SSE2 - -#define mm128_vror256_64( v1, v2 ) \ -do { \ - __m128i t = _mm_or_si128( _mm_srli_si128( v1, 8 ), \ - _mm_slli_si128( v2, 8 ) ); \ - v2 = _mm_or_si128( _mm_srli_si128( v2, 8 ), \ - _mm_slli_si128( v1, 8 ) ); \ - v1 = t; \ -} while(0) - -#define mm128_vrol256_64( v1, v2 ) \ -do { \ - __m128i t = _mm_or_si128( _mm_slli_si128( v1, 8 ), \ - _mm_srli_si128( v2, 8 ) ); \ - v2 = _mm_or_si128( _mm_slli_si128( v2, 8 ), \ - _mm_srli_si128( v1, 8 ) ); \ - v1 = t; \ -} while(0) - -#endif // SSE4.1 else SSE2 - #endif // __SSE2__ #endif // SIMD_128_H__ diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index 2580d7a..2b215e2 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -239,8 +239,8 @@ static inline __m256i mm256_not( const __m256i v ) // Mask making // Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask. -// Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements. -// Effectively a sign test. +// Create a 64 or 32 bit integer mask from MSB of 64 or 32 bit elements. +// Effectively a sign test: if (mask[n]) then -1 else 0. #define mm256_movmask_64( v ) \ _mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) ) @@ -348,7 +348,7 @@ static inline __m256i mm256_not( const __m256i v ) _mm256_or_si256( _mm256_slli_epi16( v, c ), \ _mm256_srli_epi16( v, 16-(c) ) ) -// Deprecated. +// Deprecated. Obsolete sm3, the only user, is grandfathered. #define mm256_rol_var_32( v, c ) \ _mm256_or_si256( _mm256_slli_epi32( v, c ), \ _mm256_srli_epi32( v, 32-(c) ) ) @@ -391,6 +391,7 @@ static inline __m256i mm256_shufll_32( const __m256i v ) // // Rotate elements within each 128 bit lane of 256 bit vector. +/* Not used // Limited 2 input shuffle #define mm256_shuffle2_64( v1, v2, c ) \ _mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( v1 ), \ @@ -399,6 +400,7 @@ static inline __m256i mm256_shufll_32( const __m256i v ) #define mm256_shuffle2_32( v1, v2, c ) \ _mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( v1 ), \ _mm256_castsi256_ps( v2 ), c ) ); +*/ #define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e ) #define mm256_shuflr128_64 mm256_swap128_64 @@ -511,7 +513,8 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c ) } while(0) // swap 256 bit vectors in place. -// This should be avoided, it's more efficient to switch references. +// Deprecated, Shabal is the only user and it should be modified to reorder +// instructions. #define mm256_swap512_256( v1, v2 ) \ v1 = _mm256_xor_si256( v1, v2 ); \ v2 = _mm256_xor_si256( v1, v2 ); \ diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index cad8300..60c8746 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -409,19 +409,20 @@ static inline __m512i mm512_shuflr_x64( const __m512i v, const int n ) static inline __m512i mm512_shuflr_x32( const __m512i v, const int n ) { return _mm512_alignr_epi32( v, v, n ); } +/* Not used #define mm512_shuflr_16( v ) \ _mm512_permutexvar_epi16( m512_const_64( \ 0x0000001F001E001D, 0x001C001B001A0019, \ - 0X0018001700160015, 0X0014001300120011, \ - 0X0010000F000E000D, 0X000C000B000A0009, \ - 0X0008000700060005, 0X0004000300020001 ), v ) + 0x0018001700160015, 0x0014001300120011, \ + 0x0010000F000E000D, 0x000C000B000A0009, \ + 0x0008000700060005, 0x0004000300020001 ), v ) #define mm512_shufll_16( v ) \ _mm512_permutexvar_epi16( m512_const_64( \ 0x001E001D001C001B, 0x001A001900180017, \ - 0X0016001500140013, 0X001200110010000F, \ - 0X000E000D000C000B, 0X000A000900080007, \ - 0X0006000500040003, 0X000200010000001F ), v ) + 0x0016001500140013, 0x001200110010000F, \ + 0x000E000D000C000B, 0x000A000900080007, \ + 0x0006000500040003, 0x000200010000001F ), v ) #define mm512_shuflr_8( v ) \ _mm512_shuffle_epi8( v, m512_const_64( \ @@ -436,6 +437,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n ) 0x2E2D2C2B2A292827, 0x262524232221201F, \ 0x1E1D1C1B1A191817, 0x161514131211100F, \ 0x0E0D0C0B0A090807, 0x060504030201003F ) ) +*/ // 256 bit lanes used only by lyra2, move these there // Rotate elements within 256 bit lanes of 512 bit vector. @@ -449,7 +451,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n ) #define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 ) #define mm512_shufll256_64( v ) _mm512_permutex_epi64( v, 0x93 ) -/* +/* Not used // Rotate 256 bit lanes by one 32 bit element #define mm512_shuflr256_32( v ) \ _mm512_permutexvar_epi32( m512_const_64( \ @@ -496,6 +498,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n ) // // Shuffle/rotate elements within 128 bit lanes of 512 bit vector. +/* Not used // Limited 2 input, 1 output shuffle, combines shuffle with blend. // Like most shuffles it's limited to 128 bit lanes and like some shuffles // destination elements must come from a specific source arg. @@ -506,7 +509,10 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n ) #define mm512_shuffle2_32( v1, v2, c ) \ _mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( v1 ), \ _mm512_castsi512_ps( v2 ), c ) ); +*/ +// These hard coded shuffles exist for consistency with AVX2 & SSE2 where +// efficient generic versions don't exist. // Swap 64 bits in each 128 bit lane #define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e ) #define mm512_shuflr128_64 mm512_swap128_64 @@ -516,9 +522,11 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n ) #define mm512_shuflr128_32( v ) _mm512_shuffle_epi32( v, 0x39 ) #define mm512_shufll128_32( v ) _mm512_shuffle_epi32( v, 0x93 ) -// Rotate right 128 bit lanes by c bytes, versatile and just as fast +/* Not used +// Rotate right 128 bit lanes by c bytes, efficient generic version of above. static inline __m512i mm512_shuflr128_8( const __m512i v, const int c ) { return _mm512_alignr_epi8( v, v, c ); } +*/ // Rotate byte elements in each 64 or 32 bit lane. Redundant for AVX512, all // can be done with ror & rol. Defined only for convenience and consistency diff --git a/winbuild-cross.sh b/winbuild-cross.sh index 26d1076..1400a38 100755 --- a/winbuild-cross.sh +++ b/winbuild-cross.sh @@ -17,6 +17,7 @@ export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32" # used by GCC export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl" # Support for Windows 7 CPU groups, AES sometimes not included in -march +# Disabled due to CPU group incompatibilities between Intel and AMD CPU. export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601" export DEFAULT_CFLAGS_OLD="-O3 -Wall" @@ -45,7 +46,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/ ./clean-all.sh || echo clean rm -f config.status ./autogen.sh || echo done -CFLAGS="-march=icelake-client $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS +CFLAGS="-march=icelake-client $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe @@ -53,7 +54,7 @@ mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe # AVX512 AES: Intel Core HEDT Slylake-X, Cascadelake make clean || echo clean rm -f config.status -CFLAGS="-march=skylake-avx512 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS +CFLAGS="-march=skylake-avx512 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx512.exe @@ -61,7 +62,7 @@ mv cpuminer.exe release/cpuminer-avx512.exe # AVX2 SHA VAES: Intel Alderlake, AMD Zen3 make clean || echo done rm -f config.status -CFLAGS="-mavx2 -msha -mvaes $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS +CFLAGS="-mavx2 -msha -mvaes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe @@ -69,7 +70,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe # AVX2 AES SHA: AMD Zen1 make clean || echo clean rm -f config.status -CFLAGS="-march=znver1 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS +CFLAGS="-march=znver1 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx2-sha.exe @@ -77,7 +78,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha.exe # AVX2 AES: Intel Core Haswell, Skylake, Kabylake, Coffeelake, Cometlake make clean || echo clean rm -f config.status -CFLAGS="-march=core-avx2 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS +CFLAGS="-march=core-avx2 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx2.exe