From d6e8d7a46ecfeb8767ea50bf0dee096af07dcce6 Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Tue, 18 Jun 2019 13:15:45 -0400 Subject: [PATCH] v3.9.4 --- INSTALL_LINUX | 3 - Makefile.am | 5 +- README.txt | 2 +- RELEASE_NOTES | 10 +- algo/argon2/argon2d/argon2d/core.c | 8 +- algo/bmw/bmw256-hash-4way.c | 57 +-- algo/lyra2/lyra2-gate.c | 16 +- algo/lyra2/lyra2-gate.h | 26 +- algo/lyra2/lyra2rev3-4way.c | 130 +++++- algo/lyra2/phi2-4way.c | 233 +++++++++++ algo/m7m.c | 5 +- algo/pluck.c | 4 +- algo/quark/anime-4way.c | 121 +++--- algo/quark/anime-gate.h | 8 +- algo/quark/anime.c | 17 +- algo/quark/hmq1725-4way.c | 618 +++++++++++++++++++++++++++++ algo/quark/hmq1725-gate.c | 17 + algo/quark/hmq1725-gate.h | 28 ++ algo/{x17 => quark}/hmq1725.c | 16 +- algo/quark/quark-4way.c | 98 +++-- algo/quark/quark-gate.h | 8 +- algo/quark/quark.c | 13 +- algo/qubit/deep-2way.c | 29 +- algo/qubit/deep-gate.h | 8 +- algo/qubit/deep.c | 11 +- algo/qubit/qubit-2way.c | 29 +- algo/qubit/qubit-gate.h | 8 +- algo/qubit/qubit.c | 13 +- algo/ripemd/lbry-4way.c | 174 ++------ algo/ripemd/lbry-gate.h | 15 +- algo/ripemd/lbry.c | 3 +- algo/scrypt.c | 3 +- algo/sha/sha256t-gate.h | 11 +- algo/simd/simd-hash-2way.c | 6 + algo/skein/skein-4way.c | 70 ++-- algo/skein/skein-gate.h | 7 +- algo/skein/skein.c | 7 +- algo/skein/skein2-4way.c | 36 +- algo/skein/skein2-gate.c | 3 +- algo/skein/skein2-gate.h | 4 +- algo/skein/skein2.c | 5 +- algo/x14/axiom.c | 3 +- algo/x14/polytimos-4way.c | 72 ++-- algo/x14/polytimos-gate.c | 1 - algo/x14/polytimos-gate.h | 9 +- algo/x14/polytimos.c | 5 +- algo/x14/veltor-4way.c | 11 +- algo/x14/veltor-gate.h | 4 +- algo/x14/veltor.c | 4 +- algo/x14/x14-4way.c | 33 +- algo/x14/x14-gate.h | 8 +- algo/x14/x14.c | 5 +- algo/x15/x15-4way.c | 35 +- algo/x15/x15-gate.h | 8 +- algo/x15/x15.c | 5 +- algo/x16/x16r-4way.c | 97 ++--- algo/x16/x16r-gate.c | 4 - algo/x16/x16r-gate.h | 6 - algo/x16/x16r.c | 26 +- algo/x17/x17-gate.c | 4 - algo/x17/x17-gate.h | 6 - algo/x17/x17.c | 117 +++--- algo/yespower/yespower-platform.c | 3 +- build-allarch.sh | 24 +- configure | 20 +- configure.ac | 2 +- cpu-miner.c | 16 +- simd-utils.h | 28 +- simd-utils/intrlv-avx2.h | 15 + simd-utils/simd-avx2.h | 39 +- simd-utils/simd-int.h | 27 +- simd-utils/simd-mmx.h | 4 +- simd-utils/simd-sse2.h | 32 ++ simd-utils/simd-types.h | 11 +- winbuild-cross.sh | 2 +- 75 files changed, 1790 insertions(+), 781 deletions(-) create mode 100644 algo/lyra2/phi2-4way.c create mode 100644 algo/quark/hmq1725-4way.c create mode 100644 algo/quark/hmq1725-gate.c create mode 100644 algo/quark/hmq1725-gate.h rename algo/{x17 => quark}/hmq1725.c (97%) diff --git a/INSTALL_LINUX b/INSTALL_LINUX index a9934a3..264b828 100644 --- a/INSTALL_LINUX +++ b/INSTALL_LINUX @@ -42,9 +42,6 @@ openssl 1.1.0e or higher. Add one of the following, depending on the compiler version, to CFLAGS: "-march=native" or "-march=znver1" or "-msha". -Due to poor AVX2 performance on Ryzen users should add -DRYZEN_ to CFLAGS -to override multiway AVX2 on algos with sha256, and use SHA instead. - Additional instructions for static compilalation can be found here: https://lxadm.com/Static_compilation_of_cpuminer Static builds should only considered in a homogeneous HW and SW environment. diff --git a/Makefile.am b/Makefile.am index 69dd9b5..5daee49 100644 --- a/Makefile.am +++ b/Makefile.am @@ -131,6 +131,7 @@ cpuminer_SOURCES = \ algo/lyra2/lyra2h-4way.c \ algo/lyra2/allium-4way.c \ algo/lyra2/allium.c \ + algo/lyra2/phi2-4way.c \ algo/lyra2/phi2.c \ algo/m7m.c \ algo/neoscrypt/neoscrypt.c \ @@ -147,6 +148,9 @@ cpuminer_SOURCES = \ algo/quark/anime-gate.c \ algo/quark/anime.c \ algo/quark/anime-4way.c \ + algo/quark/hmq1725-gate.c \ + algo/quark/hmq1725-4way.c \ + algo/quark/hmq1725.c \ algo/qubit/qubit-gate.c \ algo/qubit/qubit.c \ algo/qubit/qubit-2way.c \ @@ -257,7 +261,6 @@ cpuminer_SOURCES = \ algo/x17/xevan-gate.c \ algo/x17/xevan.c \ algo/x17/xevan-4way.c \ - algo/x17/hmq1725.c \ algo/x17/sonoa-gate.c \ algo/x17/sonoa-4way.c \ algo/x17/sonoa.c \ diff --git a/README.txt b/README.txt index d7bca62..0103099 100644 --- a/README.txt +++ b/README.txt @@ -29,7 +29,7 @@ cpuminer-sse2.exe "-msse2" Core2, Nehalem cpuminer-aes-sse42.exe "-march=westmere" Westmere cpuminer-avx.exe "-march=corei7-avx" Sandy-Ivybridge cpuminer-avx2.exe "-march=core-avx2" Haswell, Sky-Kaby-Coffeelake -cpuminer-zen "-march=znver1 -DRYZEN_" Ryzen +cpuminer-zen "-march=znver1" AMD Ryzen, Threadripper If you like this software feel free to donate: diff --git a/RELEASE_NOTES b/RELEASE_NOTES index b09d59d..d84f172 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -38,9 +38,17 @@ supported. Change Log ---------- +v3.9.4 + +Faster AVX2 for lyra2v3, quark, anime. +Fixed skein AVX2 regression (invalid shares since v3.9.0) and faster. +Faster skein2 with 4way AVX2 enabled. +Automatic SHA override on Ryzen CPUs, no need for -DRYZEN compile flag. +Ongoing restructuring. + v3.9.3.1 -Skippped v3.9.3 due to misidentification of v3.9.2.5 as v3.9.3. +Skipped v3.9.3 due to misidentification of v3.9.2.5 as v3.9.3. Fixed x16r algo 25% invalid share reject rate. The bug may have also affected other algos. diff --git a/algo/argon2/argon2d/argon2d/core.c b/algo/argon2/argon2d/argon2d/core.c index 2a2986a..e222648 100644 --- a/algo/argon2/argon2d/argon2d/core.c +++ b/algo/argon2/argon2d/argon2d/core.c @@ -112,7 +112,7 @@ int allocate_memory(const argon2_context *context, uint8_t **memory, void free_memory(const argon2_context *context, uint8_t *memory, size_t num, size_t size) { size_t memory_size = num*size; - clear_internal_memory(memory, memory_size); +// clear_internal_memory(memory, memory_size); if (context->free_cbk) { (context->free_cbk)(memory, memory_size); } else { @@ -137,7 +137,7 @@ void NOT_OPTIMIZED secure_wipe_memory(void *v, size_t n) { int FLAG_clear_internal_memory = 0; void clear_internal_memory(void *v, size_t n) { if (FLAG_clear_internal_memory && v) { - secure_wipe_memory(v, n); +// secure_wipe_memory(v, n); } } @@ -559,7 +559,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context, context->pwdlen); if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) { - secure_wipe_memory(context->pwd, context->pwdlen); +// secure_wipe_memory(context->pwd, context->pwdlen); context->pwdlen = 0; } } @@ -580,7 +580,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context, context->secretlen); if (context->flags & ARGON2_FLAG_CLEAR_SECRET) { - secure_wipe_memory(context->secret, context->secretlen); +// secure_wipe_memory(context->secret, context->secretlen); context->secretlen = 0; } } diff --git a/algo/bmw/bmw256-hash-4way.c b/algo/bmw/bmw256-hash-4way.c index be7c5db..efcb5d2 100644 --- a/algo/bmw/bmw256-hash-4way.c +++ b/algo/bmw/bmw256-hash-4way.c @@ -537,6 +537,8 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len) } } sc->ptr = ptr; + + if ( h1 != sc->H ) memcpy_128( sc->H, h1, 16 ); } @@ -571,6 +573,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n, for ( u = 0; u < 16; u ++ ) buf[u] = h2[u]; + compress_small( buf, (__m128i*)final_s, h1 ); for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++) @@ -1041,22 +1044,22 @@ static const __m256i final_s8[16] = void bmw256_8way_init( bmw256_8way_context *ctx ) { - ctx->H[ 0] = _mm256_set1_epi64x( IV256[ 0] ); - ctx->H[ 1] = _mm256_set1_epi64x( IV256[ 1] ); - ctx->H[ 2] = _mm256_set1_epi64x( IV256[ 2] ); - ctx->H[ 3] = _mm256_set1_epi64x( IV256[ 3] ); - ctx->H[ 4] = _mm256_set1_epi64x( IV256[ 4] ); - ctx->H[ 5] = _mm256_set1_epi64x( IV256[ 5] ); - ctx->H[ 6] = _mm256_set1_epi64x( IV256[ 6] ); - ctx->H[ 7] = _mm256_set1_epi64x( IV256[ 7] ); - ctx->H[ 8] = _mm256_set1_epi64x( IV256[ 8] ); - ctx->H[ 9] = _mm256_set1_epi64x( IV256[ 9] ); - ctx->H[10] = _mm256_set1_epi64x( IV256[10] ); - ctx->H[11] = _mm256_set1_epi64x( IV256[11] ); - ctx->H[12] = _mm256_set1_epi64x( IV256[12] ); - ctx->H[13] = _mm256_set1_epi64x( IV256[13] ); - ctx->H[14] = _mm256_set1_epi64x( IV256[14] ); - ctx->H[15] = _mm256_set1_epi64x( IV256[15] ); + ctx->H[ 0] = _mm256_set1_epi32( IV256[ 0] ); + ctx->H[ 1] = _mm256_set1_epi32( IV256[ 1] ); + ctx->H[ 2] = _mm256_set1_epi32( IV256[ 2] ); + ctx->H[ 3] = _mm256_set1_epi32( IV256[ 3] ); + ctx->H[ 4] = _mm256_set1_epi32( IV256[ 4] ); + ctx->H[ 5] = _mm256_set1_epi32( IV256[ 5] ); + ctx->H[ 6] = _mm256_set1_epi32( IV256[ 6] ); + ctx->H[ 7] = _mm256_set1_epi32( IV256[ 7] ); + ctx->H[ 8] = _mm256_set1_epi32( IV256[ 8] ); + ctx->H[ 9] = _mm256_set1_epi32( IV256[ 9] ); + ctx->H[10] = _mm256_set1_epi32( IV256[10] ); + ctx->H[11] = _mm256_set1_epi32( IV256[11] ); + ctx->H[12] = _mm256_set1_epi32( IV256[12] ); + ctx->H[13] = _mm256_set1_epi32( IV256[13] ); + ctx->H[14] = _mm256_set1_epi32( IV256[14] ); + ctx->H[15] = _mm256_set1_epi32( IV256[15] ); ctx->ptr = 0; ctx->bit_count = 0; @@ -1076,14 +1079,15 @@ void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len ) ptr = ctx->ptr; h1 = ctx->H; h2 = htmp; + while ( len > 0 ) { size_t clen; clen = buf_size - ptr; if ( clen > len ) clen = len; - memcpy_256( buf + (ptr>>3), vdata, clen >> 3 ); - vdata = vdata + (clen>>3); + memcpy_256( buf + (ptr>>2), vdata, clen >> 2 ); + vdata = vdata + (clen>>2); len -= clen; ptr += clen; if ( ptr == buf_size ) @@ -1097,6 +1101,7 @@ void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len ) } } ctx->ptr = ptr; + if ( h1 != ctx->H ) memcpy_256( ctx->H, h1, 16 ); } @@ -1106,24 +1111,26 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst ) __m256i *buf; __m256i h1[16], h2[16], *h; size_t ptr, u, v; -// unsigned z; const int buf_size = 64; // bytes of one lane, compatible with len buf = ctx->buf; ptr = ctx->ptr; - buf[ ptr>>3 ] = _mm256_set1_epi32( 0x80 ); - ptr += 8; + buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 ); + ptr += 4; h = ctx->H; - if ( ptr > (buf_size - 8) ) + if ( ptr > (buf_size - 4) ) { - memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 ); + memset_zero_256( buf + (ptr>>2), (buf_size - ptr) >> 2 ); compress_small_8way( buf, h, h1 ); ptr = 0; h = h1; } - memset_zero_256( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 ); - buf[ (buf_size - 8) >> 3 ] = _mm256_set1_epi64x( ctx->bit_count ); + memset_zero_256( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 ); + buf[ (buf_size - 8) >> 2 ] = _mm256_set1_epi32( ctx->bit_count ); + buf[ (buf_size - 4) >> 2 ] = m256_zero; + + compress_small_8way( buf, h, h2 ); for ( u = 0; u < 16; u ++ ) diff --git a/algo/lyra2/lyra2-gate.c b/algo/lyra2/lyra2-gate.c index ff16704..66e3a25 100644 --- a/algo/lyra2/lyra2-gate.c +++ b/algo/lyra2/lyra2-gate.c @@ -47,7 +47,9 @@ bool lyra2rev3_thread_init() int size = (int64_t)ROW_LEN_BYTES * 4; // nRows; l2v3_wholeMatrix = _mm_malloc( size, 64 ); -#if defined (LYRA2REV3_4WAY) +#if defined (LYRA2REV3_8WAY) + init_lyra2rev3_8way_ctx();; +#elif defined (LYRA2REV3_4WAY) init_lyra2rev3_4way_ctx();; #else init_lyra2rev3_ctx(); @@ -57,7 +59,10 @@ bool lyra2rev3_thread_init() bool register_lyra2rev3_algo( algo_gate_t* gate ) { -#if defined (LYRA2REV3_4WAY) +#if defined (LYRA2REV3_8WAY) + gate->scanhash = (void*)&scanhash_lyra2rev3_8way; + gate->hash = (void*)&lyra2rev3_8way_hash; +#elif defined (LYRA2REV3_4WAY) gate->scanhash = (void*)&scanhash_lyra2rev3_4way; gate->hash = (void*)&lyra2rev3_4way_hash; #else @@ -203,13 +208,18 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) bool register_phi2_algo( algo_gate_t* gate ) { - init_phi2_ctx(); +// init_phi2_ctx(); gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT; gate->get_work_data_size = (void*)&phi2_get_work_data_size; gate->decode_extra_data = (void*)&phi2_decode_extra_data; gate->build_extraheader = (void*)&phi2_build_extraheader; gate->set_target = (void*)&alt_set_target; gate->get_max64 = (void*)&get_max64_0xffffLL; +#if defined(PHI2_4WAY) + gate->scanhash = (void*)&scanhash_phi2_4way; +#else + init_phi2_ctx(); gate->scanhash = (void*)&scanhash_phi2; +#endif return true; } diff --git a/algo/lyra2/lyra2-gate.h b/algo/lyra2/lyra2-gate.h index adc7eb0..e7f9e56 100644 --- a/algo/lyra2/lyra2-gate.h +++ b/algo/lyra2/lyra2-gate.h @@ -5,7 +5,9 @@ #include #include "lyra2.h" -//#if defined(__AVX2__) +#if defined(__AVX2__) + #define LYRA2REV3_8WAY +#endif #if defined(__SSE2__) #define LYRA2REV3_4WAY @@ -14,8 +16,14 @@ extern __thread uint64_t* l2v3_wholeMatrix; bool register_lyra2rev3_algo( algo_gate_t* gate ); +#if defined(LYRA2REV3_8WAY) -#if defined(LYRA2REV3_4WAY) +void lyra2rev3_8way_hash( void *state, const void *input ); +int scanhash_lyra2rev3_8way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +bool init_lyra2rev3_8way_ctx(); + +#elif defined(LYRA2REV3_4WAY) void lyra2rev3_4way_hash( void *state, const void *input ); int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce, @@ -142,15 +150,29 @@ bool init_allium_ctx(); ///////////////////////////////////////// +#if defined(__AVX2__) && defined(__AES__) +// #define PHI2_4WAY +#endif + bool phi2_has_roots; bool register_phi2_algo( algo_gate_t* gate ); +#if defined(PHI2_4WAY) + +void phi2_hash_4way( void *state, const void *input ); +int scanhash_phi2_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +//void init_phi2_ctx(); + +#else void phi2_hash( void *state, const void *input ); int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); void init_phi2_ctx(); +#endif + #endif // LYRA2_GATE_H__ diff --git a/algo/lyra2/lyra2rev3-4way.c b/algo/lyra2/lyra2rev3-4way.c index 221a31c..0326161 100644 --- a/algo/lyra2/lyra2rev3-4way.c +++ b/algo/lyra2/lyra2rev3-4way.c @@ -1,12 +1,138 @@ #include "lyra2-gate.h" #include -#if defined (LYRA2REV3_4WAY) - #include "algo/blake/blake-hash-4way.h" #include "algo/bmw/bmw-hash-4way.h" #include "algo/cubehash/cubehash_sse2.h" + +#if defined (LYRA2REV3_8WAY) + +typedef struct { + blake256_8way_context blake; + cubehashParam cube; + bmw256_8way_context bmw; +} lyra2v3_8way_ctx_holder; + +static lyra2v3_8way_ctx_holder l2v3_8way_ctx; + +bool init_lyra2rev3_8way_ctx() +{ + blake256_8way_init( &l2v3_8way_ctx.blake ); + cubehashInit( &l2v3_8way_ctx.cube, 256, 16, 32 ); + bmw256_8way_init( &l2v3_8way_ctx.bmw ); + return true; +} + +void lyra2rev3_8way_hash( void *state, const void *input ) +{ + uint32_t vhash[8*8] __attribute__ ((aligned (64))); + uint32_t hash0[8] __attribute__ ((aligned (64))); + uint32_t hash1[8] __attribute__ ((aligned (32))); + uint32_t hash2[8] __attribute__ ((aligned (32))); + uint32_t hash3[8] __attribute__ ((aligned (32))); + uint32_t hash4[8] __attribute__ ((aligned (32))); + uint32_t hash5[8] __attribute__ ((aligned (32))); + uint32_t hash6[8] __attribute__ ((aligned (32))); + uint32_t hash7[8] __attribute__ ((aligned (32))); + lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64))); + memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) ); + + blake256_8way( &ctx.blake, input, 80 ); + blake256_8way_close( &ctx.blake, vhash ); + + mm256_dintrlv_8x32( hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7, vhash, 256 ); + + LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 ); + LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 ); + LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 ); + LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 ); + LYRA2REV3( l2v3_wholeMatrix, hash4, 32, hash4, 32, hash4, 32, 1, 4, 4 ); + LYRA2REV3( l2v3_wholeMatrix, hash5, 32, hash5, 32, hash5, 32, 1, 4, 4 ); + LYRA2REV3( l2v3_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 ); + LYRA2REV3( l2v3_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 ); + + cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 ); + cubehashInit( &ctx.cube, 256, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 ); + cubehashInit( &ctx.cube, 256, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 ); + cubehashInit( &ctx.cube, 256, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 ); + cubehashInit( &ctx.cube, 256, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash4, (const byte*) hash4, 32 ); + cubehashInit( &ctx.cube, 256, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash5, (const byte*) hash5, 32 ); + cubehashInit( &ctx.cube, 256, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash6, (const byte*) hash6, 32 ); + cubehashInit( &ctx.cube, 256, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash7, (const byte*) hash7, 32 ); + + LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 ); + LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 ); + LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 ); + LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 ); + LYRA2REV3( l2v3_wholeMatrix, hash4, 32, hash4, 32, hash4, 32, 1, 4, 4 ); + LYRA2REV3( l2v3_wholeMatrix, hash5, 32, hash5, 32, hash5, 32, 1, 4, 4 ); + LYRA2REV3( l2v3_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 ); + LYRA2REV3( l2v3_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 ); + + mm256_intrlv_8x32( vhash, hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7, 256 ); + + bmw256_8way( &ctx.bmw, vhash, 32 ); + bmw256_8way_close( &ctx.bmw, state ); + + } + +int scanhash_lyra2rev3_8way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash[8*8] __attribute__ ((aligned (64))); + uint32_t vdata[20*8] __attribute__ ((aligned (64))); + uint32_t *hash7 = &(hash[7<<3]); + uint32_t lane_hash[8]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + uint32_t n = first_nonce; + const uint32_t Htarg = ptarget[7]; + __m256i *noncev = (__m256i*)vdata + 19; // aligned + /* int */ thr_id = mythr->id; // thr_id arg is deprecated + + if ( opt_benchmark ) + ( (uint32_t*)ptarget )[7] = 0x0000ff; + + mm256_bswap_intrlv80_8x32( vdata, pdata ); + do + { + *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4, + n+3, n+2, n+1, n ) ); + + lyra2rev3_8way_hash( hash, vdata ); + pdata[19] = n; + + for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg ) + { + mm256_extract_lane_8x32( lane_hash, hash, lane, 256 ); + if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr, lane ); + } + } + n += 8; + } while ( (n < max_nonce-8) && !work_restart[thr_id].restart); + *hashes_done = n - first_nonce + 1; + return 0; +} + +#endif + +#if defined (LYRA2REV3_4WAY) + + typedef struct { blake256_4way_context blake; cubehashParam cube; diff --git a/algo/lyra2/phi2-4way.c b/algo/lyra2/phi2-4way.c new file mode 100644 index 0000000..321384f --- /dev/null +++ b/algo/lyra2/phi2-4way.c @@ -0,0 +1,233 @@ +/** + * Phi-2 algo Implementation + */ + +#include "lyra2-gate.h" + +#if defined(PHI2_4WAY) + +#include "algo/skein/skein-hash-4way.h" +#include "algo/jh/jh-hash-4way.h" +#include "algo/gost/sph_gost.h" +#include "algo/cubehash/cubehash_sse2.h" +#include "algo/echo/aes_ni/hash_api.h" + +typedef struct { + cubehashParam cube; + jh512_4way_context jh; + hashState_echo echo; +// hashState_echo echo2; + sph_gost512_context gost; + skein512_4way_context skein; +} phi2_ctx_holder; +/* +phi2_ctx_holder phi2_ctx; + +void init_phi2_ctx() +{ + cubehashInit( &phi2_ctx.cube, 512, 16, 32 ); + sph_jh512_init(&phi2_ctx.jh); + init_echo( &phi2_ctx.echo1, 512 ); + init_echo( &phi2_ctx.echo2, 512 ); + sph_gost512_init(&phi2_ctx.gost); + sph_skein512_init(&phi2_ctx.skein); +}; +*/ +void phi2_hash_4way( void *state, const void *input ) +{ + uint32_t hash[4][16] __attribute__ ((aligned (64))); + uint32_t hashA[4][16] __attribute__ ((aligned (64))); + uint32_t hashB[4][16] __attribute__ ((aligned (64))); + uint32_t vhash[4*16] __attribute__ ((aligned (64))); + +// unsigned char _ALIGN(128) hash[64]; +// unsigned char _ALIGN(128) hashA[64]; +// unsigned char _ALIGN(128) hashB[64]; + + phi2_ctx_holder ctx __attribute__ ((aligned (64))); +// memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) ); + + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hashB[0], (const byte*)input, + phi2_has_roots ? 144 : 80 ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hashB[1], (const byte*)input+144, + phi2_has_roots ? 144 : 80 ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hashB[2], (const byte*)input+288, + phi2_has_roots ? 144 : 80 ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hashB[3], (const byte*)input+432, + phi2_has_roots ? 144 : 80 ); + + LYRA2RE( &hashA[0][0], 32, &hashB[0][0], 32, &hashB[0][0], 32, 1, 8, 8 ); + LYRA2RE( &hashA[0][8], 32, &hashB[0][8], 32, &hashB[0][8], 32, 1, 8, 8 ); + LYRA2RE( &hashA[1][0], 32, &hashB[1][0], 32, &hashB[1][0], 32, 1, 8, 8 ); + LYRA2RE( &hashA[1][8], 32, &hashB[1][8], 32, &hashB[1][8], 32, 1, 8, 8 ); + LYRA2RE( &hashA[2][0], 32, &hashB[2][0], 32, &hashB[2][0], 32, 1, 8, 8 ); + LYRA2RE( &hashA[2][8], 32, &hashB[2][8], 32, &hashB[2][8], 32, 1, 8, 8 ); + LYRA2RE( &hashA[3][0], 32, &hashB[3][0], 32, &hashB[3][0], 32, 1, 8, 8 ); + LYRA2RE( &hashA[3][8], 32, &hashB[3][8], 32, &hashB[3][8], 32, 1, 8, 8 ); + + mm256_intrlv_4x64( vhash, hashA[0], hashA[1], hashA[2], hashA[3], 512 ); + + jh512_4way_init( &ctx.jh ); + jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_close( &ctx.jh, vhash ); + + mm256_dintrlv_4x64( hash[0], hash[1], hash[2], hash[3], vhash, 512 ); + + if ( hash[0][0] & 1 ) + { + sph_gost512_init( &ctx.gost ); + sph_gost512( &ctx.gost, (const void*)hash[0], 64 ); + sph_gost512_close( &ctx.gost, (void*)hash[0] ); + } + else + { + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash[0], + (const BitSequence *)hash[0], 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash[0], + (const BitSequence *)hash[0], 512 ); + } + + if ( hash[1][0] & 1 ) + { + sph_gost512_init( &ctx.gost ); + sph_gost512( &ctx.gost, (const void*)hash[1], 64 ); + sph_gost512_close( &ctx.gost, (void*)hash[1] ); + } + else + { + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash[1], + (const BitSequence *)hash[1], 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash[1], + (const BitSequence *)hash[1], 512 ); + } + + if ( hash[2][0] & 1 ) + { + sph_gost512_init( &ctx.gost ); + sph_gost512( &ctx.gost, (const void*)hash[2], 64 ); + sph_gost512_close( &ctx.gost, (void*)hash[2] ); + } + else + { + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash[2], + (const BitSequence *)hash[2], 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash[2], + (const BitSequence *)hash[2], 512 ); + } + + if ( hash[3][0] & 1 ) + { + sph_gost512_init( &ctx.gost ); + sph_gost512( &ctx.gost, (const void*)hash[3], 64 ); + sph_gost512_close( &ctx.gost, (void*)hash[3] ); + } + else + { + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash[3], + (const BitSequence *)hash[3], 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash[3], + (const BitSequence *)hash[3], 512 ); + } + + mm256_intrlv_4x64( vhash, hash[0], hash[1], hash[2], hash[3], 512 ); + + skein512_4way_init( &ctx.skein ); + skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_close( &ctx.skein, vhash ); + + for (int i=0; i<4; i++) + { + ( (uint64_t*)vhash )[i] ^= ( (uint64_t*)vhash )[i+4]; + ( (uint64_t*)vhash+ 8 )[i] ^= ( (uint64_t*)vhash+ 8 )[i+4]; + ( (uint64_t*)vhash+16 )[i] ^= ( (uint64_t*)vhash+16 )[i+4]; + ( (uint64_t*)vhash+24 )[i] ^= ( (uint64_t*)vhash+24 )[i+4]; + } +// for ( int i = 0; i < 4; i++ ) +// casti_m256i( vhash, i ) = _mm256_xor_si256( casti_m256i( vhash, i ), +// casti_m256i( vhash, i+4 ) ); + + memcpy( state, vhash, 128 ); +} + +int scanhash_phi2_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t _ALIGN(128) hash[8]; + uint32_t _ALIGN(128) edata[36]; + uint32_t vdata[4][36] __attribute__ ((aligned (64))); + uint32_t *hash7 = &(hash[25]); + uint32_t lane_hash[8]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t Htarg = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + uint32_t n = first_nonce; + /* int */ thr_id = mythr->id; // thr_id arg is deprecated + + if(opt_benchmark){ + ptarget[7] = 0x00ff; + } + +// Data is not interleaved, but hash is. +// any non-zero data at index 20 or above sets roots true. +// Split up the operations, bswap first, then set roots. + + phi2_has_roots = false; + for ( int i=0; i < 36; i++ ) + { + be32enc(&edata[i], pdata[i]); + if (i >= 20 && pdata[i]) phi2_has_roots = true; + } +/* + casti_m256i( vdata[0], 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) ); + casti_m256i( vdata[0], 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) ); + casti_m256i( vdata[0], 2 ) = mm256_bswap_32( casti_m256i( pdata, 2 ) ); + casti_m256i( vdata[0], 3 ) = mm256_bswap_32( casti_m256i( pdata, 3 ) ); + casti_m128i( vdata[0], 8 ) = mm128_bswap_32( casti_m128i( pdata, 8 ) ); + phi2_has_roots = mm128_anybits1( casti_m128i( vdata[0], 5 ) ) || + mm128_anybits1( casti_m128i( vdata[0], 6 ) ) || + mm128_anybits1( casti_m128i( vdata[0], 7 ) ) || + mm128_anybits1( casti_m128i( vdata[0], 8 ) ); +*/ + + memcpy( vdata[0], edata, 144 ); + memcpy( vdata[1], edata, 144 ); + memcpy( vdata[2], edata, 144 ); + memcpy( vdata[3], edata, 144 ); + + do { + be32enc( &vdata[0][19], n ); + be32enc( &vdata[1][19], n+1 ); + be32enc( &vdata[2][19], n+2 ); + be32enc( &vdata[3][19], n+3 ); + + phi2_hash_4way( hash, vdata ); + + for ( int lane = 0; lane < 4; lane++ ) if ( hash7[ lane<<1 ] < Htarg ) + { + mm256_extract_lane_4x64( lane_hash, hash, lane, 256 ); + if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr, lane ); + } + } + n += 4; + } while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart ); + *hashes_done = n - first_nonce + 1; + return 0; +} + +#endif // PHI2_4WAY diff --git a/algo/m7m.c b/algo/m7m.c index 2398fe7..c913cb3 100644 --- a/algo/m7m.c +++ b/algo/m7m.c @@ -144,8 +144,8 @@ void init_m7m_ctx() #define NM7M 5 #define SW_DIVS 5 #define M7_MIDSTATE_LEN 76 -int scanhash_m7m_hash( int thr_id, struct work* work, - uint64_t max_nonce, unsigned long *hashes_done ) +int scanhash_m7m_hash( int thr_id, struct work* work, uint64_t max_nonce, + unsigned long *hashes_done, struct thr_info *mythr ) { uint32_t *pdata = work->data; uint32_t *ptarget = work->target; @@ -154,6 +154,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work, uint32_t hash[8] __attribute__((aligned(64))); uint8_t bhash[7][64] __attribute__((aligned(64))); uint32_t n = pdata[19] - 1; + /* int */ thr_id = mythr->id; // thr_id arg is deprecated uint32_t usw_, mpzscale; const uint32_t first_nonce = pdata[19]; char data_str[161], hash_str[65], target_str[65]; diff --git a/algo/pluck.c b/algo/pluck.c index 9756df0..5ade09b 100644 --- a/algo/pluck.c +++ b/algo/pluck.c @@ -445,7 +445,7 @@ void pluck_hash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const i } int scanhash_pluck(int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ) + uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t *pdata = work->data; uint32_t *ptarget = work->target; @@ -454,6 +454,8 @@ int scanhash_pluck(int thr_id, struct work *work, uint32_t max_nonce, const uint32_t first_nonce = pdata[19]; volatile uint8_t *restart = &(work_restart[thr_id].restart); uint32_t n = first_nonce; + /* int */ thr_id = mythr->id; // thr_id arg is deprecated + if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0x0ffff; diff --git a/algo/quark/anime-4way.c b/algo/quark/anime-4way.c index 89ad927..d68cd37 100644 --- a/algo/quark/anime-4way.c +++ b/algo/quark/anime-4way.c @@ -48,8 +48,8 @@ void anime_4way_hash( void *state, const void *input ) __m256i* vhA = (__m256i*)vhashA; __m256i* vhB = (__m256i*)vhashB; __m256i vh_mask; + const uint32_t mask = 8; const __m256i bit3_mask = _mm256_set1_epi64x( 8 ); - int i; anime_4way_ctx_holder ctx; memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) ); @@ -62,27 +62,44 @@ void anime_4way_hash( void *state, const void *input ) vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), m256_zero ); - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (char*)hash0, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (char*)hash1, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (char*)hash2, 512 ); - reinit_groestl( &ctx.groestl ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (char*)hash3, 512 ); - mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhashB ); + if ( hash0[0] & mask ) + { + update_and_final_groestl( &ctx.groestl, (char*)hash0, + (char*)hash0, 512 ); + } + if ( hash1[0] & mask ) + { + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, + (char*)hash1, 512 ); + } + if ( hash2[0] & mask ) + { + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, + (char*)hash2, 512 ); + } + if ( hash3[0] & mask ) + { + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, + (char*)hash3, 512 ); + } - for ( i = 0; i < 8; i++ ) - vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask ); + mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); + + if ( mm256_anybits0( vh_mask ) ) + { + skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_close( &ctx.skein, vhashB ); + } + + mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); + + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); reinit_groestl( &ctx.groestl ); update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); reinit_groestl( &ctx.groestl ); @@ -91,7 +108,8 @@ void anime_4way_hash( void *state, const void *input ) update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); reinit_groestl( &ctx.groestl ); update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + + mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); jh512_4way( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); @@ -99,16 +117,20 @@ void anime_4way_hash( void *state, const void *input ) vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), m256_zero ); + if ( mm256_anybits1( vh_mask ) ) + { blake512_4way_init( &ctx.blake ); blake512_4way( &ctx.blake, vhash, 64 ); blake512_4way_close( &ctx.blake, vhashA ); - + } + if ( mm256_anybits0( vh_mask ) ) + { bmw512_4way_init( &ctx.bmw ); bmw512_4way( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhashB ); + } - for ( i = 0; i < 8; i++ ) - vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask ); + mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); keccak512_4way( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); @@ -120,33 +142,35 @@ void anime_4way_hash( void *state, const void *input ) vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), m256_zero ); - keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhashA ); + if ( mm256_anybits1( vh_mask ) ) + { + keccak512_4way_init( &ctx.keccak ); + keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_close( &ctx.keccak, vhashA ); + } + if ( mm256_anybits0( vh_mask ) ) + { + jh512_4way_init( &ctx.jh ); + jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_close( &ctx.jh, vhashB ); + } - jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhashB ); + mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - for ( i = 0; i < 8; i++ ) - vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask ); - - mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 ); + mm256_dintrlv_4x64( state, state+32, state+64, state+96, vhash, 256 ); } int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done) + uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t hash[4*8] __attribute__ ((aligned (64))); uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t endiandata[20] __attribute__((aligned(64))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; - uint32_t *nonces = work->nonces; - int num_found = 0; - uint32_t *noncep = vdata + 73; // 9*8 + 1 + __m256i *noncev = (__m256i*)vdata + 9; // aligned + /* int */ thr_id = mythr->id; // thr_id arg is deprecated const uint32_t Htarg = ptarget[7]; uint64_t htmax[] = { 0, @@ -165,10 +189,7 @@ int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce, 0 }; - swab32_array( endiandata, pdata, 20 ); - - uint64_t *edata = (uint64_t*)endiandata; - mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); + mm256_bswap_intrlv80_4x64( vdata, pdata ); for (int m=0; m < 6; m++) if (Htarg <= htmax[m]) @@ -177,30 +198,26 @@ int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce, do { - be32enc( noncep, n ); - be32enc( noncep+2, n+1 ); - be32enc( noncep+4, n+2 ); - be32enc( noncep+6, n+3 ); + *noncev = mm256_intrlv_blend_32( mm256_bswap_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); anime_4way_hash( hash, vdata ); pdata[19] = n; for ( int i = 0; i < 4; i++ ) if ( ( ( (hash+(i<<3))[7] & mask ) == 0 ) - && fulltest( hash+(i<<3), ptarget ) ) + && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) { pdata[19] = n+i; - nonces[ num_found++ ] = n+i; - work_set_target_ratio( work, hash+(i<<3) ); + submit_solution( work, hash+(i<<3), mythr, i ); } n += 4; - } while ( ( num_found == 0 ) && ( n < max_nonce ) - && !work_restart[thr_id].restart ); + } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); break; } *hashes_done = n - first_nonce + 1; - return num_found; + return 0; } #endif diff --git a/algo/quark/anime-gate.h b/algo/quark/anime-gate.h index 1e6ac07..5dfbfec 100644 --- a/algo/quark/anime-gate.h +++ b/algo/quark/anime-gate.h @@ -13,19 +13,15 @@ bool register_anime_algo( algo_gate_t* gate ); #if defined(ANIME_4WAY) void anime_4way_hash( void *state, const void *input ); - int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); - + uint64_t *hashes_done, struct thr_info *mythr ); void init_anime_4way_ctx(); #endif void anime_hash( void *state, const void *input ); - int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); - + uint64_t *hashes_done, struct thr_info *mythr ); void init_anime_ctx(); #endif diff --git a/algo/quark/anime.c b/algo/quark/anime.c index eebb7c2..3f33485 100644 --- a/algo/quark/anime.c +++ b/algo/quark/anime.c @@ -46,20 +46,6 @@ void init_anime_ctx() void anime_hash( void *state, const void *input ) { unsigned char hash[128] __attribute__ ((aligned (32))); -/* - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - uint64_t vhashA[8*4] __attribute__ ((aligned (64))); - uint64_t vhashB[8*4] __attribute__ ((aligned (64))); - __m256i* vh = (__m256i*)vhash; - __m256i* vhA = (__m256i*)vhashA; - __m256i* vhB = (__m256i*)vhashB; - __m256i vh_mask; - __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 ); -*/ uint32_t mask = 8; anime_ctx_holder ctx; memcpy( &ctx, &anime_ctx, sizeof(anime_ctx) ); @@ -134,7 +120,7 @@ void anime_hash( void *state, const void *input ) } int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done) + uint64_t *hashes_done, struct thr_info *mythr) { uint32_t hash[8] __attribute__ ((aligned (64))); uint32_t endiandata[20] __attribute__((aligned(64))); @@ -142,6 +128,7 @@ int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce, uint32_t *ptarget = work->target; uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; + /* int */ thr_id = mythr->id; // thr_id arg is deprecated const uint32_t Htarg = ptarget[7]; uint64_t htmax[] = { 0, diff --git a/algo/quark/hmq1725-4way.c b/algo/quark/hmq1725-4way.c new file mode 100644 index 0000000..30263a9 --- /dev/null +++ b/algo/quark/hmq1725-4way.c @@ -0,0 +1,618 @@ +#include "hmq1725-gate.h" + +#if defined(HMQ1725_4WAY) + +#include +#include +#include "algo/blake/blake-hash-4way.h" +#include "algo/bmw/bmw-hash-4way.h" +#include "algo/groestl/aes_ni/hash-groestl.h" +#include "algo/skein/skein-hash-4way.h" +#include "algo/jh/jh-hash-4way.h" +#include "algo/keccak/keccak-hash-4way.h" +#include "algo/luffa/luffa_for_sse2.h" +#include "algo/cubehash/cubehash_sse2.h" +#include "algo/simd/nist.h" +#include "algo/shavite/sph_shavite.h" +#include "algo/simd/simd-hash-2way.h" +#include "algo/echo/aes_ni/hash_api.h" +#include "algo/hamsi/hamsi-hash-4way.h" +#include "algo/fugue/sph_fugue.h" +#include "algo/shabal/shabal-hash-4way.h" +#include "algo/whirlpool/sph_whirlpool.h" +#include "algo/haval/haval-hash-4way.h" +#include "algo/sha/sha2-hash-4way.h" + +union _hmq1725_4way_context_overlay +{ + blake512_4way_context blake; + bmw512_4way_context bmw; + hashState_groestl groestl; + skein512_4way_context skein; + jh512_4way_context jh; + keccak512_4way_context keccak; + hashState_luffa luffa; + cubehashParam cube; + sph_shavite512_context shavite; + hashState_sd simd; + hashState_echo echo; + hamsi512_4way_context hamsi; + sph_fugue512_context fugue; + shabal512_4way_context shabal; + sph_whirlpool_context whirlpool; + sha512_4way_context sha512; + haval256_5_4way_context haval; +}; +typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay; + +extern void hmq1725_4way_hash(void *state, const void *input) +{ +// why so big? only really need 8, haval thing uses 16. + uint32_t hash0 [32] __attribute__ ((aligned (64))); + uint32_t hash1 [32] __attribute__ ((aligned (64))); + uint32_t hash2 [32] __attribute__ ((aligned (64))); + uint32_t hash3 [32] __attribute__ ((aligned (64))); + uint32_t vhash [32<<2] __attribute__ ((aligned (64))); + uint32_t vhashA[32<<2] __attribute__ ((aligned (64))); + uint32_t vhashB[32<<2] __attribute__ ((aligned (64))); + hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64))); + __m256i vh_mask; + const __m256i vmask = _mm256_set1_epi64x( 24 ); + const uint32_t mask = 24; + __m256i* vh = (__m256i*)vhash; + __m256i* vhA = (__m256i*)vhashA; + __m256i* vhB = (__m256i*)vhashB; + + bmw512_4way_init( &ctx.bmw ); + bmw512_4way( &ctx.bmw, input, 80 ); + bmw512_4way_close( &ctx.bmw, vhash ); + + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash0, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash1, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash2, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash3, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + +// first fork, A is groestl serial, B is skein parallel. + + mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + + vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), + m256_zero ); + +// A + +// if ( hash0[0] & mask ) +// { + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, + (char*)hash0, 512 ); +// } +// if ( hash1[0] & mask ) +// { + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, + (char*)hash1, 512 ); +// } +// if ( hash2[0] & mask ) +// { + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, + (char*)hash2, 512 ); +// } +// if ( hash3[0] & mask ) +// { + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, + (char*)hash3, 512 ); +// } + + mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); + +// B + +// if ( mm256_any_clr_256( vh_mask ) ) +// { + skein512_4way_init( &ctx.skein ); + skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_close( &ctx.skein, vhashB ); +// } + + mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); + + jh512_4way_init( &ctx.jh ); + jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_close( &ctx.jh, vhash ); + + keccak512_4way_init( &ctx.keccak ); + keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_close( &ctx.keccak, vhash ); + +// second fork, A = blake parallel, B= bmw parallel. + + vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), + m256_zero ); + +// if ( mm256_any_set_256( vh_mask ) ) +// { + blake512_4way_init( &ctx.blake ); + blake512_4way( &ctx.blake, vhash, 64 ); + blake512_4way_close( &ctx.blake, vhashA ); +// } + +// if ( mm256_any_clr_256( vh_mask ) ) +// { + bmw512_4way_init( &ctx.bmw ); + bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_close( &ctx.bmw, vhashB ); +// } + + mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); + + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + + init_luffa( &ctx.luffa, 512 ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, + (const BitSequence*)hash0, 64 ); + init_luffa( &ctx.luffa, 512 ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, + (const BitSequence*)hash1, 64 ); + init_luffa( &ctx.luffa, 512 ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, + (const BitSequence*)hash2, 64 ); + init_luffa( &ctx.luffa, 512 ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, + (const BitSequence*)hash3, 64 ); + + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash0, + (const BitSequence *)hash0, 64 ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash1, + (const BitSequence *)hash1, 64 ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash2, + (const BitSequence *)hash2, 64 ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash3, + (const BitSequence *)hash3, 64 ); + + mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + +// A= keccak parallel, B= jh parallel + + vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), + m256_zero ); + +// if ( mm256_any_set_256( vh_mask ) ) +// { + keccak512_4way_init( &ctx.keccak ); + keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_close( &ctx.keccak, vhashA ); +// } + +// if ( mm256_any_clr_256( vh_mask ) ) +// { + jh512_4way_init( &ctx.jh ); + jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_close( &ctx.jh, vhashB ); +// } + + mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); + + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + + sph_shavite512_init( &ctx.shavite ); + sph_shavite512 ( &ctx.shavite, hash0, 64 ); + sph_shavite512_close( &ctx.shavite, hash0 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512 ( &ctx.shavite, hash1, 64 ); + sph_shavite512_close( &ctx.shavite, hash1 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512 ( &ctx.shavite, hash2, 64 ); + sph_shavite512_close( &ctx.shavite, hash2 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512 ( &ctx.shavite, hash3, 64 ); + sph_shavite512_close( &ctx.shavite, hash3 ); + + init_sd( &ctx.simd, 512 ); + update_final_sd( &ctx.simd, (BitSequence *)hash0, + (const BitSequence *)hash0, 512 ); + init_sd( &ctx.simd, 512 ); + update_final_sd( &ctx.simd, (BitSequence *)hash1, + (const BitSequence *)hash1, 512 ); + init_sd( &ctx.simd, 512 ); + update_final_sd( &ctx.simd, (BitSequence *)hash2, + (const BitSequence *)hash2, 512 ); + init_sd( &ctx.simd, 512 ); + update_final_sd( &ctx.simd, (BitSequence *)hash3, + (const BitSequence *)hash3, 512 ); + +// A is whirlpool serial, B is haval parallel. + + + mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + + vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), + m256_zero ); + // A + +// if ( hash0[0] & mask ) +// { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash0, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); +// } +// if ( hash1[0] & mask ) +// { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash1, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); +// } +// if ( hash2[0] & mask ) +// { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash2, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); +// } +// if ( hash3[0] & mask ) +// { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash3, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); +// } + + mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); + +// B + +// if ( mm256_any_clr_256( vh_mask ) ) +// { + haval256_5_4way_init( &ctx.haval ); + haval256_5_4way( &ctx.haval, vhash, 64 ); + haval256_5_4way_close( &ctx.haval, vhashB ); + memset( &vhashB[8<<2], 0, 32<<2); +// } + + mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); + + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *)hash0, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *)hash1, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *)hash2, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *)hash3, 512 ); + + mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + + blake512_4way_init( &ctx.blake ); + blake512_4way( &ctx.blake, vhash, 64 ); + blake512_4way_close( &ctx.blake, vhash ); + + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + +// shavite & luffa, both serial, select individually. + + if ( hash0[0] & mask ) + { + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash0, 64 ); // + sph_shavite512_close( &ctx.shavite, hash0 ); //8 + } + else + { + init_luffa( &ctx.luffa, 512 ); + update_and_final_luffa( &ctx.luffa, (BitSequence *)hash0, + (const BitSequence *)hash0, 64 ); + } + + if ( hash1[0] & mask ) + { + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash1, 64 ); // + sph_shavite512_close( &ctx.shavite, hash1 ); //8 + } + else + { + init_luffa( &ctx.luffa, 512 ); + update_and_final_luffa( &ctx.luffa, (BitSequence *)hash1, + (const BitSequence *)hash1, 64 ); + } + + if ( hash2[0] & mask ) + { + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash2, 64 ); // + sph_shavite512_close( &ctx.shavite, hash2 ); //8 + } + else + { + init_luffa( &ctx.luffa, 512 ); + update_and_final_luffa( &ctx.luffa, (BitSequence *)hash2, + (const BitSequence *)hash2, 64 ); + } + + if ( hash3[0] & mask ) + { + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash3, 64 ); // + sph_shavite512_close( &ctx.shavite, hash3 ); //8 + } + else + { + init_luffa( &ctx.luffa, 512 ); + update_and_final_luffa( &ctx.luffa, (BitSequence *)hash3, + (const BitSequence *)hash3, 64 ); + } + + mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + + hamsi512_4way_init( &ctx.hamsi ); + hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_close( &ctx.hamsi, vhash ); + + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash0, 64 ); + sph_fugue512_close( &ctx.fugue, hash0 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash1, 64 ); + sph_fugue512_close( &ctx.fugue, hash1 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash2, 64 ); + sph_fugue512_close( &ctx.fugue, hash2 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash3, 64 ); + sph_fugue512_close( &ctx.fugue, hash3 ); + + +// A echo, B sd both serial + + if ( hash0[0] & mask ) //4 + { + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *)hash0, 512 ); + } + else + { + init_sd( &ctx.simd, 512 ); + update_final_sd( &ctx.simd, (BitSequence *)hash0, + (const BitSequence *)hash0, 512 ); + } + + if ( hash1[0] & mask ) //4 + { + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *)hash1, 512 ); + } + else + { + init_sd( &ctx.simd, 512 ); + update_final_sd( &ctx.simd, (BitSequence *)hash1, + (const BitSequence *)hash1, 512 ); + } + + if ( hash2[0] & mask ) //4 + { + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *)hash2, 512 ); + } + else + { + init_sd( &ctx.simd, 512 ); + update_final_sd( &ctx.simd, (BitSequence *)hash2, + (const BitSequence *)hash2, 512 ); + } + + if ( hash3[0] & mask ) //4 + { + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *)hash3, 512 ); + } + else + { + init_sd( &ctx.simd, 512 ); + update_final_sd( &ctx.simd, (BitSequence *)hash3, + (const BitSequence *)hash3, 512 ); + } + + mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); + + shabal512_4way_init( &ctx.shabal ); + shabal512_4way( &ctx.shabal, vhash, 64 ); + shabal512_4way_close( &ctx.shabal, vhash ); + + mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); + + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash0, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash1, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash2, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash3, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + +// A = fugue serial, B = sha512 prarallel + + mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + + vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), + m256_zero ); + +// if ( hash0[0] & mask ) +// { + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash0, 64 ); + sph_fugue512_close( &ctx.fugue, hash0 ); +// } +// if ( hash1[0] & mask ) +// { + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash1, 64 ); + sph_fugue512_close( &ctx.fugue, hash1 ); +// } +// if ( hash2[0] & mask ) +// { + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash2, 64 ); + sph_fugue512_close( &ctx.fugue, hash2 ); +// } +// if ( hash3[0] & mask ) +// { + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash3, 64 ); + sph_fugue512_close( &ctx.fugue, hash3 ); +// } + + mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); + +// if ( mm256_any_clr_256( vh_mask ) ) +// { + sha512_4way_init( &ctx.sha512 ); + sha512_4way( &ctx.sha512, vhash, 64 ); + sha512_4way_close( &ctx.sha512, vhashB ); +// } + + mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); + + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + + mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + + sha512_4way_init( &ctx.sha512 ); + sha512_4way( &ctx.sha512, vhash, 64 ); + sha512_4way_close( &ctx.sha512, vhash ); + +// A = haval parallel, B = Whirlpool serial + + vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), + m256_zero ); + + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + +// if ( mm256_any_set_256( vh_mask ) ) //4 +// { + haval256_5_4way_init( &ctx.haval ); + haval256_5_4way( &ctx.haval, vhash, 64 ); + haval256_5_4way_close( &ctx.haval, vhashA ); + memset( &vhashA[8<<2], 0, 32<<2 ); +// } + +// if ( !( hash0[0] & mask ) ) +// { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash0, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); +// } +// if ( !( hash2[0] & mask ) ) +// { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash1, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); +// } +// if ( !( hash2[0] & mask ) ) +// { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash2, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); +// } +// if ( !( hash3[0] & mask ) ) +// { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash3, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); +// } + + mm256_intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, 512 ); + + mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); + + bmw512_4way_init( &ctx.bmw ); + bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_close( &ctx.bmw, vhash ); + + memcpy(state, vhash, 32<<2 ); +} + +int scanhash_hmq1725_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash[4*8] __attribute__ ((aligned (64))); +// uint32_t *hash7 = &(hash[7<<2]); +// uint32_t lane_hash[8]; + uint32_t vdata[24*4] __attribute__ ((aligned (64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + __m256i *noncev = (__m256i*)vdata + 9; // aligned + /* int */ thr_id = mythr->id; // thr_id arg is deprecated + const uint32_t Htarg = ptarget[7]; + uint64_t htmax[] = { 0, 0xF, 0xFF, + 0xFFF, 0xFFFF, 0x10000000 }; + uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, + 0xFFFFF000, 0xFFFF0000, 0 }; + + mm256_bswap_intrlv80_4x64( vdata, pdata ); + for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) + { + uint32_t mask = masks[ m ]; + do + { + *noncev = mm256_intrlv_blend_32( mm256_bswap_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + hmq1725_4way_hash( hash, vdata ); + for ( int i = 0; i < 4; i++ ) + if ( ( (hash+(i<<3))[7] & mask ) == 0 ) + { + if ( fulltest( (hash+(i<<3)), ptarget ) && !opt_benchmark ) + { + pdata[19] = n + i; + submit_solution( work, (hash+(i<<3)), mythr, i ); + } + } + n += 4; + } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart ); + break; + } + *hashes_done = n - first_nonce + 1; + return 0; +} + +#endif // HMQ1725_4WAY diff --git a/algo/quark/hmq1725-gate.c b/algo/quark/hmq1725-gate.c new file mode 100644 index 0000000..7fd327c --- /dev/null +++ b/algo/quark/hmq1725-gate.c @@ -0,0 +1,17 @@ +#include "hmq1725-gate.h" + +bool register_hmq1725_algo( algo_gate_t* gate ) +{ +#if defined(HMQ1725_4WAY) + gate->scanhash = (void*)&scanhash_hmq1725_4way; + gate->hash = (void*)&hmq1725_4way_hash; +#else + init_hmq1725_ctx(); + gate->scanhash = (void*)&scanhash_hmq1725; + gate->hash = (void*)&hmq1725hash; +#endif + gate->set_target = (void*)&scrypt_set_target; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; + return true; +}; + diff --git a/algo/quark/hmq1725-gate.h b/algo/quark/hmq1725-gate.h new file mode 100644 index 0000000..9521cd2 --- /dev/null +++ b/algo/quark/hmq1725-gate.h @@ -0,0 +1,28 @@ +#ifndef HMQ1725_GATE_H__ +#define HMQ1725_GATE_H__ 1 + +#include "algo-gate-api.h" +#include + +#if defined(__AVX2__) && defined(__AES__) +// #define HMQ1725_4WAY +#endif + +bool register_hmq1725_algo( algo_gate_t* gate ); + +#if defined(HMQ1725_4WAY) + +void hmq1725_4way_hash( void *state, const void *input ); +int scanhash_hmq1725_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#else + +void hmq1725hash( void *state, const void *input ); +int scanhash_hmq1725( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +void init_hmq1725_ctx(); + +#endif + +#endif // HMQ1725_GATE_H__ diff --git a/algo/x17/hmq1725.c b/algo/quark/hmq1725.c similarity index 97% rename from algo/x17/hmq1725.c rename to algo/quark/hmq1725.c index 46ab6c9..66d081b 100644 --- a/algo/x17/hmq1725.c +++ b/algo/quark/hmq1725.c @@ -1,4 +1,4 @@ -#include "algo-gate-api.h" +#include "hmq1725-gate.h" #include #include #include "algo/blake/sph_blake.h" @@ -298,10 +298,11 @@ extern void hmq1725hash(void *state, const void *input) memcpy(state, hashA, 32); } -int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce, +int scanhash_hmq1725( int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t endiandata[32] __attribute__((aligned(64))); +// uint32_t endiandata[32] __attribute__((aligned(64))); + uint32_t endiandata[20] __attribute__((aligned(64))); uint32_t hash64[8] __attribute__((aligned(64))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; @@ -311,7 +312,8 @@ int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce, //const uint32_t Htarg = ptarget[7]; //we need bigendian data... - for (int k = 0; k < 32; k++) +// for (int k = 0; k < 32; k++) + for (int k = 0; k < 20; k++) be32enc(&endiandata[k], pdata[k]); hmq_bmw512_midstate( endiandata ); @@ -407,14 +409,14 @@ int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce, pdata[19] = n; return 0; } - +/* bool register_hmq1725_algo( algo_gate_t* gate ) { init_hmq1725_ctx(); - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; gate->set_target = (void*)&scrypt_set_target; gate->scanhash = (void*)&scanhash_hmq1725; gate->hash = (void*)&hmq1725hash; return true; }; - +*/ diff --git a/algo/quark/quark-4way.c b/algo/quark/quark-4way.c index bd350cf..a2237bf 100644 --- a/algo/quark/quark-4way.c +++ b/algo/quark/quark-4way.c @@ -48,9 +48,10 @@ void quark_4way_hash( void *state, const void *input ) __m256i* vhA = (__m256i*)vhashA; __m256i* vhB = (__m256i*)vhashB; __m256i vh_mask; - __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 ); - int i; quark_4way_ctx_holder ctx; + const __m256i bit3_mask = _mm256_set1_epi64x( 8 ); + const uint32_t mask = 8; + memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) ); blake512_4way( &ctx.blake, input, 80 ); @@ -62,27 +63,44 @@ void quark_4way_hash( void *state, const void *input ) vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), m256_zero ); - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + + if ( hash0[0] & mask ) + { update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + } + if ( hash1[0] & mask ) + { reinit_groestl( &ctx.groestl ); update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + } + if ( hash2[0] & mask ) + { reinit_groestl( &ctx.groestl ); update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + } + if ( hash3[0] & mask ) + { reinit_groestl( &ctx.groestl ); update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); + } + mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); + + if ( mm256_anybits0( vh_mask ) ) + { skein512_4way( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhashB ); + } - for ( i = 0; i < 8; i++ ) - vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask ); + mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); + + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); reinit_groestl( &ctx.groestl ); update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); reinit_groestl( &ctx.groestl ); @@ -91,7 +109,8 @@ void quark_4way_hash( void *state, const void *input ) update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); reinit_groestl( &ctx.groestl ); update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + + mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); jh512_4way( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); @@ -99,16 +118,21 @@ void quark_4way_hash( void *state, const void *input ) vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), m256_zero ); + if ( mm256_anybits1( vh_mask ) ) + { blake512_4way_init( &ctx.blake ); blake512_4way( &ctx.blake, vhash, 64 ); blake512_4way_close( &ctx.blake, vhashA ); + } + if ( mm256_anybits0( vh_mask ) ) + { bmw512_4way_init( &ctx.bmw ); bmw512_4way( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhashB ); + } - for ( i = 0; i < 8; i++ ) - vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask ); + mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); keccak512_4way( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); @@ -120,63 +144,65 @@ void quark_4way_hash( void *state, const void *input ) vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), m256_zero ); + if ( mm256_anybits1( vh_mask ) ) + { keccak512_4way_init( &ctx.keccak ); keccak512_4way( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhashA ); + } + if ( mm256_anybits0( vh_mask ) ) + { jh512_4way_init( &ctx.jh ); jh512_4way( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhashB ); + } - for ( i = 0; i < 8; i++ ) - vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask ); - - mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 ); + // Final blend, directly to state, only need 32 bytes. + casti_m256i( state, 0 ) = _mm256_blendv_epi8( vhA[0], vhB[0], vh_mask ); + casti_m256i( state, 1 ) = _mm256_blendv_epi8( vhA[1], vhB[1], vh_mask ); + casti_m256i( state, 2 ) = _mm256_blendv_epi8( vhA[2], vhB[2], vh_mask ); + casti_m256i( state, 3 ) = _mm256_blendv_epi8( vhA[3], vhB[3], vh_mask ); } int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done) + uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t hash[4*8] __attribute__ ((aligned (64))); uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t endiandata[20] __attribute__((aligned(64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t *hash7 = &(hash[25]); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; - uint32_t *nonces = work->nonces; - int num_found = 0; - uint32_t *noncep = vdata + 73; // 9*8 + 1 - - swab32_array( endiandata, pdata, 20 ); - - uint64_t *edata = (uint64_t*)endiandata; - mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); + __m256i *noncev = (__m256i*)vdata + 9; // aligned + /* int */ thr_id = mythr->id; // thr_id arg is deprecated + mm256_bswap_intrlv80_4x64( vdata, pdata ); do { - be32enc( noncep, n ); - be32enc( noncep+2, n+1 ); - be32enc( noncep+4, n+2 ); - be32enc( noncep+6, n+3 ); + *noncev = mm256_intrlv_blend_32( mm256_bswap_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); quark_4way_hash( hash, vdata ); pdata[19] = n; for ( int i = 0; i < 4; i++ ) - if ( ( ( (hash+(i<<3))[7] & 0xFFFFFF00 ) == 0 ) - && fulltest( hash+(i<<3), ptarget ) ) + if ( ( hash7[ i<<1 ] & 0xFFFFFF00 ) == 0 ) { - pdata[19] = n+i; - nonces[ num_found++ ] = n+i; - work_set_target_ratio( work, hash+(i<<3) ); + mm256_extract_lane_4x64( lane_hash, hash, i, 256 ); + if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + { + pdata[19] = n+i; + submit_solution( work, lane_hash, mythr, i ); + } } n += 4; - } while ( ( num_found == 0 ) && ( n < max_nonce ) - && !work_restart[thr_id].restart ); + } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); *hashes_done = n - first_nonce + 1; - return num_found; + return 0; } #endif diff --git a/algo/quark/quark-gate.h b/algo/quark/quark-gate.h index 1191d46..20b5750 100644 --- a/algo/quark/quark-gate.h +++ b/algo/quark/quark-gate.h @@ -13,19 +13,15 @@ bool register_quark_algo( algo_gate_t* gate ); #if defined(QUARK_4WAY) void quark_4way_hash( void *state, const void *input ); - int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); - + uint64_t *hashes_done, struct thr_info *mythr ); void init_quark_4way_ctx(); #endif void quark_hash( void *state, const void *input ); - int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); - + uint64_t *hashes_done, struct thr_info *mythr ); void init_quark_ctx(); #endif diff --git a/algo/quark/quark.c b/algo/quark/quark.c index debad08..6f3ca6c 100644 --- a/algo/quark/quark.c +++ b/algo/quark/quark.c @@ -173,16 +173,17 @@ void quark_hash(void *state, const void *input) } int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done) + uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t hash64[8] __attribute__((aligned(32))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; + uint32_t endiandata[20] __attribute__((aligned(64))); + uint32_t hash64[8] __attribute__((aligned(32))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; uint32_t n = pdata[19] - 1; const uint32_t first_nonce = pdata[19]; + /* int */ thr_id = mythr->id; // thr_id arg is deprecated - swab32_array( endiandata, pdata, 20 ); + swab32_array( endiandata, pdata, 20 ); do { pdata[19] = ++n; diff --git a/algo/qubit/deep-2way.c b/algo/qubit/deep-2way.c index a84fad0..ad252d9 100644 --- a/algo/qubit/deep-2way.c +++ b/algo/qubit/deep-2way.c @@ -64,7 +64,7 @@ void deep_2way_hash( void *output, const void *input ) } int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce, - uint64_t *hashes_done ) + uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t hash[4*8] __attribute__ ((aligned (64))); uint32_t vdata[24*4] __attribute__ ((aligned (64))); @@ -73,17 +73,17 @@ int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce, uint32_t *ptarget = work->target; uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; - uint32_t *nonces = work->nonces; - int num_found = 0; uint32_t *noncep = vdata + 32+3; // 4*8 + 3 + /* int */ thr_id = mythr->id; // thr_id arg is deprecated const uint32_t Htarg = ptarget[7]; uint64_t htmax[] = { 0, 0xF, 0xFF, 0xFFF, 0xFFFF, 0x10000000 }; uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0 }; - // big endian encode 0..18 uint32_t, 64 bits at a time - swab32_array( endiandata, pdata, 20 ); + casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) ); + casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) ); + casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); uint64_t *edata = (uint64_t*)endiandata; mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 ); @@ -102,23 +102,24 @@ int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce, deep_2way_hash( hash, vdata ); pdata[19] = n; - if ( !( hash[7] & mask ) && fulltest( hash, ptarget) ) + if ( !( hash[7] & mask ) ) + if ( fulltest( hash, ptarget) && !opt_benchmark ) { - nonces[ num_found++ ] = n; - work_set_target_ratio( work, hash ); + pdata[19] = n; + submit_solution( work, hash, mythr, 0 ); } - if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) ) + if ( !( (hash+8)[7] & mask ) ) + if ( fulltest( hash+8, ptarget) && !opt_benchmark ) { - nonces[ num_found++ ] = n+1; - work_set_target_ratio( work, hash+8 ); + pdata[19] = n+1; + submit_solution( work, hash+8, mythr, 1 ); } n += 2; - } while ( ( num_found == 0 ) && ( n < max_nonce ) - && !work_restart[thr_id].restart ); + } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); break; } *hashes_done = n - first_nonce + 1; - return num_found; + return 0; } #endif diff --git a/algo/qubit/deep-gate.h b/algo/qubit/deep-gate.h index b91f968..ded8f28 100644 --- a/algo/qubit/deep-gate.h +++ b/algo/qubit/deep-gate.h @@ -13,19 +13,15 @@ bool register_deep_algo( algo_gate_t* gate ); #if defined(DEEP_2WAY) void deep_2way_hash( void *state, const void *input ); - int scanhash_deep_2way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); - + uint64_t *hashes_done, struct thr_info *mythr ); void init_deep_2way_ctx(); #endif void deep_hash( void *state, const void *input ); - int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); - + uint64_t *hashes_done, struct thr_info *mythr ); void init_deep_ctx(); #endif diff --git a/algo/qubit/deep.c b/algo/qubit/deep.c index ed49e9c..9dc24a2 100644 --- a/algo/qubit/deep.c +++ b/algo/qubit/deep.c @@ -72,14 +72,15 @@ void deep_hash(void *output, const void *input) } int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done) + uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t hash64[8] __attribute__((aligned(32))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; + uint32_t endiandata[20] __attribute__((aligned(64))); + uint32_t hash64[8] __attribute__((aligned(32))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; uint32_t n = pdata[19] - 1; const uint32_t first_nonce = pdata[19]; + /* int */ thr_id = mythr->id; // thr_id arg is deprecated const uint32_t Htarg = ptarget[7]; uint64_t htmax[] = { 0, 0xF, 0xFF, 0xFFF, 0xFFFF, 0x10000000 }; diff --git a/algo/qubit/qubit-2way.c b/algo/qubit/qubit-2way.c index 7503c0f..dc7d4ad 100644 --- a/algo/qubit/qubit-2way.c +++ b/algo/qubit/qubit-2way.c @@ -70,7 +70,7 @@ void qubit_2way_hash( void *output, const void *input ) } int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce, - uint64_t *hashes_done ) + uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t hash[4*8] __attribute__ ((aligned (64))); uint32_t vdata[24*4] __attribute__ ((aligned (64))); @@ -79,17 +79,17 @@ int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce, uint32_t *ptarget = work->target; uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; - uint32_t *nonces = work->nonces; - int num_found = 0; uint32_t *noncep = vdata + 32+3; // 4*8 + 3 + /* int */ thr_id = mythr->id; // thr_id arg is deprecated const uint32_t Htarg = ptarget[7]; uint64_t htmax[] = { 0, 0xF, 0xFF, 0xFFF, 0xFFFF, 0x10000000 }; uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0 }; - // big endian encode 0..18 uint32_t, 64 bits at a time - swab32_array( endiandata, pdata, 20 ); + casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) ); + casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) ); + casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); uint64_t *edata = (uint64_t*)endiandata; mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 ); @@ -107,25 +107,24 @@ int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce, qubit_2way_hash( hash, vdata ); pdata[19] = n; - - if ( !( hash[7] & mask ) && fulltest( hash, ptarget) ) + if ( !( hash[7] & mask ) ) + if ( fulltest( hash, ptarget) && !opt_benchmark ) { - nonces[ num_found++ ] = n; - work_set_target_ratio( work, hash ); + pdata[19] = n; + submit_solution( work, hash, mythr, 0 ); } - if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) ) + if ( !( (hash+8)[7] & mask ) ) + if ( fulltest( hash+8, ptarget) && !opt_benchmark ) { pdata[19] = n+1; - nonces[ num_found++ ] = n+1; - work_set_target_ratio( work, hash+8 ); + submit_solution( work, hash+8, mythr, 1 ); } n += 2; - } while ( ( num_found == 0 ) && ( n < max_nonce ) - && !work_restart[thr_id].restart ); + } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); break; } *hashes_done = n - first_nonce + 1; - return num_found; + return 0; } #endif diff --git a/algo/qubit/qubit-gate.h b/algo/qubit/qubit-gate.h index 953c1cb..98af09a 100644 --- a/algo/qubit/qubit-gate.h +++ b/algo/qubit/qubit-gate.h @@ -13,19 +13,15 @@ bool register_qubit_algo( algo_gate_t* gate ); #if defined(QUBIT_2WAY) void qubit_2way_hash( void *state, const void *input ); - int scanhash_qubit_2way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); - + uint64_t *hashes_done, struct thr_info *mythr ); void init_qubit_2way_ctx(); #endif void qubit_hash( void *state, const void *input ); - int scanhash_qubit( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); - + uint64_t *hashes_done, struct thr_info *mythr ); void init_qubit_ctx(); #endif diff --git a/algo/qubit/qubit.c b/algo/qubit/qubit.c index 3a3be9f..d90eeb3 100644 --- a/algo/qubit/qubit.c +++ b/algo/qubit/qubit.c @@ -83,15 +83,16 @@ void qubit_hash(void *output, const void *input) memcpy(output, hash, 32); } -int scanhash_qubit(int thr_id, struct work *work, - uint32_t max_nonce, uint64_t *hashes_done) +int scanhash_qubit( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t hash64[8] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; + uint32_t endiandata[20] __attribute__((aligned(64))); + uint32_t hash64[8] __attribute__((aligned(64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; uint32_t n = pdata[19] - 1; const uint32_t first_nonce = pdata[19]; + /* int */ thr_id = mythr->id; // thr_id arg is deprecated const uint32_t Htarg = ptarget[7]; uint64_t htmax[] = { 0, 0xF, 0xFF, 0xFFF, 0xFFFF, 0x10000000 }; diff --git a/algo/ripemd/lbry-4way.c b/algo/ripemd/lbry-4way.c index 13aa8e5..383981e 100644 --- a/algo/ripemd/lbry-4way.c +++ b/algo/ripemd/lbry-4way.c @@ -40,9 +40,9 @@ void lbry_8way_hash( void* output, const void* input ) sha256_8way_close( &ctx_sha256, vhashA ); // reinterleave to do sha512 4-way 64 bit twice. - mm256_deinterleave_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 ); - mm256_interleave_4x64( vhashA, h0, h1, h2, h3, 256 ); - mm256_interleave_4x64( vhashB, h4, h5, h6, h7, 256 ); + mm256_dintrlv_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 ); + mm256_intrlv_4x64( vhashA, h0, h1, h2, h3, 256 ); + mm256_intrlv_4x64( vhashB, h4, h5, h6, h7, 256 ); sha512_4way_init( &ctx_sha512 ); sha512_4way( &ctx_sha512, vhashA, 32 ); @@ -53,9 +53,9 @@ void lbry_8way_hash( void* output, const void* input ) sha512_4way_close( &ctx_sha512, vhashB ); // back to 8-way 32 bit - mm256_deinterleave_4x64( h0, h1, h2, h3, vhashA, 512 ); - mm256_deinterleave_4x64( h4, h5, h6, h7, vhashB, 512 ); - mm256_interleave_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 ); + mm256_dintrlv_4x64( h0, h1, h2, h3, vhashA, 512 ); + mm256_dintrlv_4x64( h4, h5, h6, h7, vhashB, 512 ); + mm256_intrlv_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 ); ripemd160_8way_init( &ctx_ripemd ); ripemd160_8way( &ctx_ripemd, vhashA, 32 ); @@ -72,27 +72,24 @@ void lbry_8way_hash( void* output, const void* input ) sha256_8way_init( &ctx_sha256 ); sha256_8way( &ctx_sha256, vhashA, 32 ); - sha256_8way_close( &ctx_sha256, vhashA ); - - mm256_deinterleave_8x32( output, output+ 32, output+ 64, output+ 96, - output+128, output+160, output+192, output+224, - vhashA, 256 ); + sha256_8way_close( &ctx_sha256, output ); } int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done) + uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t hash[8*8] __attribute__ ((aligned (64))); uint32_t vdata[32*8] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + uint32_t *hash7 = &(hash[7<<3]); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; uint32_t n = pdata[27]; const uint32_t first_nonce = pdata[27]; const uint32_t Htarg = ptarget[7]; uint32_t edata[32] __attribute__ ((aligned (64))); - uint32_t *nonces = work->nonces; - int num_found = 0; - uint32_t *noncep = vdata + 216; // 27*8 + __m256i *noncev = (__m256i*)vdata + 27; // aligned + /* int */ thr_id = mythr->id; // thr_id arg is deprecated uint64_t htmax[] = { 0, 0xF, 0xFF, 0xFFF, 0xFFFF, 0x10000000 }; @@ -100,9 +97,12 @@ int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce, 0xFFFFF000, 0xFFFF0000, 0 }; // we need bigendian data... - swab32_array( edata, pdata, 32 ); - mm256_interleave_8x32( vdata, edata, edata, edata, edata, - edata, edata, edata, edata, 1024 ); + casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) ); + casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) ); + casti_m256i( edata, 2 ) = mm256_bswap_32( casti_m256i( pdata, 2 ) ); + casti_m256i( edata, 3 ) = mm256_bswap_32( casti_m256i( pdata, 3 ) ); + mm256_intrlv_8x32( vdata, edata, edata, edata, edata, + edata, edata, edata, edata, 1024 ); sha256_8way_init( &sha256_8w_mid ); sha256_8way( &sha256_8w_mid, vdata, LBRY_MIDSTATE ); @@ -111,136 +111,26 @@ int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce, uint32_t mask = masks[m]; do { - be32enc( noncep, n ); - be32enc( noncep+1, n+1 ); - be32enc( noncep+2, n+2 ); - be32enc( noncep+3, n+3 ); - be32enc( noncep+4, n+4 ); - be32enc( noncep+5, n+5 ); - be32enc( noncep+6, n+6 ); - be32enc( noncep+7, n+7 ); - + *noncev = mm256_bswap_32( _mm256_set_epi32( + n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) ); lbry_8way_hash( hash, vdata ); - for ( int i = 0; i < 8; i++ ) - if ( !( (hash+(i<<3))[7] & mask ) && fulltest( hash+(i<<3), ptarget ) ) + for ( int i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) ) { - pdata[27] = n+i; - nonces[ num_found++ ] = n+i; - work_set_target_ratio( work, hash+(i<<3) ); + // deinterleave hash for lane + mm256_extract_lane_8x32( lane_hash, hash, i, 256 ); + if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + { + pdata[27] = n + i; + submit_solution( work, lane_hash, mythr, i ); + } } - n+=8; - } while ( ( num_found == 0 ) && ( n < max_nonce ) - && !work_restart[thr_id].restart ); + n += 8; + } while ( (n < max_nonce-10) && !work_restart[thr_id].restart ); break; } - - *hashes_done = n - first_nonce; - return num_found; -} - -#elif defined(LBRY_4WAY) - -static __thread sha256_4way_context sha256_mid; - -void lbry_4way_hash( void* output, const void* input ) -{ - sha256_4way_context ctx_sha256 __attribute__ ((aligned (64))); - sha512_4way_context ctx_sha512; - ripemd160_4way_context ctx_ripemd; - uint32_t _ALIGN(64) vhashA[16<<2]; - uint32_t _ALIGN(64) vhashB[16<<2]; - uint32_t _ALIGN(64) vhashC[16<<2]; - - memcpy( &ctx_sha256, &sha256_mid, sizeof(ctx_sha256) ); - sha256_4way( &ctx_sha256, input + (LBRY_MIDSTATE<<2), LBRY_TAIL ); - sha256_4way_close( &ctx_sha256, vhashA ); - - sha256_4way_init( &ctx_sha256 ); - sha256_4way( &ctx_sha256, vhashA, 32 ); - sha256_4way_close( &ctx_sha256, vhashA ); - - // sha512 64 bit data, 64 byte output - mm256_reinterleave_4x64( vhashB, vhashA, 256 ); - sha512_4way_init( &ctx_sha512 ); - sha512_4way( &ctx_sha512, vhashB, 32 ); - sha512_4way_close( &ctx_sha512, vhashB ); - mm256_reinterleave_4x32( vhashA, vhashB, 512 ); - - ripemd160_4way_init( &ctx_ripemd ); - ripemd160_4way( &ctx_ripemd, vhashA, 32 ); - ripemd160_4way_close( &ctx_ripemd, vhashB ); - - ripemd160_4way_init( &ctx_ripemd ); - ripemd160_4way( &ctx_ripemd, vhashA+(8<<2), 32 ); - ripemd160_4way_close( &ctx_ripemd, vhashC ); - - sha256_4way_init( &ctx_sha256 ); - sha256_4way( &ctx_sha256, vhashB, 20 ); - sha256_4way( &ctx_sha256, vhashC, 20 ); - sha256_4way_close( &ctx_sha256, vhashA ); - - sha256_4way_init( &ctx_sha256 ); - sha256_4way( &ctx_sha256, vhashA, 32 ); - sha256_4way_close( &ctx_sha256, vhashA ); - - mm128_deinterleave_4x32( output, output+32, output+64, output+96, - vhashA, 256 ); -} - -int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done) -{ - uint32_t hash[4*8] __attribute__ ((aligned (64))); - uint32_t vdata[32*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[27]; - const uint32_t first_nonce = pdata[27]; - const uint32_t Htarg = ptarget[7]; - uint32_t edata[32] __attribute__ ((aligned (64))); - uint32_t *nonces = work->nonces; - int num_found = 0; - uint32_t *noncep = vdata + 108; // 27*4 - - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; - - // we need bigendian data... - swab32_array( edata, pdata, 32 ); - mm128_interleave_4x32( vdata, edata, edata, edata, edata, 1024 ); - sha256_4way_init( &sha256_mid ); - sha256_4way( &sha256_mid, vdata, LBRY_MIDSTATE ); - - for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - be32enc( noncep, n ); - be32enc( noncep+1, n+1 ); - be32enc( noncep+2, n+2 ); - be32enc( noncep+3, n+3 ); - - lbry_4way_hash( hash, vdata ); - - for ( int i = 0; i < 4; i++ ) - if ( !( (hash+(i<<3))[7] & mask ) && fulltest( hash+(i<<3), ptarget ) ) - { - pdata[27] = n+i; - nonces[ num_found++ ] = n+i; - work_set_target_ratio( work, hash+(i<<3) ); - } - n+=4; - } while ( ( num_found == 0 ) && ( n < max_nonce ) - && !work_restart[thr_id].restart ); - break; - } - - *hashes_done = n - first_nonce; - return num_found; + *hashes_done = n - first_nonce + 1; + return 0; } #endif diff --git a/algo/ripemd/lbry-gate.h b/algo/ripemd/lbry-gate.h index 078ee81..cdf2f19 100644 --- a/algo/ripemd/lbry-gate.h +++ b/algo/ripemd/lbry-gate.h @@ -4,12 +4,10 @@ #include "algo-gate-api.h" #include -// Overide multi way on ryzen, SHA is better. -#if !defined(RYZEN_) -// need sha512 2 way AVX x2 or 1 way scalar x4 to support 4way AVX. -#if defined(__AVX2__) +#if !defined(__SHA__) + #if defined(__AVX2__) #define LBRY_8WAY -#endif + #endif #endif #define LBRY_NTIME_INDEX 25 @@ -24,17 +22,18 @@ bool register_lbry_algo( algo_gate_t* gate ); void lbry_8way_hash( void *state, const void *input ); int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); - + uint64_t *hashes_done, struct thr_info *mythr ); +/* #elif defined(LBRY_4WAY) void lbry_4way_hash( void *state, const void *input ); int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done ); +*/ #else void lbry_hash( void *state, const void *input ); int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); + uint64_t *hashes_done, struct thr_info *mythr ); #endif #endif diff --git a/algo/ripemd/lbry.c b/algo/ripemd/lbry.c index ef4e242..b453c73 100644 --- a/algo/ripemd/lbry.c +++ b/algo/ripemd/lbry.c @@ -48,13 +48,14 @@ void lbry_hash(void* output, const void* input) } int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done) + uint64_t *hashes_done, struct thr_info *mythr) { uint32_t *pdata = work->data; uint32_t *ptarget = work->target; uint32_t n = pdata[27] - 1; const uint32_t first_nonce = pdata[27]; const uint32_t Htarg = ptarget[7]; + /* int */ thr_id = mythr->id; // thr_id arg is deprecated uint32_t hash64[8] __attribute__((aligned(64))); uint32_t endiandata[32] __attribute__ ((aligned (64))); diff --git a/algo/scrypt.c b/algo/scrypt.c index d670382..5d570d8 100644 --- a/algo/scrypt.c +++ b/algo/scrypt.c @@ -696,7 +696,7 @@ static void scrypt_1024_1_1_256_24way(const uint32_t *input, #endif /* HAVE_SCRYPT_6WAY */ extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ) + uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t *pdata = work->data; uint32_t *ptarget = work->target; @@ -704,6 +704,7 @@ extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce, uint32_t midstate[8]; uint32_t n = pdata[19] - 1; const uint32_t Htarg = ptarget[7]; + /* int */ thr_id = mythr->id; // thr_id arg is deprecated int throughput = scrypt_best_throughput(); int i; diff --git a/algo/sha/sha256t-gate.h b/algo/sha/sha256t-gate.h index 4d7c8c1..9b85eb1 100644 --- a/algo/sha/sha256t-gate.h +++ b/algo/sha/sha256t-gate.h @@ -5,13 +5,12 @@ #include "algo-gate-api.h" // Override multi way on ryzen, SHA is better. -#if !defined(RYZEN_) -#if defined(__SSE2__) - #define SHA256T_4WAY -#endif -#if defined(__AVX2__) +#if !defined(__SHA__) + #if defined(__AVX2__) #define SHA256T_8WAY -#endif +#elif defined(__SSE2__) + #define SHA256T_4WAY + #endif #endif bool register_sha256t_algo( algo_gate_t* gate ); diff --git a/algo/simd/simd-hash-2way.c b/algo/simd/simd-hash-2way.c index a0ed8d7..eb42f49 100644 --- a/algo/simd/simd-hash-2way.c +++ b/algo/simd/simd-hash-2way.c @@ -6,6 +6,12 @@ #if defined (__AVX2__) +union _m256_v16 { + uint16_t u16[16]; + __m256i v256; +}; +typedef union _m256_v16 m256_v16; + // imported from simd_iv.h uint32_t SIMD_IV_512[] = { 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, diff --git a/algo/skein/skein-4way.c b/algo/skein/skein-4way.c index 85ede1d..e4534f2 100644 --- a/algo/skein/skein-4way.c +++ b/algo/skein/skein-4way.c @@ -2,7 +2,11 @@ #include #include #include "skein-hash-4way.h" -#include "algo/sha/sha2-hash-4way.h" +#if defined(__SHA__) + #include +#else + #include "algo/sha/sha2-hash-4way.h" +#endif #if defined (SKEIN_4WAY) @@ -11,53 +15,69 @@ void skeinhash_4way( void *state, const void *input ) uint64_t vhash64[8*4] __attribute__ ((aligned (64))); uint32_t vhash32[16*4] __attribute__ ((aligned (64))); skein512_4way_context ctx_skein; +#if defined(__SHA__) + uint32_t hash0[16] __attribute__ ((aligned (64))); + uint32_t hash1[16] __attribute__ ((aligned (64))); + uint32_t hash2[16] __attribute__ ((aligned (64))); + uint32_t hash3[16] __attribute__ ((aligned (64))); + SHA256_CTX ctx_sha256; +#else sha256_4way_context ctx_sha256; +#endif skein512_4way_init( &ctx_skein ); skein512_4way( &ctx_skein, input, 80 ); skein512_4way_close( &ctx_skein, vhash64 ); +#if defined(__SHA__) + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 ); + + SHA256_Init( &ctx_sha256 ); + SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 ); + SHA256_Final( (unsigned char*)hash0, &ctx_sha256 ); + + SHA256_Init( &ctx_sha256 ); + SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 ); + SHA256_Final( (unsigned char*)hash1, &ctx_sha256 ); + + SHA256_Init( &ctx_sha256 ); + SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 ); + SHA256_Final( (unsigned char*)hash2, &ctx_sha256 ); + + SHA256_Init( &ctx_sha256 ); + SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 ); + SHA256_Final( (unsigned char*)hash3, &ctx_sha256 ); + + mm128_intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 ); +#else mm256_rintrlv_4x64_4x32( vhash32, vhash64, 512 ); sha256_4way_init( &ctx_sha256 ); sha256_4way( &ctx_sha256, vhash32, 64 ); sha256_4way_close( &ctx_sha256, state ); - - mm128_dintrlv_4x32( state, state+32, state+64, state+96, - vhash32, 256 ); +#endif } int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ) + uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t vdata[20*4] __attribute__ ((aligned (64))); uint32_t hash[8*4] __attribute__ ((aligned (64))); uint32_t lane_hash[8]; uint32_t *hash7 = &(hash[7<<2]); - uint32_t edata[20] __attribute__ ((aligned (64))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; uint32_t n = first_nonce; - // hash is returned deinterleaved - uint32_t *nonces = work->nonces; - int num_found = 0; - -// data is 80 bytes, 20 u32 or 4 u64. - - swab32_array( edata, pdata, 20 ); - - mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); - - uint32_t *noncep = vdata + 73; // 9*8 + 1 + __m256i *noncev = (__m256i*)vdata + 9; // aligned + /* int */ thr_id = mythr->id; // thr_id arg is deprecated + mm256_bswap_intrlv80_4x64( vdata, pdata ); do { - be32enc( noncep, n ); - be32enc( noncep+2, n+1 ); - be32enc( noncep+4, n+2 ); - be32enc( noncep+6, n+3 ); + *noncev = mm256_intrlv_blend_32( mm256_bswap_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); skeinhash_4way( hash, vdata ); @@ -68,16 +88,14 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce, if ( fulltest( lane_hash, ptarget ) ) { pdata[19] = n + lane; - nonces[ num_found++ ] = n + lane; - work_set_target_ratio( work, lane_hash ); + submit_solution( work, lane_hash, mythr, lane ); } } n += 4; - } while ( (num_found == 0) && (n < max_nonce) - && !work_restart[thr_id].restart ); + } while ( (n < max_nonce) && !work_restart[thr_id].restart ); *hashes_done = n - first_nonce + 1; - return num_found; + return 0; } #endif diff --git a/algo/skein/skein-gate.h b/algo/skein/skein-gate.h index 3fee094..bfa169d 100644 --- a/algo/skein/skein-gate.h +++ b/algo/skein/skein-gate.h @@ -3,24 +3,21 @@ #include #include "algo-gate-api.h" -// Override multi way on ryzen, SHA is better. -#if !defined(RYZEN_) #if defined(__AVX2__) #define SKEIN_4WAY #endif -#endif #if defined(SKEIN_4WAY) void skeinhash_4way( void *output, const void *input ); int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); + uint64_t *hashes_done, struct thr_info *mythr ); #endif void skeinhash( void *output, const void *input ); int scanhash_skein( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); + uint64_t *hashes_done, struct thr_info *mythr ); #endif diff --git a/algo/skein/skein.c b/algo/skein/skein.c index 90d3863..b6b070f 100644 --- a/algo/skein/skein.c +++ b/algo/skein/skein.c @@ -21,8 +21,8 @@ void skeinhash(void *state, const void *input) memcpy(state, hash, 32); } -int scanhash_skein(int thr_id, struct work *work, - uint32_t max_nonce, uint64_t *hashes_done) +int scanhash_skein( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t *pdata = work->data; uint32_t *ptarget = work->target; @@ -31,7 +31,8 @@ int scanhash_skein(int thr_id, struct work *work, const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; uint32_t n = first_nonce; - + /* int */ thr_id = mythr->id; // thr_id arg is deprecated + swab32_array( endiandata, pdata, 20 ); do { diff --git a/algo/skein/skein2-4way.c b/algo/skein/skein2-4way.c index 3758d13..fef6813 100644 --- a/algo/skein/skein2-4way.c +++ b/algo/skein/skein2-4way.c @@ -20,55 +20,43 @@ void skein2hash_4way( void *output, const void *input ) } int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ) + uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t hash[8*4] __attribute__ ((aligned (64))); uint32_t *hash7 = &(hash[25]); uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t endiandata[20] __attribute__ ((aligned (64))); - uint64_t *edata = (uint64_t*)endiandata; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; uint32_t n = first_nonce; - uint32_t *nonces = work->nonces; - int num_found = 0; - - swab32_array( endiandata, pdata, 20 ); - - mm256_interleave_4x64( vdata, edata, edata, edata, edata, 640 ); - - uint32_t *noncep = vdata + 73; // 9*8 + 1 + __m256i *noncev = (__m256i*)vdata + 9; // aligned + /* int */ thr_id = mythr->id; // thr_id arg is deprecated + mm256_bswap_intrlv80_4x64( vdata, pdata ); do { - be32enc( noncep, n ); - be32enc( noncep+2, n+1 ); - be32enc( noncep+4, n+2 ); - be32enc( noncep+6, n+3 ); + *noncev = mm256_intrlv_blend_32( mm256_bswap_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - skein2hash( hash, vdata ); + skein2hash_4way( hash, vdata ); for ( int lane = 0; lane < 4; lane++ ) - if ( hash7[ lane ] <= Htarg ) + if ( hash7[ lane<<1 ] <= Htarg ) { - // deinterleave hash for lane uint32_t lane_hash[8]; mm256_extract_lane_4x64( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) ) + if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) { pdata[19] = n + lane; - nonces[ num_found++ ] = n + lane; - work_set_target_ratio( work, lane_hash ); + submit_solution( work, lane_hash, mythr, lane ); } } n += 4; - } while ( (num_found == 0) && (n < max_nonce) - && !work_restart[thr_id].restart ); + } while ( (n < max_nonce) && !work_restart[thr_id].restart ); *hashes_done = n - first_nonce + 1; - return num_found; + return 0; } #endif diff --git a/algo/skein/skein2-gate.c b/algo/skein/skein2-gate.c index efc9f41..34483b2 100644 --- a/algo/skein/skein2-gate.c +++ b/algo/skein/skein2-gate.c @@ -10,10 +10,9 @@ int64_t skein2_get_max64 () bool register_skein2_algo( algo_gate_t* gate ) { gate->optimizations = AVX2_OPT; -#if defined (FOUR_WAY) && defined (__AVX2__) +#if defined (SKEIN2_4WAY) gate->scanhash = (void*)&scanhash_skein2_4way; gate->hash = (void*)&skein2hash_4way; - four_way_not_tested(); #else gate->scanhash = (void*)&scanhash_skein2; gate->hash = (void*)&skein2hash; diff --git a/algo/skein/skein2-gate.h b/algo/skein/skein2-gate.h index 6dcabe3..3e64936 100644 --- a/algo/skein/skein2-gate.h +++ b/algo/skein/skein2-gate.h @@ -10,11 +10,11 @@ #if defined(SKEIN2_4WAY) void skein2hash_4way( void *output, const void *input ); int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t* hashes_done ); + uint64_t* hashes_done, struct thr_info *mythr ); #endif void skein2hash( void *output, const void *input ); int scanhash_skein2( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); + uint64_t *hashes_done, struct thr_info *mythr ); #endif diff --git a/algo/skein/skein2.c b/algo/skein/skein2.c index 0aff9cb..60b32cf 100644 --- a/algo/skein/skein2.c +++ b/algo/skein/skein2.c @@ -34,8 +34,8 @@ void skein2hash(void *output, const void *input) } -int scanhash_skein2(int thr_id, struct work *work, - uint32_t max_nonce, uint64_t *hashes_done) +int scanhash_skein2( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t *pdata = work->data; uint32_t *ptarget = work->target; @@ -44,6 +44,7 @@ int scanhash_skein2(int thr_id, struct work *work, const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; uint32_t n = first_nonce; + /* int */ thr_id = mythr->id; // thr_id arg is deprecated swab32_array( endiandata, pdata, 20 ); diff --git a/algo/x14/axiom.c b/algo/x14/axiom.c index f2bcec3..b5d2ee7 100644 --- a/algo/x14/axiom.c +++ b/algo/x14/axiom.c @@ -44,12 +44,13 @@ void axiomhash(void *output, const void *input) } int scanhash_axiom(int thr_id, struct work *work, - uint32_t max_nonce, uint64_t *hashes_done) + uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t *pdata = work->data; uint32_t *ptarget = work->target; uint32_t _ALIGN(64) hash64[8]; uint32_t _ALIGN(64) endiandata[20]; + /* int */ thr_id = mythr->id; // thr_id arg is deprecated const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; diff --git a/algo/x14/polytimos-4way.c b/algo/x14/polytimos-4way.c index 5262418..afb2972 100644 --- a/algo/x14/polytimos-4way.c +++ b/algo/x14/polytimos-4way.c @@ -13,26 +13,16 @@ #include "algo/gost/sph_gost.h" #include "algo/echo/aes_ni/hash_api.h" -typedef struct { +union _poly_4way_context_overlay +{ skein512_4way_context skein; shabal512_4way_context shabal; hashState_echo echo; luffa_2way_context luffa; sph_fugue512_context fugue; sph_gost512_context gost; -} poly_4way_ctx_holder; - -poly_4way_ctx_holder poly_4way_ctx; - -void init_polytimos_4way_ctx() -{ - skein512_4way_init( &poly_4way_ctx.skein ); - shabal512_4way_init( &poly_4way_ctx.shabal ); - init_echo( &poly_4way_ctx.echo, 512 ); - luffa_2way_init( &poly_4way_ctx.luffa, 512 ); - sph_fugue512_init( &poly_4way_ctx.fugue ); - sph_gost512_init( &poly_4way_ctx.gost ); -} +}; +typedef union _poly_4way_context_overlay poly_4way_context_overlay; void polytimos_4way_hash( void *output, const void *input ) { @@ -41,51 +31,57 @@ void polytimos_4way_hash( void *output, const void *input ) uint64_t hash2[8] __attribute__ ((aligned (64))); uint64_t hash3[8] __attribute__ ((aligned (64))); uint64_t vhash[8*4] __attribute__ ((aligned (64))); - poly_4way_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &poly_4way_ctx, sizeof(poly_4way_ctx) ); + poly_4way_context_overlay ctx; + skein512_4way_init( &ctx.skein ); skein512_4way( &ctx.skein, input, 80 ); skein512_4way_close( &ctx.skein, vhash ); // Need to convert from 64 bit interleaved to 32 bit interleaved. uint32_t vhash32[16*4]; mm256_rintrlv_4x64_4x32( vhash32, vhash, 512 ); + shabal512_4way_init( &ctx.shabal ); shabal512_4way( &ctx.shabal, vhash32, 64 ); shabal512_4way_close( &ctx.shabal, vhash32 ); mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 ); + init_echo( &ctx.echo, 512 ); update_final_echo ( &ctx.echo, (BitSequence *)hash0, (const BitSequence *)hash0, 512 ); - memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) ); + init_echo( &ctx.echo, 512 ); update_final_echo( &ctx.echo, (BitSequence *)hash1, (const BitSequence *) hash1, 512 ); - memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) ); + init_echo( &ctx.echo, 512 ); update_final_echo( &ctx.echo, (BitSequence *)hash2, (const BitSequence *) hash2, 512 ); - memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) ); + init_echo( &ctx.echo, 512 ); update_final_echo( &ctx.echo, (BitSequence *)hash3, (const BitSequence *) hash3, 512 ); mm256_intrlv_2x128( vhash, hash0, hash1, 512 ); + luffa_2way_init( &ctx.luffa, 512 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); mm256_dintrlv_2x128( hash0, hash1, vhash, 512 ); mm256_intrlv_2x128( vhash, hash2, hash3, 512 ); luffa_2way_init( &ctx.luffa, 512 ); + luffa_2way_init( &ctx.luffa, 512 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); mm256_dintrlv_2x128( hash2, hash3, vhash, 512 ); + sph_fugue512_init( &ctx.fugue ); sph_fugue512( &ctx.fugue, hash0, 64 ); sph_fugue512_close( &ctx.fugue, hash0 ); - memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512_init( &ctx.fugue ); sph_fugue512( &ctx.fugue, hash1, 64 ); sph_fugue512_close( &ctx.fugue, hash1 ); - memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512_init( &ctx.fugue ); sph_fugue512( &ctx.fugue, hash2, 64 ); sph_fugue512_close( &ctx.fugue, hash2 ); - memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512_init( &ctx.fugue ); sph_fugue512( &ctx.fugue, hash3, 64 ); sph_fugue512_close( &ctx.fugue, hash3 ); + sph_gost512_init( &ctx.gost ); sph_gost512( &ctx.gost, hash0, 64 ); sph_gost512_close( &ctx.gost, hash0 ); sph_gost512_init( &ctx.gost ); @@ -104,51 +100,43 @@ void polytimos_4way_hash( void *output, const void *input ) memcpy( output+96, hash3, 32 ); } -int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done ) +int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t hash[4*8] __attribute__ ((aligned (64))); uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t endiandata[20] __attribute__((aligned(64))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; uint32_t n = first_nonce; - uint32_t *nonces = work->nonces; - int num_found = 0; - uint32_t *noncep = vdata + 73; // 9*8 + 1 + __m256i *noncev = (__m256i*)vdata + 9; // aligned const uint32_t Htarg = ptarget[7]; + /* int */ thr_id = mythr->id; // thr_id arg is deprecated volatile uint8_t *restart = &(work_restart[thr_id].restart); if ( opt_benchmark ) ptarget[7] = 0x0cff; - for ( int i=0; i < 19; i++ ) - be32enc( &endiandata[i], pdata[i] ); - - uint64_t *edata = (uint64_t*)endiandata; - mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); + mm256_bswap_intrlv80_4x64( vdata, pdata ); do { - be32enc( noncep, n ); - be32enc( noncep+2, n+1 ); - be32enc( noncep+4, n+2 ); - be32enc( noncep+6, n+3 ); + *noncev = mm256_intrlv_blend_32( mm256_bswap_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); polytimos_4way_hash(hash, vdata); pdata[19] = n; - for ( int i = 0; i < 4; i++ ) - if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) ) + for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg ) + if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) { pdata[19] = n+i; - nonces[ num_found++ ] = n+i; - work_set_target_ratio( work, hash+(i<<3) ); + submit_solution( work, hash+(i<<3), mythr, i ); } n += 4; - } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart)); + } while ( ( n < max_nonce-4 ) && !(*restart)); *hashes_done = n - first_nonce + 1; - return num_found; + return 0; } #endif diff --git a/algo/x14/polytimos-gate.c b/algo/x14/polytimos-gate.c index 7e14e6f..aa54060 100644 --- a/algo/x14/polytimos-gate.c +++ b/algo/x14/polytimos-gate.c @@ -4,7 +4,6 @@ bool register_polytimos_algo( algo_gate_t* gate ) { gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; #ifdef POLYTIMOS_4WAY - init_polytimos_4way_ctx(); gate->scanhash = (void*)&scanhash_polytimos_4way; gate->hash = (void*)&polytimos_4way_hash; #else diff --git a/algo/x14/polytimos-gate.h b/algo/x14/polytimos-gate.h index 11b4297..85fbc64 100644 --- a/algo/x14/polytimos-gate.h +++ b/algo/x14/polytimos-gate.h @@ -13,19 +13,14 @@ bool register_polytimos_algo( algo_gate_t* gate ); #if defined(POLYTIMOS_4WAY) void polytimos_4way_hash( void *state, const void *input ); - int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); - -void init_polytimos_4way_ctx(); + uint64_t *hashes_done, struct thr_info *mythr ); #endif void polytimos_hash( void *state, const void *input ); - int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); - + uint64_t *hashes_done, struct thr_info *mythr ); void init_polytimos_ctx(); #endif diff --git a/algo/x14/polytimos.c b/algo/x14/polytimos.c index 8bd4f9e..ab83499 100644 --- a/algo/x14/polytimos.c +++ b/algo/x14/polytimos.c @@ -76,13 +76,14 @@ void polytimos_hash(void *output, const void *input) memcpy(output, hashA, 32); } -int scanhash_polytimos(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done) +int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t _ALIGN(128) hash[8]; uint32_t _ALIGN(128) endiandata[20]; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; - + /* int */ thr_id = mythr->id; // thr_id arg is deprecated const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; uint32_t nonce = first_nonce; diff --git a/algo/x14/veltor-4way.c b/algo/x14/veltor-4way.c index 519ede3..a44c2d3 100644 --- a/algo/x14/veltor-4way.c +++ b/algo/x14/veltor-4way.c @@ -40,7 +40,7 @@ void veltor_4way_hash( void *output, const void *input ) skein512_4way( &ctx.skein, input, 80 ); skein512_4way_close( &ctx.skein, vhash ); - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); sph_shavite512( &ctx.shavite, hash0, 64 ); sph_shavite512_close( &ctx.shavite, hash0 ); @@ -54,10 +54,10 @@ void veltor_4way_hash( void *output, const void *input ) sph_shavite512( &ctx.shavite, hash3, 64 ); sph_shavite512_close( &ctx.shavite, hash3 ); - mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); + mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); shabal512_4way( &ctx.shabal, vhash, 64 ); shabal512_4way_close( &ctx.shabal, vhash ); - mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); + mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); sph_gost512( &ctx.gost, hash0, 64 ); sph_gost512_close( &ctx.gost, hash0 ); @@ -78,7 +78,7 @@ void veltor_4way_hash( void *output, const void *input ) } int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ) + uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t hash[4*8] __attribute__ ((aligned (64))); uint32_t vdata[24*4] __attribute__ ((aligned (64))); @@ -91,6 +91,7 @@ int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce, uint32_t *nonces = work->nonces; int num_found = 0; uint32_t *noncep = vdata + 73; // 9*8 + 1 + /* int */ thr_id = mythr->id; // thr_id arg is deprecated volatile uint8_t *restart = &(work_restart[thr_id].restart); if ( opt_benchmark ) @@ -101,7 +102,7 @@ int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce, } uint64_t *edata = (uint64_t*)endiandata; - mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); + mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); do { be32enc( noncep, n ); diff --git a/algo/x14/veltor-gate.h b/algo/x14/veltor-gate.h index 3f56d3e..d10781f 100644 --- a/algo/x14/veltor-gate.h +++ b/algo/x14/veltor-gate.h @@ -15,7 +15,7 @@ bool register_veltor_algo( algo_gate_t* gate ); void veltor_4way_hash( void *state, const void *input ); int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); + uint64_t *hashes_done, struct thr_info *mythr ); void init_veltor_4way_ctx(); @@ -24,7 +24,7 @@ void init_veltor_4way_ctx(); void veltor_hash( void *state, const void *input ); int scanhash_veltor( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); + uint64_t *hashes_done, struct thr_info *mythr ); void init_veltor_ctx(); diff --git a/algo/x14/veltor.c b/algo/x14/veltor.c index fb529c3..c5fa3d6 100644 --- a/algo/x14/veltor.c +++ b/algo/x14/veltor.c @@ -61,12 +61,14 @@ void veltor_hash(void *output, const void *input) memcpy(output, hashB, 32); } -int scanhash_veltor(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done) +int scanhash_veltor( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t _ALIGN(128) hash[8]; uint32_t _ALIGN(128) endiandata[20]; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; + /* int */ thr_id = mythr->id; // thr_id arg is deprecated const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; diff --git a/algo/x14/x14-4way.c b/algo/x14/x14-4way.c index 064c426..461ddd0 100644 --- a/algo/x14/x14-4way.c +++ b/algo/x14/x14-4way.c @@ -78,7 +78,7 @@ void x14_4way_hash( void *state, const void *input ) bmw512_4way_close( &ctx.bmw, vhash ); // Serial - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); // 3 Groestl update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); @@ -90,7 +90,7 @@ void x14_4way_hash( void *state, const void *input ) update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); // Parallel 4way - mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); // 4 Skein skein512_4way( &ctx.skein, vhash, 64 ); @@ -105,16 +105,16 @@ void x14_4way_hash( void *state, const void *input ) keccak512_4way_close( &ctx.keccak, vhash ); // Serial - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); // 7 Luffa - mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + mm256_intrlv_2x128( vhash, hash0, hash1, 512 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); - mm256_interleave_2x128( vhash, hash2, hash3, 512 ); + mm256_dintrlv_2x128( hash0, hash1, vhash, 512 ); + mm256_intrlv_2x128( vhash, hash2, hash3, 512 ); luffa_2way_init( &ctx.luffa, 512 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); + mm256_dintrlv_2x128( hash2, hash3, vhash, 512 ); // 8 Cubehash cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); @@ -142,13 +142,13 @@ void x14_4way_hash( void *state, const void *input ) sph_shavite512_close( &ctx.shavite, hash3 ); // 10 Simd - mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + mm256_intrlv_2x128( vhash, hash0, hash1, 512 ); simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); - mm256_interleave_2x128( vhash, hash2, hash3, 512 ); + mm256_dintrlv_2x128( hash0, hash1, vhash, 512 ); + mm256_intrlv_2x128( vhash, hash2, hash3, 512 ); simd_2way_init( &ctx.simd, 512 ); simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); + mm256_dintrlv_2x128( hash2, hash3, vhash, 512 ); // 11 Echo update_final_echo( &ctx.echo, (BitSequence *)hash0, @@ -164,10 +164,10 @@ void x14_4way_hash( void *state, const void *input ) (const BitSequence *) hash3, 512 ); // 12 Hamsi parallel 4way 32 bit - mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); hamsi512_4way( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); // 13 Fugue serial sph_fugue512( &ctx.fugue, hash0, 64 ); @@ -183,14 +183,14 @@ void x14_4way_hash( void *state, const void *input ) sph_fugue512_close( &ctx.fugue, hash3 ); // 14 Shabal, parallel 32 bit - mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); + mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); shabal512_4way( &ctx.shabal, vhash, 64 ); shabal512_4way_close( &ctx.shabal, state ); } int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ) + uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t hash[4*16] __attribute__ ((aligned (64))); uint32_t vdata[24*4] __attribute__ ((aligned (64))); @@ -203,6 +203,7 @@ int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce, int num_found = 0; uint32_t *noncep = vdata + 73; // 9*8 + 1 const uint32_t Htarg = ptarget[7]; + /* int */ thr_id = mythr->id; // thr_id arg is deprecated uint64_t htmax[] = { 0, 0xF, 0xFF, 0xFFF, 0xFFFF, 0x10000000 }; uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, @@ -212,7 +213,7 @@ int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce, swab32_array( endiandata, pdata, 20 ); uint64_t *edata = (uint64_t*)endiandata; - mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); + mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] ) diff --git a/algo/x14/x14-gate.h b/algo/x14/x14-gate.h index 1b3770e..18b0016 100644 --- a/algo/x14/x14-gate.h +++ b/algo/x14/x14-gate.h @@ -13,19 +13,15 @@ bool register_x14_algo( algo_gate_t* gate ); #if defined(X14_4WAY) void x14_4way_hash( void *state, const void *input ); - int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); - + uint64_t *hashes_done, struct thr_info *mythr ); void init_x14_4way_ctx(); #endif void x14hash( void *state, const void *input ); - int scanhash_x14( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); - + uint64_t *hashes_done, struct thr_info *mythr ); void init_x14_ctx(); #endif diff --git a/algo/x14/x14.c b/algo/x14/x14.c index 8e6c70d..effd8fd 100644 --- a/algo/x14/x14.c +++ b/algo/x14/x14.c @@ -180,8 +180,8 @@ void x14hash(void *output, const void *input) memcpy(output, hash, 32); } -int scanhash_x14(int thr_id, struct work *work, - uint32_t max_nonce, uint64_t *hashes_done) +int scanhash_x14( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t endiandata[20] __attribute__((aligned(64))); uint32_t hash64[8] __attribute__((aligned(64))); @@ -190,6 +190,7 @@ int scanhash_x14(int thr_id, struct work *work, uint32_t n = pdata[19] - 1; const uint32_t first_nonce = pdata[19]; const uint32_t Htarg = ptarget[7]; + /* int */ thr_id = mythr->id; // thr_id arg is deprecated uint64_t htmax[] = { 0, diff --git a/algo/x15/x15-4way.c b/algo/x15/x15-4way.c index 5579f18..5635acf 100644 --- a/algo/x15/x15-4way.c +++ b/algo/x15/x15-4way.c @@ -81,7 +81,7 @@ void x15_4way_hash( void *state, const void *input ) bmw512_4way_close( &ctx.bmw, vhash ); // Serial - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); // 3 Groestl update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); @@ -93,7 +93,7 @@ void x15_4way_hash( void *state, const void *input ) update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); // Parallel 4way - mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); // 4 Skein skein512_4way( &ctx.skein, vhash, 64 ); @@ -108,16 +108,16 @@ void x15_4way_hash( void *state, const void *input ) keccak512_4way_close( &ctx.keccak, vhash ); // Serial to the end - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); // 7 Luffa - mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + mm256_intrlv_2x128( vhash, hash0, hash1, 512 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); - mm256_interleave_2x128( vhash, hash2, hash3, 512 ); + mm256_dintrlv_2x128( hash0, hash1, vhash, 512 ); + mm256_intrlv_2x128( vhash, hash2, hash3, 512 ); luffa_2way_init( &ctx.luffa, 512 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); + mm256_dintrlv_2x128( hash2, hash3, vhash, 512 ); // 8 Cubehash cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); @@ -145,13 +145,13 @@ void x15_4way_hash( void *state, const void *input ) sph_shavite512_close( &ctx.shavite, hash3 ); // 10 Simd - mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + mm256_intrlv_2x128( vhash, hash0, hash1, 512 ); simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); - mm256_interleave_2x128( vhash, hash2, hash3, 512 ); + mm256_dintrlv_2x128( hash0, hash1, vhash, 512 ); + mm256_intrlv_2x128( vhash, hash2, hash3, 512 ); simd_2way_init( &ctx.simd, 512 ); simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); - mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); + mm256_dintrlv_2x128( hash2, hash3, vhash, 512 ); // 11 Echo update_final_echo( &ctx.echo, (BitSequence *)hash0, @@ -167,10 +167,10 @@ void x15_4way_hash( void *state, const void *input ) (const BitSequence *) hash3, 512 ); // 12 Hamsi parallel 4way 32 bit - mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); hamsi512_4way( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); // 13 Fugue sph_fugue512( &ctx.fugue, hash0, 64 ); @@ -186,10 +186,10 @@ void x15_4way_hash( void *state, const void *input ) sph_fugue512_close( &ctx.fugue, hash3 ); // 14 Shabal, parallel 32 bit - mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); + mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); shabal512_4way( &ctx.shabal, vhash, 64 ); shabal512_4way_close( &ctx.shabal, vhash ); - mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); + mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); // 15 Whirlpool sph_whirlpool( &ctx.whirlpool, hash0, 64 ); @@ -214,7 +214,7 @@ void x15_4way_hash( void *state, const void *input ) } int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ) + uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t hash[4*8] __attribute__ ((aligned (64))); uint32_t vdata[24*4] __attribute__ ((aligned (64))); @@ -227,6 +227,7 @@ int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce, int num_found = 0; uint32_t *noncep = vdata + 73; // 9*8 + 1 const uint32_t Htarg = ptarget[7]; + /* int */ thr_id = mythr->id; // thr_id arg is deprecated uint64_t htmax[] = { 0, 0xF, 0xFF, 0xFFF, 0xFFFF, 0x10000000 }; uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, @@ -236,7 +237,7 @@ int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce, swab32_array( endiandata, pdata, 20 ); uint64_t *edata = (uint64_t*)endiandata; - mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); + mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] ) diff --git a/algo/x15/x15-gate.h b/algo/x15/x15-gate.h index fc54df0..eefccf6 100644 --- a/algo/x15/x15-gate.h +++ b/algo/x15/x15-gate.h @@ -13,19 +13,15 @@ bool register_x15_algo( algo_gate_t* gate ); #if defined(X15_4WAY) void x15_4way_hash( void *state, const void *input ); - int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); - + uint64_t *hashes_done, struct thr_info *mythr ); void init_x15_4way_ctx(); #endif void x15hash( void *state, const void *input ); - int scanhash_x15( int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done ); - + uint64_t *hashes_done, struct thr_info *mythr ); void init_x15_ctx(); #endif diff --git a/algo/x15/x15.c b/algo/x15/x15.c index c66e135..b3a7180 100644 --- a/algo/x15/x15.c +++ b/algo/x15/x15.c @@ -186,8 +186,8 @@ void x15hash(void *output, const void *input) memcpy(output, hashB, 32); } -int scanhash_x15(int thr_id, struct work *work, - uint32_t max_nonce, uint64_t *hashes_done) +int scanhash_x15( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t endiandata[20] __attribute__((aligned(64))); uint32_t hash64[8] __attribute__((aligned(64))); @@ -196,6 +196,7 @@ int scanhash_x15(int thr_id, struct work *work, uint32_t n = pdata[19] - 1; const uint32_t first_nonce = pdata[19]; const uint32_t Htarg = ptarget[7]; + /* int */ thr_id = mythr->id; // thr_id arg is deprecated uint64_t htmax[] = { 0, diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c index 01306a3..b254f52 100644 --- a/algo/x16/x16r-4way.c +++ b/algo/x16/x16r-4way.c @@ -32,8 +32,8 @@ static __thread uint32_t s_ntime = UINT32_MAX; static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; - -typedef struct { +union _x16r_4way_context_overlay +{ blake512_4way_context blake; bmw512_4way_context bmw; hashState_echo echo; @@ -50,16 +50,8 @@ typedef struct { shabal512_4way_context shabal; sph_whirlpool_context whirlpool; sha512_4way_context sha512; -} x16r_4way_ctx_holder; - -x16r_4way_ctx_holder x16r_4way_ctx __attribute__ ((aligned (64))); - -// Cube needs one full init so fast reinits can be done in the hash loop. -void init_x16r_4way_ctx() -{ - cubehashInit( &x16r_4way_ctx.cube, 512, 16, 32 ); }; - +typedef union _x16r_4way_context_overlay x16r_4way_context_overlay; void x16r_4way_hash( void* output, const void* input ) { @@ -68,14 +60,14 @@ void x16r_4way_hash( void* output, const void* input ) uint32_t hash2[24] __attribute__ ((aligned (64))); uint32_t hash3[24] __attribute__ ((aligned (64))); uint32_t vhash[24*4] __attribute__ ((aligned (64))); - x16r_4way_ctx_holder ctx; + x16r_4way_context_overlay ctx; void *in0 = (void*) hash0; void *in1 = (void*) hash1; void *in2 = (void*) hash2; void *in3 = (void*) hash3; int size = 80; - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, input, 640 ); + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 ); if ( s_ntime == UINT32_MAX ) { @@ -104,11 +96,11 @@ void x16r_4way_hash( void* output, const void* input ) blake512_4way( &ctx.blake, input, size ); else { - mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 ); + mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); blake512_4way( &ctx.blake, vhash, size ); } blake512_4way_close( &ctx.blake, vhash ); - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; case BMW: bmw512_4way_init( &ctx.bmw ); @@ -116,11 +108,11 @@ void x16r_4way_hash( void* output, const void* input ) bmw512_4way( &ctx.bmw, input, size ); else { - mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 ); + mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); bmw512_4way( &ctx.bmw, vhash, size ); } bmw512_4way_close( &ctx.bmw, vhash ); - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; case GROESTL: init_groestl( &ctx.groestl, 64 ); @@ -142,11 +134,11 @@ void x16r_4way_hash( void* output, const void* input ) skein512_4way( &ctx.skein, input, size ); else { - mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 ); + mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); skein512_4way( &ctx.skein, vhash, size ); } skein512_4way_close( &ctx.skein, vhash ); - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; case JH: jh512_4way_init( &ctx.jh ); @@ -154,11 +146,11 @@ void x16r_4way_hash( void* output, const void* input ) jh512_4way( &ctx.jh, input, size ); else { - mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 ); + mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); jh512_4way( &ctx.jh, vhash, size ); } jh512_4way_close( &ctx.jh, vhash ); - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; case KECCAK: keccak512_4way_init( &ctx.keccak ); @@ -166,21 +158,21 @@ void x16r_4way_hash( void* output, const void* input ) keccak512_4way( &ctx.keccak, input, size ); else { - mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 ); + mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); keccak512_4way( &ctx.keccak, vhash, size ); } keccak512_4way_close( &ctx.keccak, vhash ); - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; case LUFFA: - mm256_interleave_2x128( vhash, in0, in1, size<<3 ); + mm256_intrlv_2x128( vhash, in0, in1, size<<3 ); luffa_2way_init( &ctx.luffa, 512 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, size ); - mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); - mm256_interleave_2x128( vhash, in2, in3, size<<3 ); + mm256_dintrlv_2x128( hash0, hash1, vhash, 512 ); + mm256_intrlv_2x128( vhash, in2, in3, size<<3 ); luffa_2way_init( &ctx.luffa, 512 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, size); - mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); + mm256_dintrlv_2x128( hash2, hash3, vhash, 512 ); break; case CUBEHASH: cubehashInit( &ctx.cube, 512, 16, 32 ); @@ -194,7 +186,7 @@ void x16r_4way_hash( void* output, const void* input ) (const byte*)in2, size ); cubehashInit( &ctx.cube, 512, 16, 32 ); cubehashUpdateDigest( &ctx.cube, (byte*) hash3, - (const byte*)in3, size ); + (const byte*)in3, size ); break; case SHAVITE: sph_shavite512_init( &ctx.shavite ); @@ -211,14 +203,14 @@ void x16r_4way_hash( void* output, const void* input ) sph_shavite512_close( &ctx.shavite, hash3 ); break; case SIMD: - mm256_interleave_2x128( vhash, in0, in1, size<<3 ); + mm256_intrlv_2x128( vhash, in0, in1, size<<3 ); simd_2way_init( &ctx.simd, 512 ); simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 ); - mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); - mm256_interleave_2x128( vhash, in2, in3, size<<3 ); + mm256_dintrlv_2x128( hash0, hash1, vhash, 512 ); + mm256_intrlv_2x128( vhash, in2, in3, size<<3 ); simd_2way_init( &ctx.simd, 512 ); simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 ); - mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); + mm256_dintrlv_2x128( hash2, hash3, vhash, 512 ); break; case ECHO: init_echo( &ctx.echo, 512 ); @@ -235,11 +227,11 @@ void x16r_4way_hash( void* output, const void* input ) (const BitSequence*)in3, size<<3 ); break; case HAMSI: - mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 ); + mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); hamsi512_4way_init( &ctx.hamsi ); hamsi512_4way( &ctx.hamsi, vhash, size ); hamsi512_4way_close( &ctx.hamsi, vhash ); - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; case FUGUE: sph_fugue512_init( &ctx.fugue ); @@ -256,11 +248,11 @@ void x16r_4way_hash( void* output, const void* input ) sph_fugue512_close( &ctx.fugue, hash3 ); break; case SHABAL: - mm128_interleave_4x32( vhash, in0, in1, in2, in3, size<<3 ); + mm128_intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 ); shabal512_4way_init( &ctx.shabal ); shabal512_4way( &ctx.shabal, vhash, size ); shabal512_4way_close( &ctx.shabal, vhash ); - mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); + mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); break; case WHIRLPOOL: sph_whirlpool_init( &ctx.whirlpool ); @@ -277,11 +269,11 @@ void x16r_4way_hash( void* output, const void* input ) sph_whirlpool_close( &ctx.whirlpool, hash3 ); break; case SHA_512: - mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 ); + mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); sha512_4way_init( &ctx.sha512 ); sha512_4way( &ctx.sha512, vhash, size ); sha512_4way_close( &ctx.sha512, vhash ); - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; } size = 64; @@ -304,15 +296,13 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce, const uint32_t first_nonce = pdata[19]; uint32_t n = first_nonce; /* int */ thr_id = mythr->id; // thr_id arg is deprecated - uint32_t *nonces = work->nonces; - int num_found = 0; - uint32_t *noncep = vdata + 73; // 9*8 + 1 + __m256i *noncev = (__m256i*)vdata + 9; // aligned volatile uint8_t *restart = &(work_restart[thr_id].restart); - for ( int k=0; k < 19; k++ ) - be32enc( &endiandata[k], pdata[k] ); + casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) ); + casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) ); + casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); -// if ( s_ntime != pdata[17] ) if ( s_ntime != endiandata[17] ) { uint32_t ntime = swab32(pdata[17]); @@ -326,30 +316,27 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce, ptarget[7] = 0x0cff; uint64_t *edata = (uint64_t*)endiandata; - mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); + mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); do { - be32enc( noncep, n ); - be32enc( noncep+2, n+1 ); - be32enc( noncep+4, n+2 ); - be32enc( noncep+6, n+3 ); + *noncev = mm256_intrlv_blend_32( mm256_bswap_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); x16r_4way_hash( hash, vdata ); pdata[19] = n; - for ( int i = 0; i < 4; i++ ) - if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) ) + for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg ) + if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) { pdata[19] = n+i; - nonces[ num_found++ ] = n+i; - work_set_target_ratio( work, hash+(i<<3) ); + submit_solution( work, hash+(i<<3), mythr, i ); } n += 4; - } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) ); + } while ( ( n < max_nonce ) && !(*restart) ); *hashes_done = n - first_nonce + 1; - return num_found; + return 0; } #endif diff --git a/algo/x16/x16r-gate.c b/algo/x16/x16r-gate.c index 67aaf93..d1cebd2 100644 --- a/algo/x16/x16r-gate.c +++ b/algo/x16/x16r-gate.c @@ -35,11 +35,9 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output ) bool register_x16r_algo( algo_gate_t* gate ) { #if defined (X16R_4WAY) -// init_x16r_4way_ctx(); gate->scanhash = (void*)&scanhash_x16r_4way; gate->hash = (void*)&x16r_4way_hash; #else - init_x16r_ctx(); gate->scanhash = (void*)&scanhash_x16r; gate->hash = (void*)&x16r_hash; #endif @@ -52,11 +50,9 @@ bool register_x16r_algo( algo_gate_t* gate ) bool register_x16s_algo( algo_gate_t* gate ) { #if defined (X16R_4WAY) -// init_x16r_4way_ctx(); gate->scanhash = (void*)&scanhash_x16r_4way; gate->hash = (void*)&x16r_4way_hash; #else - init_x16r_ctx(); gate->scanhash = (void*)&scanhash_x16r; gate->hash = (void*)&x16r_hash; #endif diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h index 5a75c54..88ff56a 100644 --- a/algo/x16/x16r-gate.h +++ b/algo/x16/x16r-gate.h @@ -39,20 +39,14 @@ bool register_x16s_algo( algo_gate_t* gate ); #if defined(X16R_4WAY) void x16r_4way_hash( void *state, const void *input ); - int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); -void init_x16r_4way_ctx(); - #endif void x16r_hash( void *state, const void *input ); - int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); -void init_x16r_ctx(); - #endif diff --git a/algo/x16/x16r.c b/algo/x16/x16r.c index 493a322..4f8c077 100644 --- a/algo/x16/x16r.c +++ b/algo/x16/x16r.c @@ -33,7 +33,8 @@ static __thread uint32_t s_ntime = UINT32_MAX; static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; -typedef struct { +union _x16r_context_overlay +{ #if defined(__AES__) hashState_echo echo; hashState_groestl groestl; @@ -55,19 +56,13 @@ typedef struct { sph_shabal512_context shabal; sph_whirlpool_context whirlpool; SHA512_CTX sha512; -} x16r_ctx_holder; - -x16r_ctx_holder x16r_ctx __attribute__ ((aligned (64))); - -void init_x16r_ctx() -{ - cubehashInit( &x16r_ctx.cube, 512, 16, 32 ); }; +typedef union _x16r_context_overlay x16r_context_overlay; void x16r_hash( void* output, const void* input ) { uint32_t _ALIGN(128) hash[16]; - x16r_ctx_holder ctx; + x16r_context_overlay ctx; void *in = (void*) input; int size = 80; @@ -126,7 +121,7 @@ void x16r_hash( void* output, const void* input ) (const BitSequence*)in, size ); break; case CUBEHASH: - memcpy( &ctx.cube, &x16r_ctx.cube, sizeof(cubehashParam) ); + cubehashInit( &ctx.cube, 512, 16, 32 ); cubehashUpdateDigest( &ctx.cube, (byte*) hash, (const byte*)in, size ); break; @@ -196,13 +191,12 @@ int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce, uint32_t nonce = first_nonce; volatile uint8_t *restart = &(work_restart[thr_id].restart); - for ( int k=0; k < 19; k++ ) - be32enc( &endiandata[k], pdata[k] ); + casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); + casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); + casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) ); + casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); + casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); -// This code is suspicious. s_ntime is saved after byteswapping pdata[17] -// but is tested vs unswapped pdata[17]. This should result in calling -// getAlgoString every pass, but that doesn't seem to be the case. -// It appears to be working correctly as is. if ( s_ntime != pdata[17] ) { uint32_t ntime = swab32(pdata[17]); diff --git a/algo/x17/x17-gate.c b/algo/x17/x17-gate.c index c1cf1b0..69d28f6 100644 --- a/algo/x17/x17-gate.c +++ b/algo/x17/x17-gate.c @@ -3,13 +3,9 @@ bool register_x17_algo( algo_gate_t* gate ) { #if defined (X17_4WAY) -printf("register x17 4way\n"); -// init_x17_4way_ctx(); gate->scanhash = (void*)&scanhash_x17_4way; gate->hash = (void*)&x17_4way_hash; #else -printf("register x17 no 4way\n"); - init_x17_ctx(); gate->scanhash = (void*)&scanhash_x17; gate->hash = (void*)&x17_hash; #endif diff --git a/algo/x17/x17-gate.h b/algo/x17/x17-gate.h index 45faec4..a0b9b81 100644 --- a/algo/x17/x17-gate.h +++ b/algo/x17/x17-gate.h @@ -13,20 +13,14 @@ bool register_x17_algo( algo_gate_t* gate ); #if defined(X17_4WAY) void x17_4way_hash( void *state, const void *input ); - int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); -//void init_x17_4way_ctx(); - #endif void x17_hash( void *state, const void *input ); - int scanhash_x17( int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); -void init_x17_ctx(); - #endif diff --git a/algo/x17/x17.c b/algo/x17/x17.c index 591161e..228cb69 100644 --- a/algo/x17/x17.c +++ b/algo/x17/x17.c @@ -34,7 +34,8 @@ #include "algo/echo/sph_echo.h" #endif -typedef struct { +union _x17_context_overlay +{ #if defined(__AES__) hashState_groestl groestl; hashState_echo echo; @@ -43,7 +44,7 @@ typedef struct { sph_echo512_context echo; #endif hashState_luffa luffa; - cubehashParam cubehash; + cubehashParam cube; sph_shavite512_context shavite; hashState_sd simd; sph_hamsi512_context hamsi; @@ -52,38 +53,14 @@ typedef struct { sph_whirlpool_context whirlpool; SHA512_CTX sha512; sph_haval256_5_context haval; -} x17_ctx_holder; - -x17_ctx_holder x17_ctx __attribute__ ((aligned (64))); - -void init_x17_ctx() -{ -#if defined(__AES__) - init_groestl( &x17_ctx.groestl, 64 ); - init_echo( &x17_ctx.echo, 512 ); -#else - sph_groestl512_init(&x17_ctx.groestl ); - sph_echo512_init(&x17_ctx.echo); -#endif - init_luffa( &x17_ctx.luffa, 512 ); - cubehashInit( &x17_ctx.cubehash, 512, 16, 32 ); - sph_shavite512_init( &x17_ctx.shavite ); - init_sd( &x17_ctx.simd, 512 ); - sph_hamsi512_init( &x17_ctx.hamsi ); - sph_fugue512_init( &x17_ctx.fugue ); - sph_shabal512_init( &x17_ctx.shabal ); - sph_whirlpool_init( &x17_ctx.whirlpool ); - SHA512_Init( &x17_ctx.sha512 ); - sph_haval256_5_init(&x17_ctx.haval); }; +typedef union _x17_context_overlay x17_context_overlay; void x17_hash(void *output, const void *input) { unsigned char hash[128] __attribute__ ((aligned (64))); #define hashB hash+64 - - x17_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &x17_ctx, sizeof(x17_ctx) ); + x17_context_overlay ctx; unsigned char hashbuf[128]; size_t hashptr; @@ -115,9 +92,11 @@ void x17_hash(void *output, const void *input) //---groestl---- #if defined(__AES__) + init_groestl( &ctx.groestl, 64 ); update_and_final_groestl( &ctx.groestl, (char*)hash, (const char*)hash, 512 ); #else + sph_groestl512_init( &ctx.groestl ); sph_groestl512( &ctx.groestl, hash, 64 ); sph_groestl512_close( &ctx.groestl, hash ); #endif @@ -142,50 +121,62 @@ void x17_hash(void *output, const void *input) KEC_C; //--- luffa7 + init_luffa( &ctx.luffa, 512 ); update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)hash, 64 ); // 8 Cube - cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash, (const byte*)hash, 64 ); // 9 Shavite + sph_shavite512_init( &ctx.shavite ); sph_shavite512( &ctx.shavite, hash, 64); sph_shavite512_close( &ctx.shavite, hash); // 10 Simd + init_sd( &ctx.simd, 512 ); update_final_sd( &ctx.simd, (BitSequence*)hash, (const BitSequence*)hash, 512 ); //11---echo--- #if defined(__AES__) + init_echo( &ctx.echo, 512 ); update_final_echo ( &ctx.echo, (BitSequence*)hash, (const BitSequence*)hash, 512 ); #else - sph_echo512( &ctx.echo, hash, 64 ); + sph_echo512_init( &ctx.echo ); + sph_echo512( &ctx.echo, hash, 64 ); sph_echo512_close( &ctx.echo, hash ); #endif // X13 algos // 12 Hamsi + sph_hamsi512_init( &ctx.hamsi ); sph_hamsi512( &ctx.hamsi, hash, 64 ); sph_hamsi512_close( &ctx.hamsi, hash ); // 13 Fugue + sph_fugue512_init( &ctx.fugue ); sph_fugue512(&ctx.fugue, hash, 64 ); sph_fugue512_close(&ctx.fugue, hash ); // X14 Shabal + sph_shabal512_init( &ctx.shabal ); sph_shabal512(&ctx.shabal, hash, 64); sph_shabal512_close( &ctx.shabal, hash ); // X15 Whirlpool - sph_whirlpool( &ctx.whirlpool, hash, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash ); - SHA512_Update( &ctx.sha512, hash, 64 ); + SHA512_Init( &ctx.sha512 ); + SHA512_Update( &ctx.sha512, hash, 64 ); SHA512_Final( (unsigned char*)hash, &ctx.sha512 ); + sph_haval256_5_init(&ctx.haval); sph_haval256_5( &ctx.haval, (const void*)hash, 64 ); sph_haval256_5_close( &ctx.haval, output ); } @@ -234,42 +225,42 @@ int scanhash_x17( int thr_id, struct work *work, uint32_t max_nonce, #endif for ( int m = 0; m < 6; m++ ) { - if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do + if ( Htarg <= htmax[m] ) { - pdata[19] = ++n; - be32enc( &endiandata[19], n ); - x17_hash( hash64, endiandata ); + uint32_t mask = masks[m]; + do + { + pdata[19] = ++n; + be32enc( &endiandata[19], n ); + x17_hash( hash64, endiandata ); #ifndef DEBUG_ALGO - if ( !( hash64[7] & mask ) ) - { - if ( fulltest( hash64, ptarget ) ) - { - *hashes_done = n - first_nonce + 1; - return true; - } -// else -// applog(LOG_INFO, "Result does not validate on CPU!"); - } + if ( !( hash64[7] & mask ) ) + { + if ( fulltest( hash64, ptarget ) ) + { + *hashes_done = n - first_nonce + 1; + return true; + } +// else +// applog(LOG_INFO, "Result does not validate on CPU!"); + } #else - if ( !( n % 0x1000 ) && !thr_id ) printf("."); - if ( !( hash64[7] & mask ) ) + if ( !( n % 0x1000 ) && !thr_id ) printf("."); + if ( !( hash64[7] & mask ) ) { - printf("[%d]",thr_id); - if ( fulltest( hash64, ptarget ) ) - { - work_set_target_ratio( work, hash64 ); - *hashes_done = n - first_nonce + 1; - return true; - } - } + printf("[%d]",thr_id); + if ( fulltest( hash64, ptarget ) ) + { + work_set_target_ratio( work, hash64 ); + *hashes_done = n - first_nonce + 1; + return true; + } + } #endif - } while (n < max_nonce && !work_restart[thr_id].restart); + } while (n < max_nonce && !work_restart[thr_id].restart); // see blake.c if else to understand the loop on htmax => mask - break; - } + break; + } } *hashes_done = n - first_nonce + 1; pdata[19] = n; diff --git a/algo/yespower/yespower-platform.c b/algo/yespower/yespower-platform.c index 2b1a03f..5985791 100644 --- a/algo/yespower/yespower-platform.c +++ b/algo/yespower/yespower-platform.c @@ -41,7 +41,7 @@ static void *alloc_region(yespower_region_t *region, size_t size) #endif MAP_ANON | MAP_PRIVATE; #if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE) - size_t new_size = size; + size_t new_size = size; const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1; if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) { flags |= MAP_HUGETLB; @@ -55,6 +55,7 @@ static void *alloc_region(yespower_region_t *region, size_t size) base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0); if (base != MAP_FAILED) { base_size = new_size; + } else if (flags & MAP_HUGETLB) { flags &= ~MAP_HUGETLB; base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); diff --git a/build-allarch.sh b/build-allarch.sh index f13966f..63e0e95 100755 --- a/build-allarch.sh +++ b/build-allarch.sh @@ -7,16 +7,6 @@ make distclean || echo clean rm -f config.status ./autogen.sh || echo done -CFLAGS="-O3 -march=core-avx2 -msha -Wall" ./configure --with-curl -make -j 16 -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-avx2-sha.exe -strip -s cpuminer -mv cpuminer cpuminer-avx2-sha - -make clean || echo clean -rm -f config.status -./autogen.sh || echo done CFLAGS="-O3 -march=skylake-avx512 -Wall" ./configure --with-curl make -j 16 strip -s cpuminer.exe @@ -26,7 +16,6 @@ mv cpuminer cpuminer-avx512 make clean || echo clean rm -f config.status -./autogen.sh || echo done CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl make -j 16 strip -s cpuminer.exe @@ -36,7 +25,6 @@ mv cpuminer cpuminer-avx2 make clean || echo clean rm -f config.status -./autogen.sh || echo done CFLAGS="-O3 -march=corei7-avx -Wall" ./configure --with-curl make -j 16 strip -s cpuminer.exe @@ -46,7 +34,6 @@ mv cpuminer cpuminer-aes-avx make clean || echo clean rm -f config.status -./autogen.sh || echo done CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl make -j 16 strip -s cpuminer.exe @@ -56,7 +43,6 @@ mv cpuminer cpuminer-aes-sse42 make clean || echo clean rm -f config.status -./autogen.sh || echo done CFLAGS="-O3 -march=corei7 -Wall" ./configure --with-curl make -j 16 strip -s cpuminer.exe @@ -66,7 +52,6 @@ mv cpuminer cpuminer-sse42 make clean || echo clean rm -f config.status -./autogen.sh || echo done CFLAGS="-O3 -march=core2 -Wall" ./configure --with-curl make -j 16 strip -s cpuminer.exe @@ -76,7 +61,6 @@ mv cpuminer cpuminer-ssse3 make clean || echo clean rm -f config.status -./autogen.sh || echo done CFLAGS="-O3 -msse2 -Wall" ./configure --with-curl make -j 16 strip -s cpuminer.exe @@ -86,8 +70,7 @@ mv cpuminer cpuminer-sse2 make clean || echo done rm -f config.status -./autogen.sh || echo done -CFLAGS="-O3 -march=znver1 -DRYZEN_ -Wall" ./configure --with-curl +CFLAGS="-O3 -march=znver1 -Wall" ./configure --with-curl make -j 16 strip -s cpuminer.exe mv cpuminer.exe cpuminer-zen.exe @@ -96,13 +79,8 @@ mv cpuminer cpuminer-zen make clean || echo done rm -f config.status -./autogen.sh || echo done CFLAGS="-O3 -march=native -Wall" ./configure --with-curl make -j 16 strip -s cpuminer.exe -#mv cpuminer.exe cpuminer-native.exe strip -s cpuminer -#mv cpuminer cpuminer-native - -#make clean || echo done diff --git a/configure b/configure index 71842f6..6430e3b 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.3.1. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.4. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.9.3.1' -PACKAGE_STRING='cpuminer-opt 3.9.3.1' +PACKAGE_VERSION='3.9.4' +PACKAGE_STRING='cpuminer-opt 3.9.4' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.9.3.1 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.9.4 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.9.3.1:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.9.4:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.9.3.1 +cpuminer-opt configure 3.9.4 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.9.3.1, which was +It was created by cpuminer-opt $as_me 3.9.4, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.9.3.1' + VERSION='3.9.4' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.9.3.1, which was +This file was extended by cpuminer-opt $as_me 3.9.4, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.9.3.1 +cpuminer-opt config.status 3.9.4 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index e1e37d5..9707a8d 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.9.3.1]) +AC_INIT([cpuminer-opt], [3.9.4]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 7a45973..1fe5c6c 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -1851,14 +1851,14 @@ static void *miner_thread( void *userdata ) { #if AFFINITY_USES_UINT128 // Default affinity - if ( (opt_affinity == i128_neg1 ) && opt_n_threads > 1 ) + if ( (opt_affinity == (uint128_t)(-1) ) && opt_n_threads > 1 ) { if ( opt_debug ) applog( LOG_DEBUG, "Binding thread %d to cpu %d.", thr_id, thr_id % num_cpus, - u128_hi64( (uint128_t)1ULL << (thr_id % num_cpus) ), - u128_lo64( (uint128_t)1ULL << (thr_id % num_cpus) ) ); - affine_to_cpu_mask( thr_id, (uint128_t)1ULL << (thr_id % num_cpus) ); + u128_hi64( (uint128_t)1 << (thr_id % num_cpus) ), + u128_lo64( (uint128_t)1 << (thr_id % num_cpus) ) ); + affine_to_cpu_mask( thr_id, (uint128_t)1 << (thr_id % num_cpus) ); } #else if ( (opt_affinity == -1LL) && opt_n_threads > 1 ) @@ -2322,7 +2322,7 @@ bool jr2_stratum_handle_response( json_t *val ) static bool stratum_handle_response( char *buf ) { - json_t *val, *id_val; + json_t *val, *id_val, *res_val; json_error_t err; bool ret = false; @@ -2332,8 +2332,10 @@ static bool stratum_handle_response( char *buf ) applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text); goto out; } - json_object_get( val, "result" ); - id_val = json_object_get( val, "id" ); + res_val = json_object_get( val, "result" ); + if ( !res_val ) { /* now what? */ } + + id_val = json_object_get( val, "id" ); if ( !id_val || json_is_null(id_val) ) goto out; if ( !algo_gate.stratum_handle_response( val ) ) diff --git a/simd-utils.h b/simd-utils.h index 469a24d..5a668b2 100644 --- a/simd-utils.h +++ b/simd-utils.h @@ -80,7 +80,8 @@ // the element size. // - there is a subset of some functions for scalar data. They may have // no prefix nor vec-size, just one size, the size of the data. -// +// - Some integer functions are also defined which use a similar notation. +// // Function names follow this pattern: // // prefix_op[esize]_[vsize] @@ -89,7 +90,7 @@ // are some examples: // // u64: unsigned 64 bit integer function -// i128: signed 128 bit integer function +// i128: signed 128 bit integer function (rarely used) // m128: 128 bit vector identifier // mm128: 128 bit vector function // @@ -137,14 +138,32 @@ // improve high level code readability without the penalty of function // overhead. // +// A major restructuring is taking place shifting the focus from pointers +// to registers. Previously pointer casting used memory to provide transparency +// leaving it up to the compiler to manage everything and it does a very good +// job. The focus has shifted to register arguments for more control +// over the actual instructions assuming the data is in a register and the +// the compiler just needs to manage the registers. +// +// Rather than use pointers to provide type transparency +// specific instructions are used to access specific data as specific types. +// Previously pointers were cast and the compiler was left to find a way +// to get the data from wherever it happened to be to the correct registers. +// +// The utilities defined here make use features like register aliasing +// to optimize operations. Many operations have specialized versions as +// well as more generic versions. It is preferable to use a specialized +// version whenever possible a sthey can take advantage of certain +// optimizations not available to the generic version. Specically the generic +// version usually has a second argument used is some extra calculations. +// /////////////////////////////////////////////////////// #include #include #include +#include #include -// byteswap.h doesn't exist on Windows, find alternative -//#include // Various types and overlays #include "simd-utils/simd-types.h" @@ -157,6 +176,7 @@ // 64 bit vectors #include "simd-utils/simd-mmx.h" #include "simd-utils/intrlv-mmx.h" + #if defined(__SSE2__) // 128 bit vectors diff --git a/simd-utils/intrlv-avx2.h b/simd-utils/intrlv-avx2.h index bfc384c..00c95b5 100644 --- a/simd-utils/intrlv-avx2.h +++ b/simd-utils/intrlv-avx2.h @@ -384,6 +384,7 @@ static inline void mm256_intrlv_8x32( void *d, const void *s0, // bit_len == 1024 } +// A couple of mining specifi functions. // Interleave 80 bytes of 32 bit data for 8 lanes. static inline void mm256_bswap_intrlv80_8x32( void *d, const void *s ) @@ -469,6 +470,20 @@ static inline void mm256_bswap_intrlv80_4x64( void *d, const void *s ) mm256_bswap_intrlv_4x64_128( d+256, casti_m128i( s, 4 ) ); } +// Blend 32 byte lanes of hash from 2 sources according to control mask. +// macro due to 256 bit value arg. +#define mm256_blend_hash_4x64( dst, a, b, mask ) \ +do { \ + dst[0] = _mm256_blendv_epi8( a[0], b[0], mask ); \ + dst[1] = _mm256_blendv_epi8( a[1], b[1], mask ); \ + dst[2] = _mm256_blendv_epi8( a[2], b[2], mask ); \ + dst[3] = _mm256_blendv_epi8( a[3], b[3], mask ); \ + dst[4] = _mm256_blendv_epi8( a[4], b[4], mask ); \ + dst[5] = _mm256_blendv_epi8( a[5], b[5], mask ); \ + dst[6] = _mm256_blendv_epi8( a[6], b[6], mask ); \ + dst[7] = _mm256_blendv_epi8( a[7], b[7], mask ); \ +} while(0) + // Deinterleave 4 buffers of 64 bit data from the source buffer. // bit_len must be 256, 512, 640 or 1024 bits. // Requires overrun padding for 640 bit len. diff --git a/simd-utils/simd-avx2.h b/simd-utils/simd-avx2.h index 81083a4..8359440 100644 --- a/simd-utils/simd-avx2.h +++ b/simd-utils/simd-avx2.h @@ -103,6 +103,29 @@ #define mm128_extr_lo128_256( a ) _mm256_castsi256_si128( a ) #define mm128_extr_hi128_256( a ) _mm256_extracti128_si256( a, 1 ) +// Extract 4 u64 from 256 bit vector. +#define mm256_extr_4x64( a0, a1, a2, a3, src ) \ +do { \ + __m128i hi = _mm256_extracti128_si256( src, 1 ); \ + a0 = _mm_extract_epi64( _mm256_castsi256_si128( src ), 0 ); \ + a1 = _mm_extract_epi64( _mm256_castsi256_si128( src ), 1 ); \ + a2 = _mm_extract_epi64( hi, 0 ); \ + a3 = _mm_extract_epi64( hi, 1 ); \ +} while(0) + +#define mm256_extr_8x32( a0, a1, a2, a3, a4, a5, a6, a7, src ) \ +do { \ + __m128i hi = _mm256_extracti128_si256( src, 1 ); \ + a0 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 0 ); \ + a1 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 1 ); \ + a2 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 2 ); \ + a3 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 3 ); \ + a4 = _mm_extract_epi32( hi, 0 ); \ + a5 = _mm_extract_epi32( hi, 1 ); \ + a6 = _mm_extract_epi32( hi, 2 ); \ + a7 = _mm_extract_epi32( hi, 3 ); \ +} while(0) + // input __m128i, returns __m256i // To build a 256 bit vector from 2 128 bit vectors lo must be done first. // lo alone leaves hi undefined, hi alone leaves lo unchanged. @@ -111,10 +134,24 @@ #define mm256_ins_lo128_256( a, b ) _mm256_inserti128_si256( a, b, 0 ) #define mm256_ins_hi128_256( a, b ) _mm256_inserti128_si256( a, b, 1 ) -// concatenate two 128 bit vectors into one 256 bit vector +// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo } #define mm256_concat_128( hi, lo ) \ mm256_ins_hi128_256( _mm256_castsi128_si256( lo ), hi ) +// Horizontal vector testing + +// Bit-wise test of entire vector, useful to test results of cmp. +#define mm256_anybits0( a ) \ + ( (uint128_t)mm128_extr_hi128_256( a ) \ + | (uint128_t)mm128_extr_lo128_256( a ) ) + +#define mm256_anybits1( a ) \ + ( ( (uint128_t)mm128_extr_hi128_256( a ) + 1 ) \ + | ( (uint128_t)mm128_extr_lo128_256( a ) + 1 ) ) + +#define mm256_allbits0_256( a ) ( !mm256_anybits1(a) ) +#define mm256_allbits1_256( a ) ( !mm256_anybits0(a) ) + // Parallel AES, for when x is expected to be in a 256 bit register. #define mm256_aesenc_2x128( x ) \ mm256_concat_128( \ diff --git a/simd-utils/simd-int.h b/simd-utils/simd-int.h index d7ed432..7ef7833 100644 --- a/simd-utils/simd-int.h +++ b/simd-utils/simd-int.h @@ -3,9 +3,15 @@ /////////////////////////////////// // -// Integers up to 64 bits. +// Integers up to 128 bits. // - +// These utilities enhance support for integers up to 128 bits. +// All standard operations are supported on 128 bit integers except +// numeric constant representation and IO. 128 bit integers must be built +// and displayed as 2 64 bit halves, just like the old times. +// +// Some utilities are also provided for smaller integers, most notably +// bit rotation. // MMX has no extract instruction for 32 bit elements so this: // Lo is trivial, high is a simple shift. @@ -17,7 +23,6 @@ #define u64_extr_16( a, n ) ( (uint16_t)( (a) >> ( ( 4-(n)) <<4 ) ) ) #define u64_extr_8( a, n ) ( (uint8_t) ( (a) >> ( ( 8-(n)) <<3 ) ) ) - // Rotate bits in various sized integers. #define u64_ror_64( x, c ) \ (uint64_t)( ( (uint64_t)(x) >> (c) ) | ( (uint64_t)(x) << (64-(c)) ) ) @@ -36,6 +41,9 @@ #define u8_rol_8( x, c ) \ (uint8_t) ( ( (uint8_t) (x) << (c) ) | ( (uint8_t) (x) >> ( 8-(c)) ) ) +// Endian byte swap +#define bswap_64( a ) __builtin_bswap64( a ) +#define bswap_32( a ) __builtin_bswap32( a ) // 64 bit mem functions use integral sizes instead of bytes, data must // be aligned to 64 bits. Mostly for scaled indexing convenience. @@ -56,21 +64,20 @@ static inline void memset_64( uint64_t *dst, const uint64_t a, int n ) // // No real need or use. -#define i128_neg1 ((uint128_t)(-1LL)) +//#define u128_neg1 ((uint128_t)(-1)) -// Extract specified 64 bit half of 128 bit integer. -// typecast should work for lo: (uint64_t)(x), test it! +// Extracting the low bits is a trivial cast. +// These specialized functions are optimized while providing a +// consistent interface. #define u128_hi64( x ) ( (uint64_t)( (uint128_t)(x) >> 64 ) ) -#define u128_lo64( x ) ( (uint64_t)( (uint128_t)(x) << 64 >> 64 ) ) -// #define i128_lo64( x ) ((uint64_t)(x)) +#define u128_lo64( x ) ( (uint64_t)(x) ) -// Generic extract, +// Generic extract, don't use for extracting low bits, cast instead. #define u128_extr_64( a, n ) ( (uint64_t)( (a) >> ( ( 2-(n)) <<6 ) ) ) #define u128_extr_32( a, n ) ( (uint32_t)( (a) >> ( ( 4-(n)) <<5 ) ) ) #define u128_extr_16( a, n ) ( (uint16_t)( (a) >> ( ( 8-(n)) <<4 ) ) ) #define u128_extr_8( a, n ) ( (uint8_t) ( (a) >> ( (16-(n)) <<3 ) ) ) - // Not much need for this but it fills a gap. #define u128_ror_128( x, c ) \ ( ( (uint128_t)(x) >> (c) ) | ( (uint128_t)(x) << (128-(c)) ) ) diff --git a/simd-utils/simd-mmx.h b/simd-utils/simd-mmx.h index 5f45d76..ca08039 100644 --- a/simd-utils/simd-mmx.h +++ b/simd-utils/simd-mmx.h @@ -111,8 +111,8 @@ #if defined(__SSSE3__) -// An SSE2 versin of this would be monstrous, shifting, masking and oring -// each byte individually. +// An SSE2 or MMX version of this would be monstrous, shifting, masking and +// oring each byte individually. #define mm64_invert_8( v ) \ _mm_shuffle_pi8( (__m64)v, _mm_set_pi8( 0,1,2,3,4,5,6,7 ) ); diff --git a/simd-utils/simd-sse2.h b/simd-utils/simd-sse2.h index 435be0a..78594d7 100644 --- a/simd-utils/simd-sse2.h +++ b/simd-utils/simd-sse2.h @@ -10,11 +10,21 @@ // SSE2 is generally required for full 128 bit support. Some functions // are also optimized with SSSE3 or SSE4.1. // +// Do not call _mm_extract directly, it isn't supported in SSE2. +// Use mm128_extr instead, it will select the appropriate implementation. +// +// 128 bit operations are enhanced with uint128 which adds 128 bit integer +// support for arithmetic and other operations. Casting to uint128_t is not +// free, it requires a move from mmx to gpr but is often the only way or +// the more efficient way for certain operations. // Compile time constant initializers are type agnostic and can have // a pointer handle of almost any type. All arguments must be scalar constants. // up to 64 bits. These iniitializers should only be used at compile time // to initialize vector arrays. All data reside in memory. +// +// These are of limited use, it is often simpler to use uint64_t arrays +// and cast as required. #define mm128_const_64( x1, x0 ) {{ x1, x0 }} #define mm128_const1_64( x ) {{ x, x }} @@ -80,6 +90,28 @@ #define mm128_negate_32( v ) _mm_sub_epi32( m128_zero, v ) #define mm128_negate_16( v ) _mm_sub_epi16( m128_zero, v ) +// Use uint128_t for most arithmetic, bit shift, comparison operations +// spanning all 128 bits. Some extractions are also more efficient +// casting __m128i as uint128_t and usingstandard operators. + +// This isn't cheap, not suitable for bulk usage. +#define mm128_extr_4x32( a0, a1, a2, a3, src ) \ +do { \ + a0 = _mm_extract_epi32( src, 0 ); \ + a1 = _mm_extract_epi32( src, 1 ); \ + a1 = _mm_extract_epi32( src, 2 ); \ + a3 = _mm_extract_epi32( src, 3 ); \ +} while(0) + +// Horizontal vector testing + +// Bit-wise test of entire vector, useful to test results of cmp. +#define mm128_anybits0( a ) (uint128_t)(a) +#define mm128_anybits1( a ) (((uint128_t)(a))+1) + +#define mm128_allbits0( a ) ( !mm128_anybits1(a) ) +#define mm128_allbits1( a ) ( !mm128_anybits0(a) ) + // // Vector pointer cast diff --git a/simd-utils/simd-types.h b/simd-utils/simd-types.h index d272f67..6dbbb30 100644 --- a/simd-utils/simd-types.h +++ b/simd-utils/simd-types.h @@ -93,6 +93,7 @@ // my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 ); // Compiler check for __int128 support +// Configure also has a test for int128. #if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) ) #define GCC_INT128 1 #endif @@ -386,13 +387,3 @@ typedef union _regarray_v256 regarray_v256; #define u8_1e u8_._1e #define u8_1f u8_._1f - -// This is in use by, coincidentally, simd hash. -union _m256_v16 { - uint16_t u16[16]; - __m256i v256; -}; -typedef union _m256_v16 m256_v16; - - - diff --git a/winbuild-cross.sh b/winbuild-cross.sh index 9db3c5e..42f1549 100755 --- a/winbuild-cross.sh +++ b/winbuild-cross.sh @@ -34,7 +34,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/ make distclean || echo clean rm -f config.status ./autogen.sh || echo done -CFLAGS="-O3 -march=znver1 -DRYZEN_ -Wall" ./configure $CONFIGURE_ARGS +CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS make -j 16 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-zen.exe