From 8f94d029e33e8e1ca28ac78f0a6583941aaabc24 Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Fri, 17 Nov 2023 14:39:26 -0500 Subject: [PATCH] v23.11 --- RELEASE_NOTES | 28 ++- algo-gate-api.h | 2 +- algo/bmw/bmw512-4way.c | 72 +++++- algo/bmw/bmw512-gate.c | 5 +- algo/bmw/bmw512-gate.h | 12 +- algo/echo/aes_ni/hash.c | 24 +- algo/echo/aes_ni/hash_api.h | 14 +- algo/echo/sph_echo.c | 2 - algo/echo/sph_echo.h | 3 - algo/groestl/sph_groestl.c | 3 - algo/groestl/sph_groestl.h | 2 - algo/keccak/keccak-4way.c | 51 +++- algo/keccak/keccak-gate.c | 16 +- algo/keccak/keccak-gate.h | 50 +++- algo/keccak/sha3d-4way.c | 60 ++++- algo/quark/hmq1725-gate.c | 4 +- algo/quark/hmq1725-gate.h | 1 - algo/quark/hmq1725.c | 369 ++++++++++++----------------- algo/x16/hex.c | 25 +- algo/x16/minotaur.c | 16 +- algo/x16/x16r-4way.c | 399 ++++++++++++++++++++++++++++++++ algo/x16/x16r-gate.c | 59 +++-- algo/x16/x16r-gate.h | 158 ++++++++++--- algo/x16/x16r.c | 73 +++--- algo/x16/x16rt-4way.c | 55 ++++- algo/x16/x16rt.c | 2 +- algo/x16/x16rv2-4way.c | 450 ++++++++++++++++++++++++++++++++++++ algo/x16/x16rv2.c | 35 +-- algo/x16/x21s-4way.c | 116 ++++++++++ algo/x16/x21s.c | 2 +- algo/x22/x22i.c | 54 ++--- algo/x22/x25x.c | 56 ++--- configure | 20 +- configure.ac | 2 +- configure~ | 20 +- 35 files changed, 1721 insertions(+), 539 deletions(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 06cb800..fce7e29 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -27,17 +27,19 @@ See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions Requirements ------------ -Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not -supported. +- A x86_64 architecture CPU with a minimum of SSE2 support. This includes Intel Core2 and newer and AMD equivalents. +- Arm CPU supporting AArch64 and NEON. -64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi -are not supported. FreeBSD YMMV. +32 bit CPUs are not supported. -ARM requirements (Beta): +Older CPUs are supported by open source cpuminer-multi by TPruvot but at reduced performance. -CPU: Armv8 and NEON, SHA2 & AES are optional -OS: Linux distribution built for AArch64. -Packages: source code only. +Mining on mobile devices that meet the requirements is not recommended due to the risk of +overheating and damaging the battery. Mining has unlimited demand, it will push any device +to or beyond its limits. There is also a fire risk with overheated lithium batteries. + +Beware of apps claiming "mobile only mining". There is no such thing, they aren't miners. +If a mobile CPU can mine it any CPU can. See wiki for details. @@ -73,12 +75,18 @@ If not what makes it happen or not happen? Change Log ---------- +v23.11 + +This is a release candidate for full AArch64 support, marking the end of the Beta phase. +Fixed hmq1725 & x25x algos, SSE2 & NEON, broken in v3.23.4. +Most CPU-mineable SHA3 algos (X*) upgraded to 2-way SSE2 & NEON. + v23.10 x86_64: Fixed scrypt, scryptn2 algos SSE2. -Fixed sha512d256d algo AVX2, SSE2, NEON. +Fixed sha512256d algo AVX2, SSE2, NEON. Fixed a bug in Skein N-way that reduced performance. -ARM: Skein algo optimized for NEON & SHA2. +ARM: Skein optimized for NEON, SHA2 & SSE2. Skein2 algo 2-way optimized for NEON & SSE2. v23.9 diff --git a/algo-gate-api.h b/algo-gate-api.h index 74d12f0..045b306 100644 --- a/algo-gate-api.h +++ b/algo-gate-api.h @@ -99,7 +99,7 @@ typedef uint32_t set_t; #define AES_OPT 1 << 7 // Intel Westmere, AArch64 #define VAES_OPT 1 << 8 // Icelake, Zen3 #define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64 -#define SHA512_OPT 1 << 10 // AArch64 +#define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64 #define NEON_OPT 1 << 11 // AArch64 // AVX10 does not have explicit algo features: diff --git a/algo/bmw/bmw512-4way.c b/algo/bmw/bmw512-4way.c index 392e5f5..7352fed 100644 --- a/algo/bmw/bmw512-4way.c +++ b/algo/bmw/bmw512-4way.c @@ -2,12 +2,11 @@ #include #include #include -//#include "sph_keccak.h" #include "bmw-hash-4way.h" #if defined(BMW512_8WAY) -void bmw512hash_8way(void *state, const void *input) +void bmw512hash_8way( void *state, const void *input ) { bmw512_8way_context ctx; bmw512_8way_init( &ctx ); @@ -27,9 +26,9 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce, uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; const uint32_t last_nonce = max_nonce - 8; - __m512i *noncev = (__m512i*)vdata + 9; // aligned + __m512i *noncev = (__m512i*)vdata + 9; const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; + const int thr_id = mythr->id; mm512_bswap32_intrlv80_8x64( vdata, pdata ); do { @@ -43,7 +42,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce, if ( unlikely( hash7[ lane<<1 ] <= Htarg ) ) { extr_lane_8x64( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) ) + if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark )) { pdata[19] = n + lane; submit_solution( work, lane_hash, mythr ); @@ -59,9 +58,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce, #elif defined(BMW512_4WAY) -//#ifdef BMW512_4WAY - -void bmw512hash_4way(void *state, const void *input) +void bmw512hash_4way( void *state, const void *input ) { bmw512_4way_context ctx; bmw512_4way_init( &ctx ); @@ -80,10 +77,10 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce, uint32_t *ptarget = work->target; uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; - const uint32_t last_nonce = max_nonce - 4; - __m256i *noncev = (__m256i*)vdata + 9; // aligned + const uint32_t last_nonce = max_nonce - 4; + __m256i *noncev = (__m256i*)vdata + 9; const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated + const int thr_id = mythr->id; mm256_bswap32_intrlv80_4x64( vdata, pdata ); do { @@ -96,7 +93,7 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce, if ( unlikely( hash7[ lane<<1 ] <= Htarg ) ) { extr_lane_4x64( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) ) + if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark )) { pdata[19] = n + lane; submit_solution( work, lane_hash, mythr ); @@ -110,4 +107,55 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce, return 0; } +#elif defined(BMW512_2WAY) + +void bmw512hash_2x64( void *state, const void *input ) +{ + bmw512_2x64_context ctx; + bmw512_2x64_init( &ctx ); + bmw512_2x64_update( &ctx, input, 80 ); + bmw512_2x64_close( &ctx, state ); +} + +int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t vdata[24*2] __attribute__ ((aligned (64))); + uint32_t hash[16*2] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + uint32_t *hash7 = &(hash[13]); // 3*4+1 + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 2; + v128_t *noncev = (v128_t*)vdata + 9; + const uint32_t Htarg = ptarget[7]; + const int thr_id = mythr->id; + + v128_bswap32_intrlv80_2x64( vdata, pdata ); + do { + *noncev = v128_intrlv_blend_32( v128_bswap32( + v128_set32( n+1, 0, n, 0 ) ), *noncev ); + + bmw512hash_2x64( hash, vdata ); + + for ( int lane = 0; lane < 2; lane++ ) + if ( unlikely( hash7[ lane<<1 ] <= Htarg ) ) + { + extr_lane_2x64( lane_hash, hash, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark )) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } + } + n += 2; + + } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) ); + + *hashes_done = n - first_nonce; + return 0; +} + #endif diff --git a/algo/bmw/bmw512-gate.c b/algo/bmw/bmw512-gate.c index fb7d0d3..be0bdc5 100644 --- a/algo/bmw/bmw512-gate.c +++ b/algo/bmw/bmw512-gate.c @@ -2,7 +2,7 @@ bool register_bmw512_algo( algo_gate_t* gate ) { - gate->optimizations = AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT; opt_target_factor = 256.0; #if defined (BMW512_8WAY) gate->scanhash = (void*)&scanhash_bmw512_8way; @@ -10,6 +10,9 @@ bool register_bmw512_algo( algo_gate_t* gate ) #elif defined (BMW512_4WAY) gate->scanhash = (void*)&scanhash_bmw512_4way; gate->hash = (void*)&bmw512hash_4way; +#elif defined (BMW512_2WAY) + gate->scanhash = (void*)&scanhash_bmw512_2x64; + gate->hash = (void*)&bmw512hash_2x64; #else gate->scanhash = (void*)&scanhash_bmw512; gate->hash = (void*)&bmw512hash; diff --git a/algo/bmw/bmw512-gate.h b/algo/bmw/bmw512-gate.h index 4c7fb41..e7542ca 100644 --- a/algo/bmw/bmw512-gate.h +++ b/algo/bmw/bmw512-gate.h @@ -8,19 +8,27 @@ #define BMW512_8WAY 1 #elif defined(__AVX2__) #define BMW512_4WAY 1 +#elif defined(__SSE2__) || defined(__ARM_NEON) + #define BMW512_2WAY 1 #endif #if defined(BMW512_8WAY) void bmw512hash_8way( void *state, const void *input ); int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); + uint64_t *hashes_done, struct thr_info *mythr ); #elif defined(BMW512_4WAY) void bmw512hash_4way( void *state, const void *input ); int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); + uint64_t *hashes_done, struct thr_info *mythr ); + +#elif defined(BMW512_2WAY) + +void bmw512hash_2x64( void *state, const void *input ); +int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); #else diff --git a/algo/echo/aes_ni/hash.c b/algo/echo/aes_ni/hash.c index 057dedf..382ecf7 100644 --- a/algo/echo/aes_ni/hash.c +++ b/algo/echo/aes_ni/hash.c @@ -236,9 +236,7 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc } - - -HashReturn init_echo(hashState_echo *ctx, int nHashSize) +HashReturn init_echo( hashState_echo *ctx, int nHashSize ) { int i, j; @@ -280,7 +278,8 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize) return SUCCESS; } -HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen) +HashReturn update_echo( hashState_echo *state, const void *data, + uint32_t databitlen ) { unsigned int uByteLength, uBlockCount, uRemainingBytes; @@ -330,7 +329,7 @@ HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLengt return SUCCESS; } -HashReturn final_echo(hashState_echo *state, BitSequence *hashval) +HashReturn final_echo( hashState_echo *state, void *hashval) { v128_t remainingbits; @@ -407,8 +406,8 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval) return SUCCESS; } -HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval, - const BitSequence *data, DataLength databitlen ) +HashReturn update_final_echo( hashState_echo *state, void *hashval, + const void *data, uint32_t databitlen ) { unsigned int uByteLength, uBlockCount, uRemainingBytes; @@ -530,8 +529,8 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval, return SUCCESS; } -HashReturn echo_full( hashState_echo *state, BitSequence *hashval, - int nHashSize, const BitSequence *data, DataLength datalen ) +HashReturn echo_full( hashState_echo *state, void *hashval, + int nHashSize, const void *data, uint32_t datalen ) { int i, j; @@ -578,7 +577,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval, { // Fill the buffer memcpy( state->buffer + state->uBufferBytes, - (void*)data, state->uBlockLength - state->uBufferBytes ); + data, state->uBlockLength - state->uBufferBytes ); // Process buffer Compress( state, state->buffer, 1 ); @@ -601,7 +600,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval, } if( uRemainingBytes > 0 ) - memcpy(state->buffer, (void*)data, uRemainingBytes); + memcpy(state->buffer, data, uRemainingBytes); state->uBufferBytes = uRemainingBytes; } @@ -689,7 +688,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval, } - +#if 0 HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval) { HashReturn hRet; @@ -746,5 +745,6 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit return SUCCESS; } +#endif #endif diff --git a/algo/echo/aes_ni/hash_api.h b/algo/echo/aes_ni/hash_api.h index b961fe6..268cb6b 100644 --- a/algo/echo/aes_ni/hash_api.h +++ b/algo/echo/aes_ni/hash_api.h @@ -47,16 +47,16 @@ HashReturn init_echo(hashState_echo *state, int hashbitlen); HashReturn reinit_echo(hashState_echo *state); -HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen); +HashReturn update_echo(hashState_echo *state, const void *data, uint32_t databitlen); -HashReturn final_echo(hashState_echo *state, BitSequence *hashval); +HashReturn final_echo(hashState_echo *state, void *hashval); -HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval); +HashReturn hash_echo(int hashbitlen, const void *data, uint32_t databitlen, void *hashval); -HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval, - const BitSequence *data, DataLength databitlen ); -HashReturn echo_full( hashState_echo *state, BitSequence *hashval, - int nHashSize, const BitSequence *data, DataLength databitlen ); +HashReturn update_final_echo( hashState_echo *state, void *hashval, + const void *data, uint32_t databitlen ); +HashReturn echo_full( hashState_echo *state, void *hashval, + int nHashSize, const void *data, uint32_t databitlen ); #endif // HASH_API_H diff --git a/algo/echo/sph_echo.c b/algo/echo/sph_echo.c index b7b3c06..16b6ee3 100644 --- a/algo/echo/sph_echo.c +++ b/algo/echo/sph_echo.c @@ -36,7 +36,6 @@ #include "sph_echo.h" -#if !defined(__AES__) #ifdef __cplusplus extern "C"{ @@ -1031,4 +1030,3 @@ sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) #ifdef __cplusplus } #endif -#endif // !AES diff --git a/algo/echo/sph_echo.h b/algo/echo/sph_echo.h index 8165f7b..b67e258 100644 --- a/algo/echo/sph_echo.h +++ b/algo/echo/sph_echo.h @@ -36,8 +36,6 @@ #ifndef SPH_ECHO_H__ #define SPH_ECHO_H__ -#if !defined(__AES__) - #ifdef __cplusplus extern "C"{ #endif @@ -318,5 +316,4 @@ void sph_echo512_addbits_and_close( #ifdef __cplusplus } #endif -#endif // !AES #endif diff --git a/algo/groestl/sph_groestl.c b/algo/groestl/sph_groestl.c index 8f12430..11d48a6 100644 --- a/algo/groestl/sph_groestl.c +++ b/algo/groestl/sph_groestl.c @@ -35,8 +35,6 @@ #include "sph_groestl.h" -#if !defined(__AES__) - #ifdef __cplusplus extern "C"{ #endif @@ -3119,5 +3117,4 @@ sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) #ifdef __cplusplus } -#endif // !AES #endif diff --git a/algo/groestl/sph_groestl.h b/algo/groestl/sph_groestl.h index 899d716..27c7db0 100644 --- a/algo/groestl/sph_groestl.h +++ b/algo/groestl/sph_groestl.h @@ -42,7 +42,6 @@ extern "C"{ #include #include "compat/sph_types.h" -#if !defined(__AES__) /** * Output size (in bits) for Groestl-224. */ @@ -327,5 +326,4 @@ void sph_groestl512_addbits_and_close( } #endif -#endif // !AES #endif diff --git a/algo/keccak/keccak-4way.c b/algo/keccak/keccak-4way.c index 6bc1b2c..a9c9cad 100644 --- a/algo/keccak/keccak-4way.c +++ b/algo/keccak/keccak-4way.c @@ -78,7 +78,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce, uint32_t *ptarget = work->target; uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; - __m256i *noncev = (__m256i*)vdata + 9; // aligned + __m256i *noncev = (__m256i*)vdata + 9; // aligned const uint32_t Htarg = ptarget[7]; const int thr_id = mythr->id; const bool bench = opt_benchmark; @@ -108,4 +108,53 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce, return 0; } +#elif defined(KECCAK_2WAY) + +void keccakhash_2x64(void *state, const void *input) +{ + keccak256_2x64_context ctx; + keccak256_2x64_init( &ctx ); + keccak256_2x64_update( &ctx, input, 80 ); + keccak256_2x64_close( &ctx, state ); +} + +int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t vdata[24*2] __attribute__ ((aligned (64))); + uint32_t hash[16*2] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + uint32_t *hash7 = &(hash[13]); // 3*4+1 + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + v128_t *noncev = (v128_t*)vdata + 9; + const uint32_t Htarg = ptarget[7]; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + + v128_bswap32_intrlv80_2x64( vdata, pdata ); + *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev ); + do { + keccakhash_2x64( hash, vdata ); + + for ( int lane = 0; lane < 2; lane++ ) + if unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) + { + extr_lane_2x64( lane_hash, hash, lane, 256 ); + if ( valid_hash( lane_hash, ptarget )) + { + pdata[19] = bswap_32( n + lane ); + submit_solution( work, lane_hash, mythr ); + } + } + *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) ); + n += 2; + } while ( (n < max_nonce-2) && !work_restart[thr_id].restart); + pdata[19] = n; + *hashes_done = n - first_nonce + 1; + return 0; +} + #endif diff --git a/algo/keccak/keccak-gate.c b/algo/keccak/keccak-gate.c index b2f0a21..b0021eb 100644 --- a/algo/keccak/keccak-gate.c +++ b/algo/keccak/keccak-gate.c @@ -17,6 +17,9 @@ bool register_keccak_algo( algo_gate_t* gate ) #elif defined (KECCAK_4WAY) gate->scanhash = (void*)&scanhash_keccak_4way; gate->hash = (void*)&keccakhash_4way; +#elif defined (KECCAK_2WAY) + gate->scanhash = (void*)&scanhash_keccak_2x64; + gate->hash = (void*)&keccakhash_2x64; #else gate->scanhash = (void*)&scanhash_keccak; gate->hash = (void*)&keccakhash; @@ -37,6 +40,9 @@ bool register_keccakc_algo( algo_gate_t* gate ) #elif defined (KECCAK_4WAY) gate->scanhash = (void*)&scanhash_keccak_4way; gate->hash = (void*)&keccakhash_4way; +#elif defined (KECCAK_2WAY) + gate->scanhash = (void*)&scanhash_keccak_2x64; + gate->hash = (void*)&keccakhash_2x64; #else gate->scanhash = (void*)&scanhash_keccak; gate->hash = (void*)&keccakhash; @@ -75,15 +81,17 @@ void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx ) bool register_sha3d_algo( algo_gate_t* gate ) { hard_coded_eb = 6; -// opt_extranonce = false; - gate->optimizations = AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT; gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root; -#if defined (KECCAK_8WAY) +#if defined (SHA3D_8WAY) gate->scanhash = (void*)&scanhash_sha3d_8way; gate->hash = (void*)&sha3d_hash_8way; -#elif defined (KECCAK_4WAY) +#elif defined (SHA3D_4WAY) gate->scanhash = (void*)&scanhash_sha3d_4way; gate->hash = (void*)&sha3d_hash_4way; +#elif defined (SHA3D_2WAY) + gate->scanhash = (void*)&scanhash_sha3d_2x64; + gate->hash = (void*)&sha3d_hash_2x64; #else gate->scanhash = (void*)&scanhash_sha3d; gate->hash = (void*)&sha3d_hash; diff --git a/algo/keccak/keccak-gate.h b/algo/keccak/keccak-gate.h index cee3d00..bd2b6a3 100644 --- a/algo/keccak/keccak-gate.h +++ b/algo/keccak/keccak-gate.h @@ -8,6 +8,16 @@ #define KECCAK_8WAY 1 #elif defined(__AVX2__) #define KECCAK_4WAY 1 +#elif defined(__SSE2__) || defined(__ARM_NEON) + #define KECCAK_2WAY 1 +#endif + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define SHA3D_8WAY 1 +#elif defined(__AVX2__) + #define SHA3D_4WAY 1 +#elif defined(__SSE2__) || defined(__ARM_NEON) + #define SHA3D_2WAY 1 #endif extern int hard_coded_eb; @@ -16,27 +26,47 @@ extern int hard_coded_eb; void keccakhash_8way( void *state, const void *input ); int scanhash_keccak_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -void sha3d_hash_8way( void *state, const void *input ); -int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); + uint64_t *hashes_done, struct thr_info *mythr ); #elif defined(KECCAK_4WAY) void keccakhash_4way( void *state, const void *input ); int scanhash_keccak_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); + uint64_t *hashes_done, struct thr_info *mythr ); -void sha3d_hash_4way( void *state, const void *input ); -int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); +#elif defined(KECCAK_2WAY) + +void keccakhash_2x64( void *state, const void *input ); +int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); #else void keccakhash( void *state, const void *input ); int scanhash_keccak( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); + uint64_t *hashes_done, struct thr_info *mythr ); + +#endif + +#if defined(SHA3D_8WAY) + +void sha3d_hash_8way( void *state, const void *input ); +int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#elif defined(SHA3D_4WAY) + +void sha3d_hash_4way( void *state, const void *input ); +int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#elif defined(SHA3D_2WAY) + +void sha3d_hash_2x64( void *state, const void *input ); +int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#else void sha3d_hash( void *state, const void *input ); int scanhash_sha3d( struct work *work, uint32_t max_nonce, diff --git a/algo/keccak/sha3d-4way.c b/algo/keccak/sha3d-4way.c index d11df12..ba885b3 100644 --- a/algo/keccak/sha3d-4way.c +++ b/algo/keccak/sha3d-4way.c @@ -4,7 +4,7 @@ #include #include "keccak-hash-4way.h" -#if defined(KECCAK_8WAY) +#if defined(SHA3D_8WAY) void sha3d_hash_8way(void *state, const void *input) { @@ -64,7 +64,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce, return 0; } -#elif defined(KECCAK_4WAY) +#elif defined(SHA3D_4WAY) void sha3d_hash_4way(void *state, const void *input) { @@ -122,4 +122,60 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce, return 0; } +#elif defined(SHA3D_2WAY) + +void sha3d_hash_2x64(void *state, const void *input) +{ + uint32_t buffer[16*4] __attribute__ ((aligned (64))); + keccak256_2x64_context ctx; + + keccak256_2x64_init( &ctx ); + keccak256_2x64_update( &ctx, input, 80 ); + keccak256_2x64_close( &ctx, buffer ); + + keccak256_2x64_init( &ctx ); + keccak256_2x64_update( &ctx, buffer, 32 ); + keccak256_2x64_close( &ctx, state ); +} + +int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t vdata[24*2] __attribute__ ((aligned (64))); + uint32_t hash[16*2] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + uint32_t *hash7 = &(hash[13]); // 3*4+1 + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 2; + v128_t *noncev = (v128_t*)vdata + 9; + const uint32_t Htarg = ptarget[7]; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + + v128_bswap32_intrlv80_2x64( vdata, pdata ); + *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev ); + do { + sha3d_hash_2x64( hash, vdata ); + + for ( int lane = 0; lane < 2; lane++ ) + if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) ) + { + extr_lane_2x64( lane_hash, hash, lane, 256 ); + if ( valid_hash( lane_hash, ptarget ) ) + { + pdata[19] = bswap_32( n + lane ); + submit_solution( work, lane_hash, mythr ); + } + } + *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) ); + n += 2; + } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) ); + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + #endif diff --git a/algo/quark/hmq1725-gate.c b/algo/quark/hmq1725-gate.c index 4c64bff..e5977db 100644 --- a/algo/quark/hmq1725-gate.c +++ b/algo/quark/hmq1725-gate.c @@ -9,11 +9,11 @@ bool register_hmq1725_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_hmq1725_4way; gate->hash = (void*)&hmq1725_4way_hash; #else - init_hmq1725_ctx(); gate->scanhash = (void*)&scanhash_hmq1725; gate->hash = (void*)&hmq1725hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT + | NEON_OPT; opt_target_factor = 65536.0; return true; }; diff --git a/algo/quark/hmq1725-gate.h b/algo/quark/hmq1725-gate.h index faef6fc..bc0ff99 100644 --- a/algo/quark/hmq1725-gate.h +++ b/algo/quark/hmq1725-gate.h @@ -29,7 +29,6 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce, void hmq1725hash( void *state, const void *input ); int scanhash_hmq1725( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); -void init_hmq1725_ctx(); #endif diff --git a/algo/quark/hmq1725.c b/algo/quark/hmq1725.c index ea0119a..67f055e 100644 --- a/algo/quark/hmq1725.c +++ b/algo/quark/hmq1725.c @@ -4,346 +4,267 @@ #include #include -#include "algo/blake/sph_blake.h" +#include "algo/blake/blake512-hash.h" #include "algo/bmw/sph_bmw.h" -#include "algo/groestl/sph_groestl.h" +#if defined(__AES__) + #include "algo/groestl/aes_ni/hash-groestl.h" + #include "algo/fugue/fugue-aesni.h" +#else + #include "algo/groestl/sph_groestl.h" + #include "algo/fugue/sph_fugue.h" +#endif +#if defined(__AES__) || defined(__ARM_FEATURE_AES) + #include "algo/echo/aes_ni/hash_api.h" +#else + #include "algo/echo/sph_echo.h" +#endif #include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" #include "algo/shavite/sph_shavite.h" -#include "algo/echo/sph_echo.h" #include "algo/hamsi/sph_hamsi.h" -#include "algo/fugue/sph_fugue.h" #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" #include "algo/haval/sph-haval.h" #include "algo/sha/sph_sha2.h" -#if defined(__AES__) - #include "algo/groestl/aes_ni/hash-groestl.h" - #include "algo/echo/aes_ni/hash_api.h" - #include "algo/fugue/fugue-aesni.h" -#else - #include "algo/groestl/sph_groestl.h" - #include "algo/echo/sph_echo.h" - #include "algo/fugue/sph_fugue.h" -#endif #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" -#if defined(__aarch64__) - #include "algo/simd/sph_simd.h" -#else - #include "algo/simd/nist.h" -#endif +#include "algo/simd/simd-hash-2way.h" -typedef struct { - sph_blake512_context blake1, blake2; - sph_bmw512_context bmw1, bmw2, bmw3; - sph_skein512_context skein1, skein2; - sph_jh512_context jh1, jh2; - sph_keccak512_context keccak1, keccak2; - hashState_luffa luffa1, luffa2; - cubehashParam cube; - sph_shavite512_context shavite1, shavite2; -#if defined(__aarch64__) - sph_simd512_context simd1, simd2; -#else - hashState_sd simd1, simd2; -#endif - sph_hamsi512_context hamsi1; - sph_shabal512_context shabal1; - sph_whirlpool_context whirlpool1, whirlpool2, whirlpool3, whirlpool4; - sph_sha512_context sha1, sha2; - sph_haval256_5_context haval1, haval2; -#if defined(__AES__) - hashState_echo echo1, echo2; - hashState_groestl groestl1, groestl2; - hashState_fugue fugue1, fugue2; -#else - sph_groestl512_context groestl1, groestl2; - sph_echo512_context echo1, echo2; - sph_fugue512_context fugue1, fugue2; -#endif -} hmq1725_ctx_holder; - -static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64))); -static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64))); - -void init_hmq1725_ctx() +union _hmq1725_ctx_holder { - sph_blake512_init(&hmq1725_ctx.blake1); - sph_blake512_init(&hmq1725_ctx.blake2); - - sph_bmw512_init(&hmq1725_ctx.bmw1); - sph_bmw512_init(&hmq1725_ctx.bmw2); - sph_bmw512_init(&hmq1725_ctx.bmw3); - - sph_skein512_init(&hmq1725_ctx.skein1); - sph_skein512_init(&hmq1725_ctx.skein2); - - sph_jh512_init(&hmq1725_ctx.jh1); - sph_jh512_init(&hmq1725_ctx.jh2); - - sph_keccak512_init(&hmq1725_ctx.keccak1); - sph_keccak512_init(&hmq1725_ctx.keccak2); - - init_luffa( &hmq1725_ctx.luffa1, 512 ); - init_luffa( &hmq1725_ctx.luffa2, 512 ); - - cubehashInit( &hmq1725_ctx.cube, 512, 16, 32 ); - - sph_shavite512_init(&hmq1725_ctx.shavite1); - sph_shavite512_init(&hmq1725_ctx.shavite2); - -#if defined(__aarch64__) - sph_simd512_init(&hmq1725_ctx.simd1); - sph_simd512_init(&hmq1725_ctx.simd2); -#else - init_sd( &hmq1725_ctx.simd1, 512 ); - init_sd( &hmq1725_ctx.simd2, 512 ); -#endif - - sph_hamsi512_init(&hmq1725_ctx.hamsi1); - + blake512_context blake; + sph_bmw512_context bmw; #if defined(__AES__) - fugue512_Init( &hmq1725_ctx.fugue1, 512 ); - fugue512_Init( &hmq1725_ctx.fugue2, 512 ); + hashState_groestl groestl; + hashState_fugue fugue; #else - sph_fugue512_init(&hmq1725_ctx.fugue1); - sph_fugue512_init(&hmq1725_ctx.fugue2); + sph_groestl512_context groestl; + sph_fugue512_context fugue; #endif - - sph_shabal512_init(&hmq1725_ctx.shabal1); - - sph_whirlpool_init(&hmq1725_ctx.whirlpool1); - sph_whirlpool_init(&hmq1725_ctx.whirlpool2); - sph_whirlpool_init(&hmq1725_ctx.whirlpool3); - sph_whirlpool_init(&hmq1725_ctx.whirlpool4); - - sph_sha512_init( &hmq1725_ctx.sha1 ); - sph_sha512_init( &hmq1725_ctx.sha2 ); - - sph_haval256_5_init(&hmq1725_ctx.haval1); - sph_haval256_5_init(&hmq1725_ctx.haval2); - -#if defined(__AES__) - init_echo( &hmq1725_ctx.echo1, 512 ); - init_echo( &hmq1725_ctx.echo2, 512 ); - init_groestl( &hmq1725_ctx.groestl1, 64 ); - init_groestl( &hmq1725_ctx.groestl2, 64 ); +#if defined(__AES__) || defined(__ARM_FEATURE_AES) + hashState_echo echo; #else - sph_groestl512_init( &hmq1725_ctx.groestl1 ); - sph_groestl512_init( &hmq1725_ctx.groestl2 ); - sph_echo512_init( &hmq1725_ctx.echo1 ); - sph_echo512_init( &hmq1725_ctx.echo2 ); + sph_echo512_context echo; #endif -} + sph_skein512_context skein; + sph_jh512_context jh; + sph_keccak512_context keccak; + hashState_luffa luffa; + cubehashParam cube; + sph_shavite512_context shavite; + simd512_context simd; + sph_hamsi512_context hamsi; + sph_shabal512_context shabal; + sph_whirlpool_context whirlpool; + sph_sha512_context sha; + sph_haval256_5_context haval; +}; +typedef union _hmq1725_ctx_holder hmq1725_ctx_holder; -void hmq_bmw512_midstate( const void* input ) -{ - memcpy( &hmq_bmw_mid, &hmq1725_ctx.bmw1, sizeof hmq_bmw_mid ); - sph_bmw512( &hmq_bmw_mid, input, 64 ); -} - -__thread hmq1725_ctx_holder h_ctx __attribute__ ((aligned (64))); +//static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64))); +//static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64))); extern void hmq1725hash(void *state, const void *input) { const uint32_t mask = 24; - uint32_t hashA[32] __attribute__((aligned(64))); - uint32_t hashB[32] __attribute__((aligned(64))); - const int midlen = 64; // bytes - const int tail = 80 - midlen; // 16 + uint32_t hashA[32] __attribute__((aligned(32))); + uint32_t hashB[32] __attribute__((aligned(32))); + hmq1725_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy(&h_ctx, &hmq1725_ctx, sizeof(hmq1725_ctx)); + sph_bmw512_init( &ctx.bmw ); + sph_bmw512( &ctx.bmw, input, 80 ); + sph_bmw512_close( &ctx.bmw, hashA ); //1 - memcpy( &h_ctx.bmw1, &hmq_bmw_mid, sizeof hmq_bmw_mid ); - sph_bmw512( &h_ctx.bmw1, input + midlen, tail ); - sph_bmw512_close(&h_ctx.bmw1, hashA); //1 - - sph_whirlpool (&h_ctx.whirlpool1, hashA, 64); //0 - sph_whirlpool_close(&h_ctx.whirlpool1, hashB); //1 + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //0 + sph_whirlpool_close( &ctx.whirlpool, hashB ); //1 if ( hashB[0] & mask ) //1 { #if defined(__AES__) - update_and_final_groestl( &h_ctx.groestl1, (char*)hashA, - (const char*)hashB, 512 ); + groestl512_full( &ctx.groestl, hashA, hashB, 512 ); #else - sph_groestl512 (&h_ctx.groestl1, hashB, 64); //1 - sph_groestl512_close(&h_ctx.groestl1, hashA); //2 + sph_groestl512_init( &ctx.groestl ); + sph_groestl512( &ctx.groestl, hashB, 64 ); //1 + sph_groestl512_close( &ctx.groestl, hashA ); //2 #endif } else { - sph_skein512 (&h_ctx.skein1, hashB, 64); //1 - sph_skein512_close(&h_ctx.skein1, hashA); //2 + sph_skein512_init( &ctx.skein ); + sph_skein512( &ctx.skein, hashB, 64 ); //1 + sph_skein512_close( &ctx.skein, hashA ); //2 } - sph_jh512 (&h_ctx.jh1, hashA, 64); //3 - sph_jh512_close(&h_ctx.jh1, hashB); //4 + sph_jh512_init( &ctx.jh ); + sph_jh512( &ctx.jh, hashA, 64 ); //3 + sph_jh512_close( &ctx.jh, hashB ); //4 - sph_keccak512 (&h_ctx.keccak1, hashB, 64); //2 - sph_keccak512_close(&h_ctx.keccak1, hashA); //3 + sph_keccak512_init( &ctx.keccak ); + sph_keccak512( &ctx.keccak, hashB, 64 ); //2 + sph_keccak512_close( &ctx.keccak, hashA ); //3 if ( hashA[0] & mask ) //4 { - sph_blake512 (&h_ctx.blake1, hashA, 64); // - sph_blake512_close(&h_ctx.blake1, hashB); //5 + blake512_init( &ctx.blake ); + blake512_update( &ctx.blake, hashA, 64 ); + blake512_close( &ctx.blake, hashB ); } else { - sph_bmw512 (&h_ctx.bmw2, hashA, 64); //4 - sph_bmw512_close(&h_ctx.bmw2, hashB); //5 + sph_bmw512_init( &ctx.bmw ); + sph_bmw512( &ctx.bmw, hashA, 64 ); //4 + sph_bmw512_close( &ctx.bmw, hashB ); //5 } - update_and_final_luffa( &h_ctx.luffa1, hashA, hashB, 64 ); + luffa_full( &ctx.luffa, hashA, 512, hashB, 64 ); - cubehashUpdateDigest( &h_ctx.cube, hashB, hashA, 64 ); + cubehash_full( &ctx.cube, hashB, 512, hashA, 64 ); if ( hashB[0] & mask ) //7 { - sph_keccak512 (&h_ctx.keccak2, hashB, 64); // - sph_keccak512_close(&h_ctx.keccak2, hashA); //8 + sph_keccak512_init( &ctx.keccak ); + sph_keccak512( &ctx.keccak, hashB, 64 ); // + sph_keccak512_close( &ctx.keccak, hashA ); //8 } else { - sph_jh512 (&h_ctx.jh2, hashB, 64); //7 - sph_jh512_close(&h_ctx.jh2, hashA); //8 + sph_jh512_init( &ctx.jh ); + sph_jh512( &ctx.jh, hashB, 64 ); //7 + sph_jh512_close( &ctx.jh, hashA ); //8 } - sph_shavite512 (&h_ctx.shavite1, hashA, 64); //3 - sph_shavite512_close(&h_ctx.shavite1, hashB); //4 + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hashA, 64 ); //3 + sph_shavite512_close( &ctx.shavite, hashB ); //4 -#if defined(__aarch64__) - sph_simd512 (&h_ctx.simd1, hashB, 64); //3 - sph_simd512_close(&h_ctx.simd1, hashA); //4 -#else - update_final_sd( &h_ctx.simd1, (BitSequence *)hashA, - (const BitSequence *)hashB, 512 ); -#endif + simd512_ctx( &ctx.simd, hashA, hashB, 64 ); if ( hashA[0] & mask ) //4 { - sph_whirlpool (&h_ctx.whirlpool2, hashA, 64); // - sph_whirlpool_close(&h_ctx.whirlpool2, hashB); //5 + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hashA, 64 ); // + sph_whirlpool_close( &ctx.whirlpool, hashB ); //5 } else { - sph_haval256_5 (&h_ctx.haval1, hashA, 64); //4 - sph_haval256_5_close(&h_ctx.haval1, hashB); //5 + sph_haval256_5_init( &ctx.haval ); + sph_haval256_5( &ctx.haval, hashA, 64 ); //4 + sph_haval256_5_close( &ctx.haval, hashB ); //5 memset(&hashB[8], 0, 32); } -#if defined(__AES__) - update_final_echo ( &h_ctx.echo1, (BitSequence *)hashA, - (const BitSequence *)hashB, 512 ); +#if defined(__AES__) || defined(__ARM_FEATURE_AES) + echo_full( &ctx.echo, hashA, 512, hashB, 64 ); #else - sph_echo512 (&h_ctx.echo1, hashB, 64); //5 - sph_echo512_close(&h_ctx.echo1, hashA); //6 + sph_echo512_init( &ctx.echo ); + sph_echo512( &ctx.echo, hashB, 64 ); //5 + sph_echo512_close( &ctx.echo, hashA ); //6 #endif - sph_blake512 (&h_ctx.blake2, hashA, 64); //6 - sph_blake512_close(&h_ctx.blake2, hashB); //7 + blake512_init( &ctx.blake ); + blake512_update( &ctx.blake, hashA, 64 ); + blake512_close( &ctx.blake, hashB ); if ( hashB[0] & mask ) //7 { - sph_shavite512 (&h_ctx.shavite2, hashB, 64); // - sph_shavite512_close(&h_ctx.shavite2, hashA); //8 + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hashB, 64 ); // + sph_shavite512_close( &ctx.shavite, hashA ); //8 } else - { - update_and_final_luffa( &h_ctx.luffa2, hashA, hashB, 64 ); - } + luffa_full( &ctx.luffa, hashA, 512, hashB, 64 ); - sph_hamsi512 (&h_ctx.hamsi1, hashA, 64); //3 - sph_hamsi512_close(&h_ctx.hamsi1, hashB); //4 + sph_hamsi512_init( &ctx.hamsi ); + sph_hamsi512( &ctx.hamsi, hashA, 64 ); //3 + sph_hamsi512_close( &ctx.hamsi, hashB ); //4 #if defined(__AES__) - fugue512_Update( &h_ctx.fugue1, hashB, 512 ); //2 //// - fugue512_Final( &h_ctx.fugue1, hashA ); //3 + fugue512_full( &ctx.fugue, hashA, hashB, 64 ); #else - sph_fugue512 (&h_ctx.fugue1, hashB, 64); //2 //// - sph_fugue512_close(&h_ctx.fugue1, hashA); //3 + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hashB, 64 ); //2 //// + sph_fugue512_close( &ctx.fugue, hashA ); //3 #endif if ( hashA[0] & mask ) //4 { -#if defined(__AES__) - update_final_echo ( &h_ctx.echo2, (BitSequence *)hashB, - (const BitSequence *)hashA, 512 ); +#if defined(__AES__) || defined(__ARM_FEATURE_AES) + echo_full( &ctx.echo, hashB, 512, hashA, 64 ); #else - sph_echo512 (&h_ctx.echo2, hashA, 64); // - sph_echo512_close(&h_ctx.echo2, hashB); //5 + sph_echo512_init( &ctx.echo ); + sph_echo512( &ctx.echo, hashA, 64 ); // + sph_echo512_close( &ctx.echo, hashB ); //5 #endif } else - { -#if defined(__aarch64__) - sph_simd512(&h_ctx.simd2, hashA, 64); //6 - sph_simd512_close(&h_ctx.simd2, hashB); //7 -#else - update_final_sd( &h_ctx.simd2, (BitSequence *)hashB, - (const BitSequence *)hashA, 512 ); -#endif - } + simd512_ctx( &ctx.simd, hashB, hashA, 64 ); - sph_shabal512 (&h_ctx.shabal1, hashB, 64); //5 - sph_shabal512_close(&h_ctx.shabal1, hashA); //6 + sph_shabal512_init( &ctx.shabal ); + sph_shabal512( &ctx.shabal, hashB, 64 ); //5 + sph_shabal512_close( &ctx.shabal, hashA ); //6 - sph_whirlpool (&h_ctx.whirlpool3, hashA, 64); //6 - sph_whirlpool_close(&h_ctx.whirlpool3, hashB); //7 + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //6 + sph_whirlpool_close( &ctx.whirlpool, hashB ); //7 if ( hashB[0] & mask ) //7 { #if defined(__AES__) - fugue512_Update( &h_ctx.fugue2, hashB, 512 ); // - fugue512_Final( &h_ctx.fugue2, hashA ); //8 + fugue512_full( &ctx.fugue, hashA, hashB, 64 ); #else - sph_fugue512 (&h_ctx.fugue2, hashB, 64); // - sph_fugue512_close(&h_ctx.fugue2, hashA); //8 + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hashB, 64 ); // + sph_fugue512_close( &ctx.fugue, hashA ); //8 #endif } else { - sph_sha512( &h_ctx.sha1, hashB, 64 ); - sph_sha512_close( &h_ctx.sha1, hashA ); + sph_sha512_init( &ctx.sha ); + sph_sha512( &ctx.sha, hashB, 64 ); + sph_sha512_close( &ctx.sha, hashA ); } #if defined(__AES__) - update_and_final_groestl( &h_ctx.groestl2, (char*)hashB, - (const char*)hashA, 512 ); + groestl512_full( &ctx.groestl, hashB, hashA, 512 ); #else - sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3 - sph_groestl512_close(&h_ctx.groestl2, hashB); //4 + sph_groestl512_init( &ctx.groestl ); + sph_groestl512( &ctx.groestl, hashA, 64 ); //3 + sph_groestl512_close( &ctx.groestl, hashB ); //4 #endif - sph_sha512( &h_ctx.sha2, hashB, 64 ); - sph_sha512_close( &h_ctx.sha2, hashA ); + sph_sha512_init( &ctx.sha ); + sph_sha512( &ctx.sha, hashB, 64 ); + sph_sha512_close( &ctx.sha, hashA ); if ( hashA[0] & mask ) //4 { - sph_haval256_5 (&h_ctx.haval2, hashA, 64); // - sph_haval256_5_close(&h_ctx.haval2, hashB); //5 - memset(&hashB[8], 0, 32); + sph_haval256_5_init( &ctx.haval ); + sph_haval256_5( &ctx.haval, hashA, 64 ); // + sph_haval256_5_close( &ctx.haval, hashB ); //5 + memset( &hashB[8], 0, 32 ); } else { - sph_whirlpool (&h_ctx.whirlpool4, hashA, 64); //4 - sph_whirlpool_close(&h_ctx.whirlpool4, hashB); //5 + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //4 + sph_whirlpool_close( &ctx.whirlpool, hashB ); //5 } - sph_bmw512 (&h_ctx.bmw3, hashB, 64); //5 - sph_bmw512_close(&h_ctx.bmw3, hashA); //6 + sph_bmw512_init( &ctx.bmw ); + sph_bmw512( &ctx.bmw, hashB, 64 ); //5 + sph_bmw512_close( &ctx.bmw, hashA ); //6 - memcpy(state, hashA, 32); + memcpy( state, hashA, 32 ); } int scanhash_hmq1725( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { // uint32_t endiandata[32] __attribute__((aligned(64))); - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t hash64[8] __attribute__((aligned(64))); + uint32_t endiandata[20] __attribute__((aligned(32))); + uint32_t hash64[8] __attribute__((aligned(32))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; uint32_t n = pdata[19] - 1; @@ -356,7 +277,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce, for (int k = 0; k < 20; k++) be32enc(&endiandata[k], pdata[k]); - hmq_bmw512_midstate( endiandata ); +// hmq_bmw512_midstate( endiandata ); // if (opt_debug) // { diff --git a/algo/x16/hex.c b/algo/x16/hex.c index b87e8ef..d3b213a 100644 --- a/algo/x16/hex.c +++ b/algo/x16/hex.c @@ -52,7 +52,7 @@ int hex_hash( void* output, const void* input, int thrid ) break; case GROESTL: #if defined(__AES__) - groestl512_full( &ctx.groestl, (char*)hash, (char*)in, size<<3 ); + groestl512_full( &ctx.groestl, hash, in, size<<3 ); #else sph_groestl512_init( &ctx.groestl ); sph_groestl512( &ctx.groestl, in, size ); @@ -108,26 +108,15 @@ int hex_hash( void* output, const void* input, int thrid ) shavite512_full( &ctx.shavite, hash, in, size ); break; case SIMD: -#if defined(__aarch64__) - sph_simd512_init( &ctx.simd ); - sph_simd512(&ctx.simd, (const void*) hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - simd_full( &ctx.simd, (BitSequence *)hash, - (const BitSequence*)in, size<<3 ); - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence*)in, size<<3 ); -#endif + simd512_ctx( &ctx.simd, hash, in, size<<3 ); break; case ECHO: -#if defined(__AES__) - echo_full( &ctx.echo, (BitSequence *)hash, 512, - (const BitSequence *)in, size ); +#if defined(__AES__) || defined(__ARM_FEATURE_AES) + echo_full( &ctx.echo, hash, 512, in, size ); #else - sph_echo512_init( &ctx.echo ); - sph_echo512( &ctx.echo, in, size ); - sph_echo512_close( &ctx.echo, hash ); + sph_echo512_init( &ctx.echo ); + sph_echo512( &ctx.echo, in, size ); + sph_echo512_close( &ctx.echo, hash ); #endif break; case HAMSI: diff --git a/algo/x16/minotaur.c b/algo/x16/minotaur.c index d1e3e77..1ffb7b0 100644 --- a/algo/x16/minotaur.c +++ b/algo/x16/minotaur.c @@ -14,9 +14,9 @@ #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/simd-hash-2way.h" -#if defined(__aarch64__) - #include "algo/simd/sph_simd.h" -#endif +//#if defined(__aarch64__) +// #include "algo/simd/sph_simd.h" +//#endif #include "algo/hamsi/sph_hamsi.h" #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" @@ -24,10 +24,14 @@ #include "algo/yespower/yespower.h" #if defined(__AES__) || defined(__ARM_FEATURE_AES) #include "algo/echo/aes_ni/hash_api.h" - #include "algo/groestl/aes_ni/hash-groestl.h" +#else + #include "algo/echo/sph_echo.h" +#endif +#if defined(__AES__) // || defined(__ARM_FEATURE_AES) + #include "algo/groestl/aes_ni/hash-groestl.h" +#else + #include "algo/groestl/sph_groestl.h" #endif -#include "algo/echo/sph_echo.h" -#include "algo/groestl/sph_groestl.h" #if defined(__AES__) #include "algo/fugue/fugue-aesni.h" #else diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c index b16970b..88c25f7 100644 --- a/algo/x16/x16r-4way.c +++ b/algo/x16/x16r-4way.c @@ -971,4 +971,403 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce, return 0; } +#elif defined (X16R_2WAY) + +void x16r_2x64_prehash( void *vdata, void *pdata ) +{ + uint32_t edata[20] __attribute__ ((aligned (64))); + const char elem = x16r_hash_order[0]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch ( algo ) + { + case JH: + v128_bswap32_intrlv80_2x64( vdata, pdata ); + jh512_2x64_init( &x16r_ctx.jh ); + jh512_2x64_update( &x16r_ctx.jh, vdata, 64 ); + break; + case KECCAK: + v128_bswap32_intrlv80_2x64( vdata, pdata ); + keccak512_2x64_init( &x16r_ctx.keccak ); + keccak512_2x64_update( &x16r_ctx.keccak, vdata, 72 ); + break; + case SKEIN: + v128_bswap32_intrlv80_2x64( vdata, pdata ); + skein512_2x64_prehash64( &x16r_ctx.skein, vdata ); + break; + case LUFFA: + { + v128_bswap32_80( edata, pdata ); + init_luffa( &x16r_ctx.luffa, 512 ); + update_luffa( &x16r_ctx.luffa, edata, 64 ); + intrlv_2x64( vdata, edata, edata, 640 ); + } + break; + case CUBEHASH: + { + v128_bswap32_80( edata, pdata ); + cubehashInit( &x16r_ctx.cube, 512, 16, 32 ); + cubehashUpdate( &x16r_ctx.cube, edata, 64 ); + intrlv_2x64( vdata, edata, edata, 640 ); + } + break; + case HAMSI: +#if defined(__SSE4_2__) + v128_bswap32_intrlv80_2x64( vdata, pdata ); + hamsi512_2x64_init( &x16r_ctx.hamsi ); + hamsi512_2x64_update( &x16r_ctx.hamsi, vdata, 72 ); +#else + v128_bswap32_80( edata, pdata ); + sph_hamsi512_init( &x16r_ctx.hamsi ); + sph_hamsi512( &x16r_ctx.hamsi, edata, 72 ); + intrlv_2x64( vdata, edata, edata, 640 ); +#endif + break; + case FUGUE: + v128_bswap32_80( edata, pdata ); +#if defined(__AES__) + fugue512_init( &x16r_ctx.fugue ); + fugue512_update( &x16r_ctx.fugue, edata, 76 ); +#else + sph_fugue512_init( &x16r_ctx.fugue ); + sph_fugue512( &x16r_ctx.fugue, edata, 76 ); +#endif + intrlv_2x64( vdata, edata, edata, 640 ); + break; + case SHABAL: + v128_bswap32_80( edata, pdata ); + sph_shabal512_init( &x16r_ctx.shabal ); + sph_shabal512( &x16r_ctx.shabal, edata, 64); + intrlv_2x64( vdata, edata, edata, 640 ); + break; + case WHIRLPOOL: + v128_bswap32_80( edata, pdata ); + sph_whirlpool_init( &x16r_ctx.whirlpool ); + sph_whirlpool( &x16r_ctx.whirlpool, edata, 64 ); + intrlv_2x64( vdata, edata, edata, 640 ); + break; + default: + v128_bswap32_intrlv80_2x64( vdata, pdata ); + } +} + +int x16r_2x64_hash_generic( void* output, const void* input, int thrid ) +{ + uint32_t vhash[20*2] __attribute__ ((aligned (64))); + uint32_t hash0[20] __attribute__ ((aligned (32))); + uint32_t hash1[20] __attribute__ ((aligned (32))); + x16r_2x64_context_overlay ctx; + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + void *in0 = (void*) hash0; + void *in1 = (void*) hash1; + int size = 80; + + dintrlv_2x64( hash0, hash1, input, 640 ); + + for ( int i = 0; i < 16; i++ ) + { + const char elem = x16r_hash_order[i]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch ( algo ) + { + case BLAKE: + if ( i == 0 ) + blake512_2x64_full( &ctx.blake, vhash, input, size ); + else + { + intrlv_2x64( vhash, in0, in1, size<<3 ); + blake512_2x64_full( &ctx.blake, vhash, vhash, size ); + } + dintrlv_2x64( hash0, hash1, vhash, 512 ); + break; + case BMW: + bmw512_2x64_init( &ctx.bmw ); + if ( i == 0 ) + bmw512_2x64_update( &ctx.bmw, input, size ); + else + { + intrlv_2x64( vhash, in0, in1, size<<3 ); + bmw512_2x64_update( &ctx.bmw, vhash, size ); + } + bmw512_2x64_close( &ctx.bmw, vhash ); + dintrlv_2x64( hash0, hash1, vhash, 512 ); + break; + case GROESTL: +#if defined(__AES__) // || defined(__ARM_FEATURE_AES) + groestl512_full( &ctx.groestl, hash0, in0, size<<3 ); + groestl512_full( &ctx.groestl, hash1, in1, size<<3 ); +#else + sph_groestl512_init( &ctx.groestl ); + sph_groestl512( &ctx.groestl, in0, size ); + sph_groestl512_close( &ctx.groestl, hash0 ); + sph_groestl512_init( &ctx.groestl ); + sph_groestl512( &ctx.groestl, in1, size ); + sph_groestl512_close( &ctx.groestl, hash1 ); +#endif + break; + case JH: + if ( i == 0 ) + jh512_2x64_update( &ctx.jh, input + (64*2), 16 ); + else + { + intrlv_2x64( vhash, in0, in1, size<<3 ); + jh512_2x64_init( &ctx.jh ); + jh512_2x64_update( &ctx.jh, vhash, size ); + } + jh512_2x64_close( &ctx.jh, vhash ); + dintrlv_2x64( hash0, hash1, vhash, 512 ); + break; + case KECCAK: + if ( i == 0 ) + keccak512_2x64_update( &ctx.keccak, input + (72*2), 8 ); + else + { + intrlv_2x64( vhash, in0, in1, size<<3 ); + keccak512_2x64_init( &ctx.keccak ); + keccak512_2x64_update( &ctx.keccak, vhash, size ); + } + keccak512_2x64_close( &ctx.keccak, vhash ); + dintrlv_2x64( hash0, hash1, vhash, 512 ); + break; + case SKEIN: + if ( i == 0 ) + skein512_2x64_final16( &ctx.skein, vhash, input + (64*2) ); + else + { + intrlv_2x64( vhash, in0, in1, size<<3 ); + skein512_2x64_full( &ctx.skein, vhash, vhash, size ); + } + dintrlv_2x64( hash0, hash1, vhash, 512 ); + break; + case LUFFA: + if ( i == 0 ) + { + update_and_final_luffa( &ctx.luffa, hash0, in0 + 64, 16 ); + update_and_final_luffa( &ctx.luffa, hash1, in1 + 64, 16 ); + } + else + { + luffa_full( &ctx.luffa, hash0, 512, hash0, size ); + luffa_full( &ctx.luffa, hash1, 512, hash1, size ); + } + break; + case CUBEHASH: + if ( i == 0 ) + { + cubehashUpdateDigest( &ctx.cube, hash0, in0 + 64, 16 ); + cubehashUpdateDigest( &ctx.cube, hash1, in1 + 64, 16 ); + } + else + { + cubehash_full( &ctx.cube, hash0, 512, hash0, size ); + cubehash_full( &ctx.cube, hash1, 512, hash1, size ); + } + break; + case SHAVITE: + shavite512_full( &ctx.shavite, hash0, in0, size ); + shavite512_full( &ctx.shavite, hash1, in1, size ); + break; + case SIMD: + simd512_ctx( &ctx.simd, hash0, in0, size ); + simd512_ctx( &ctx.simd, hash1, in1, size ); + break; + case ECHO: +#if defined(__AES__) + echo_full( &ctx.echo, hash0, 512, in0, size ); + echo_full( &ctx.echo, hash1, 512, in1, size ); +#else + sph_echo512_init( &ctx.echo ); + sph_echo512( &ctx.echo, in0, size ); + sph_echo512_close( &ctx.echo, hash0 ); + sph_echo512_init( &ctx.echo ); + sph_echo512( &ctx.echo, in1, size ); + sph_echo512_close( &ctx.echo, hash1 ); +#endif + break; + case HAMSI: +#if defined(__SSE4_2__) || defined(__ARM_NEON) + if ( i == 0 ) + hamsi512_2x64_update( &ctx.hamsi, input + (72*2), 8 ); + else + { + intrlv_2x64( vhash, hash0, hash1, size<<3 ); + hamsi512_2x64_init( &ctx.hamsi ); + hamsi512_2x64_update( &ctx.hamsi, vhash, size ); + } + hamsi512_2x64_close( &ctx.hamsi, vhash ); + dintrlv_2x64( hash0, hash1, vhash, 512 ); +#else + if ( i == 0 ) + { + sph_hamsi512( &ctx.hamsi, in0 + 72, 8 ); + sph_hamsi512_close( &ctx.hamsi, hash0 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + sph_hamsi512( &ctx.hamsi, in1 + 72, 8 ); + sph_hamsi512_close( &ctx.hamsi, hash1 ); + } + else + { + sph_hamsi512_init( &ctx.hamsi ); + sph_hamsi512( &ctx.hamsi, hash0, size ); + sph_hamsi512_close( &ctx.hamsi, hash0 ); + sph_hamsi512_init( &ctx.hamsi ); + sph_hamsi512( &ctx.hamsi, hash1, size ); + sph_hamsi512_close( &ctx.hamsi, hash1 ); + } +#endif + break; + case FUGUE: +#if defined(__AES__) + if ( i == 0 ) + { + fugue512_update( &ctx.fugue, in0 + 76, 4 ); + fugue512_final( &ctx.fugue, hash0 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in1 + 76, 4 ); + fugue512_final( &ctx.fugue, hash1 ); + } + else + { + fugue512_full( &ctx.fugue, hash0, hash0, size ); + fugue512_full( &ctx.fugue, hash1, hash1, size ); + } +#else + if ( i == 0 ) + { + sph_fugue512( &ctx.fugue, in0 + 76, 4 ); + sph_fugue512_close( &ctx.fugue, hash0 ); + memcpy( &ctx, &x16r_ctx, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, in1 + 76, 4 ); + sph_fugue512_close( &ctx.fugue, hash1 ); + } + else + { + sph_fugue512_full( &ctx.fugue, hash0, hash0, size ); + sph_fugue512_full( &ctx.fugue, hash1, hash1, size ); + } +#endif + break; + case SHABAL: + if ( i == 0 ) + { + sph_shabal512( &ctx.shabal, in0 + 64, 16 ); + sph_shabal512_close( &ctx.shabal, hash0 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + sph_shabal512( &ctx.shabal, in1 + 64, 16 ); + sph_shabal512_close( &ctx.shabal, hash1 ); + } + else + { + sph_shabal512_init( &ctx.shabal ); + sph_shabal512( &ctx.shabal, hash0, size ); + sph_shabal512_close( &ctx.shabal, hash0 ); + sph_shabal512_init( &ctx.shabal ); + sph_shabal512( &ctx.shabal, hash1, size ); + sph_shabal512_close( &ctx.shabal, hash1 ); + } + break; + case WHIRLPOOL: + if ( i == 0 ) + { + sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + } + else + { + sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, size ); + } + break; + case SHA_512: + sha512_2x64_init( &ctx.sha512 ); + if ( i == 0 ) + sha512_2x64_update( &ctx.sha512, input, size ); + else + { + intrlv_2x64( vhash, in0, in1, size<<3 ); + sha512_2x64_init( &ctx.sha512 ); + sha512_2x64_update( &ctx.sha512, vhash, size ); + } + sha512_2x64_close( &ctx.sha512, vhash ); + dintrlv_2x64( hash0, hash1, vhash, 512 ); + break; + } + + if ( work_restart[thrid].restart ) return 0; + + size = 64; + } + memcpy( output, hash0, 64 ); + memcpy( output+64, hash1, 64 ); + + return 1; +} + +int x16r_2x64_hash( void* output, const void* input, int thrid ) +{ + uint8_t hash[64*2] __attribute__ ((aligned (64))); + if ( !x16r_2x64_hash_generic( hash, input, thrid ) ) + return 0; + + memcpy( output, hash, 32 ); + memcpy( output+32, hash+64, 32 ); + + return 1; +} + +int scanhash_x16r_2x64( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr) +{ + uint32_t hash[16*2] __attribute__ ((aligned (64))); + uint32_t vdata[20*2] __attribute__ ((aligned (64))); + uint32_t bedata1[2]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 2; + uint32_t n = first_nonce; + v128_t *noncev = (v128_t*)vdata + 9; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + volatile uint8_t *restart = &(work_restart[thr_id].restart); + + if ( bench ) ptarget[7] = 0x0cff; + + bedata1[0] = bswap_32( pdata[1] ); + bedata1[1] = bswap_32( pdata[2] ); + + static __thread uint32_t s_ntime = UINT32_MAX; + const uint32_t ntime = bswap_32( pdata[17] ); + if ( s_ntime != ntime ) + { + x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order ); + s_ntime = ntime; + if ( opt_debug && !thr_id ) + applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime ); + } + + x16r_2x64_prehash( vdata, pdata ); + *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev ); + do + { + if ( x16r_2x64_hash( hash, vdata, thr_id ) ); + for ( int i = 0; i < 2; i++ ) + if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) + { + pdata[19] = bswap_32( n+i ); + submit_solution( work, hash+(i<<3), mythr ); + } + *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) ); + n += 2; + } while ( likely( ( n < last_nonce ) && !(*restart) ) ); + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + #endif diff --git a/algo/x16/x16r-gate.c b/algo/x16/x16r-gate.c index 19ba317..25dbec1 100644 --- a/algo/x16/x16r-gate.c +++ b/algo/x16/x16r-gate.c @@ -13,10 +13,13 @@ __thread x16r_8way_context_overlay x16r_ctx; __thread x16r_4way_context_overlay x16r_ctx; +#elif defined (X16R_2WAY) + +__thread x16r_2x64_context_overlay x16r_ctx; + #endif -__thread x16r_context_overlay x16_ctx; - +__thread x16r_context_overlay x16r_ref_ctx; void x16r_getAlgoString( const uint8_t* prevblock, char *output ) { @@ -58,11 +61,15 @@ bool register_x16r_algo( algo_gate_t* gate ) #elif defined (X16R_4WAY) gate->scanhash = (void*)&scanhash_x16r_4way; gate->hash = (void*)&x16r_4way_hash; +#elif defined (X16R_2WAY) + gate->scanhash = (void*)&scanhash_x16r_2x64; + gate->hash = (void*)&x16r_2x64_hash; #else gate->scanhash = (void*)&scanhash_x16r; gate->hash = (void*)&x16r_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT + | NEON_OPT; x16_r_s_getAlgoString = (void*)&x16r_getAlgoString; opt_target_factor = 256.0; return true; @@ -76,11 +83,15 @@ bool register_x16rv2_algo( algo_gate_t* gate ) #elif defined (X16RV2_4WAY) gate->scanhash = (void*)&scanhash_x16rv2_4way; gate->hash = (void*)&x16rv2_4way_hash; +#elif defined (X16RV2_2WAY) + gate->scanhash = (void*)&scanhash_x16rv2_2x64; + gate->hash = (void*)&x16rv2_2x64_hash; #else gate->scanhash = (void*)&scanhash_x16rv2; gate->hash = (void*)&x16rv2_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT + | NEON_OPT; x16_r_s_getAlgoString = (void*)&x16r_getAlgoString; opt_target_factor = 256.0; return true; @@ -94,11 +105,15 @@ bool register_x16s_algo( algo_gate_t* gate ) #elif defined (X16R_4WAY) gate->scanhash = (void*)&scanhash_x16r_4way; gate->hash = (void*)&x16r_4way_hash; +#elif defined (X16R_2WAY) + gate->scanhash = (void*)&scanhash_x16r_2x64; + gate->hash = (void*)&x16r_2x64_hash; #else gate->scanhash = (void*)&scanhash_x16r; gate->hash = (void*)&x16r_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT + | NEON_OPT; x16_r_s_getAlgoString = (void*)&x16s_getAlgoString; opt_target_factor = 256.0; return true; @@ -108,7 +123,6 @@ bool register_x16s_algo( algo_gate_t* gate ) // // X16RT - void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash ) { int32_t maskedTime = timeStamp & 0xffffff80; @@ -221,34 +235,42 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) bool register_x16rt_algo( algo_gate_t* gate ) { -#if defined (X16R_8WAY) +#if defined (X16RT_8WAY) gate->scanhash = (void*)&scanhash_x16rt_8way; gate->hash = (void*)&x16r_8way_hash; -#elif defined (X16R_4WAY) +#elif defined (X16RT_4WAY) gate->scanhash = (void*)&scanhash_x16rt_4way; gate->hash = (void*)&x16r_4way_hash; +#elif defined (X16RT_2WAY) + gate->scanhash = (void*)&scanhash_x16rt_2x64; + gate->hash = (void*)&x16r_2x64_hash; #else gate->scanhash = (void*)&scanhash_x16rt; gate->hash = (void*)&x16r_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT + | NEON_OPT; opt_target_factor = 256.0; return true; }; bool register_x16rt_veil_algo( algo_gate_t* gate ) { -#if defined (X16R_8WAY) +#if defined (X16RT_8WAY) gate->scanhash = (void*)&scanhash_x16rt_8way; gate->hash = (void*)&x16r_8way_hash; -#elif defined (X16R_4WAY) +#elif defined (X16RT_4WAY) gate->scanhash = (void*)&scanhash_x16rt_4way; gate->hash = (void*)&x16r_4way_hash; +#elif defined (X16RT_2WAY) + gate->scanhash = (void*)&scanhash_x16rt_2x64; + gate->hash = (void*)&x16r_2x64_hash; #else gate->scanhash = (void*)&scanhash_x16rt; gate->hash = (void*)&x16r_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT + | NEON_OPT; gate->build_extraheader = (void*)&veil_build_extraheader; opt_target_factor = 256.0; return true; @@ -262,7 +284,7 @@ bool register_hex_algo( algo_gate_t* gate ) { gate->scanhash = (void*)&scanhash_hex; gate->hash = (void*)&x16r_hash; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT; gate->gen_merkle_root = (void*)&sha256_gen_merkle_root; opt_target_factor = 128.0; return true; @@ -274,20 +296,25 @@ bool register_hex_algo( algo_gate_t* gate ) bool register_x21s_algo( algo_gate_t* gate ) { -#if defined (X16R_8WAY) +#if defined (X21S_8WAY) gate->scanhash = (void*)&scanhash_x21s_8way; gate->hash = (void*)&x21s_8way_hash; gate->miner_thread_init = (void*)&x21s_8way_thread_init; -#elif defined (X16R_4WAY) +#elif defined (X21S_4WAY) gate->scanhash = (void*)&scanhash_x21s_4way; gate->hash = (void*)&x21s_4way_hash; gate->miner_thread_init = (void*)&x21s_4way_thread_init; +#elif defined (X21S_2WAY) + gate->scanhash = (void*)&scanhash_x21s_2x64; + gate->hash = (void*)&x21s_2x64_hash; + gate->miner_thread_init = (void*)&x21s_2x64_thread_init; #else gate->scanhash = (void*)&scanhash_x21s; gate->hash = (void*)&x21s_hash; gate->miner_thread_init = (void*)&x21s_thread_init; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT + | NEON_OPT; x16_r_s_getAlgoString = (void*)&x16s_getAlgoString; opt_target_factor = 256.0; return true; diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h index 4bdfd65..c06ce75 100644 --- a/algo/x16/x16r-gate.h +++ b/algo/x16/x16r-gate.h @@ -7,13 +7,15 @@ #include #include "algo/blake/blake512-hash.h" #include "algo/bmw/sph_bmw.h" -#include "algo/groestl/sph_groestl.h" #include "algo/jh/sph_jh.h" +#include "algo/groestl/sph_groestl.h" #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" #include "algo/shavite/sph_shavite.h" #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" +#include "algo/simd/sph_simd.h" +#include "algo/simd/nist.h" #include "algo/echo/sph_echo.h" #include "algo/hamsi/sph_hamsi.h" #include "algo/fugue/sph_fugue.h" @@ -21,13 +23,13 @@ #include "algo/whirlpool/sph_whirlpool.h" #include "algo/sha/sha512-hash.h" -#if defined(__AES__) +#if defined(__AES__) || defined(__ARM_FEATURE_AES) #include "algo/echo/aes_ni/hash_api.h" #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/fugue/fugue-aesni.h" #endif -#if defined (__AVX2__) +//#if defined (__AVX2__) #include "algo/bmw/bmw-hash-4way.h" #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/skein/skein-hash-4way.h" @@ -39,7 +41,7 @@ #include "algo/echo/aes_ni/hash_api.h" #include "algo/hamsi/hamsi-hash-4way.h" #include "algo/shabal/shabal-hash-4way.h" -#endif +//#endif #if defined(__VAES__) #include "algo/groestl/groestl512-hash-4way.h" @@ -48,28 +50,41 @@ #include "algo/echo/echo-hash-4way.h" #endif -#if defined(__aarch64__) - #include "algo/simd/sph_simd.h" -#else - #include "algo/simd/nist.h" +// X16R, X16S +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define X16R_8WAY 1 +#elif defined(__AVX2__) && defined(__AES__) + #define X16R_4WAY 1 +#elif defined(__SSE2__) || defined(__ARM_NEON__) + #define X16R_2WAY 1 #endif #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) - - #define X16R_8WAY 1 #define X16RV2_8WAY 1 - #define X16RT_8WAY 1 - #define X21S_8WAY 1 - #elif defined(__AVX2__) && defined(__AES__) - #define X16RV2_4WAY 1 - #define X16RT_4WAY 1 - #define X21S_4WAY 1 - #define X16R_4WAY 1 - +#elif defined(__SSE2__) || defined(__ARM_NEON__) + #define X16RV2_2WAY 1 #endif +// X16RT, VEIL +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define X16RT_8WAY 1 +#elif defined(__AVX2__) && defined(__AES__) + #define X16RT_4WAY 1 +#elif defined(__SSE2__) || defined(__ARM_NEON__) + #define X16RT_2WAY 1 +#endif + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define X21S_8WAY 1 +#elif defined(__AVX2__) && defined(__AES__) + #define X21S_4WAY 1 +#elif defined(__SSE2__) || defined(__ARM_NEON__) + #define X21S_2WAY 1 +#endif + + enum x16r_Algo { BLAKE = 0, BMW, @@ -167,7 +182,6 @@ union _x16r_4way_context_overlay keccak512_4way_context keccak; luffa_2way_context luffa; cube_2way_context cube; - hashState_luffa luffa1; simd_2way_context simd; hamsi512_4way_context hamsi; hashState_fugue fugue; @@ -187,34 +201,87 @@ int scanhash_x16r_4way( struct work *, uint32_t, uint64_t *, struct thr_info * ); extern __thread x16r_4way_context_overlay x16r_ctx; +#elif defined(X16R_2WAY) + +union _x16r_2x64_context_overlay +{ + blake512_2x64_context blake; + bmw512_2x64_context bmw; +#if defined(__AES__) // || defined(__ARM_FEATURE_AES) + hashState_groestl groestl; +#else + sph_groestl512_context groestl; +#endif + skein512_2x64_context skein; + jh512_2x64_context jh; + keccak512_2x64_context keccak; + hashState_luffa luffa; + cubehashParam cube; + shavite512_context shavite; + simd512_context simd; +#if defined(__AES__) || defined(__ARM_FEATURE_AES) + hashState_echo echo; +#else + sph_echo512_context echo; +#endif +#if defined(__SSE4_2__) || defined(__ARM_NEON) + hamsi_2x64_context hamsi; +#else + sph_hamsi512_context hamsi; +#endif +#if defined(__AES__) + hashState_fugue fugue; +#else + sph_fugue512_context fugue; +#endif + sph_shabal512_context shabal; + sph_whirlpool_context whirlpool; + sha512_2x64_context sha512; +} __attribute__ ((aligned (64))); + +typedef union _x16r_2x64_context_overlay x16r_2x64_context_overlay; + +void x16r_2x64_prehash( void *, void * ); +int x16r_2x64_hash_generic( void *, const void *, int ); +int x16r_2x64_hash( void *, const void *, int ); +int scanhash_x16r_2x64( struct work *, uint32_t, + uint64_t *, struct thr_info * ); +extern __thread x16r_2x64_context_overlay x16r_ctx; + #endif // needed for hex union _x16r_context_overlay { -#if defined(__AES__) - hashState_echo echo; - hashState_groestl groestl; - hashState_fugue fugue; -#else - sph_groestl512_context groestl; - sph_echo512_context echo; - sph_fugue512_context fugue; -#endif blake512_context blake; sph_bmw512_context bmw; +#if defined(__AES__) // || defined(__ARM_FEATURE_AES) + hashState_groestl groestl; +#else + sph_groestl512_context groestl; +#endif sph_skein512_context skein; sph_jh512_context jh; sph_keccak512_context keccak; hashState_luffa luffa; cubehashParam cube; shavite512_context shavite; -#if defined(__aarch64__) - sph_simd512_context simd; + simd512_context simd; +#if defined(__AES__) || defined(__ARM_FEATURE_AES) + hashState_echo echo; #else - hashState_sd simd; + sph_echo512_context echo; #endif +#if defined(__SSE4_2__) || defined(__ARM_NEON) + hamsi_2x64_context hamsi; +#else sph_hamsi512_context hamsi; +#endif +#if defined(__AES__) + hashState_fugue fugue; +#else + sph_fugue512_context fugue; +#endif sph_shabal512_context shabal; sph_whirlpool_context whirlpool; sph_sha512_context sha512; @@ -222,7 +289,7 @@ union _x16r_context_overlay typedef union _x16r_context_overlay x16r_context_overlay; -extern __thread x16r_context_overlay x16_ctx; +extern __thread x16r_context_overlay x16r_ref_ctx; void x16r_prehash( void *, void * ); int x16r_hash_generic( void *, const void *, int ); @@ -242,6 +309,12 @@ int x16rv2_4way_hash( void *state, const void *input, int thrid ); int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); +#elif defined(X16RV2_2WAY) + +int x16rv2_2x64_hash( void *state, const void *input, int thrid ); +int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + #else int x16rv2_hash( void *state, const void *input, int thr_id ); @@ -251,18 +324,24 @@ int scanhash_x16rv2( struct work *work, uint32_t max_nonce, #endif // x16rt, veil -#if defined(X16R_8WAY) +#if defined(X16RT_8WAY) //void x16rt_8way_hash( void *state, const void *input ); int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); -#elif defined(X16R_4WAY) +#elif defined(X16RT_4WAY) //void x16rt_4way_hash( void *state, const void *input ); int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); +#elif defined(X16RT_2WAY) + +//void x16rt_4way_hash( void *state, const void *input ); +int scanhash_x16rt_2x64( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + #else //void x16rt_hash( void *state, const void *input ); @@ -272,20 +351,27 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce, #endif // x21s -#if defined(X16R_8WAY) +#if defined(X21S_8WAY) int x21s_8way_hash( void *state, const void *input, int thrid ); int scanhash_x21s_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); bool x21s_8way_thread_init(); -#elif defined(X16R_4WAY) +#elif defined(X21S_4WAY) int x21s_4way_hash( void *state, const void *input, int thrid ); int scanhash_x21s_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); bool x21s_4way_thread_init(); +#elif defined(X21S_2WAY) + +int x21s_2x64_hash( void *state, const void *input, int thrid ); +int scanhash_x21s_2x64( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +bool x21s_2x64_thread_init(); + #else int x21s_hash( void *state, const void *input, int thr_id ); diff --git a/algo/x16/x16r.c b/algo/x16/x16r.c index d18b930..2776d81 100644 --- a/algo/x16/x16r.c +++ b/algo/x16/x16r.c @@ -18,32 +18,36 @@ void x16r_prehash( void *edata, void *pdata ) switch ( algo ) { case JH: - sph_jh512_init( &x16_ctx.jh ); - sph_jh512( &x16_ctx.jh, edata, 64 ); + sph_jh512_init( &x16r_ref_ctx.jh ); + sph_jh512( &x16r_ref_ctx.jh, edata, 64 ); break; case SKEIN: - sph_skein512_init( &x16_ctx.skein ); - sph_skein512( &x16_ctx.skein, edata, 64 ); + sph_skein512_init( &x16r_ref_ctx.skein ); + sph_skein512( &x16r_ref_ctx.skein, edata, 64 ); + break; + case KECCAK: + sph_keccak512_init( &x16r_ref_ctx.keccak ); + sph_keccak512( &x16r_ref_ctx.keccak, edata, 72 ); break; case LUFFA: - init_luffa( &x16_ctx.luffa, 512 ); - update_luffa( &x16_ctx.luffa, edata, 64 ); + init_luffa( &x16r_ref_ctx.luffa, 512 ); + update_luffa( &x16r_ref_ctx.luffa, edata, 64 ); break; case CUBEHASH: - cubehashInit( &x16_ctx.cube, 512, 16, 32 ); - cubehashUpdate( &x16_ctx.cube, edata, 64 ); + cubehashInit( &x16r_ref_ctx.cube, 512, 16, 32 ); + cubehashUpdate( &x16r_ref_ctx.cube, edata, 64 ); break; case HAMSI: - sph_hamsi512_init( &x16_ctx.hamsi ); - sph_hamsi512( &x16_ctx.hamsi, edata, 64 ); - break; + sph_hamsi512_init( &x16r_ref_ctx.hamsi ); + sph_hamsi512( &x16r_ref_ctx.hamsi, edata, 72 ); + break; case SHABAL: - sph_shabal512_init( &x16_ctx.shabal ); - sph_shabal512( &x16_ctx.shabal, edata, 64 ); + sph_shabal512_init( &x16r_ref_ctx.shabal ); + sph_shabal512( &x16r_ref_ctx.shabal, edata, 64 ); break; case WHIRLPOOL: - sph_whirlpool_init( &x16_ctx.whirlpool ); - sph_whirlpool( &x16_ctx.whirlpool, edata, 64 ); + sph_whirlpool_init( &x16r_ref_ctx.whirlpool ); + sph_whirlpool( &x16r_ref_ctx.whirlpool, edata, 64 ); break; } } @@ -52,7 +56,7 @@ int x16r_hash_generic( void* output, const void* input, int thrid ) { uint32_t _ALIGN(128) hash[16]; x16r_context_overlay ctx; - memcpy( &ctx, &x16_ctx, sizeof(ctx) ); + memcpy( &ctx, &x16r_ref_ctx, sizeof(ctx) ); void *in = (void*) input; int size = 80; @@ -74,12 +78,12 @@ int x16r_hash_generic( void* output, const void* input, int thrid ) sph_bmw512_close(&ctx.bmw, hash); break; case GROESTL: -#if defined(__AES__) - groestl512_full( &ctx.groestl, (char*)hash, (char*)in, size<<3 ); +#if defined(__AES__) // || defined(__ARM_FEATURE_AES) + groestl512_full( &ctx.groestl, hash, in, size<<3 ); #else sph_groestl512_init( &ctx.groestl ); sph_groestl512( &ctx.groestl, in, size ); - sph_groestl512_close(&ctx.groestl, hash); + sph_groestl512_close( &ctx.groestl, hash ); #endif break; case JH: @@ -93,8 +97,13 @@ int x16r_hash_generic( void* output, const void* input, int thrid ) sph_jh512_close(&ctx.jh, hash ); break; case KECCAK: - sph_keccak512_init( &ctx.keccak ); - sph_keccak512( &ctx.keccak, in, size ); + if ( i == 0 ) + sph_keccak512( &ctx.keccak, in+72, 8 ); + else + { + sph_keccak512_init( &ctx.keccak ); + sph_keccak512( &ctx.keccak, in, size ); + } sph_keccak512_close( &ctx.keccak, hash ); break; case SKEIN: @@ -109,13 +118,13 @@ int x16r_hash_generic( void* output, const void* input, int thrid ) break; case LUFFA: if ( i == 0 ) - update_and_final_luffa( &ctx.luffa, hash, (const void*)in+64, 16 ); + update_and_final_luffa( &ctx.luffa, hash, in+64, 16 ); else luffa_full( &ctx.luffa, hash, 512, in, size ); break; case CUBEHASH: if ( i == 0 ) - cubehashUpdateDigest( &ctx.cube, hash, (const void*)in+64, 16 ); + cubehashUpdateDigest( &ctx.cube, hash, in+64, 16 ); else cubehash_full( &ctx.cube, hash, 512, in, size ); break; @@ -123,19 +132,13 @@ int x16r_hash_generic( void* output, const void* input, int thrid ) shavite512_full( &ctx.shavite, hash, in, size ); break; case SIMD: -#if defined(__aarch64__) sph_simd512_init( &ctx.simd ); - sph_simd512(&ctx.simd, (const void*) hash, 64); + sph_simd512(&ctx.simd, hash, 64); sph_simd512_close(&ctx.simd, hash); -#else - simd_full( &ctx.simd, (BitSequence *)hash, - (const BitSequence*)in, size<<3 ); -#endif break; case ECHO: #if defined(__AES__) - echo_full( &ctx.echo, (BitSequence*)hash, 512, - (const BitSequence*)in, size ); + echo_full( &ctx.echo, hash, 512, in, size ); #else sph_echo512_init( &ctx.echo ); sph_echo512( &ctx.echo, in, size ); @@ -153,12 +156,8 @@ int x16r_hash_generic( void* output, const void* input, int thrid ) sph_hamsi512_close( &ctx.hamsi, hash ); break; case FUGUE: -#if defined(__AES__) - fugue512_full( &ctx.fugue, hash, in, size ); -#else - sph_fugue512_full( &ctx.fugue, hash, in, size ); -#endif - break; + sph_fugue512_full( &ctx.fugue, hash, in, size ); + break; case SHABAL: if ( i == 0 ) sph_shabal512( &ctx.shabal, in+64, 16 ); diff --git a/algo/x16/x16rt-4way.c b/algo/x16/x16rt-4way.c index fa34987..445bba3 100644 --- a/algo/x16/x16rt-4way.c +++ b/algo/x16/x16rt-4way.c @@ -3,7 +3,7 @@ #include #include -#if defined (X16R_8WAY) +#if defined (X16RT_8WAY) int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr) @@ -57,7 +57,7 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce, return 0; } -#elif defined (X16R_4WAY) +#elif defined (X16RT_4WAY) int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr) @@ -110,4 +110,55 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce, return 0; } +#elif defined (X16RT_2WAY) + +int scanhash_x16rt_2x64( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr) +{ + uint32_t hash[2*16] __attribute__ ((aligned (64))); + uint32_t vdata[24*2] __attribute__ ((aligned (64))); + uint32_t _ALIGN(64) timeHash[4*8]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 2; + uint32_t n = first_nonce; + const int thr_id = mythr->id; + v128_t *noncev = (v128_t*)vdata + 9; + volatile uint8_t *restart = &(work_restart[thr_id].restart); + const bool bench = opt_benchmark; + + if ( bench ) ptarget[7] = 0x0cff; + + static __thread uint32_t s_ntime = UINT32_MAX; + uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80; + if ( s_ntime != masked_ntime ) + { + x16rt_getTimeHash( masked_ntime, &timeHash ); + x16rt_getAlgoString( &timeHash[0], x16r_hash_order ); + s_ntime = masked_ntime; + if ( !thr_id ) + applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x", + x16r_hash_order, bswap_32( pdata[17] ), timeHash ); + } + + x16r_2x64_prehash( vdata, pdata ); + *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev ); + do + { + if ( x16r_2x64_hash( hash, vdata, thr_id ) ) + for ( int i = 0; i < 2; i++ ) + if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) + { + pdata[19] = bswap_32( n+i ); + submit_solution( work, hash+(i<<3), mythr ); + } + *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) ); + n += 2; + } while ( ( n < last_nonce ) && !(*restart) ); + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + #endif diff --git a/algo/x16/x16rt.c b/algo/x16/x16rt.c index 954aca3..3b447c5 100644 --- a/algo/x16/x16rt.c +++ b/algo/x16/x16rt.c @@ -1,6 +1,6 @@ #include "x16r-gate.h" -#if !defined(X16R_8WAY) && !defined(X16R_4WAY) +#if !defined(X16RT_8WAY) && !defined(X16RT_4WAY) int scanhash_x16rt( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) diff --git a/algo/x16/x16rv2-4way.c b/algo/x16/x16rv2-4way.c index 2f9c112..607c185 100644 --- a/algo/x16/x16rv2-4way.c +++ b/algo/x16/x16rv2-4way.c @@ -1151,4 +1151,454 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce, return 0; } +#elif defined (X16RV2_2WAY) + +union _x16rv2_2x64_context_overlay +{ + blake512_2x64_context blake; + bmw512_2x64_context bmw; +#if defined(__AES__) // || defined(__ARM_FEATURE_AES) + hashState_groestl groestl; +#else + sph_groestl512_context groestl; +#endif + skein512_2x64_context skein; + jh512_2x64_context jh; + keccak512_2x64_context keccak; + hashState_luffa luffa; + cubehashParam cube; + shavite512_context shavite; + simd512_context simd; +#if defined(__AES__) || defined(__ARM_FEATURE_AES) + hashState_echo echo; +#else + sph_echo512_context echo; +#endif +#if defined(__SSE4_2__) || defined(__ARM_NEON) + hamsi_2x64_context hamsi; +#else + sph_hamsi512_context hamsi; +#endif +#if defined(__AES__) + hashState_fugue fugue; +#else + sph_fugue512_context fugue; +#endif + sph_shabal512_context shabal; + sph_whirlpool_context whirlpool; + sha512_2x64_context sha512; + sph_tiger_context tiger; +} __attribute__ ((aligned (64))); + +typedef union _x16rv2_2x64_context_overlay x16rv2_2x64_context_overlay; + +static __thread x16rv2_2x64_context_overlay x16rv2_ctx; + +// Pad the 24 bytes tiger hash to 64 bytes +static inline void padtiger512( uint32_t* hash ) +{ + for ( int i = 6; i < 16; i++ ) hash[i] = 0; +} + +int x16rv2_2x64_hash( void* output, const void* input, int thrid ) +{ + uint32_t vhash[20*2] __attribute__ ((aligned (64))); + uint32_t hash0[20] __attribute__ ((aligned (32))); + uint32_t hash1[20] __attribute__ ((aligned (32))); + x16rv2_2x64_context_overlay ctx; + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + void *in0 = (void*) hash0; + void *in1 = (void*) hash1; + int size = 80; + + dintrlv_2x64( hash0, hash1, input, 640 ); + + for ( int i = 0; i < 16; i++ ) + { + const char elem = x16r_hash_order[i]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch ( algo ) + { + case BLAKE: + if ( i == 0 ) + blake512_2x64_full( &ctx.blake, vhash, input, size ); + else + { + intrlv_2x64( vhash, in0, in1, size<<3 ); + blake512_2x64_full( &ctx.blake, vhash, vhash, size ); + } + dintrlv_2x64( hash0, hash1, vhash, 512 ); + break; + case BMW: + bmw512_2x64_init( &ctx.bmw ); + if ( i == 0 ) + bmw512_2x64_update( &ctx.bmw, input, size ); + else + { + intrlv_2x64( vhash, in0, in1, size<<3 ); + bmw512_2x64_update( &ctx.bmw, vhash, size ); + } + bmw512_2x64_close( &ctx.bmw, vhash ); + dintrlv_2x64( hash0, hash1, vhash, 512 ); + break; + case GROESTL: +#if defined(__AES__) + groestl512_full( &ctx.groestl, hash0, in0, size<<3 ); + groestl512_full( &ctx.groestl, hash1, in1, size<<3 ); +#else + sph_groestl512_init( &ctx.groestl ); + sph_groestl512( &ctx.groestl, in0, size ); + sph_groestl512_close( &ctx.groestl, hash0 ); + sph_groestl512_init( &ctx.groestl ); + sph_groestl512( &ctx.groestl, in1, size ); + sph_groestl512_close( &ctx.groestl, hash1 ); +#endif + break; + case JH: + if ( i == 0 ) + jh512_2x64_update( &ctx.jh, input + (64<<2), 16 ); + else + { + intrlv_2x64( vhash, in0, in1, size<<3 ); + jh512_2x64_init( &ctx.jh ); + jh512_2x64_update( &ctx.jh, vhash, size ); + } + jh512_2x64_close( &ctx.jh, vhash ); + dintrlv_2x64( hash0, hash1, vhash, 512 ); + break; + case KECCAK: + if ( i == 0 ) + { + sph_tiger( &ctx.tiger, in0 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash0 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in1 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash1 ); + } + else + { + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in0, size ); + sph_tiger_close( &ctx.tiger, hash0 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in1, size ); + sph_tiger_close( &ctx.tiger, hash1 ); + } + for ( int i = (24/4); i < (64/4); i++ ) + hash0[i] = hash1[i] = 0; + + intrlv_2x64( vhash, hash0, hash1, 512 ); + keccak512_2x64_init( &ctx.keccak ); + keccak512_2x64_update( &ctx.keccak, vhash, 64 ); + keccak512_2x64_close( &ctx.keccak, vhash ); + dintrlv_2x64( hash0, hash1, vhash, 512 ); + break; + case SKEIN: + if ( i == 0 ) + skein512_2x64_final16( &ctx.skein, vhash, input + (64*4) ); + else + { + intrlv_2x64( vhash, in0, in1, size<<3 ); + skein512_2x64_init( &ctx.skein ); + skein512_2x64_update( &ctx.skein, vhash, size ); + } + skein512_2x64_close( &ctx.skein, vhash ); + dintrlv_2x64( hash0, hash1, vhash, 512 ); + break; + case LUFFA: + if ( i == 0 ) + { + sph_tiger( &ctx.tiger, in0 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash0 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in1 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash1 ); + } + else + { + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in0, size ); + sph_tiger_close( &ctx.tiger, hash0 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in1, size ); + sph_tiger_close( &ctx.tiger, hash1 ); + } + for ( int i = (24/4); i < (64/4); i++ ) + hash0[i] = hash1[i] = 0; + luffa_full( &ctx.luffa, hash0, 512, hash0, size ); + luffa_full( &ctx.luffa, hash1, 512, hash1, size ); + break; + case CUBEHASH: + if ( i == 0 ) + { + cubehashUpdateDigest( &ctx.cube, hash0, in0 + 64, 16 ); + cubehashUpdateDigest( &ctx.cube, hash1, in1 + 64, 16 ); + } + else + { + cubehash_full( &ctx.cube, hash0, 512, hash0, size ); + cubehash_full( &ctx.cube, hash1, 512, hash1, size ); + } + break; + case SHAVITE: + shavite512_full( &ctx.shavite, hash0, in0, size ); + shavite512_full( &ctx.shavite, hash1, in1, size ); + break; + case SIMD: + simd512_ctx( &ctx.simd, hash0, in0, size ); + simd512_ctx( &ctx.simd, hash1, in1, size ); + break; + case ECHO: +#if defined(__AES__) + echo_full( &ctx.echo, hash0, 512, in0, size ); + echo_full( &ctx.echo, hash1, 512, in1, size ); +#else + sph_echo512_init( &ctx.echo ); + sph_echo512( &ctx.echo, in0, size ); + sph_echo512_close( &ctx.echo, hash0 ); + sph_echo512_init( &ctx.echo ); + sph_echo512( &ctx.echo, in1, size ); + sph_echo512_close( &ctx.echo, hash1 ); +#endif + break; + case HAMSI: +#if defined(__SSE4_2__) || defined(__ARM_NEON) + if ( i == 0 ) + hamsi512_2x64_update( &ctx.hamsi, input + (72*2), 8 ); + else + { + intrlv_2x64( vhash, hash0, hash1, size<<3 ); + hamsi512_2x64_init( &ctx.hamsi ); + hamsi512_2x64_update( &ctx.hamsi, vhash, size ); + } + hamsi512_2x64_close( &ctx.hamsi, vhash ); + dintrlv_2x64( hash0, hash1, vhash, 512 ); +#else + if ( i == 0 ) + { + sph_hamsi512( &ctx.hamsi, in0 + 72, 8 ); + sph_hamsi512_close( &ctx.hamsi, hash0 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + sph_hamsi512( &ctx.hamsi, in1 + 72, 8 ); + sph_hamsi512_close( &ctx.hamsi, hash1 ); + } + else + { + sph_hamsi512_init( &ctx.hamsi ); + sph_hamsi512( &ctx.hamsi, hash0, size ); + sph_hamsi512_close( &ctx.hamsi, hash0 ); + sph_hamsi512_init( &ctx.hamsi ); + sph_hamsi512( &ctx.hamsi, hash1, size ); + sph_hamsi512_close( &ctx.hamsi, hash1 ); + } +#endif + break; + case FUGUE: +#if defined(__AES__) + if ( i == 0 ) + { + fugue512_update( &ctx.fugue, in0 + 76, 4 ); + fugue512_final( &ctx.fugue, hash0 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in1 + 76, 4 ); + fugue512_final( &ctx.fugue, hash1 ); + } + else + { + fugue512_full( &ctx.fugue, hash0, hash0, size ); + fugue512_full( &ctx.fugue, hash1, hash1, size ); + } +#else + if ( i == 0 ) + { + sph_fugue512( &ctx.fugue, in0 + 76, 4 ); + sph_fugue512_close( &ctx.fugue, hash0 ); + memcpy( &ctx, &x16r_ctx, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, in1 + 76, 4 ); + sph_fugue512_close( &ctx.fugue, hash1 ); + } + else + { + sph_fugue512_full( &ctx.fugue, hash0, hash0, size ); + sph_fugue512_full( &ctx.fugue, hash1, hash1, size ); + } +#endif + break; + case SHABAL: + if ( i == 0 ) + { + sph_shabal512( &ctx.shabal, in0 + 64, 16 ); + sph_shabal512_close( &ctx.shabal, hash0 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + sph_shabal512( &ctx.shabal, in1 + 64, 16 ); + sph_shabal512_close( &ctx.shabal, hash1 ); + } + else + { + sph_shabal512_init( &ctx.shabal ); + sph_shabal512( &ctx.shabal, hash0, size ); + sph_shabal512_close( &ctx.shabal, hash0 ); + sph_shabal512_init( &ctx.shabal ); + sph_shabal512( &ctx.shabal, hash1, size ); + sph_shabal512_close( &ctx.shabal, hash1 ); + } + break; + case WHIRLPOOL: + sph_whirlpool512_full( &ctx.whirlpool, hash0, in0, size ); + sph_whirlpool512_full( &ctx.whirlpool, hash1, in1, size ); + break; + case SHA_512: + if ( i == 0 ) + { + sph_tiger( &ctx.tiger, in0 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash0 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in1 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash1 ); + } + else + { + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in0, size ); + sph_tiger_close( &ctx.tiger, hash0 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in1, size ); + sph_tiger_close( &ctx.tiger, hash1 ); + } + for ( int i = (24/4); i < (64/4); i++ ) + hash0[i] = hash1[i] = 0; + + intrlv_2x64( vhash, hash0, hash1, 512 ); + sha512_2x64_init( &ctx.sha512 ); + sha512_2x64_update( &ctx.sha512, vhash, 64 ); + sha512_2x64_close( &ctx.sha512, vhash ); + dintrlv_2x64( hash0, hash1, vhash, 512 ); + break; + } + + if ( work_restart[thrid].restart ) return 0; + + size = 64; + } + memcpy( output, hash0, 32 ); + memcpy( output+32, hash1, 32 ); + return 1; +} + +int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr) +{ + uint32_t hash[2*16] __attribute__ ((aligned (64))); + uint32_t vdata[24*2] __attribute__ ((aligned (64))); + uint32_t edata[20]; + uint32_t bedata1[2]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 2; + uint32_t n = first_nonce; + const int thr_id = mythr->id; + v128_t *noncev = (v128_t*)vdata + 9; + volatile uint8_t *restart = &(work_restart[thr_id].restart); + const bool bench = opt_benchmark; + + if ( bench ) ptarget[7] = 0x0fff; + + bedata1[0] = bswap_32( pdata[1] ); + bedata1[1] = bswap_32( pdata[2] ); + + static __thread uint32_t s_ntime = UINT32_MAX; + const uint32_t ntime = bswap_32(pdata[17]); + if ( s_ntime != ntime ) + { + x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order ); + s_ntime = ntime; + if ( opt_debug && !thr_id ) + applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime ); + } + + // Do midstate prehash on hash functions with block size <= 64 bytes. + const char elem = x16r_hash_order[0]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + switch ( algo ) + { + case JH: + v128_bswap32_intrlv80_2x64( vdata, pdata ); + jh512_2x64_init( &x16rv2_ctx.jh ); + jh512_2x64_update( &x16rv2_ctx.jh, vdata, 64 ); + break; + case KECCAK: + case LUFFA: + case SHA_512: + v128_bswap32_80( edata, pdata ); + sph_tiger_init( &x16rv2_ctx.tiger ); + sph_tiger( &x16rv2_ctx.tiger, edata, 64 ); + intrlv_2x64( vdata, edata, edata, 640 ); + break; + case SKEIN: + v128_bswap32_intrlv80_2x64( vdata, pdata ); + skein512_2x64_prehash64( &x16r_ctx.skein, vdata ); + break; + case CUBEHASH: + v128_bswap32_80( edata, pdata ); + cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 ); + cubehashUpdate( &x16rv2_ctx.cube, edata, 64 ); + intrlv_2x64( vdata, edata, edata, 640 ); + break; + case HAMSI: +#if defined(__SSE4_2__) + v128_bswap32_intrlv80_2x64( vdata, pdata ); + hamsi512_2x64_init( &x16r_ctx.hamsi ); + hamsi512_2x64_update( &x16r_ctx.hamsi, vdata, 72 ); +#else + v128_bswap32_80( edata, pdata ); + sph_hamsi512_init( &x16r_ctx.hamsi ); + sph_hamsi512( &x16r_ctx.hamsi, edata, 72 ); + intrlv_2x64( vdata, edata, edata, 640 ); +#endif + break; + case FUGUE: + v128_bswap32_80( edata, pdata ); +#if defined(__AES__) + fugue512_init( &x16r_ctx.fugue ); + fugue512_update( &x16r_ctx.fugue, edata, 76 ); +#else + sph_fugue512_init( &x16r_ctx.fugue ); + sph_fugue512( &x16r_ctx.fugue, edata, 76 ); +#endif + intrlv_2x64( vdata, edata, edata, 640 ); + break; + case SHABAL: + v128_bswap32_80( edata, pdata ); + sph_shabal512_init( &x16r_ctx.shabal ); + sph_shabal512( &x16r_ctx.shabal, edata, 64); + intrlv_2x64( vdata, edata, edata, 640 ); + break; + default: + v128_bswap32_intrlv80_2x64( vdata, pdata ); + } + + *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev ); + + do + { + if ( x16rv2_2x64_hash( hash, vdata, thr_id ) ) + for ( int i = 0; i < 2; i++ ) + if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) + { + pdata[19] = bswap_32( n+i ); + submit_solution( work, hash+(i<<3), mythr ); + } + *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) ); + n += 2; + } while ( likely( ( n < last_nonce ) && !(*restart) ) ); + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + + + + #endif diff --git a/algo/x16/x16rv2.c b/algo/x16/x16rv2.c index 8cf026b..b7e51b1 100644 --- a/algo/x16/x16rv2.c +++ b/algo/x16/x16rv2.c @@ -6,21 +6,15 @@ */ #include "x16r-gate.h" -#if !defined(X16R_8WAY) && !defined(X16R_4WAY) +#if !defined(X16RV2_8WAY) && !defined(X16RV2_4WAY) && !defined(X16RV2_2WAY) #include "algo/tiger/sph_tiger.h" union _x16rv2_context_overlay { -#if defined(__AES__) - hashState_echo echo; - hashState_groestl groestl; - hashState_fugue fugue; -#else sph_groestl512_context groestl; sph_echo512_context echo; sph_fugue512_context fugue; -#endif blake512_context blake; sph_bmw512_context bmw; sph_skein512_context skein; @@ -29,11 +23,7 @@ union _x16rv2_context_overlay hashState_luffa luffa; cubehashParam cube; shavite512_context shavite; -#if defined(__aarch64__) sph_simd512_context simd; -#else - hashState_sd simd; -#endif sph_hamsi512_context hamsi; sph_shabal512_context shabal; sph_whirlpool_context whirlpool; @@ -72,15 +62,9 @@ int x16rv2_hash( void* output, const void* input, int thrid ) sph_bmw512_close(&ctx.bmw, hash); break; case GROESTL: -#if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)in, size<<3 ); -#else sph_groestl512_init( &ctx.groestl ); sph_groestl512( &ctx.groestl, in, size ); sph_groestl512_close(&ctx.groestl, hash); -#endif break; case SKEIN: sph_skein512_init( &ctx.skein ); @@ -117,25 +101,14 @@ int x16rv2_hash( void* output, const void* input, int thrid ) shavite512_full( &ctx.shavite, hash, in, size ); break; case SIMD: -#if defined(__aarch64__) sph_simd512_init( &ctx.simd ); - sph_simd512(&ctx.simd, (const void*) hash, 64); + sph_simd512(&ctx.simd, hash, 64); sph_simd512_close(&ctx.simd, hash); -#else - simd_full( &ctx.simd, (BitSequence *)hash, - (const BitSequence*)in, size<<3 ); -#endif break; case ECHO: -#if defined(__AES__) - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash, - (const BitSequence*)in, size<<3 ); -#else sph_echo512_init( &ctx.echo ); sph_echo512( &ctx.echo, in, size ); sph_echo512_close( &ctx.echo, hash ); -#endif break; case HAMSI: sph_hamsi512_init( &ctx.hamsi ); @@ -143,11 +116,7 @@ int x16rv2_hash( void* output, const void* input, int thrid ) sph_hamsi512_close( &ctx.hamsi, hash ); break; case FUGUE: -#if defined(__AES__) - fugue512_full( &ctx.fugue, hash, in, size ); -#else sph_fugue512_full( &ctx.fugue, hash, in, size ); -#endif break; case SHABAL: sph_shabal512_init( &ctx.shabal ); diff --git a/algo/x16/x21s-4way.c b/algo/x16/x21s-4way.c index c858a70..45b6acc 100644 --- a/algo/x16/x21s-4way.c +++ b/algo/x16/x21s-4way.c @@ -9,6 +9,7 @@ #include #include #include "algo/haval/haval-hash-4way.h" +#include "algo/haval/sph-haval.h" #include "algo/tiger/sph_tiger.h" #include "algo/gost/sph_gost.h" #include "algo/lyra2/lyra2.h" @@ -351,4 +352,119 @@ bool x21s_4way_thread_init() return x21s_4way_matrix; } +#elif defined (X21S_2WAY) + +static __thread uint64_t* x21s_2x64_matrix; + +union _x21s_2x64_context_overlay +{ + sph_haval256_5_context haval; + sph_tiger_context tiger; + sph_gost512_context gost; +} __attribute__ ((aligned (64))); + +typedef union _x21s_2x64_context_overlay x21s_2x64_context_overlay; + +int x21s_2x64_hash( void* output, const void* input, int thrid ) +{ + uint8_t shash[64*2] __attribute__ ((aligned (64))); + x21s_2x64_context_overlay ctx; + uint32_t *hash0 = (uint32_t*) shash; + uint32_t *hash1 = (uint32_t*)( shash+64 ); + + if ( !x16r_2x64_hash_generic( shash, input, thrid ) ) + return 0; + + sph_haval256_5_init( &ctx.haval ); + sph_haval256_5( &ctx.haval, hash0, 64 ); + sph_haval256_5_close( &ctx.haval, hash0 ); + sph_haval256_5_init( &ctx.haval ); + sph_haval256_5( &ctx.haval, hash1, 64 ); + sph_haval256_5_close( &ctx.haval, hash1 ); + + sph_tiger_init( &ctx.tiger ); + sph_tiger ( &ctx.tiger, (const void*) hash0, 64 ); + sph_tiger_close( &ctx.tiger, (void*) hash0 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger ( &ctx.tiger, (const void*) hash1, 64 ); + sph_tiger_close( &ctx.tiger, (void*) hash1 ); + + LYRA2REV2( x21s_2x64_matrix, (void*) hash0, 32, (const void*) hash0, 32, + (const void*) hash0, 32, 1, 4, 4 ); + LYRA2REV2( x21s_2x64_matrix, (void*) hash1, 32, (const void*) hash1, 32, + (const void*) hash1, 32, 1, 4, 4 ); + + sph_gost512_init( &ctx.gost ); + sph_gost512 ( &ctx.gost, (const void*) hash0, 64 ); + sph_gost512_close( &ctx.gost, (void*) hash0 ); + sph_gost512_init( &ctx.gost ); + sph_gost512 ( &ctx.gost, (const void*) hash1, 64 ); + sph_gost512_close( &ctx.gost, (void*) hash1 ); + + sha256_full( output, hash0, 64 ); + sha256_full( output+32, hash1, 64 ); + + return 1; +} + +int scanhash_x21s_2x64( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr) +{ + uint32_t hash[16*2] __attribute__ ((aligned (64))); + uint32_t vdata[20*2] __attribute__ ((aligned (64))); + uint32_t bedata1[2] __attribute__((aligned(64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 2; + uint32_t n = first_nonce; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + v128_t *noncev = (v128_t*)vdata + 9; + volatile uint8_t *restart = &(work_restart[thr_id].restart); + + if ( bench ) ptarget[7] = 0x0cff; + + bedata1[0] = bswap_32( pdata[1] ); + bedata1[1] = bswap_32( pdata[2] ); + + static __thread uint32_t s_ntime = UINT32_MAX; + uint32_t ntime = bswap_32( pdata[17] ); + if ( s_ntime != ntime ) + { + x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order ); + s_ntime = ntime; + if ( opt_debug && !thr_id ) + applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime ); + } + + x16r_2x64_prehash( vdata, pdata ); + *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev ); + do + { + if ( x21s_2x64_hash( hash, vdata, thr_id ) ) + for ( int i = 0; i < 2; i++ ) + if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) + { + pdata[19] = bswap_32( n+i ); + submit_solution( work, hash+(i<<3), mythr ); + } + *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) ); + n += 2; + } while ( likely( ( n < last_nonce ) && !(*restart) ) ); + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + +bool x21s_2x64_thread_init() +{ + const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols + const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + + const int size = (int64_t)ROW_LEN_BYTES * 4; // nRows; + x21s_2x64_matrix = mm_malloc( size, 64 ); + return x21s_2x64_matrix; +} + #endif diff --git a/algo/x16/x21s.c b/algo/x16/x21s.c index 2f7fc42..526c3d4 100644 --- a/algo/x16/x21s.c +++ b/algo/x16/x21s.c @@ -15,7 +15,7 @@ #include "algo/gost/sph_gost.h" #include "algo/lyra2/lyra2.h" -#if !defined(X16R_8WAY) && !defined(X16R_4WAY) +#if !defined(X21S_8WAY) && !defined(X21S_4WAY) static __thread uint64_t* x21s_matrix; diff --git a/algo/x22/x22i.c b/algo/x22/x22i.c index 4140e31..d804ef2 100644 --- a/algo/x22/x22i.c +++ b/algo/x22/x22i.c @@ -5,24 +5,23 @@ #include "algo/blake/blake512-hash.h" #include "algo/bmw/sph_bmw.h" #if defined(__AES__) - #include "algo/echo/aes_ni/hash_api.h" #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/fugue/fugue-aesni.h" #else #include "algo/groestl/sph_groestl.h" - #include "algo/echo/sph_echo.h" #include "algo/fugue/sph_fugue.h" #endif +#if defined(__AES__) || defined(__ARM_FEATURE_AES) + #include "algo/echo/aes_ni/hash_api.h" +#else + #include "algo/echo/sph_echo.h" +#endif #include "algo/skein/sph_skein.h" #include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/shavite/sph_shavite.h" -#if defined(__aarch64__) - #include "algo/simd/sph_simd.h" -#else - #include "algo/simd/nist.h" -#endif +#include "algo/simd/simd-hash-2way.h" #include "algo/hamsi/sph_hamsi.h" #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" @@ -41,12 +40,15 @@ union _x22i_context_overlay sph_bmw512_context bmw; #if defined(__AES__) hashState_groestl groestl; - hashState_echo echo; hashState_fugue fugue; #else sph_groestl512_context groestl; - sph_echo512_context echo; sph_fugue512_context fugue; +#endif +#if defined(__AES__) || defined(__ARM_FEATURE_AES) + hashState_echo echo; +#else + sph_echo512_context echo; #endif sph_jh512_context jh; sph_keccak512_context keccak; @@ -54,11 +56,7 @@ union _x22i_context_overlay hashState_luffa luffa; cubehashParam cube; sph_shavite512_context shavite; -#if defined(__aarch64__) - sph_simd512_context simd; -#else - hashState_sd simd; -#endif + simd512_context simd; sph_hamsi512_context hamsi; sph_shabal512_context shabal; sph_whirlpool_context whirlpool; @@ -84,9 +82,7 @@ int x22i_hash( void *output, const void *input, int thrid ) sph_bmw512_close(&ctx.bmw, hash); #if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); + groestl512_full( &ctx.groestl, hash, hash, 512 ); #else sph_groestl512_init( &ctx.groestl ); sph_groestl512( &ctx.groestl, hash, 64 ); @@ -109,26 +105,16 @@ int x22i_hash( void *output, const void *input, int thrid ) luffa_full( &ctx.luffa, hash, 512, hash, 64 ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, hash, hash, 64 ); - + cubehash_full( &ctx.cube, hash, 512, hash, 64 ); + sph_shavite512_init(&ctx.shavite); sph_shavite512(&ctx.shavite, (const void*) hash, 64); sph_shavite512_close(&ctx.shavite, hash); -#if defined(__aarch64__) - sph_simd512_init(&ctx.simd ); - sph_simd512(&ctx.simd, (const void*) hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - simd_full( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#endif + simd512_ctx( &ctx.simd, hash, hash, 64 ); -#if defined(__AES__) - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence*)hash, - (const BitSequence*)hash, 512 ); +#if defined(__AES__) || defined(__ARM_FEATURE_AES) + echo_full( &ctx.echo, hash, 512, hash, 64 ); #else sph_echo512_init( &ctx.echo ); sph_echo512( &ctx.echo, hash, 64 ); @@ -192,8 +178,8 @@ int x22i_hash( void *output, const void *input, int thrid ) int scanhash_x22i( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr) { - uint32_t edata[20] __attribute__((aligned(64))); - uint32_t hash64[8] __attribute__((aligned(64))); + uint32_t edata[20] __attribute__((aligned(32))); + uint32_t hash64[8] __attribute__((aligned(32))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; uint32_t n = pdata[19]; diff --git a/algo/x22/x25x.c b/algo/x22/x25x.c index 4defb6e..99827b5 100644 --- a/algo/x22/x25x.c +++ b/algo/x22/x25x.c @@ -5,24 +5,23 @@ #include "algo/blake/blake512-hash.h" #include "algo/bmw/sph_bmw.h" #if defined(__AES__) - #include "algo/echo/aes_ni/hash_api.h" #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/fugue/fugue-aesni.h" #else #include "algo/groestl/sph_groestl.h" - #include "algo/echo/sph_echo.h" #include "algo/fugue/sph_fugue.h" #endif +#if defined(__AES__) || defined(__ARM_FEATURE_AES) + #include "algo/echo/aes_ni/hash_api.h" +#else + #include "algo/echo/sph_echo.h" +#endif #include "algo/skein/sph_skein.h" #include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/shavite/sph_shavite.h" -#if defined(__aarch64__) - #include "algo/simd/sph_simd.h" -#else - #include "algo/simd/nist.h" -#endif +#include "algo/simd/simd-hash-2way.h" #include "algo/hamsi/sph_hamsi.h" #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" @@ -44,12 +43,15 @@ union _x25x_context_overlay sph_bmw512_context bmw; #if defined(__AES__) hashState_groestl groestl; - hashState_echo echo; hashState_fugue fugue; #else sph_groestl512_context groestl; - sph_echo512_context echo; sph_fugue512_context fugue; +#endif +#if defined(__AES__) || defined(__ARM_FEATURE_AES) + hashState_echo echo; +#else + sph_echo512_context echo; #endif sph_jh512_context jh; sph_keccak512_context keccak; @@ -57,11 +59,7 @@ union _x25x_context_overlay hashState_luffa luffa; cubehashParam cube; sph_shavite512_context shavite; -#if defined(__aarch64__) - sph_simd512_context simd; -#else - hashState_sd simd; -#endif + simd512_context simd; sph_hamsi512_context hamsi; sph_shabal512_context shabal; sph_whirlpool_context whirlpool; @@ -89,9 +87,7 @@ int x25x_hash( void *output, const void *input, int thrid ) sph_bmw512_close(&ctx.bmw, &hash[1]); #if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)&hash[2], - (const char*)&hash[1], 512 ); + groestl512_full( &ctx.groestl, (void*)&hash[2], (const void*)&hash[1], 512 ); #else sph_groestl512_init( &ctx.groestl ); sph_groestl512( &ctx.groestl, &hash[1], 64 ); @@ -112,28 +108,18 @@ int x25x_hash( void *output, const void *input, int thrid ) if ( work_restart[thrid].restart ) return 0; - init_luffa( &ctx.luffa, 512 ); - luffa_full( &ctx.luffa, &hash[6], 512, &hash[5], 64 ); + luffa_full( &ctx.luffa, (void*)&hash[6], 512, (const void*)&hash[5], 64 ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, &hash[7], &hash[6], 64 ); + cubehash_full( &ctx.cube, (void*)&hash[7], 512, (const void*)&hash[6], 64 ); sph_shavite512_init(&ctx.shavite); sph_shavite512(&ctx.shavite, (const void*) &hash[7], 64); sph_shavite512_close(&ctx.shavite, &hash[8]); -#if defined(__aarch64__) - sph_simd512(&ctx.simd, (const void*) &hash[8], 64); - sph_simd512_close(&ctx.simd, &hash[9] ); -#else - update_final_sd( &ctx.simd, (BitSequence *)&hash[9], - (const BitSequence *)&hash[8], 512 ); -#endif + simd512_ctx( &ctx.simd, (void*)&hash[9], (const void*)&hash[8], 64 ); -#if defined(__AES__) - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence*)&hash[10], - (const BitSequence*)&hash[9], 512 ); +#if defined(__AES__) || defined(__ARM_FEATURE_AES) + echo_full( &ctx.echo, (void*)&hash[10], 512, (const void*)&hash[9], 64 ); #else sph_echo512_init( &ctx.echo ); sph_echo512( &ctx.echo, &hash[9], 64 ); @@ -227,8 +213,8 @@ int x25x_hash( void *output, const void *input, int thrid ) int scanhash_x25x( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr) { - uint32_t edata[20] __attribute__((aligned(64))); - uint32_t hash64[8] __attribute__((aligned(64))); + uint32_t edata[20] __attribute__((aligned(32))); + uint32_t hash64[8] __attribute__((aligned(32))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; uint32_t n = pdata[19]; @@ -245,7 +231,7 @@ int scanhash_x25x( struct work *work, uint32_t max_nonce, do { edata[19] = n; - if ( x25x_hash( hash64, edata, thr_id ) ) + if ( x25x_hash( hash64, edata, thr_id ) ); if ( unlikely( valid_hash( hash64, ptarget ) && !bench ) ) { pdata[19] = bswap_32( n ); diff --git a/configure b/configure index 1dc92ef..60e7ff2 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.10. +# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.11. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -608,8 +608,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='23.10' -PACKAGE_STRING='cpuminer-opt 23.10' +PACKAGE_VERSION='23.11' +PACKAGE_STRING='cpuminer-opt 23.11' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 23.10 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 23.11 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1432,7 +1432,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 23.10:";; + short | recursive ) echo "Configuration of cpuminer-opt 23.11:";; esac cat <<\_ACEOF @@ -1538,7 +1538,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 23.10 +cpuminer-opt configure 23.11 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 23.10, which was +It was created by cpuminer-opt $as_me 23.11, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3593,7 +3593,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='23.10' + VERSION='23.11' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 23.10, which was +This file was extended by cpuminer-opt $as_me 23.11, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 23.10 +cpuminer-opt config.status 23.11 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 63be358..48b3cd9 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [23.10]) +AC_INIT([cpuminer-opt], [23.11]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/configure~ b/configure~ index 94a7819..1dc92ef 100755 --- a/configure~ +++ b/configure~ @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.9. +# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.10. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -608,8 +608,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='23.9' -PACKAGE_STRING='cpuminer-opt 23.9' +PACKAGE_VERSION='23.10' +PACKAGE_STRING='cpuminer-opt 23.10' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 23.9 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 23.10 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1432,7 +1432,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 23.9:";; + short | recursive ) echo "Configuration of cpuminer-opt 23.10:";; esac cat <<\_ACEOF @@ -1538,7 +1538,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 23.9 +cpuminer-opt configure 23.10 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 23.9, which was +It was created by cpuminer-opt $as_me 23.10, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3593,7 +3593,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='23.9' + VERSION='23.10' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 23.9, which was +This file was extended by cpuminer-opt $as_me 23.10, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 23.9 +cpuminer-opt config.status 23.10 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\"