From a90d75b8f529bea860113861c56314353d108795 Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Tue, 16 Jan 2018 15:11:44 -0500 Subject: [PATCH] v3.7.10 --- Makefile.am | 19 +- RELEASE_NOTES | 7 + algo/blake/blake-4way.c | 91 ++- algo/blake/blake-hash-4way.c | 205 +++---- algo/blake/blake-hash-4way.h | 44 +- algo/blake/blakecoin-4way.c | 106 ++++ algo/blake/blakecoin-gate.c | 71 +++ algo/blake/blakecoin-gate.h | 21 + algo/blake/blakecoin.c | 6 +- algo/blake/decred-4way.c | 82 +-- algo/blake/pentablake-4way.c | 5 +- algo/blake/pentablake-gate.h | 2 +- algo/bmw/bmw-hash-4way.c | 791 ++++++++++++++++----------- algo/bmw/bmw-hash-4way.h | 73 +-- algo/jh/jha-4way.c | 136 ++--- algo/lyra2/lyra2h-4way.c | 128 +++++ algo/lyra2/lyra2h-gate.c | 25 + algo/lyra2/lyra2h-gate.h | 32 ++ algo/lyra2/lyra2h.c | 22 +- algo/lyra2/lyra2re.c | 1 + algo/lyra2/lyra2rev2-4way.c | 177 ++++++ algo/lyra2/lyra2rev2-gate.c | 38 ++ algo/lyra2/lyra2rev2-gate.h | 35 ++ algo/lyra2/lyra2rev2.c | 40 +- algo/lyra2/lyra2z-4way.c | 2 +- algo/lyra2/lyra2z.c | 38 -- algo/m7m.c | 1 + algo/nist5/nist5.c | 1 + algo/nist5/zr5.c | 1 + algo/quark/quark-4way.c | 207 +++++++ algo/quark/quark-gate.c | 17 + algo/quark/quark-gate.h | 32 ++ algo/quark/quark.c | 16 +- algo/qubit/deep.c | 1 + algo/qubit/qubit.c | 1 + algo/scrypt.c | 1 + algo/scryptjane/scrypt-jane-chacha.h | 4 +- algo/scryptjane/scrypt-jane-hash.h | 4 +- algo/skein/skein-hash-4way.c | 11 - algo/skein/skein-hash-4way.h | 24 +- algo/sm3/sm3-hash-4way.c | 231 ++++++++ algo/sm3/sm3-hash-4way.h | 89 +++ algo/sm3/sm3.c | 2 +- algo/whirlpool/sph_whirlpool.c | 7 +- algo/whirlpool/whirlpool-gate.h | 1 + algo/whirlpool/whirlpool-hash-4way.c | 6 +- algo/x11/c11.c | 3 +- algo/x11/timetravel-4way.c | 274 ++++++++++ algo/x11/timetravel-gate.c | 78 +++ algo/x11/timetravel-gate.h | 40 ++ algo/{ => x11}/timetravel.c | 99 +--- algo/x11/timetravel10-4way.c | 316 +++++++++++ algo/x11/timetravel10-gate.c | 78 +++ algo/x11/timetravel10-gate.h | 39 ++ algo/{ => x11}/timetravel10.c | 99 +--- algo/x11/x11.c | 12 +- algo/x11/x11evo-4way.c | 340 ++++++++++++ algo/x11/x11evo-gate.c | 95 ++++ algo/x11/x11evo-gate.h | 39 ++ algo/x11/x11evo.c | 112 +--- algo/x11/x11gost.c | 1 + algo/x13/phi1612.c | 1 + algo/x13/skunk.c | 1 + algo/x13/x13.c | 1 + algo/x13/x13sm3-4way.c | 33 +- algo/x13/x13sm3.c | 1 + algo/x14/axiom.c | 1 + algo/x14/polytimos-gate.h | 2 +- algo/x14/x14.c | 1 + algo/x15/x15-4way.c | 1 - algo/x15/x15.c | 1 + algo/x17/x17.c | 11 +- avxdefs.h | 24 +- configure | 20 +- configure.ac | 2 +- cpu-miner.c | 41 +- miner.h | 2 +- 77 files changed, 3408 insertions(+), 1214 deletions(-) create mode 100644 algo/blake/blakecoin-4way.c create mode 100644 algo/blake/blakecoin-gate.c create mode 100644 algo/blake/blakecoin-gate.h create mode 100644 algo/lyra2/lyra2h-4way.c create mode 100644 algo/lyra2/lyra2h-gate.c create mode 100644 algo/lyra2/lyra2h-gate.h create mode 100644 algo/lyra2/lyra2rev2-4way.c create mode 100644 algo/lyra2/lyra2rev2-gate.c create mode 100644 algo/lyra2/lyra2rev2-gate.h create mode 100644 algo/quark/quark-4way.c create mode 100644 algo/quark/quark-gate.c create mode 100644 algo/quark/quark-gate.h create mode 100644 algo/sm3/sm3-hash-4way.c create mode 100644 algo/sm3/sm3-hash-4way.h create mode 100644 algo/x11/timetravel-4way.c create mode 100644 algo/x11/timetravel-gate.c create mode 100644 algo/x11/timetravel-gate.h rename algo/{ => x11}/timetravel.c (82%) create mode 100644 algo/x11/timetravel10-4way.c create mode 100644 algo/x11/timetravel10-gate.c create mode 100644 algo/x11/timetravel10-gate.h rename algo/{ => x11}/timetravel10.c (84%) create mode 100644 algo/x11/x11evo-4way.c create mode 100644 algo/x11/x11evo-gate.c create mode 100644 algo/x11/x11evo-gate.h diff --git a/Makefile.am b/Makefile.am index e8cf3f1..05062ca 100644 --- a/Makefile.am +++ b/Makefile.am @@ -46,8 +46,10 @@ cpuminer_SOURCES = \ algo/blake/sph_blake2b.c \ algo/blake/blake2b.c \ algo/blake/blake2s.c \ + algo/blake/blakecoin-gate.c \ algo/blake/mod_blakecoin.c \ algo/blake/blakecoin.c \ + algo/blake/blakecoin-4way.c \ algo/blake/decred-gate.c \ algo/blake/decred.c \ algo/blake/decred-4way.c \ @@ -99,13 +101,17 @@ cpuminer_SOURCES = \ algo/luffa/sse2/luffa_for_sse2.c \ algo/lyra2/lyra2.c \ algo/lyra2/sponge.c \ + algo/lyra2/lyra2rev2-gate.c \ algo/lyra2/lyra2rev2.c \ + algo/lyra2/lyra2rev2-4way.c \ algo/lyra2/lyra2re.c \ algo/lyra2/lyra2z-gate.c \ algo/lyra2/lyra2z.c \ algo/lyra2/lyra2z-4way.c \ algo/lyra2/lyra2z330.c \ + algo/lyra2/lyra2h-gate.c \ algo/lyra2/lyra2h.c \ + algo/lyra2/lyra2h-4way.c \ algo/m7m.c \ algo/neoscrypt/neoscrypt.c \ algo/nist5/nist5-gate.c \ @@ -113,7 +119,9 @@ cpuminer_SOURCES = \ algo/nist5/nist5.c \ algo/nist5/zr5.c \ algo/pluck.c \ + algo/quark/quark-gate.c \ algo/quark/quark.c \ + algo/quark/quark-4way.c \ algo/qubit/qubit.c \ algo/qubit/deep.c \ algo/ripemd/sph_ripemd.c \ @@ -140,9 +148,8 @@ cpuminer_SOURCES = \ algo/skein/skein2-4way.c \ algo/skein/skein2-gate.c \ algo/sm3/sm3.c \ + algo/sm3/sm3-hash-4way.c \ algo/tiger/sph_tiger.c \ - algo/timetravel.c \ - algo/timetravel10.c \ algo/whirlpool/sph_whirlpool.c \ algo/whirlpool/whirlpool-hash-4way.c \ algo/whirlpool/whirlpool-gate.c \ @@ -161,8 +168,16 @@ cpuminer_SOURCES = \ algo/x11/tribus-gate.c \ algo/x11/tribus.c \ algo/x11/tribus-4way.c \ + algo/x11/timetravel-gate.c \ + algo/x11/timetravel.c \ + algo/x11/timetravel-4way.c \ + algo/x11/timetravel10-gate.c \ + algo/x11/timetravel10.c \ + algo/x11/timetravel10-4way.c \ algo/x11/fresh.c \ algo/x11/x11evo.c \ + algo/x11/x11evo-4way.c \ + algo/x11/x11evo-gate.c \ algo/x13/x13-gate.c \ algo/x13/x13.c \ algo/x13/x13-4way.c \ diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 5caedd1..bc2010d 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -165,6 +165,13 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble. Change Log ---------- +v3.7.10 + +4way optimizations for lyra2rev2, lyra2h, quark, timetravel8, timetravel10 + x11evo, blakecoin. +Faster x13sm3 (hsr). +Added share difficulty to accepted message. + v3.7.9 Partial 4way optimizations for veltor, skunk, polytimos, lyra2z. diff --git a/algo/blake/blake-4way.c b/algo/blake/blake-4way.c index 70b51a3..34f0e92 100644 --- a/algo/blake/blake-4way.c +++ b/algo/blake/blake-4way.c @@ -1,31 +1,22 @@ #include "blake-gate.h" -#include "sph_blake.h" + +#if defined (__AVX__) + #include "blake-hash-4way.h" #include #include #include -#if defined (BLAKE_4WAY) +blake256r14_4way_context blake_ctx; void blakehash_4way(void *state, const void *input) { - uint32_t vhash[4*4] __attribute__ ((aligned (64))); - uint32_t hash0[4] __attribute__ ((aligned (32))); - uint32_t hash1[4] __attribute__ ((aligned (32))); - uint32_t hash2[4] __attribute__ ((aligned (32))); - uint32_t hash3[4] __attribute__ ((aligned (32))); - blake256_4way_context ctx; - - blake256_4way_init( &ctx ); - blake256_4way( &ctx, input, 16 ); - blake256_4way_close( &ctx, vhash ); - - mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 ); - - memcpy( state, hash0, 32 ); - memcpy( state+32, hash1, 32 ); - memcpy( state+64, hash1, 32 ); - memcpy( state+96, hash1, 32 ); + uint32_t vhash[8*4] __attribute__ ((aligned (64))); + blake256r14_4way_context ctx; + memcpy( &ctx, &blake_ctx, sizeof ctx ); + blake256r14_4way( &ctx, input + (64<<2), 16 ); + blake256r14_4way_close( &ctx, vhash ); + mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 ); } int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce, @@ -36,21 +27,24 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce, uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; -// uint32_t HTarget = ptarget[7]; + uint32_t HTarget = ptarget[7]; uint32_t _ALIGN(32) edata[20]; uint32_t n = first_nonce; uint32_t *nonces = work->nonces; bool *found = work->nfound; int num_found = 0; -// if (opt_benchmark) -// HTarget = 0x7f; + if (opt_benchmark) + HTarget = 0x7f; // we need big endian data... swab32_array( edata, pdata, 20 ); mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 ); + blake256r14_4way_init( &blake_ctx ); + blake256r14_4way( &blake_ctx, vdata, 64 ); + uint32_t *noncep = vdata + 76; // 19*4 do { found[0] = found[1] = found[2] = found[3] = false; @@ -61,45 +55,36 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce, blakehash_4way( hash, vdata ); - if ( hash[7] == 0 ) + if ( hash[7] <= HTarget && fulltest( hash, ptarget ) ) { - if ( fulltest( hash, ptarget ) ) - { - found[0] = true; - num_found++; - nonces[0] = n; - pdata[19] = n; - } + found[0] = true; + num_found++; + nonces[0] = n; + pdata[19] = n; + work_set_target_ratio( work, hash ); } - if ( (hash+8)[7] == 0 ) + if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) ) { - if ( fulltest( hash+8, ptarget ) ) - { - found[1] = true; - num_found++; - nonces[1] = n+1; - } + found[1] = true; + num_found++; + nonces[1] = n+1; + work_set_target_ratio( work, hash+8 ); } - if ( (hash+16)[7] == 0 ) + if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) ) { - if ( fulltest( hash+8, ptarget ) ) - { - found[2] = true; - num_found++; - nonces[2] = n+2; - } + found[2] = true; + num_found++; + nonces[2] = n+2; + work_set_target_ratio( work, hash+16 ); } - if ( (hash+24)[7] == 0 ) + if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) ) { - if ( fulltest( hash+8, ptarget ) ) - { - found[3] = true; - num_found++; - nonces[3] = n+3; - } + found[3] = true; + num_found++; + nonces[3] = n+3; + work_set_target_ratio( work, hash+24 ); } - n += 4; - *hashes_done = n - first_nonce + 1; + n += 4; } while ( (num_found == 0) && (n < max_nonce) && !work_restart[thr_id].restart ); diff --git a/algo/blake/blake-hash-4way.c b/algo/blake/blake-hash-4way.c index ee0ecd5..fab394a 100644 --- a/algo/blake/blake-hash-4way.c +++ b/algo/blake/blake-hash-4way.c @@ -491,14 +491,9 @@ do { \ (state)->T1 = T1; \ } while (0) -//#define BLAKE32_ROUNDS 8 -#ifndef BLAKE32_ROUNDS -#define BLAKE32_ROUNDS 14 -#endif - #if SPH_COMPACT_BLAKE_32 -#define COMPRESS32_4WAY do { \ +#define COMPRESS32_4WAY( rounds ) do { \ __m128i M[16]; \ __m128i V0, V1, V2, V3, V4, V5, V6, V7; \ __m128i V8, V9, VA, VB, VC, VD, VE, VF; \ @@ -539,7 +534,7 @@ do { \ M[0xD] = mm_byteswap_32( *(buf + 13) ); \ M[0xE] = mm_byteswap_32( *(buf + 14) ); \ M[0xF] = mm_byteswap_32( *(buf + 15) ); \ - for (r = 0; r < BLAKE32_ROUNDS; r ++) \ + for (r = 0; r < rounds; r ++) \ ROUND_S_4WAY(r); \ H0 = _mm_xor_si128( _mm_xor_si128( \ _mm_xor_si128( S0, V0 ), V8 ), H0 ); \ @@ -563,80 +558,70 @@ do { \ // current impl -#define COMPRESS32_4WAY do { \ - __m128i M0, M1, M2, M3, M4, M5, M6, M7; \ - __m128i M8, M9, MA, MB, MC, MD, ME, MF; \ - __m128i V0, V1, V2, V3, V4, V5, V6, V7; \ - __m128i V8, V9, VA, VB, VC, VD, VE, VF; \ - V0 = H0; \ - V1 = H1; \ - V2 = H2; \ - V3 = H3; \ - V4 = H4; \ - V5 = H5; \ - V6 = H6; \ - V7 = H7; \ - V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \ - V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \ - VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \ - VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \ - VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \ - _mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \ - VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \ - _mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \ - VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \ - _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \ - VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \ - _mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \ - M0 = mm_byteswap_32( * buf ); \ - M1 = mm_byteswap_32( *(buf+1) ); \ - M2 = mm_byteswap_32( *(buf+2) ); \ - M3 = mm_byteswap_32( *(buf+3) ); \ - M4 = mm_byteswap_32( *(buf+4) ); \ - M5 = mm_byteswap_32( *(buf+5) ); \ - M6 = mm_byteswap_32( *(buf+6) ); \ - M7 = mm_byteswap_32( *(buf+7) ); \ - M8 = mm_byteswap_32( *(buf+8) ); \ - M9 = mm_byteswap_32( *(buf+9) ); \ - MA = mm_byteswap_32( *(buf+10) ); \ - MB = mm_byteswap_32( *(buf+11) ); \ - MC = mm_byteswap_32( *(buf+12) ); \ - MD = mm_byteswap_32( *(buf+13) ); \ - ME = mm_byteswap_32( *(buf+14) ); \ - MF = mm_byteswap_32( *(buf+15) ); \ - ROUND_S_4WAY(0); \ - ROUND_S_4WAY(1); \ - ROUND_S_4WAY(2); \ - ROUND_S_4WAY(3); \ - ROUND_S_4WAY(4); \ - ROUND_S_4WAY(5); \ - ROUND_S_4WAY(6); \ - ROUND_S_4WAY(7); \ - if (BLAKE32_ROUNDS == 14) { \ - ROUND_S_4WAY(8); \ - ROUND_S_4WAY(9); \ - ROUND_S_4WAY(0); \ - ROUND_S_4WAY(1); \ - ROUND_S_4WAY(2); \ - ROUND_S_4WAY(3); \ - } \ - H0 = _mm_xor_si128( _mm_xor_si128( \ - _mm_xor_si128( V8, V0 ), S0 ), H0 ); \ - H1 = _mm_xor_si128( _mm_xor_si128( \ - _mm_xor_si128( V9, V1 ), S1 ), H1 ); \ - H2 = _mm_xor_si128( _mm_xor_si128( \ - _mm_xor_si128( VA, V2 ), S2 ), H2 ); \ - H3 = _mm_xor_si128( _mm_xor_si128( \ - _mm_xor_si128( VB, V3 ), S3 ), H3 ); \ - H4 = _mm_xor_si128( _mm_xor_si128( \ - _mm_xor_si128( VC, V4 ), S0 ), H4 ); \ - H5 = _mm_xor_si128( _mm_xor_si128( \ - _mm_xor_si128( VD, V5 ), S1 ), H5 ); \ - H6 = _mm_xor_si128( _mm_xor_si128( \ - _mm_xor_si128( VE, V6 ), S2 ), H6 ); \ - H7 = _mm_xor_si128( _mm_xor_si128( \ - _mm_xor_si128( VF, V7 ), S3 ), H7 ); \ - } while (0) +#define COMPRESS32_4WAY( rounds ) \ +do { \ + __m128i M0, M1, M2, M3, M4, M5, M6, M7; \ + __m128i M8, M9, MA, MB, MC, MD, ME, MF; \ + __m128i V0, V1, V2, V3, V4, V5, V6, V7; \ + __m128i V8, V9, VA, VB, VC, VD, VE, VF; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \ + V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \ + VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \ + VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \ + VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \ + VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \ + VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \ + VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \ + M0 = mm_byteswap_32( * buf ); \ + M1 = mm_byteswap_32( *(buf+1) ); \ + M2 = mm_byteswap_32( *(buf+2) ); \ + M3 = mm_byteswap_32( *(buf+3) ); \ + M4 = mm_byteswap_32( *(buf+4) ); \ + M5 = mm_byteswap_32( *(buf+5) ); \ + M6 = mm_byteswap_32( *(buf+6) ); \ + M7 = mm_byteswap_32( *(buf+7) ); \ + M8 = mm_byteswap_32( *(buf+8) ); \ + M9 = mm_byteswap_32( *(buf+9) ); \ + MA = mm_byteswap_32( *(buf+10) ); \ + MB = mm_byteswap_32( *(buf+11) ); \ + MC = mm_byteswap_32( *(buf+12) ); \ + MD = mm_byteswap_32( *(buf+13) ); \ + ME = mm_byteswap_32( *(buf+14) ); \ + MF = mm_byteswap_32( *(buf+15) ); \ + ROUND_S_4WAY(0); \ + ROUND_S_4WAY(1); \ + ROUND_S_4WAY(2); \ + ROUND_S_4WAY(3); \ + ROUND_S_4WAY(4); \ + ROUND_S_4WAY(5); \ + ROUND_S_4WAY(6); \ + ROUND_S_4WAY(7); \ + if (rounds == 14) \ + { \ + ROUND_S_4WAY(8); \ + ROUND_S_4WAY(9); \ + ROUND_S_4WAY(0); \ + ROUND_S_4WAY(1); \ + ROUND_S_4WAY(2); \ + ROUND_S_4WAY(3); \ + } \ + H0 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V8, V0 ), S0 ), H0 ); \ + H1 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V9, V1 ), S1 ), H1 ); \ + H2 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VA, V2 ), S2 ), H2 ); \ + H3 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VB, V3 ), S3 ), H3 ); \ + H4 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VC, V4 ), S0 ), H4 ); \ + H5 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VD, V5 ), S1 ), H5 ); \ + H6 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VE, V6 ), S2 ), H6 ); \ + H7 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VF, V7 ), S3 ), H7 ); \ +} while (0) #endif @@ -832,15 +817,16 @@ static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 }; static void blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv, - const sph_u32 *salt) + const sph_u32 *salt, int rounds ) { - int i; - for ( i = 0; i < 8; i++ ) - sc->H[i] = _mm_set1_epi32( iv[i] ); - for ( i = 0; i < 4; i++ ) - sc->S[i] = _mm_set1_epi32( salt[i] ); - sc->T0 = sc->T1 = 0; - sc->ptr = 0; + int i; + for ( i = 0; i < 8; i++ ) + sc->H[i] = _mm_set1_epi32( iv[i] ); + for ( i = 0; i < 4; i++ ) + sc->S[i] = _mm_set1_epi32( salt[i] ); + sc->T0 = sc->T1 = 0; + sc->ptr = 0; + sc->rounds = rounds; } static void @@ -878,7 +864,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len ) { if ( ( T0 = SPH_T32(T0 + 512) ) < 512 ) T1 = SPH_T32(T1 + 1); - COMPRESS32_4WAY; + COMPRESS32_4WAY( sc->rounds ); ptr = 0; } } @@ -1079,10 +1065,11 @@ blake64_4way_close( blake_4way_big_context *sc, #endif +// default 14 rounds, backward copatibility void blake256_4way_init(void *cc) { - blake32_4way_init(cc, IV256, salt_zero_small); + blake32_4way_init( cc, IV256, salt_zero_small, 14 ); } void @@ -1094,13 +1081,43 @@ blake256_4way(void *cc, const void *data, size_t len) void blake256_4way_close(void *cc, void *dst) { - blake256_4way_addbits_and_close(cc, 0, 0, dst); + blake32_4way_close(cc, 0, 0, dst, 8); +} + +// 14 rounds blake, decred +void blake256r14_4way_init(void *cc) +{ + blake32_4way_init( cc, IV256, salt_zero_small, 14 ); } void -blake256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +blake256r14_4way(void *cc, const void *data, size_t len) { - blake32_4way_close(cc, ub, n, dst, 8); + blake32_4way(cc, data, len); +} + +void +blake256r14_4way_close(void *cc, void *dst) +{ + blake32_4way_close(cc, 0, 0, dst, 8); +} + +// 8 rounds blakecoin, vanilla +void blake256r8_4way_init(void *cc) +{ + blake32_4way_init( cc, IV256, salt_zero_small, 8 ); +} + +void +blake256r8_4way(void *cc, const void *data, size_t len) +{ + blake32_4way(cc, data, len); +} + +void +blake256r8_4way_close(void *cc, void *dst) +{ + blake32_4way_close(cc, 0, 0, dst, 8); } #if defined (__AVX2__) diff --git a/algo/blake/blake-hash-4way.h b/algo/blake/blake-hash-4way.h index f670446..003b39a 100644 --- a/algo/blake/blake-hash-4way.h +++ b/algo/blake/blake-hash-4way.h @@ -35,7 +35,9 @@ */ #ifndef __BLAKE_HASH_4WAY__ -#define __BLAKE_HASH_4WAY___ +#define __BLAKE_HASH_4WAY__ + +#ifdef __AVX__ #ifdef __cplusplus extern "C"{ @@ -45,38 +47,36 @@ extern "C"{ #include "algo/sha/sph_types.h" #include "avxdefs.h" -/** - * Output size (in bits) for BLAKE-256. - */ #define SPH_SIZE_blake256 256 -#if SPH_64 - -/** - * Output size (in bits) for BLAKE-512. - */ #define SPH_SIZE_blake512 512 -#endif - -#ifdef __AVX__ typedef struct { - __m128i buf[16] __attribute__ ((aligned (64))); - __m128i H[8]; - __m128i S[4]; - size_t ptr; - sph_u32 T0, T1; + __m128i buf[16] __attribute__ ((aligned (64))); + __m128i H[8]; + __m128i S[4]; + size_t ptr; + sph_u32 T0, T1; + int rounds; // 14 for blake, 8 for blakecoin & vanilla } blake_4way_small_context; +// Default 14 rounds typedef blake_4way_small_context blake256_4way_context; - void blake256_4way_init(void *cc); void blake256_4way(void *cc, const void *data, size_t len); void blake256_4way_close(void *cc, void *dst); -void blake256_4way_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); -#endif +// 14 rounds, blake, decred +typedef blake_4way_small_context blake256r14_4way_context; +void blake256r14_4way_init(void *cc); +void blake256r14_4way(void *cc, const void *data, size_t len); +void blake256r14_4way_close(void *cc, void *dst); + +// 8 rounds, blakecoin, vanilla +typedef blake_4way_small_context blake256r8_4way_context; +void blake256r8_4way_init(void *cc); +void blake256r8_4way(void *cc, const void *data, size_t len); +void blake256r8_4way_close(void *cc, void *dst); #ifdef __AVX2__ @@ -103,3 +103,5 @@ void blake512_4way_addbits_and_close( #endif #endif + +#endif diff --git a/algo/blake/blakecoin-4way.c b/algo/blake/blakecoin-4way.c new file mode 100644 index 0000000..0abd85f --- /dev/null +++ b/algo/blake/blakecoin-4way.c @@ -0,0 +1,106 @@ +#include "blakecoin-gate.h" + +#if defined (__AVX__) + +#include "blake-hash-4way.h" +#include +#include +#include + +blake256r8_4way_context blakecoin_ctx; + +void blakecoin_4way_hash(void *state, const void *input) +{ + uint32_t vhash[8*4] __attribute__ ((aligned (64))); + blake256r8_4way_context ctx; + memcpy( &ctx, &blakecoin_ctx, sizeof ctx ); + blake256r8_4way( &ctx, input + (64<<2), 16 ); + blake256r8_4way_close( &ctx, vhash ); + mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 ); +} + +int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ) +{ + uint32_t vdata[20*4] __attribute__ ((aligned (64))); + uint32_t hash[8*4] __attribute__ ((aligned (32))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + uint32_t HTarget = ptarget[7]; + uint32_t _ALIGN(32) edata[20]; + uint32_t n = first_nonce; + uint32_t *nonces = work->nonces; + bool *found = work->nfound; + int num_found = 0; + + if (opt_benchmark) + HTarget = 0x7f; + + // we need big endian data... + swab32_array( edata, pdata, 20 ); + + mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 ); + + blake256r8_4way_init( &blakecoin_ctx ); + blake256r8_4way( &blakecoin_ctx, vdata, 64 ); + + uint32_t *noncep = vdata + 76; // 19*4 + do { + found[0] = found[1] = found[2] = found[3] = false; + be32enc( noncep, n ); + be32enc( noncep +1, n+1 ); + be32enc( noncep +2, n+2 ); + be32enc( noncep +3, n+3 ); + + blakecoin_4way_hash( hash, vdata ); + pdata[19] = n; + + if ( hash[7] <= HTarget && fulltest( hash, ptarget ) ) + { + found[0] = true; + num_found++; + nonces[0] = n; + work_set_target_ratio( work, hash ); + } + if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) ) + { + found[1] = true; + num_found++; + nonces[1] = n+1; + work_set_target_ratio( work, hash+8 ); + } + if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) ) + { + found[2] = true; + num_found++; + nonces[2] = n+2; + work_set_target_ratio( work, hash+16 ); + } + if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) ) + { + found[3] = true; + num_found++; + nonces[3] = n+3; + work_set_target_ratio( work, hash+24 ); + } + n += 4; + + } while ( (num_found == 0) && (n < max_nonce) + && !work_restart[thr_id].restart ); + + *hashes_done = n - first_nonce + 1; + + // workaround to prevent flood of hash reports when nonce range exhasuted + // and thread is spinning waiting for new work + if ( ( n >= max_nonce ) && ( *hashes_done < 10 ) ) + { + *hashes_done = 0; + sleep(1); + } + + return num_found; +} + +#endif + diff --git a/algo/blake/blakecoin-gate.c b/algo/blake/blakecoin-gate.c new file mode 100644 index 0000000..9b83c58 --- /dev/null +++ b/algo/blake/blakecoin-gate.c @@ -0,0 +1,71 @@ +#include "blakecoin-gate.h" +#include + +// changed to get_max64_0x3fffffLL in cpuminer-multi-decred +int64_t blakecoin_get_max64 () +{ + return 0x7ffffLL; +// return 0x3fffffLL; +} + +// Blakecoin 4 way hashes so fast it runs out of nonces. +// This is an attempt to solve this but the result may be +// to rehash old nonces until new work is received. +void bc4w_get_new_work( struct work* work, struct work* g_work, int thr_id, + uint32_t *end_nonce_ptr, bool clean_job ) +{ + uint32_t *nonceptr = algo_gate.get_nonceptr( work->data ); +// +// if ( have_stratum && ( *nonceptr >= *end_nonce_ptr ) ) +// algo_gate.stratum_gen_work( &stratum, g_work ); + + if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size ) + || ( *nonceptr >= *end_nonce_ptr ) + || ( work->job_id != g_work->job_id ) && clean_job ) +/* + if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size ) + && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) + || ( work->job_id != g_work->job_id ) ) ) +*/ + { + work_free( work ); + work_copy( work, g_work ); + *nonceptr = 0xffffffffU / opt_n_threads * thr_id; + if ( opt_randomize ) + *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads; + *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20; +// try incrementing the xnonce to chsnge the data +// for ( int i = 0; i < work->xnonce2_size && !( ++work->xnonce2[i] ); i++ ); + } + else + ++(*nonceptr); +} + + +// vanilla uses default gen merkle root, otherwise identical to blakecoin +bool register_vanilla_algo( algo_gate_t* gate ) +{ +#if defined(BLAKECOIN_4WAY) +// four_way_not_tested(); + gate->optimizations = FOUR_WAY_OPT; + gate->scanhash = (void*)&scanhash_blakecoin_4way; + gate->hash = (void*)&blakecoin_4way_hash; +// gate->get_new_work = (void*)&bc4w_get_new_work; +// blakecoin_4way_init( &blake_4way_init_ctx ); +#else + gate->scanhash = (void*)&scanhash_blakecoin; + gate->hash = (void*)&blakecoinhash; +// blakecoin_init( &blake_init_ctx ); +#endif + gate->optimizations = AVX2_OPT | FOUR_WAY_OPT; + gate->get_max64 = (void*)&blakecoin_get_max64; + return true; +} + +bool register_blakecoin_algo( algo_gate_t* gate ) +{ + register_vanilla_algo( gate ); + gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root; + return true; +} + diff --git a/algo/blake/blakecoin-gate.h b/algo/blake/blakecoin-gate.h new file mode 100644 index 0000000..f7c7b4f --- /dev/null +++ b/algo/blake/blakecoin-gate.h @@ -0,0 +1,21 @@ +#ifndef __BLAKECOIN_GATE_H__ +#define __BLAKECOIN_GATE_H__ + +#include "algo-gate-api.h" +#include + +#if defined(FOUR_WAY) && defined(__AVX__) + #define BLAKECOIN_4WAY +#endif + +#if defined (BLAKECOIN_4WAY) +void blakecoin_4way_hash(void *state, const void *input); +int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); +#endif + +void blakecoinhash( void *state, const void *input ); +int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +#endif diff --git a/algo/blake/blakecoin.c b/algo/blake/blakecoin.c index a06d5ee..4eda458 100644 --- a/algo/blake/blakecoin.c +++ b/algo/blake/blakecoin.c @@ -1,4 +1,4 @@ -#include "algo-gate-api.h" +#include "blakecoin-gate.h" #define BLAKE32_ROUNDS 8 #include "sph_blake.h" @@ -98,7 +98,7 @@ void blakecoin_gen_merkle_root ( char* merkle_root, struct stratum_ctx* sctx ) SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root ); } */ - +/* // changed to get_max64_0x3fffffLL in cpuminer-multi-decred int64_t blakecoin_get_max64 () { @@ -121,4 +121,4 @@ bool register_blakecoin_algo( algo_gate_t* gate ) gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root; return true; } - +*/ diff --git a/algo/blake/decred-4way.c b/algo/blake/decred-4way.c index de65b72..11111d8 100644 --- a/algo/blake/decred-4way.c +++ b/algo/blake/decred-4way.c @@ -1,5 +1,4 @@ #include "decred-gate.h" -#include "sph_blake.h" #include "blake-hash-4way.h" #include #include @@ -9,7 +8,6 @@ #if defined (DECRED_4WAY) static __thread blake256_4way_context blake_mid; -static __thread bool ctx_midstate_done = false; void decred_hash_4way( void *state, const void *input ) { @@ -18,50 +16,14 @@ void decred_hash_4way( void *state, const void *input ) uint32_t hash1[8] __attribute__ ((aligned (32))); uint32_t hash2[8] __attribute__ ((aligned (32))); uint32_t hash3[8] __attribute__ ((aligned (32))); - blake256_4way_context ctx __attribute__ ((aligned (64))); - - sph_blake256_context ctx2 __attribute__ ((aligned (64))); - uint32_t hash[16] __attribute__ ((aligned (64))); - uint32_t sin0[45], sin1[45], sin2[45], sin3[45]; - - mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 ); - void *tail = input + ( DECRED_MIDSTATE_LEN << 2 ); int tail_len = 180 - DECRED_MIDSTATE_LEN; + blake256_4way_context ctx __attribute__ ((aligned (64))); memcpy( &ctx, &blake_mid, sizeof(blake_mid) ); blake256_4way( &ctx, tail, tail_len ); blake256_4way_close( &ctx, vhash ); -/* - sph_blake256_init( &ctx2 ); - sph_blake256( &ctx2, sin0, 180 ); - sph_blake256_close( &ctx2, hash ); -*/ -/* - blake256_4way_init( &ctx ); - blake256_4way( &ctx, input, 180 ); - blake256_4way_close( &ctx, vhash ); -*/ - mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 ); -/* - for ( int i = 0; i < 8; i++ ) - if ( hash[i] != hash0[i] ) - printf(" hash mismatch, i = %u\n",i); - -printf("hash: %08lx %08lx %08lx %08lx\n", *hash, *(hash+1), - *(hash+2), *(hash+3) ); -printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1), - *(hash0+2), *(hash0+3) ); -printf("\n"); -*/ - - memcpy( state, hash0, 32 ); - memcpy( state+32, hash1, 32 ); - memcpy( state+64, hash2, 32 ); - memcpy( state+96, hash3, 32 ); - -// memcpy( state, hash, 32 ); - + mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 ); } int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce, @@ -69,21 +31,21 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce, { uint32_t vdata[48*4] __attribute__ ((aligned (64))); uint32_t hash[8*4] __attribute__ ((aligned (32))); - uint32_t _ALIGN(64) edata[48]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX]; - uint32_t n = first_nonce; - const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7]; + uint32_t _ALIGN(64) edata[48]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX]; + uint32_t n = first_nonce; + const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7]; uint32_t *nonces = work->nonces; bool *found = work->nfound; int num_found = 0; - ctx_midstate_done = false; - memcpy( edata, pdata, 180 ); + // copy to buffer guaranteed to be aligned. + memcpy( edata, pdata, 180 ); // use the old way until new way updated for size. - mm_interleave_4x32( vdata, edata, edata, edata, edata, 180*8 ); + mm_interleave_4x32x( vdata, edata, edata, edata, edata, 180*8 ); blake256_4way_init( &blake_mid ); blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN ); @@ -106,22 +68,13 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce, nonces[0] = n; pdata[DECRED_NONCE_INDEX] = n; } -/* if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) ) { -printf("found 1\n"); - -printf("vhash: %08lx %08lx %08lx %08lx\n", hash[8], hash[9], hash[10],hash[11] ); -printf("vhash: %08lx %08lx %08lx %08lx\n", hash[12], hash[13], hash[14],hash[15] ); -printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] ); -printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] ); - work_set_target_ratio( work, hash+8 ); found[1] = true; num_found++; nonces[1] = n+1; } -*/ if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) ) { work_set_target_ratio( work, hash+16 ); @@ -129,24 +82,15 @@ printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[ num_found++; nonces[2] = n+2; } -/* + if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) ) { -printf("found 3\n"); - -printf("vhash: %08lx %08lx %08lx %08lx\n", hash[0], hash[1], hash[2],hash[3] ); -printf("vhash: %08lx %08lx %08lx %08lx\n", hash[4], hash[5], hash[6],hash[7] ); -printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] ); -printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] ); - work_set_target_ratio( work, hash+24 ); found[3] = true; num_found++; nonces[3] = n+3; } -*/ - n += 2; -// n += 4; + n += 4; } while ( (num_found == 0) && (n < max_nonce) && !work_restart[thr_id].restart ); diff --git a/algo/blake/pentablake-4way.c b/algo/blake/pentablake-4way.c index c9a64f4..05a52bd 100644 --- a/algo/blake/pentablake-4way.c +++ b/algo/blake/pentablake-4way.c @@ -1,4 +1,7 @@ #include "pentablake-gate.h" + +#ifdef __AVX2__ + #include #include #include @@ -9,8 +12,6 @@ //#define DEBUG_ALGO -#ifdef PENTABLAKE_4WAY - extern void pentablakehash_4way( void *output, const void *input ) { unsigned char _ALIGN(32) hash[128]; diff --git a/algo/blake/pentablake-gate.h b/algo/blake/pentablake-gate.h index daac37c..08ac744 100644 --- a/algo/blake/pentablake-gate.h +++ b/algo/blake/pentablake-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(FOUR_WAY) && defined(__AVX__) +#if defined(FOUR_WAY) && defined(__AVX2__) #define PENTABLAKE_4WAY #endif diff --git a/algo/bmw/bmw-hash-4way.c b/algo/bmw/bmw-hash-4way.c index 651f090..2e35e70 100644 --- a/algo/bmw/bmw-hash-4way.c +++ b/algo/bmw/bmw-hash-4way.c @@ -41,19 +41,13 @@ extern "C"{ #endif -//#include "sph_bmw.h" - -//#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BMW -#define SPH_SMALL_FOOTPRINT_BMW 1 -//#endif - #ifdef _MSC_VER #pragma warning (disable: 4146) #endif -//#undef SPH_ROTL64 -//#define SPH_ROTL64(x,n) (((x) << (n)) | ((x) >> (64 - (n)))) -//#define SPH_ROTL64(x,n) mm256_rotl_64(x,n) +#define LPAR ( + +// BMW256 static const sph_u32 IV256[] = { SPH_C32(0x40414243), SPH_C32(0x44454647), @@ -66,8 +60,7 @@ static const sph_u32 IV256[] = { SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F) }; -#if SPH_64 - +// BMW512 static const sph_u64 IV512[] = { SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F), SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F), @@ -79,74 +72,113 @@ static const sph_u64 IV512[] = { SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF) }; -#endif +// BMW256 -#define XCAT(x, y) XCAT_(x, y) -#define XCAT_(x, y) x ## y +#define ss0(x) \ + _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \ + _mm_slli_epi32( (x), 3) ), \ + _mm_xor_si128( mm_rotl_32( (x), 4), \ + mm_rotl_32( (x), 19) ) ) -#define LPAR ( +#define ss1(x) \ + _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \ + _mm_slli_epi32( (x), 2) ), \ + _mm_xor_si128( mm_rotl_32( (x), 8), \ + mm_rotl_32( (x), 23) ) ) -/* -#define ss0(x) (((x) >> 1) ^ SPH_T32((x) << 3) \ - ^ SPH_ROTL32(x, 4) ^ SPH_ROTL32(x, 19)) -#define ss1(x) (((x) >> 1) ^ SPH_T32((x) << 2) \ - ^ SPH_ROTL32(x, 8) ^ SPH_ROTL32(x, 23)) -#define ss2(x) (((x) >> 2) ^ SPH_T32((x) << 1) \ - ^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25)) -#define ss3(x) (((x) >> 2) ^ SPH_T32((x) << 2) \ - ^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29)) -#define ss4(x) (((x) >> 1) ^ (x)) -#define ss5(x) (((x) >> 2) ^ (x)) -#define rs1(x) SPH_ROTL32(x, 3) -#define rs2(x) SPH_ROTL32(x, 7) -#define rs3(x) SPH_ROTL32(x, 13) -#define rs4(x) SPH_ROTL32(x, 16) -#define rs5(x) SPH_ROTL32(x, 19) -#define rs6(x) SPH_ROTL32(x, 23) -#define rs7(x) SPH_ROTL32(x, 27) +#define ss2(x) \ + _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \ + _mm_slli_epi32( (x), 1) ), \ + _mm_xor_si128( mm_rotl_32( (x), 12), \ + mm_rotl_32( (x), 25) ) ) -#define Ks(j) SPH_T32((sph_u32)(j) * SPH_C32(0x05555555)) +#define ss3(x) \ + _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \ + _mm_slli_epi32( (x), 2) ), \ + _mm_xor_si128( mm_rotl_32( (x), 15), \ + mm_rotl_32( (x), 29) ) ) -#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \ - (SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \ - - SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m)) +#define ss4(x) \ + _mm_xor_si128( (x), _mm_srli_epi32( (x), 1 ) ) -#define expand1s_inner(qf, mf, hf, i16, \ - i0, i1, i2, i3, i4, i5, i6, i7, i8, \ - i9, i10, i11, i12, i13, i14, i15, \ - i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ - SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \ - + ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \ - + ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \ - + ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \ - + add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) +#define ss5(x) \ + _mm_xor_si128( (x), _mm_srli_epi32( (x), 2 ) ) -#define expand1s(qf, mf, hf, i16) \ - expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) -#define expand1s_(qf, mf, hf, i16, ix, iy) \ - expand1s_inner LPAR qf, mf, hf, i16, ix, iy) +#define rs1(x) mm_rotl_32( x, 3 ) +#define rs2(x) mm_rotl_32( x, 7 ) +#define rs3(x) mm_rotl_32( x, 13 ) +#define rs4(x) mm_rotl_32( x, 16 ) +#define rs5(x) mm_rotl_32( x, 19 ) +#define rs6(x) mm_rotl_32( x, 23 ) +#define rs7(x) mm_rotl_32( x, 27 ) -#define expand2s_inner(qf, mf, hf, i16, \ - i0, i1, i2, i3, i4, i5, i6, i7, i8, \ - i9, i10, i11, i12, i13, i14, i15, \ - i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ - SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \ - + qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \ - + qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \ - + qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \ - + add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) +#define rol_off_32( M, j, off ) \ + mm_rotl_32( M[ ( (j) + (off) ) & 0xF ] , \ + ( ( (j) + (off) ) & 0xF ) + 1 ) -#define expand2s(qf, mf, hf, i16) \ - expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) -#define expand2s_(qf, mf, hf, i16, ix, iy) \ - expand2s_inner LPAR qf, mf, hf, i16, ix, iy) -*/ -#if SPH_64 +#define add_elt_s( M, H, j ) \ + _mm_xor_si128( \ + _mm_add_epi32( \ + _mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \ + rol_off_32( M, j, 3 ) ), \ + rol_off_32( M, j, 10 ) ), \ + _mm_set1_epi32( ( (j) + 16 ) * 0x05555555UL ) ), \ + H[ ( (j)+7 ) & 0xF ] ) + + +#define expand1s( qt, M, H, i ) \ + _mm_add_epi32( \ + _mm_add_epi32( \ + _mm_add_epi32( \ + _mm_add_epi32( \ + _mm_add_epi32( ss1( qt[ (i)-16 ] ), \ + ss2( qt[ (i)-15 ] ) ), \ + _mm_add_epi32( ss3( qt[ (i)-14 ] ), \ + ss0( qt[ (i)-13 ] ) ) ), \ + _mm_add_epi32( \ + _mm_add_epi32( ss1( qt[ (i)-12 ] ), \ + ss2( qt[ (i)-11 ] ) ), \ + _mm_add_epi32( ss3( qt[ (i)-10 ] ), \ + ss0( qt[ (i)- 9 ] ) ) ) ), \ + _mm_add_epi32( \ + _mm_add_epi32( \ + _mm_add_epi32( ss1( qt[ (i)- 8 ] ), \ + ss2( qt[ (i)- 7 ] ) ), \ + _mm_add_epi32( ss3( qt[ (i)- 6 ] ), \ + ss0( qt[ (i)- 5 ] ) ) ), \ + _mm_add_epi32( \ + _mm_add_epi32( ss1( qt[ (i)- 4 ] ), \ + ss2( qt[ (i)- 3 ] ) ), \ + _mm_add_epi32( ss3( qt[ (i)- 2 ] ), \ + ss0( qt[ (i)- 1 ] ) ) ) ) ), \ + add_elt_s( M, H, (i)-16 ) ) + +#define expand2s( qt, M, H, i) \ + _mm_add_epi32( \ + _mm_add_epi32( \ + _mm_add_epi32( \ + _mm_add_epi32( \ + _mm_add_epi32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ) ), \ + _mm_add_epi32( qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ) ), \ + _mm_add_epi64( \ + _mm_add_epi32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ) ), \ + _mm_add_epi32( qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ) ) ), \ + _mm_add_epi32( \ + _mm_add_epi32( \ + _mm_add_epi32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ) ), \ + _mm_add_epi32( qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ) ), \ + _mm_add_epi32( \ + _mm_add_epi32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ) ), \ + _mm_add_epi32( ss4( qt[ (i)- 2 ] ), \ + ss5( qt[ (i)- 1 ] ) ) ) ) ), \ + add_elt_s( M, H, (i)-16 ) ) + +// BMW512 #define sb0(x) \ _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \ _mm256_slli_epi64( (x), 3) ), \ - _mm256_xor_si256( mm256_rotl_64( (x), 4), \ + _mm256_xor_si256( mm256_rotl_64( (x), 4), \ mm256_rotl_64( (x), 37) ) ) #define sb1(x) \ @@ -181,18 +213,18 @@ static const sph_u64 IV512[] = { #define rb6(x) mm256_rotl_64( x, 43 ) #define rb7(x) mm256_rotl_64( x, 53 ) -#define rol_off( M, j, off ) \ - mm256_rotl_64( M[ ( (j) + (off) ) & 15 ] , \ - ( ( (j) + (off) ) & 15 ) + 1 ) +#define rol_off_64( M, j, off ) \ + mm256_rotl_64( M[ ( (j) + (off) ) & 0xF ] , \ + ( ( (j) + (off) ) & 0xF ) + 1 ) #define add_elt_b( M, H, j ) \ _mm256_xor_si256( \ _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_add_epi64( rol_off( M, j, 0 ), \ - rol_off( M, j, 3 ) ), \ - rol_off( M, j, 10 ) ), \ + _mm256_sub_epi64( _mm256_add_epi64( rol_off_64( M, j, 0 ), \ + rol_off_64( M, j, 3 ) ), \ + rol_off_64( M, j, 10 ) ), \ _mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \ - H[ ( (j)+7 ) & 15 ] ) + H[ ( (j)+7 ) & 0xF ] ) #define expand1b( qt, M, H, i ) \ _mm256_add_epi64( \ @@ -241,132 +273,301 @@ static const sph_u64 IV512[] = { sb5( qt[ (i)- 1 ] ) ) ) ) ), \ add_elt_b( M, H, (i)-16 ) ) -#endif +// BMW256 -/* -#define MAKE_W( i0, op01, i1, op12, i2, op23, i3, op34, i4) \ - ((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \ - op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4))) -*/ +#define Ws0 \ + _mm_add_epi32( \ + _mm_add_epi32( \ + _mm_add_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \ + _mm_xor_si128( M[ 7], H[ 7] ) ), \ + _mm_xor_si128( M[10], H[10] ) ), \ + _mm_xor_si128( M[13], H[13] ) ), \ + _mm_xor_si128( M[14], H[14] ) ) -/* -#define Ws0 MAKE_W(SPH_T32, 5, -, 7, +, 10, +, 13, +, 14) -#define Ws1 MAKE_W(SPH_T32, 6, -, 8, +, 11, +, 14, -, 15) -#define Ws2 MAKE_W(SPH_T32, 0, +, 7, +, 9, -, 12, +, 15) -#define Ws3 MAKE_W(SPH_T32, 0, -, 1, +, 8, -, 10, +, 13) -#define Ws4 MAKE_W(SPH_T32, 1, +, 2, +, 9, -, 11, -, 14) -#define Ws5 MAKE_W(SPH_T32, 3, -, 2, +, 10, -, 12, +, 15) -#define Ws6 MAKE_W(SPH_T32, 4, -, 0, -, 3, -, 11, +, 13) -#define Ws7 MAKE_W(SPH_T32, 1, -, 4, -, 5, -, 12, -, 14) -#define Ws8 MAKE_W(SPH_T32, 2, -, 5, -, 6, +, 13, -, 15) -#define Ws9 MAKE_W(SPH_T32, 0, -, 3, +, 6, -, 7, +, 14) -#define Ws10 MAKE_W(SPH_T32, 8, -, 1, -, 4, -, 7, +, 15) -#define Ws11 MAKE_W(SPH_T32, 8, -, 0, -, 2, -, 5, +, 9) -#define Ws12 MAKE_W(SPH_T32, 1, +, 3, -, 6, -, 9, +, 10) -#define Ws13 MAKE_W(SPH_T32, 2, +, 4, +, 7, +, 10, +, 11) -#define Ws14 MAKE_W(SPH_T32, 3, -, 5, +, 8, -, 11, -, 12) -#define Ws15 MAKE_W(SPH_T32, 12, -, 4, -, 6, -, 9, +, 13) +#define Ws1 \ + _mm_sub_epi32( \ + _mm_add_epi32( \ + _mm_add_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \ + _mm_xor_si128( M[ 8], H[ 8] ) ), \ + _mm_xor_si128( M[11], H[11] ) ), \ + _mm_xor_si128( M[14], H[14] ) ), \ + _mm_xor_si128( M[15], H[15] ) ) -#if SPH_SMALL_FOOTPRINT_BMW +#define Ws2 \ + _mm_add_epi32( \ + _mm_sub_epi32( \ + _mm_add_epi32( \ + _mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \ + _mm_xor_si128( M[ 7], H[ 7] ) ), \ + _mm_xor_si128( M[ 9], H[ 9] ) ), \ + _mm_xor_si128( M[12], H[12] ) ), \ + _mm_xor_si128( M[15], H[15] ) ) -#define MAKE_Qas do { \ - unsigned u; \ - sph_u32 Ws[16]; \ - Ws[ 0] = Ws0; \ - Ws[ 1] = Ws1; \ - Ws[ 2] = Ws2; \ - Ws[ 3] = Ws3; \ - Ws[ 4] = Ws4; \ - Ws[ 5] = Ws5; \ - Ws[ 6] = Ws6; \ - Ws[ 7] = Ws7; \ - Ws[ 8] = Ws8; \ - Ws[ 9] = Ws9; \ - Ws[10] = Ws10; \ - Ws[11] = Ws11; \ - Ws[12] = Ws12; \ - Ws[13] = Ws13; \ - Ws[14] = Ws14; \ - Ws[15] = Ws15; \ - for (u = 0; u < 15; u += 5) { \ - qt[u + 0] = SPH_T32(ss0(Ws[u + 0]) + H(u + 1)); \ - qt[u + 1] = SPH_T32(ss1(Ws[u + 1]) + H(u + 2)); \ - qt[u + 2] = SPH_T32(ss2(Ws[u + 2]) + H(u + 3)); \ - qt[u + 3] = SPH_T32(ss3(Ws[u + 3]) + H(u + 4)); \ - qt[u + 4] = SPH_T32(ss4(Ws[u + 4]) + H(u + 5)); \ - } \ - qt[15] = SPH_T32(ss0(Ws[15]) + H(0)); \ - } while (0) +#define Ws3 \ + _mm_add_epi32( \ + _mm_sub_epi32( \ + _mm_add_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \ + _mm_xor_si128( M[ 1], H[ 1] ) ), \ + _mm_xor_si128( M[ 8], H[ 8] ) ), \ + _mm_xor_si128( M[10], H[10] ) ), \ + _mm_xor_si128( M[13], H[13] ) ) -#define MAKE_Qbs do { \ - qt[16] = expand1s(Qs, M, H, 16); \ - qt[17] = expand1s(Qs, M, H, 17); \ - qt[18] = expand2s(Qs, M, H, 18); \ - qt[19] = expand2s(Qs, M, H, 19); \ - qt[20] = expand2s(Qs, M, H, 20); \ - qt[21] = expand2s(Qs, M, H, 21); \ - qt[22] = expand2s(Qs, M, H, 22); \ - qt[23] = expand2s(Qs, M, H, 23); \ - qt[24] = expand2s(Qs, M, H, 24); \ - qt[25] = expand2s(Qs, M, H, 25); \ - qt[26] = expand2s(Qs, M, H, 26); \ - qt[27] = expand2s(Qs, M, H, 27); \ - qt[28] = expand2s(Qs, M, H, 28); \ - qt[29] = expand2s(Qs, M, H, 29); \ - qt[30] = expand2s(Qs, M, H, 30); \ - qt[31] = expand2s(Qs, M, H, 31); \ - } while (0) +#define Ws4 \ + _mm_sub_epi32( \ + _mm_sub_epi32( \ + _mm_add_epi32( \ + _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \ + _mm_xor_si128( M[ 2], H[ 2] ) ), \ + _mm_xor_si128( M[ 9], H[ 9] ) ), \ + _mm_xor_si128( M[11], H[11] ) ), \ + _mm_xor_si128( M[14], H[14] ) ) -#else +#define Ws5 \ + _mm_add_epi32( \ + _mm_sub_epi32( \ + _mm_add_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \ + _mm_xor_si128( M[ 2], H[ 2] ) ), \ + _mm_xor_si128( M[10], H[10] ) ), \ + _mm_xor_si128( M[12], H[12] ) ), \ + _mm_xor_si128( M[15], H[15] ) ) -#define MAKE_Qas do { \ - qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \ - qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \ - qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \ - qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \ - qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \ - qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \ - qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \ - qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \ - qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \ - qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \ - qt[10] = SPH_T32(ss0(Ws10) + H(11)); \ - qt[11] = SPH_T32(ss1(Ws11) + H(12)); \ - qt[12] = SPH_T32(ss2(Ws12) + H(13)); \ - qt[13] = SPH_T32(ss3(Ws13) + H(14)); \ - qt[14] = SPH_T32(ss4(Ws14) + H(15)); \ - qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \ - } while (0) +#define Ws6 \ + _mm_add_epi32( \ + _mm_sub_epi32( \ + _mm_sub_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \ + _mm_xor_si128( M[ 0], H[ 0] ) ), \ + _mm_xor_si128( M[ 3], H[ 3] ) ), \ + _mm_xor_si128( M[11], H[11] ) ), \ + _mm_xor_si128( M[13], H[13] ) ) -#define MAKE_Qbs do { \ - qt[16] = expand1s(Qs, M, H, 16); \ - qt[17] = expand1s(Qs, M, H, 17); \ - qt[18] = expand2s(Qs, M, H, 18); \ - qt[19] = expand2s(Qs, M, H, 19); \ - qt[20] = expand2s(Qs, M, H, 20); \ - qt[21] = expand2s(Qs, M, H, 21); \ - qt[22] = expand2s(Qs, M, H, 22); \ - qt[23] = expand2s(Qs, M, H, 23); \ - qt[24] = expand2s(Qs, M, H, 24); \ - qt[25] = expand2s(Qs, M, H, 25); \ - qt[26] = expand2s(Qs, M, H, 26); \ - qt[27] = expand2s(Qs, M, H, 27); \ - qt[28] = expand2s(Qs, M, H, 28); \ - qt[29] = expand2s(Qs, M, H, 29); \ - qt[30] = expand2s(Qs, M, H, 30); \ - qt[31] = expand2s(Qs, M, H, 31); \ - } while (0) +#define Ws7 \ + _mm_sub_epi32( \ + _mm_sub_epi32( \ + _mm_sub_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \ + _mm_xor_si128( M[ 4], H[ 4] ) ), \ + _mm_xor_si128( M[ 5], H[ 5] ) ), \ + _mm_xor_si128( M[12], H[12] ) ), \ + _mm_xor_si128( M[14], H[14] ) ) -#endif +#define Ws8 \ + _mm_sub_epi32( \ + _mm_add_epi32( \ + _mm_sub_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \ + _mm_xor_si128( M[ 5], H[ 5] ) ), \ + _mm_xor_si128( M[ 6], H[ 6] ) ), \ + _mm_xor_si128( M[13], H[13] ) ), \ + _mm_xor_si128( M[15], H[15] ) ) -#define MAKE_Qs do { \ - MAKE_Qas; \ - MAKE_Qbs; \ - } while (0) +#define Ws9 \ + _mm_add_epi32( \ + _mm_sub_epi32( \ + _mm_add_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \ + _mm_xor_si128( M[ 3], H[ 3] ) ), \ + _mm_xor_si128( M[ 6], H[ 6] ) ), \ + _mm_xor_si128( M[ 7], H[ 7] ) ), \ + _mm_xor_si128( M[14], H[14] ) ) -#define Qs(j) (qt[j]) -*/ -#if SPH_64 +#define Ws10 \ + _mm_add_epi32( \ + _mm_sub_epi32( \ + _mm_sub_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \ + _mm_xor_si128( M[ 1], H[ 1] ) ), \ + _mm_xor_si128( M[ 4], H[ 4] ) ), \ + _mm_xor_si128( M[ 7], H[ 7] ) ), \ + _mm_xor_si128( M[15], H[15] ) ) + +#define Ws11 \ + _mm_add_epi32( \ + _mm_sub_epi32( \ + _mm_sub_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \ + _mm_xor_si128( M[ 0], H[ 0] ) ), \ + _mm_xor_si128( M[ 2], H[ 2] ) ), \ + _mm_xor_si128( M[ 5], H[ 5] ) ), \ + _mm_xor_si128( M[ 9], H[ 9] ) ) + +#define Ws12 \ + _mm_add_epi32( \ + _mm_sub_epi32( \ + _mm_sub_epi32( \ + _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \ + _mm_xor_si128( M[ 3], H[ 3] ) ), \ + _mm_xor_si128( M[ 6], H[ 6] ) ), \ + _mm_xor_si128( M[ 9], H[ 9] ) ), \ + _mm_xor_si128( M[10], H[10] ) ) + +#define Ws13 \ + _mm_add_epi32( \ + _mm_add_epi32( \ + _mm_add_epi32( \ + _mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \ + _mm_xor_si128( M[ 4], H[ 4] ) ), \ + _mm_xor_si128( M[ 7], H[ 7] ) ), \ + _mm_xor_si128( M[10], H[10] ) ), \ + _mm_xor_si128( M[11], H[11] ) ) + +#define Ws14 \ + _mm_sub_epi32( \ + _mm_sub_epi32( \ + _mm_add_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \ + _mm_xor_si128( M[ 5], H[ 5] ) ), \ + _mm_xor_si128( M[ 8], H[ 8] ) ), \ + _mm_xor_si128( M[11], H[11] ) ), \ + _mm_xor_si128( M[12], H[12] ) ) + +#define Ws15 \ + _mm_add_epi32( \ + _mm_sub_epi32( \ + _mm_sub_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \ + _mm_xor_si128( M[ 4], H[ 4] ) ), \ + _mm_xor_si128( M[ 6], H[ 6] ) ), \ + _mm_xor_si128( M[ 9], H[ 9] ) ), \ + _mm_xor_si128( M[13], H[13] ) ) + + +void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] ) +{ + __m128i qt[32], xl, xh; \ + + qt[ 0] = ss0( Ws0 ) + H[ 1]; + qt[ 1] = ss1( Ws1 ) + H[ 2]; + qt[ 2] = ss2( Ws2 ) + H[ 3]; + qt[ 3] = ss3( Ws3 ) + H[ 4]; + qt[ 4] = ss4( Ws4 ) + H[ 5]; + qt[ 5] = ss0( Ws5 ) + H[ 6]; + qt[ 6] = ss1( Ws6 ) + H[ 7]; + qt[ 7] = ss2( Ws7 ) + H[ 8]; + qt[ 8] = ss3( Ws8 ) + H[ 9]; + qt[ 9] = ss4( Ws9 ) + H[10]; + qt[10] = ss0( Ws10) + H[11]; + qt[11] = ss1( Ws11) + H[12]; + qt[12] = ss2( Ws12) + H[13]; + qt[13] = ss3( Ws13) + H[14]; + qt[14] = ss4( Ws14) + H[15]; + qt[15] = ss0( Ws15) + H[ 0]; + qt[16] = expand1s( qt, M, H, 16 ); + qt[17] = expand1s( qt, M, H, 17 ); + qt[18] = expand2s( qt, M, H, 18 ); + qt[19] = expand2s( qt, M, H, 19 ); + qt[20] = expand2s( qt, M, H, 20 ); + qt[21] = expand2s( qt, M, H, 21 ); + qt[22] = expand2s( qt, M, H, 22 ); + qt[23] = expand2s( qt, M, H, 23 ); + qt[24] = expand2s( qt, M, H, 24 ); + qt[25] = expand2s( qt, M, H, 25 ); + qt[26] = expand2s( qt, M, H, 26 ); + qt[27] = expand2s( qt, M, H, 27 ); + qt[28] = expand2s( qt, M, H, 28 ); + qt[29] = expand2s( qt, M, H, 29 ); + qt[30] = expand2s( qt, M, H, 30 ); + qt[31] = expand2s( qt, M, H, 31 ); + + xl = _mm_xor_si128( + _mm_xor_si128( _mm_xor_si128( qt[16], qt[17] ), + _mm_xor_si128( qt[18], qt[19] ) ), + _mm_xor_si128( _mm_xor_si128( qt[20], qt[21] ), + _mm_xor_si128( qt[22], qt[23] ) ) ); + xh = _mm_xor_si128( xl, + _mm_xor_si128( + _mm_xor_si128( _mm_xor_si128( qt[24], qt[25] ), + _mm_xor_si128( qt[26], qt[27] ) ), + _mm_xor_si128( _mm_xor_si128( qt[28], qt[29] ), + _mm_xor_si128( qt[30], qt[31] ) ))); + + dH[ 0] = _mm_add_epi32( + _mm_xor_si128( M[0], + _mm_xor_si128( _mm_slli_epi32( xh, 5 ), + _mm_srli_epi32( qt[16], 5 ) ) ), + _mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] )); + dH[ 1] = _mm_add_epi32( + _mm_xor_si128( M[1], + _mm_xor_si128( _mm_srli_epi32( xh, 7 ), + _mm_slli_epi32( qt[17], 8 ) ) ), + _mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] )); + dH[ 2] = _mm_add_epi32( + _mm_xor_si128( M[2], + _mm_xor_si128( _mm_srli_epi32( xh, 5 ), + _mm_slli_epi32( qt[18], 5 ) ) ), + _mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] )); + dH[ 3] = _mm_add_epi32( + _mm_xor_si128( M[3], + _mm_xor_si128( _mm_srli_epi32( xh, 1 ), + _mm_slli_epi32( qt[19], 5 ) ) ), + _mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] )); + dH[ 4] = _mm_add_epi32( + _mm_xor_si128( M[4], + _mm_xor_si128( _mm_srli_epi32( xh, 3 ), + _mm_slli_epi32( qt[20], 0 ) ) ), + _mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] )); + dH[ 5] = _mm_add_epi32( + _mm_xor_si128( M[5], + _mm_xor_si128( _mm_slli_epi32( xh, 6 ), + _mm_srli_epi32( qt[21], 6 ) ) ), + _mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] )); + dH[ 6] = _mm_add_epi32( + _mm_xor_si128( M[6], + _mm_xor_si128( _mm_srli_epi32( xh, 4 ), + _mm_slli_epi32( qt[22], 6 ) ) ), + _mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] )); + dH[ 7] = _mm_add_epi32( + _mm_xor_si128( M[7], + _mm_xor_si128( _mm_srli_epi32( xh, 11 ), + _mm_slli_epi32( qt[23], 2 ) ) ), + _mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] )); + dH[ 8] = _mm_add_epi32( _mm_add_epi32( + mm_rotl_32( dH[4], 9 ), + _mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] )), + _mm_xor_si128( _mm_slli_epi32( xl, 8 ), + _mm_xor_si128( qt[23], qt[ 8] ) ) ); + dH[ 9] = _mm_add_epi32( _mm_add_epi32( + mm_rotl_32( dH[5], 10 ), + _mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] )), + _mm_xor_si128( _mm_srli_epi32( xl, 6 ), + _mm_xor_si128( qt[16], qt[ 9] ) ) ); + dH[10] = _mm_add_epi32( _mm_add_epi32( + mm_rotl_32( dH[6], 11 ), + _mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] )), + _mm_xor_si128( _mm_slli_epi32( xl, 6 ), + _mm_xor_si128( qt[17], qt[10] ) ) ); + dH[11] = _mm_add_epi32( _mm_add_epi32( + mm_rotl_32( dH[7], 12 ), + _mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )), + _mm_xor_si128( _mm_slli_epi32( xl, 4 ), + _mm_xor_si128( qt[18], qt[11] ) ) ); + dH[12] = _mm_add_epi32( _mm_add_epi32( + mm_rotl_32( dH[0], 13 ), + _mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] )), + _mm_xor_si128( _mm_srli_epi32( xl, 3 ), + _mm_xor_si128( qt[19], qt[12] ) ) ); + dH[13] = _mm_add_epi32( _mm_add_epi32( + mm_rotl_32( dH[1], 14 ), + _mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] )), + _mm_xor_si128( _mm_srli_epi32( xl, 4 ), + _mm_xor_si128( qt[20], qt[13] ) ) ); + dH[14] = _mm_add_epi32( _mm_add_epi32( + mm_rotl_32( dH[2], 15 ), + _mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] )), + _mm_xor_si128( _mm_srli_epi32( xl, 7 ), + _mm_xor_si128( qt[21], qt[14] ) ) ); + dH[15] = _mm_add_epi32( _mm_add_epi32( + mm_rotl_32( dH[3], 16 ), + _mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] )), + _mm_xor_si128( _mm_srli_epi32( xl, 2 ), + _mm_xor_si128( qt[22], qt[15] ) ) ); +} + +// BMW512 #define Wb0 \ _mm256_add_epi64( \ @@ -564,6 +765,7 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] ) qt[29] = expand2b( qt, M, H, 29 ); qt[30] = expand2b( qt, M, H, 30 ); qt[31] = expand2b( qt, M, H, 31 ); + xl = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ), _mm256_xor_si256( qt[18], qt[19] ) ), @@ -575,6 +777,7 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] ) _mm256_xor_si256( qt[26], qt[27] ) ), _mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ), _mm256_xor_si256( qt[30], qt[31] ) ))); + dH[ 0] = _mm256_add_epi64( _mm256_xor_si256( M[0], _mm256_xor_si256( _mm256_slli_epi64( xh, 5 ), @@ -657,137 +860,113 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] ) _mm256_xor_si256( qt[22], qt[15] ) ) ); } -#endif // 64 +// BMW256 -//#define FOLDs FOLD(sph_u32, MAKE_Qs, SPH_ROTL32, M, Qs, dH) - - -/* -static void -compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16]) +static const __m128i final_s[16] = { -#define M(x) sph_dec32le_aligned(data + 4 * (x)) -#define H(x) (h[x]) -#define dH(x) (dh[x]) - - FOLDs; - -#undef M -#undef H -#undef dH -} - -static const sph_u32 final_s[16] = { - SPH_C32(0xaaaaaaa0), SPH_C32(0xaaaaaaa1), SPH_C32(0xaaaaaaa2), - SPH_C32(0xaaaaaaa3), SPH_C32(0xaaaaaaa4), SPH_C32(0xaaaaaaa5), - SPH_C32(0xaaaaaaa6), SPH_C32(0xaaaaaaa7), SPH_C32(0xaaaaaaa8), - SPH_C32(0xaaaaaaa9), SPH_C32(0xaaaaaaaa), SPH_C32(0xaaaaaaab), - SPH_C32(0xaaaaaaac), SPH_C32(0xaaaaaaad), SPH_C32(0xaaaaaaae), - SPH_C32(0xaaaaaaaf) + { 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 }, + { 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 }, + { 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 }, + { 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 }, + { 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 }, + { 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 }, + { 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 }, + { 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 }, + { 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 }, + { 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 }, + { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa }, + { 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab }, + { 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac }, + { 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad }, + { 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae }, + { 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf } }; static void bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv) { - memcpy(sc->H, iv, sizeof sc->H); - sc->ptr = 0; -#if SPH_64 - sc->bit_count = 0; -#else - sc->bit_count_high = 0; - sc->bit_count_low = 0; -#endif + for ( int i = 0; i < 16; i++ ) + sc->H[i] = _mm_set1_epi32( iv[i] ); + sc->ptr = 0; + sc->bit_count = 0; } static void bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len) { - unsigned char *buf; - size_t ptr; - sph_u32 htmp[16]; - sph_u32 *h1, *h2; -#if !SPH_64 - sph_u32 tmp; -#endif + __m128i *vdata = (__m128i*)data; + __m128i *buf; + __m128i htmp[16]; + __m128i *h1, *h2; + size_t ptr; + const int buf_size = 64; // bytes of one lane, compatible with len -#if SPH_64 - sc->bit_count += (sph_u64)len << 3; -#else - tmp = sc->bit_count_low; - sc->bit_count_low = SPH_T32(tmp + ((sph_u32)len << 3)); - if (sc->bit_count_low < tmp) - sc->bit_count_high ++; - sc->bit_count_high += len >> 29; -#endif - buf = sc->buf; - ptr = sc->ptr; - h1 = sc->H; - h2 = htmp; - while (len > 0) { - size_t clen; - - clen = (sizeof sc->buf) - ptr; - if (clen > len) - clen = len; - memcpy(buf + ptr, data, clen); - data = (const unsigned char *)data + clen; - len -= clen; - ptr += clen; - if (ptr == sizeof sc->buf) { - sph_u32 *ht; - - compress_small(buf, h1, h2); - ht = h1; - h1 = h2; - h2 = ht; - ptr = 0; - } - } - sc->ptr = ptr; - if (h1 != sc->H) - memcpy(sc->H, h1, sizeof sc->H); + sc->bit_count += (sph_u64)len << 3; + buf = sc->buf; + ptr = sc->ptr; + h1 = sc->H; + h2 = htmp; + while ( len > 0 ) + { + size_t clen; + clen = buf_size - ptr; + if ( clen > len ) + clen = len; + memcpy_128( buf + (ptr>>2), vdata, clen >> 2 ); + vdata += ( clen >> 2 ); + len -= clen; + ptr += clen; + if ( ptr == buf_size ) + { + __m128i *ht; + compress_small( buf, h1, h2 ); + ht = h1; + h1 = h2; + h2 = ht; + ptr = 0; + } + } + sc->ptr = ptr; + if ( h1 != sc->H ) + memcpy_128( sc->H, h1, 16 ); } static void bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n, void *dst, size_t out_size_w32) { - unsigned char *buf, *out; - size_t ptr, u, v; - unsigned z; - sph_u32 h1[16], h2[16], *h; + __m128i *buf; + __m128i h1[16], h2[16], *h; + size_t ptr, u, v; + unsigned z; + const int buf_size = 64; // bytes of one lane, compatible with len - buf = sc->buf; - ptr = sc->ptr; - z = 0x80 >> n; - buf[ptr ++] = ((ub & -z) | z) & 0xFF; - h = sc->H; - if (ptr > (sizeof sc->buf) - 8) { - memset(buf + ptr, 0, (sizeof sc->buf) - ptr); - compress_small(buf, h, h1); - ptr = 0; - h = h1; - } - memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr); -#if SPH_64 - sph_enc64le_aligned(buf + (sizeof sc->buf) - 8, - SPH_T64(sc->bit_count + n)); -#else - sph_enc32le_aligned(buf + (sizeof sc->buf) - 8, - sc->bit_count_low + n); - sph_enc32le_aligned(buf + (sizeof sc->buf) - 4, - SPH_T32(sc->bit_count_high)); -#endif - compress_small(buf, h, h2); - for (u = 0; u < 16; u ++) - sph_enc32le_aligned(buf + 4 * u, h2[u]); - compress_small(buf, final_s, h1); - out = dst; - for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++) - sph_enc32le(out + 4 * u, h1[v]); + buf = sc->buf; + ptr = sc->ptr; + z = 0x80 >> n; + buf[ ptr>>2 ] = _mm_set1_epi32( z ); + ptr += 4; + h = sc->H; + + // assume bit_count fits in 32 bits + if ( ptr > buf_size - 4 ) + { + memset_zero_128( buf + (ptr>>2), (buf_size - ptr) >> 2 ); + compress_small( buf, h, h1 ); + ptr = 0; + h = h1; + } + memset_zero_128( buf + (ptr>>2), (buf_size - 4 - ptr) >> 2 ); + buf[ (buf_size - 4) >> 2 ] = _mm_set1_epi32( sc->bit_count + n ); + compress_small( buf, h, h2 ); + for ( u = 0; u < 16; u ++ ) + buf[u] = h2[u]; + compress_small( buf, final_s, h1 ); + for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++) + casti_m128i( dst, u ) = h1[v]; } -*/ -#if SPH_64 + +// BMW512 static const __m256i final_b[16] = { @@ -908,33 +1087,33 @@ bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n, casti_m256i(dst,u) = h1[v]; } -#endif +// BMW256 void bmw256_4way_init(void *cc) { -// bmw32_4way_init(cc, IV256); + bmw32_4way_init(cc, IV256); } void bmw256_4way(void *cc, const void *data, size_t len) { -// bmw32_4way(cc, data, len); + bmw32_4way(cc, data, len); } void bmw256_4way_close(void *cc, void *dst) { -// bmw256_4way_addbits_and_close(cc, 0, 0, dst); + bmw256_4way_addbits_and_close(cc, 0, 0, dst); } void bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) { -// bmw32_4way_close(cc, ub, n, dst, 8); + bmw32_4way_close(cc, ub, n, dst, 8); } -#if SPH_64 +// BMW512 void bmw512_4way_init(void *cc) @@ -960,10 +1139,8 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) bmw64_4way_close(cc, ub, n, dst, 8); } -#endif - #ifdef __cplusplus } #endif -#endif +#endif // __AVX2__ diff --git a/algo/bmw/bmw-hash-4way.h b/algo/bmw/bmw-hash-4way.h index f22a30a..1bd3098 100644 --- a/algo/bmw/bmw-hash-4way.h +++ b/algo/bmw/bmw-hash-4way.h @@ -46,94 +46,37 @@ extern "C"{ #include "algo/sha/sph_types.h" #include "avxdefs.h" -/** - * Output size (in bits) for BMW-224. - */ -#define SPH_SIZE_bmw224 224 - -/** - * Output size (in bits) for BMW-256. - */ #define SPH_SIZE_bmw256 256 -#if SPH_64 - -/** - * Output size (in bits) for BMW-384. - */ -#define SPH_SIZE_bmw384 384 - -/** - * Output size (in bits) for BMW-512. - */ #define SPH_SIZE_bmw512 512 -#endif - -/** - * This structure is a context for BMW-224 and BMW-256 computations: - * it contains the intermediate values and some data from the last - * entered block. Once a BMW computation has been performed, the - * context can be reused for another computation. - * - * The contents of this structure are private. A running BMW - * computation can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ typedef struct { -#ifndef DOXYGEN_IGNORE - unsigned char buf[64]; /* first field, for alignment */ - size_t ptr; - sph_u32 H[16]; -#if SPH_64 - sph_u64 bit_count; -#else - sph_u32 bit_count_high, bit_count_low; -#endif -#endif + __m128i buf[64]; + __m128i H[16]; + size_t ptr; + sph_u32 bit_count; // assume bit_count fits in 32 bits } bmw_4way_small_context; typedef bmw_4way_small_context bmw256_4way_context; -#if SPH_64 - -/** - * This structure is a context for BMW-384 and BMW-512 computations: - * it contains the intermediate values and some data from the last - * entered block. Once a BMW computation has been performed, the - * context can be reused for another computation. - * - * The contents of this structure are private. A running BMW - * computation can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ typedef struct { -#ifndef DOXYGEN_IGNORE __m256i buf[16]; __m256i H[16]; - -// unsigned char buf[128]; /* first field, for alignment */ - size_t ptr; -// sph_u64 H[16]; - sph_u64 bit_count; -#endif + size_t ptr; + sph_u64 bit_count; } bmw_4way_big_context; typedef bmw_4way_big_context bmw512_4way_context; -#endif - void bmw256_4way_init(void *cc); void bmw256_4way(void *cc, const void *data, size_t len); void bmw256_4way_close(void *cc, void *dst); -void bmw256_addbits_and_close( +void bmw256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst); -#if SPH_64 - void bmw512_4way_init(void *cc); void bmw512_4way(void *cc, const void *data, size_t len); @@ -150,5 +93,3 @@ void bmw512_4way_addbits_and_close( #endif #endif - -#endif diff --git a/algo/jh/jha-4way.c b/algo/jh/jha-4way.c index 6f4dea2..83029a2 100644 --- a/algo/jh/jha-4way.c +++ b/algo/jh/jha-4way.c @@ -23,12 +23,12 @@ void jha_hash_4way( void *out, const void *input ) uint64_t hash2[8] __attribute__ ((aligned (64))); uint64_t hash3[8] __attribute__ ((aligned (64))); uint64_t vhash[8*4] __attribute__ ((aligned (64))); - uint64_t vhash0[8*4] __attribute__ ((aligned (64))); - uint64_t vhash1[8*4] __attribute__ ((aligned (64))); - __m256i mask, mask0, mask1; - __m256i* vh = (__m256i*)vhash; - __m256i* vh0 = (__m256i*)vhash0; - __m256i* vh1 = (__m256i*)vhash1; + uint64_t vhashA[8*4] __attribute__ ((aligned (64))); + uint64_t vhashB[8*4] __attribute__ ((aligned (64))); + __m256i* vh = (__m256i*)vhash; + __m256i* vhA = (__m256i*)vhashA; + __m256i* vhB = (__m256i*)vhashB; + __m256i vh_mask; blake512_4way_context ctx_blake; hashState_groestl ctx_groestl; @@ -40,127 +40,69 @@ void jha_hash_4way( void *out, const void *input ) keccak512_4way( &ctx_keccak, input, 80 ); keccak512_4way_close( &ctx_keccak, vhash ); -// memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid ); -// keccak512_4way( &ctx_keccak, input + (64<<2), 16 ); -// keccak512_4way_close( &ctx_keccak, vhash ); - // Heavy & Light Pair Loop for ( int round = 0; round < 3; round++ ) { - // select next function based on bit 0 of previous hash. - // Specutively execute both functions and use mask to - // select results from correct function for each lane. - // hash = mask : vhash0 ? vhash1 - mask = mm256_negate_64( - _mm256_and_si256( vh[0], _mm256_set1_epi64x( 0x1 ) ) ); - -// second version -// mask0 = mask -// mask1 = mm256_not( mask ); - -// first version -// mask = _mm256_sub_epi64( _mm256_and_si256( vh[0], -// _mm256_set1_epi64x( 0x1 ) ), _mm256_set1_epi64x( 0x1 ) ); - - // groestl (serial) vs skein + vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( + vh[0], _mm256_set1_epi64x( 1 ) ), mm256_zero ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - init_groestl( &ctx_groestl, 64 ); update_and_final_groestl( &ctx_groestl, (char*)hash0, - (char*)hash0, 512 ); + (char*)hash0, 512 ); init_groestl( &ctx_groestl, 64 ); update_and_final_groestl( &ctx_groestl, (char*)hash1, - (char*)hash1, 512 ); + (char*)hash1, 512 ); init_groestl( &ctx_groestl, 64 ); update_and_final_groestl( &ctx_groestl, (char*)hash2, - (char*)hash2, 512 ); + (char*)hash2, 512 ); init_groestl( &ctx_groestl, 64 ); update_and_final_groestl( &ctx_groestl, (char*)hash3, - (char*)hash3, 512 ); - - mm256_interleave_4x64( vhash0, hash0, hash1, hash2, hash3, 512 ); - - // skein + (char*)hash3, 512 ); + mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); skein512_4way_init( &ctx_skein ); skein512_4way( &ctx_skein, vhash, 64 ); - skein512_4way_close( &ctx_skein, vhash1 ); + skein512_4way_close( &ctx_skein, vhashB ); - // merge vectored hash for ( int i = 0; i < 8; i++ ) - { - // blend should be faster - vh[i] = _mm256_blendv_epi8( vh0[i], vh1[i], mask ); - -// second version -// vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ), -// _mm256_and_si256( vh1[i], mask1 ) ); - -// first version -/* - vh0[i] = _mm256_maskload_epi64( - vhash0 + i*4, mm256_not( mask ) ); - vh1[i] = _mm256_maskload_epi64( - vhash1 + i*4, mask ); - vh[i] = _mm256_or_si256( vh0[i], vh1[i] ); -*/ - } - - // blake v jh + vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask ); blake512_4way_init( &ctx_blake ); blake512_4way( &ctx_blake, vhash, 64 ); - blake512_4way_close( &ctx_blake, vhash0 ); + blake512_4way_close( &ctx_blake, vhashA ); jh512_4way_init( &ctx_jh ); jh512_4way( &ctx_jh, vhash, 64 ); - jh512_4way_close( &ctx_jh, vhash1 ); + jh512_4way_close( &ctx_jh, vhashB ); - // merge hash for ( int i = 0; i < 8; i++ ) - { - vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ), - _mm256_and_si256( vh1[i], mask1 ) ); -/* - vha256[i] = _mm256_maskload_epi64( - vhasha + i*4, mm256_not( mask ) ); - vhb256[i] = _mm256_maskload_epi64( - vhashb + i*4, mask ); - vh256[i] = _mm256_or_si256( vha256[i], vhb256[i] ); -*/ - } + vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask ); } mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 ); - -// memcpy( output, hash0, 32 ); -// memcpy( output+32, hash1, 32 ); -// memcpy( output+64, hash2, 32 ); -// memcpy( output+96, hash3, 32 ); - } int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done ) { - uint32_t hash[8*4] __attribute__ ((aligned (64))); - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - uint32_t n = pdata[19]; - uint32_t *nonces = work->nonces; - bool *found = work->nfound; - int num_found = 0; - uint32_t *noncep0 = vdata + 73; // 9*8 + 1 - uint32_t *noncep1 = vdata + 75; - uint32_t *noncep2 = vdata + 77; - uint32_t *noncep3 = vdata + 79; + uint32_t hash[8*4] __attribute__ ((aligned (64))); + uint32_t vdata[20*4] __attribute__ ((aligned (64))); + uint32_t endiandata[20] __attribute__((aligned(64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + uint32_t n = pdata[19]; + uint32_t *nonces = work->nonces; + bool *found = work->nfound; + int num_found = 0; + uint32_t *noncep0 = vdata + 73; // 9*8 + 1 + uint32_t *noncep1 = vdata + 75; + uint32_t *noncep2 = vdata + 77; + uint32_t *noncep3 = vdata + 79; - uint64_t htmax[] = { + uint64_t htmax[] = { 0, 0xF, 0xFF, @@ -168,7 +110,7 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { + uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, @@ -177,17 +119,12 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce, 0 }; - // we need bigendian data... for ( int i=0; i < 19; i++ ) be32enc( &endiandata[i], pdata[i] ); uint64_t *edata = (uint64_t*)endiandata; mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); - // precalc midstate for keccak -// keccak512_4way_init( &jha_kec_mid ); -// keccak512_4way( &jha_kec_mid, vdata, 64 ); - for ( int m = 0; m < 6; m++ ) { if ( Htarg <= htmax[m] ) @@ -201,7 +138,6 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce, be32enc( noncep3, n+3 ); jha_hash_4way( hash, vdata ); - pdata[19] = n; if ( ( !(hash[7] & mask) ) @@ -239,11 +175,9 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce, n += 4; } while ( ( num_found == 0 ) && ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; } } - *hashes_done = n - first_nonce + 1; return num_found; } diff --git a/algo/lyra2/lyra2h-4way.c b/algo/lyra2/lyra2h-4way.c new file mode 100644 index 0000000..324b31c --- /dev/null +++ b/algo/lyra2/lyra2h-4way.c @@ -0,0 +1,128 @@ +#include "lyra2h-gate.h" + +#ifdef LYRA2H_4WAY + +#include +#include +#include "lyra2.h" +#include "algo/blake/sph_blake.h" +#include "algo/blake/blake-hash-4way.h" + +__thread uint64_t* lyra2h_4way_matrix; + +bool lyra2h_4way_thread_init() +{ + return ( lyra2h_4way_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) ); +} + +static __thread blake256_4way_context l2h_4way_blake_mid; + +void lyra2h_4way_midstate( const void* input ) +{ + blake256_4way_init( &l2h_4way_blake_mid ); + blake256_4way( &l2h_4way_blake_mid, input, 64 ); +} + +void lyra2h_4way_hash( void *state, const void *input ) +{ + uint32_t hash0[8] __attribute__ ((aligned (64))); + uint32_t hash1[8] __attribute__ ((aligned (64))); + uint32_t hash2[8] __attribute__ ((aligned (64))); + uint32_t hash3[8] __attribute__ ((aligned (64))); + uint32_t vhash[8*4] __attribute__ ((aligned (64))); + blake256_4way_context ctx_blake __attribute__ ((aligned (64))); + + memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid ); + blake256_4way( &ctx_blake, input + (64*4), 16 ); + blake256_4way_close( &ctx_blake, vhash ); + + mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 ); + + LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 ); + LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 ); + LYRA2Z( lyra2h_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 16, 16, 16 ); + LYRA2Z( lyra2h_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 16, 16, 16 ); + + memcpy( state, hash0, 32 ); + memcpy( state+32, hash1, 32 ); + memcpy( state+64, hash2, 32 ); + memcpy( state+96, hash3, 32 ); +} + +int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ) +{ + uint32_t hash[8*4] __attribute__ ((aligned (64))); + uint32_t vdata[20*4] __attribute__ ((aligned (64))); + uint32_t _ALIGN(64) edata[20]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t Htarg = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + uint32_t n = first_nonce; + uint32_t *nonces = work->nonces; + bool *found = work->nfound; + int num_found = 0; + uint32_t *noncep0 = vdata + 76; // 19*4 + uint32_t *noncep1 = vdata + 77; + uint32_t *noncep2 = vdata + 78; + uint32_t *noncep3 = vdata + 79; + + if ( opt_benchmark ) + ptarget[7] = 0x0000ff; + + for ( int i=0; i < 19; i++ ) + be32enc( &edata[i], pdata[i] ); + + mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 ); + + lyra2h_4way_midstate( vdata ); + + do { + found[0] = found[1] = found[2] = found[3] = false; + be32enc( noncep0, n ); + be32enc( noncep1, n+1 ); + be32enc( noncep2, n+2 ); + be32enc( noncep3, n+3 ); + + be32enc( &edata[19], n ); + lyra2h_4way_hash( hash, vdata ); + + if ( hash[7] <= Htarg && fulltest( hash, ptarget ) ) + { + found[0] = true; + num_found++; + nonces[0] = pdata[19] = n; + work_set_target_ratio( work, hash ); + } + if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) ) + { + found[1] = true; + num_found++; + nonces[1] = n+1; + work_set_target_ratio( work, hash+8 ); + } + if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) ) + { + found[2] = true; + num_found++; + nonces[2] = n+2; + work_set_target_ratio( work, hash+16 ); + } + if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) ) + { + found[3] = true; + num_found++; + nonces[3] = n+3; + work_set_target_ratio( work, hash+24 ); + } + n += 4; + } while ( (num_found == 0) && (n < max_nonce-4) + && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce + 1; + return num_found; +} + +#endif + diff --git a/algo/lyra2/lyra2h-gate.c b/algo/lyra2/lyra2h-gate.c new file mode 100644 index 0000000..4aaca82 --- /dev/null +++ b/algo/lyra2/lyra2h-gate.c @@ -0,0 +1,25 @@ +#include "lyra2h-gate.h" +#include "lyra2.h" + +void lyra2h_set_target( struct work* work, double job_diff ) +{ + work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); +} + +bool register_lyra2h_algo( algo_gate_t* gate ) +{ +#ifdef LYRA2H_4WAY + gate->miner_thread_init = (void*)&lyra2h_4way_thread_init; + gate->scanhash = (void*)&scanhash_lyra2h_4way; + gate->hash = (void*)&lyra2h_4way_hash; +#else + gate->miner_thread_init = (void*)&lyra2h_thread_init; + gate->scanhash = (void*)&scanhash_lyra2h; + gate->hash = (void*)&lyra2h_hash; +#endif + gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->get_max64 = (void*)&get_max64_0xffffLL; + gate->set_target = (void*)&lyra2h_set_target; + return true; +}; + diff --git a/algo/lyra2/lyra2h-gate.h b/algo/lyra2/lyra2h-gate.h new file mode 100644 index 0000000..f51c3bf --- /dev/null +++ b/algo/lyra2/lyra2h-gate.h @@ -0,0 +1,32 @@ +#ifndef LYRA2H_GATE_H__ +#define LYRA2H_GATE_H__ + +#include "algo-gate-api.h" +#include + +#if defined(HASH_4WAY) + #define LYRA2H_4WAY +#endif + +#define LYRA2H_MATRIX_SIZE BLOCK_LEN_INT64 * 16 * 16 * 8 + +#if defined(LYRA2H_4WAY) + +void lyra2h_4way_hash( void *state, const void *input ); + +int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +bool lyra2h_4way_thread_init(); + +#endif + +void lyra2h_hash( void *state, const void *input ); + +int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +bool lyra2h_thread_init(); + +#endif + diff --git a/algo/lyra2/lyra2h.c b/algo/lyra2/lyra2h.c index 51579c3..2c0daf0 100644 --- a/algo/lyra2/lyra2h.c +++ b/algo/lyra2/lyra2h.c @@ -1,6 +1,6 @@ +#include "lyra2h-gate.h" #include #include -#include "algo-gate-api.h" #include "lyra2.h" #include "algo/blake/sph_blake.h" @@ -8,8 +8,7 @@ __thread uint64_t* lyra2h_matrix; bool lyra2h_thread_init() { - const int i = 16 * 16 * 96; - lyra2h_matrix = _mm_malloc( i, 64 ); + lyra2h_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 ); return lyra2h_matrix; } @@ -74,20 +73,3 @@ int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce, *hashes_done = pdata[19] - first_nonce + 1; return 0; } - -void lyra2h_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); -} - -bool register_lyra2h_algo( algo_gate_t* gate ) -{ - gate->optimizations = AVX_OPT | AVX2_OPT; - gate->miner_thread_init = (void*)&lyra2h_thread_init; - gate->scanhash = (void*)&scanhash_lyra2h; - gate->hash = (void*)&lyra2h_hash; - gate->get_max64 = (void*)&get_max64_0xffffLL; - gate->set_target = (void*)&lyra2h_set_target; - return true; -}; - diff --git a/algo/lyra2/lyra2re.c b/algo/lyra2/lyra2re.c index 4b4a59b..bba19b9 100644 --- a/algo/lyra2/lyra2re.c +++ b/algo/lyra2/lyra2re.c @@ -106,6 +106,7 @@ int scanhash_lyra2re(int thr_id, struct work *work, { pdata[19] = nonce; *hashes_done = pdata[19] - first_nonce; + work_set_target_ratio( work, hash ); return 1; } } diff --git a/algo/lyra2/lyra2rev2-4way.c b/algo/lyra2/lyra2rev2-4way.c new file mode 100644 index 0000000..95113e7 --- /dev/null +++ b/algo/lyra2/lyra2rev2-4way.c @@ -0,0 +1,177 @@ +#include "lyra2rev2-gate.h" +#include + +#ifdef __AVX2__ + +#include "algo/blake/blake-hash-4way.h" +#include "algo/keccak/keccak-hash-4way.h" +#include "algo/skein/skein-hash-4way.h" +#include "algo/bmw/bmw-hash-4way.h" + +#include "algo/cubehash/sph_cubehash.h" +#include "algo/bmw/sph_bmw.h" +#include "algo/cubehash/sse2/cubehash_sse2.h" + +typedef struct { + blake256_4way_context blake; + keccak256_4way_context keccak; + cubehashParam cube; + skein256_4way_context skein; + sph_bmw256_context bmw; + +} lyra2v2_4way_ctx_holder; + +static lyra2v2_4way_ctx_holder l2v2_4way_ctx; + +void init_lyra2rev2_4way_ctx() +{ +// blake256_4way_init( &l2v2_4way_ctx.blake ); + keccak256_4way_init( &l2v2_4way_ctx.keccak ); + cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 ); + skein256_4way_init( &l2v2_4way_ctx.skein ); + sph_bmw256_init( &l2v2_4way_ctx.bmw ); +} + +void lyra2rev2_4way_hash( void *state, const void *input ) +{ + uint32_t hash0[8] __attribute__ ((aligned (64))); + uint32_t hash1[8] __attribute__ ((aligned (32))); + uint32_t hash2[8] __attribute__ ((aligned (32))); + uint32_t hash3[8] __attribute__ ((aligned (32))); + uint32_t vhash[8*4] __attribute__ ((aligned (64))); + uint64_t vhash64[4*4] __attribute__ ((aligned (64))); + lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64))); + memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) ); + + blake256_4way( &ctx.blake, input + (64<<2), 16 ); +// blake256_4way( &ctx.blake, input, 80 ); + blake256_4way_close( &ctx.blake, vhash ); + + mm256_reinterleave_4x64( vhash64, vhash, 256 ); + keccak256_4way( &ctx.keccak, vhash64, 32 ); + keccak256_4way_close( &ctx.keccak, vhash64 ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 ); + + cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 ); + memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 ); + memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 ); + memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 ); + + LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 ); + LYRA2REV2( l2v2_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 ); + LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 ); + LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 ); + + mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 ); + skein256_4way( &ctx.skein, vhash64, 32 ); + skein256_4way_close( &ctx.skein, vhash64 ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 ); + + memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 ); + memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 ); + memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 ); + memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 ); + + + sph_bmw256( &ctx.bmw, hash0, 32 ); + sph_bmw256_close( &ctx.bmw, hash0 ); + memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw ); + sph_bmw256( &ctx.bmw, hash1, 32 ); + sph_bmw256_close( &ctx.bmw, hash1 ); + memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw ); + sph_bmw256( &ctx.bmw, hash2, 32 ); + sph_bmw256_close( &ctx.bmw, hash2 ); + memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw ); + sph_bmw256( &ctx.bmw, hash3, 32 ); + sph_bmw256_close( &ctx.bmw, hash3 ); + + + memcpy( state, hash0, 32 ); + memcpy( state+32, hash1, 32 ); + memcpy( state+64, hash2, 32 ); + memcpy( state+96, hash3, 32 ); +} + +int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ) +{ + uint32_t hash[8*4] __attribute__ ((aligned (64))); + uint32_t vdata[20*4] __attribute__ ((aligned (64))); + uint32_t _ALIGN(64) edata[20]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + uint32_t n = first_nonce; + const uint32_t Htarg = ptarget[7]; + uint32_t *nonces = work->nonces; + bool *found = work->nfound; + int num_found = 0; + uint32_t *noncep0 = vdata + 76; // 19*4 + uint32_t *noncep1 = vdata + 77; + uint32_t *noncep2 = vdata + 78; + uint32_t *noncep3 = vdata + 79; + + if ( opt_benchmark ) + ( (uint32_t*)ptarget )[7] = 0x0000ff; + + swab32_array( edata, pdata, 20 ); + + mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 ); + + blake256_4way_init( &l2v2_4way_ctx.blake ); + blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 ); + + do { + found[0] = found[1] = found[2] = found[3] = false; + be32enc( noncep0, n ); + be32enc( noncep1, n+1 ); + be32enc( noncep2, n+2 ); + be32enc( noncep3, n+3 ); + + lyra2rev2_4way_hash( hash, vdata ); + pdata[19] = n; + + if ( hash[7] <= Htarg && fulltest( hash, ptarget ) ) + { + found[0] = true; + num_found++; + nonces[0] = pdata[19] = n; + work_set_target_ratio( work, hash ); + } + if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) ) + { + found[1] = true; + num_found++; + nonces[1] = n+1; + work_set_target_ratio( work, hash+8 ); + } + if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) ) + { + found[2] = true; + num_found++; + nonces[2] = n+2; + work_set_target_ratio( work, hash+16 ); + } + if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) ) + { + found[3] = true; + num_found++; + nonces[3] = n+3; + work_set_target_ratio( work, hash+24 ); + } + n += 4; + } while ( (num_found == 0) && (n < max_nonce-4) + && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce + 1; + return num_found; +} + +#endif diff --git a/algo/lyra2/lyra2rev2-gate.c b/algo/lyra2/lyra2rev2-gate.c new file mode 100644 index 0000000..a07b721 --- /dev/null +++ b/algo/lyra2/lyra2rev2-gate.c @@ -0,0 +1,38 @@ +#include "lyra2rev2-gate.h" + +__thread uint64_t* l2v2_wholeMatrix; + +void lyra2rev2_set_target( struct work* work, double job_diff ) +{ + work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); +} + +bool lyra2rev2_thread_init() +{ + const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols + const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + + int i = (int64_t)ROW_LEN_BYTES * 4; // nRows; + l2v2_wholeMatrix = _mm_malloc( i, 64 ); + + return l2v2_wholeMatrix; +} + +bool register_lyra2rev2_algo( algo_gate_t* gate ) +{ +#if defined (LYRA2REV2_4WAY) + init_lyra2rev2_4way_ctx(); + gate->scanhash = (void*)&scanhash_lyra2rev2_4way; + gate->hash = (void*)&lyra2rev2_4way_hash; +#else + init_lyra2rev2_ctx(); + gate->scanhash = (void*)&scanhash_lyra2rev2; + gate->hash = (void*)&lyra2rev2_hash; +#endif + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->miner_thread_init = (void*)&lyra2rev2_thread_init; + gate->set_target = (void*)&lyra2rev2_set_target; + return true; +}; + + diff --git a/algo/lyra2/lyra2rev2-gate.h b/algo/lyra2/lyra2rev2-gate.h new file mode 100644 index 0000000..8af8290 --- /dev/null +++ b/algo/lyra2/lyra2rev2-gate.h @@ -0,0 +1,35 @@ +#ifndef LYRA2REV2_GATE_H__ +#define LYRA2REV2_GATE_H__ 1 + +#include "algo-gate-api.h" +#include +#include "lyra2.h" + +#if defined(HASH_4WAY) + #define LYRA2REV2_4WAY +#endif + +extern __thread uint64_t* l2v2_wholeMatrix; + +bool register_lyra2rev2_algo( algo_gate_t* gate ); + +#if defined(LYRA2REV2_4WAY) + +void lyra2rev2_4way_hash( void *state, const void *input ); + +int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +void init_lyra2rev2_4way_ctx(); + +#endif + +void lyra2rev2_hash( void *state, const void *input ); + +int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +void init_lyra2rev2_ctx(); + +#endif + diff --git a/algo/lyra2/lyra2rev2.c b/algo/lyra2/lyra2rev2.c index 1a8a482..658b96d 100644 --- a/algo/lyra2/lyra2rev2.c +++ b/algo/lyra2/lyra2rev2.c @@ -1,20 +1,12 @@ +#include "lyra2rev2-gate.h" #include - -#include "algo-gate-api.h" - #include "algo/blake/sph_blake.h" #include "algo/cubehash/sph_cubehash.h" #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" #include "algo/bmw/sph_bmw.h" #include "algo/cubehash/sse2/cubehash_sse2.h" -#include "lyra2.h" -#include "avxdefs.h" - -// This gets allocated when miner_thread starts up and is never freed. -// It's not a leak because the only way to allocate it again is to exit -// the thread and that only occurs when the entire program exits. -__thread uint64_t* l2v2_wholeMatrix; +//#include "lyra2.h" typedef struct { cubehashParam cube1; @@ -106,6 +98,7 @@ int scanhash_lyra2rev2(int thr_id, struct work *work, if( fulltest(hash, ptarget) ) { pdata[19] = nonce; + work_set_target_ratio( work, hash ); *hashes_done = pdata[19] - first_nonce; return 1; } @@ -119,30 +112,3 @@ int scanhash_lyra2rev2(int thr_id, struct work *work, return 0; } -void lyra2rev2_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); -} - -bool lyra2rev2_thread_init() -{ - const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols - const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; - - int i = (int64_t)ROW_LEN_BYTES * 4; // nRows; - l2v2_wholeMatrix = _mm_malloc( i, 64 ); - - return l2v2_wholeMatrix; -} - -bool register_lyra2rev2_algo( algo_gate_t* gate ) -{ - init_lyra2rev2_ctx(); - gate->optimizations = AVX_OPT | AVX2_OPT; - gate->miner_thread_init = (void*)&lyra2rev2_thread_init; - gate->scanhash = (void*)&scanhash_lyra2rev2; - gate->hash = (void*)&lyra2rev2_hash; - gate->set_target = (void*)&lyra2rev2_set_target; - return true; -}; - diff --git a/algo/lyra2/lyra2z-4way.c b/algo/lyra2/lyra2z-4way.c index 9dc67df..1e67a71 100644 --- a/algo/lyra2/lyra2z-4way.c +++ b/algo/lyra2/lyra2z-4way.c @@ -85,8 +85,8 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce, be32enc( noncep2, n+2 ); be32enc( noncep3, n+3 ); - be32enc( &edata[19], n ); lyra2z_4way_hash( hash, vdata ); + pdata[19] = n; if ( hash[7] <= Htarg && fulltest( hash, ptarget ) ) { diff --git a/algo/lyra2/lyra2z.c b/algo/lyra2/lyra2z.c index 59374a1..23c7a85 100644 --- a/algo/lyra2/lyra2z.c +++ b/algo/lyra2/lyra2z.c @@ -82,41 +82,3 @@ int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce, return 0; } -/* -//int64_t get_max64_0xffffLL() { return 0xffffLL; }; - -void lyra2z_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); -} - -bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx ) -{ - work->height = sctx->bloc_height; - return false; -} - - -bool lyra2z_thread_init() -{ - const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols - const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; - - int i = (int64_t)ROW_LEN_BYTES * 8; // nRows; - lyra2z_wholeMatrix = _mm_malloc( i, 64 ); - - return lyra2z_wholeMatrix; -} - -bool register_lyra2z_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; - gate->miner_thread_init = (void*)&lyra2z_thread_init; - gate->scanhash = (void*)&scanhash_lyra2z; - gate->hash = (void*)&lyra2z_hash; - gate->get_max64 = (void*)&get_max64_0xffffLL; - gate->set_target = (void*)&lyra2z_set_target; -// gate->prevent_dupes = (void*)&zcoin_get_work_height; - return true; -}; -*/ diff --git a/algo/m7m.c b/algo/m7m.c index c24b246..287e944 100644 --- a/algo/m7m.c +++ b/algo/m7m.c @@ -346,6 +346,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work, hash_str, target_str); } + work_set_target_ratio( work, hash ); pdata[19] = data[19]; goto out; } diff --git a/algo/nist5/nist5.c b/algo/nist5/nist5.c index ef69801..0bbc9a9 100644 --- a/algo/nist5/nist5.c +++ b/algo/nist5/nist5.c @@ -132,6 +132,7 @@ int scanhash_nist5(int thr_id, struct work *work, if (!(hash64[7] & mask)) { printf("[%d]",thr_id); if (fulltest(hash64, ptarget)) { + work_set_target_ratio( work, hash64 ); *hashes_done = n - first_nonce + 1; return true; } diff --git a/algo/nist5/zr5.c b/algo/nist5/zr5.c index 7b1705d..32aa022 100644 --- a/algo/nist5/zr5.c +++ b/algo/nist5/zr5.c @@ -172,6 +172,7 @@ int scanhash_zr5( int thr_id, struct work *work, pdata[0] = tmpdata[0]; pdata[19] = nonce; *hashes_done = pdata[19] - first_nonce + 1; + work_set_target_ratio( work, hash ); if (opt_debug) applog(LOG_INFO, "found nonce %x", nonce); return 1; diff --git a/algo/quark/quark-4way.c b/algo/quark/quark-4way.c new file mode 100644 index 0000000..1a92e98 --- /dev/null +++ b/algo/quark/quark-4way.c @@ -0,0 +1,207 @@ +#include "cpuminer-config.h" +#include "quark-gate.h" + +#if defined (__AVX2__) && defined (__AES__) + +#include +#include +#include + +#include "algo/blake/blake-hash-4way.h" +#include "algo/bmw/bmw-hash-4way.h" +#include "algo/skein/skein-hash-4way.h" +#include "algo/jh/jh-hash-4way.h" +#include "algo/keccak/keccak-hash-4way.h" +#include "algo/groestl/aes_ni/hash-groestl.h" + +typedef struct { + blake512_4way_context blake; + bmw512_4way_context bmw; + hashState_groestl groestl; + jh512_4way_context jh; + skein512_4way_context skein; + keccak512_4way_context keccak; +} quark_4way_ctx_holder; + +quark_4way_ctx_holder quark_4way_ctx __attribute__ ((aligned (64))); + +void init_quark_4way_ctx() +{ + blake512_4way_init( &quark_4way_ctx.blake ); + bmw512_4way_init( &quark_4way_ctx.bmw ); + init_groestl( &quark_4way_ctx.groestl, 64 ); + skein512_4way_init( &quark_4way_ctx.skein ); + jh512_4way_init( &quark_4way_ctx.jh ); + keccak512_4way_init( &quark_4way_ctx.keccak ); +} + +void quark_4way_hash( void *state, const void *input ) +{ + uint64_t hash0[8] __attribute__ ((aligned (64))); + uint64_t hash1[8] __attribute__ ((aligned (64))); + uint64_t hash2[8] __attribute__ ((aligned (64))); + uint64_t hash3[8] __attribute__ ((aligned (64))); + uint64_t vhash[8*4] __attribute__ ((aligned (64))); + uint64_t vhashA[8*4] __attribute__ ((aligned (64))); + uint64_t vhashB[8*4] __attribute__ ((aligned (64))); + __m256i* vh = (__m256i*)vhash; + __m256i* vhA = (__m256i*)vhashA; + __m256i* vhB = (__m256i*)vhashB; + __m256i vh_mask; + __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 ); + int i; + quark_4way_ctx_holder ctx; + memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) ); + + blake512_4way( &ctx.blake, input, 80 ); + blake512_4way_close( &ctx.blake, vhash ); + + bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_close( &ctx.bmw, vhash ); + + vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), + mm256_zero ); + + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, + (char*)hash0, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, + (char*)hash1, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, + (char*)hash2, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, + (char*)hash3, 512 ); + mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); + + skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_close( &ctx.skein, vhashB ); + + for ( i = 0; i < 8; i++ ) + vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask ); + + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + + jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_close( &ctx.jh, vhash ); + + vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), + mm256_zero ); + + blake512_4way_init( &ctx.blake ); + blake512_4way( &ctx.blake, vhash, 64 ); + blake512_4way_close( &ctx.blake, vhashA ); + + bmw512_4way_init( &ctx.bmw ); + bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_close( &ctx.bmw, vhashB ); + + for ( i = 0; i < 8; i++ ) + vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask ); + + keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_close( &ctx.keccak, vhash ); + + skein512_4way_init( &ctx.skein ); + skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_close( &ctx.skein, vhash ); + + vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), + mm256_zero ); + + keccak512_4way_init( &ctx.keccak ); + keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_close( &ctx.keccak, vhashA ); + + jh512_4way_init( &ctx.jh ); + jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_close( &ctx.jh, vhashB ); + + for ( i = 0; i < 8; i++ ) + vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask ); + + mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 ); +} + +int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done) +{ + uint32_t hash[4*8] __attribute__ ((aligned (64))); + uint32_t vdata[24*4] __attribute__ ((aligned (64))); + uint32_t endiandata[20] __attribute__((aligned(64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + uint32_t *nonces = work->nonces; + bool *found = work->nfound; + int num_found = 0; + uint32_t *noncep0 = vdata + 73; // 9*8 + 1 + uint32_t *noncep1 = vdata + 75; + uint32_t *noncep2 = vdata + 77; + uint32_t *noncep3 = vdata + 79; + + swab32_array( endiandata, pdata, 20 ); + + uint64_t *edata = (uint64_t*)endiandata; + mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); + + do + { + found[0] = found[1] = found[2] = found[3] = false; + be32enc( noncep0, n ); + be32enc( noncep1, n+1 ); + be32enc( noncep2, n+2 ); + be32enc( noncep3, n+3 ); + + quark_4way_hash( hash, vdata ); + pdata[19] = n; + + if ( ( hash[7] & 0xFFFFFF00 ) == 0 && fulltest( hash, ptarget ) ) + { + found[0] = true; + num_found++; + nonces[0] = n; + work_set_target_ratio( work, hash ); + } + if ( ( (hash+8)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+8, ptarget ) ) + { + found[1] = true; + num_found++; + nonces[1] = n+1; + work_set_target_ratio( work, hash ); + } + if ( ( (hash+16)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+16, ptarget ) ) + { + found[2] = true; + num_found++; + nonces[2] = n+2; + work_set_target_ratio( work, hash ); + } + if ( ( (hash+24)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+24, ptarget ) ) + { + found[3] = true; + num_found++; + nonces[3] = n+3; + work_set_target_ratio( work, hash ); + } + n += 4; + } while ( ( num_found == 0 ) && ( n < max_nonce ) + && !work_restart[thr_id].restart ); + + *hashes_done = n - first_nonce + 1; + return num_found; +} + +#endif diff --git a/algo/quark/quark-gate.c b/algo/quark/quark-gate.c new file mode 100644 index 0000000..9356cdb --- /dev/null +++ b/algo/quark/quark-gate.c @@ -0,0 +1,17 @@ +#include "quark-gate.h" + +bool register_quark_algo( algo_gate_t* gate ) +{ +#if defined (QUARK_4WAY) + init_quark_4way_ctx(); + gate->scanhash = (void*)&scanhash_quark_4way; + gate->hash = (void*)&quark_4way_hash; +#else + init_quark_ctx(); + gate->scanhash = (void*)&scanhash_quark; + gate->hash = (void*)&quark_hash; +#endif + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT; + return true; +}; + diff --git a/algo/quark/quark-gate.h b/algo/quark/quark-gate.h new file mode 100644 index 0000000..676c6b5 --- /dev/null +++ b/algo/quark/quark-gate.h @@ -0,0 +1,32 @@ +#ifndef QUARK_GATE_H__ +#define QUARK_GATE_H__ 1 + +#include "algo-gate-api.h" +#include + +#if defined(HASH_4WAY) && defined(__AES__) + #define QUARK_4WAY +#endif + +bool register_quark_algo( algo_gate_t* gate ); + +#if defined(QUARK_4WAY) + +void quark_4way_hash( void *state, const void *input ); + +int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +void init_quark_4way_ctx(); + +#endif + +void quark_hash( void *state, const void *input ); + +int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +void init_quark_ctx(); + +#endif + diff --git a/algo/quark/quark.c b/algo/quark/quark.c index 7d9d21e..debad08 100644 --- a/algo/quark/quark.c +++ b/algo/quark/quark.c @@ -1,5 +1,5 @@ #include "cpuminer-config.h" -#include "algo-gate-api.h" +#include "quark-gate.h" #include #include @@ -47,7 +47,7 @@ void init_quark_ctx() #endif } -inline static void quarkhash(void *state, const void *input) +void quark_hash(void *state, const void *input) { unsigned char hashbuf[128]; size_t hashptr; @@ -187,11 +187,12 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce, do { pdata[19] = ++n; be32enc(&endiandata[19], n); - quarkhash(hash64, &endiandata); + quark_hash(hash64, &endiandata); if ((hash64[7]&0xFFFFFF00)==0) { if (fulltest(hash64, ptarget)) { + work_set_target_ratio( work, hash64 ); *hashes_done = n - first_nonce + 1; return true; } @@ -203,12 +204,3 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce, return 0; } -bool register_quark_algo( algo_gate_t* gate ) -{ - init_quark_ctx(); - gate->optimizations = SSE2_OPT | AES_OPT; - gate->scanhash = (void*)&scanhash_quark; - gate->hash = (void*)&quarkhash; - return true; -}; - diff --git a/algo/qubit/deep.c b/algo/qubit/deep.c index ed395d9..6c82aad 100644 --- a/algo/qubit/deep.c +++ b/algo/qubit/deep.c @@ -122,6 +122,7 @@ int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce, if (!(hash64[7] & mask)) { printf("[%d]",thr_id); if (fulltest(hash64, ptarget)) { + work_set_target_ratio( work, hash64 ); *hashes_done = n - first_nonce + 1; return true; } diff --git a/algo/qubit/qubit.c b/algo/qubit/qubit.c index 74d528e..4310f9a 100644 --- a/algo/qubit/qubit.c +++ b/algo/qubit/qubit.c @@ -134,6 +134,7 @@ int scanhash_qubit(int thr_id, struct work *work, if (!(hash64[7] & mask)) { printf("[%d]",thr_id); if (fulltest(hash64, ptarget)) { + work_set_target_ratio( work, hash64 ); *hashes_done = n - first_nonce + 1; return true; } diff --git a/algo/scrypt.c b/algo/scrypt.c index 88066a0..0e268e7 100644 --- a/algo/scrypt.c +++ b/algo/scrypt.c @@ -754,6 +754,7 @@ extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce, if (unlikely(hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget))) { *hashes_done = n - pdata[19] + 1; pdata[19] = data[i * 20 + 19]; + work_set_target_ratio( work, hash ); return 1; } } diff --git a/algo/scryptjane/scrypt-jane-chacha.h b/algo/scryptjane/scrypt-jane-chacha.h index c4d44c2..128e347 100644 --- a/algo/scryptjane/scrypt-jane-chacha.h +++ b/algo/scryptjane/scrypt-jane-chacha.h @@ -114,7 +114,7 @@ available_implementations() { return flags; } #endif - +/* static int scrypt_test_mix() { static const uint8_t expected[16] = { @@ -145,4 +145,4 @@ scrypt_test_mix() { return ret; } - +*/ diff --git a/algo/scryptjane/scrypt-jane-hash.h b/algo/scryptjane/scrypt-jane-hash.h index db5c1db..264eb48 100644 --- a/algo/scryptjane/scrypt-jane-hash.h +++ b/algo/scryptjane/scrypt-jane-hash.h @@ -26,7 +26,7 @@ #include "scrypt-jane-pbkdf2.h" #define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */ - +/* static int scrypt_test_hash() { scrypt_hash_state st; @@ -45,4 +45,4 @@ scrypt_test_hash() { scrypt_hash_finish(&st, final); return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE); } - +*/ diff --git a/algo/skein/skein-hash-4way.c b/algo/skein/skein-hash-4way.c index ae1bf98..8f48fa5 100644 --- a/algo/skein/skein-hash-4way.c +++ b/algo/skein/skein-hash-4way.c @@ -342,17 +342,6 @@ do { \ do { \ sph_u64 t0, t1, t2; \ __m256i h8; \ -/* can LE be assumed? \ - dec64le does nothing when SPH_LITTLE endian is set, as it is. \ - __m256i m0 = _mm256_dec64le( buf ); \ - __m256i m1 = _mm256_dec64le( buf + 8*4 ); \ - __m256i m2 = _mm256_dec64le( buf + 16*4 ); \ - __m256i m3 = _mm256_dec64le( buf + 24*4 ); \ - __m256i m4 = _mm256_dec64le( buf + 32*4 ); \ - __m256i m5 = _mm256_dec64le( buf + 40*4 ); \ - __m256i m6 = _mm256_dec64le( buf + 48*4 ); \ - __m256i m7 = _mm256_dec64le( buf + 56*4 ); \ -*/ \ __m256i m0 = buf[0]; \ __m256i m1 = buf[1]; \ __m256i m2 = buf[2]; \ diff --git a/algo/skein/skein-hash-4way.h b/algo/skein/skein-hash-4way.h index 2d858f3..8dba423 100644 --- a/algo/skein/skein-hash-4way.h +++ b/algo/skein/skein-hash-4way.h @@ -39,7 +39,9 @@ */ #ifndef __SKEIN_HASH_4WAY_H__ -#define __SKEIN_HASH_4WAY_H__ +#define __SKEIN_HASH_4WAY_H__ 1 + +#ifdef __AVX2__ #ifdef __cplusplus extern "C"{ @@ -53,14 +55,15 @@ extern "C"{ #define SPH_SIZE_skein256 256 #define SPH_SIZE_skein512 512 -#ifdef __AVX2__ - typedef struct { __m256i buf[8] __attribute__ ((aligned (32))); __m256i h0, h1, h2, h3, h4, h5, h6, h7; size_t ptr; sph_u64 bcount; -} skein512_4way_context; +} sph_skein_4way_big_context; + +typedef sph_skein_4way_big_context skein512_4way_context; +typedef sph_skein_4way_big_context skein256_4way_context; void skein512_4way_init(void *cc); void skein512_4way(void *cc, const void *data, size_t len); @@ -68,26 +71,15 @@ void skein512_4way_close(void *cc, void *dst); //void sph_skein512_addbits_and_close( // void *cc, unsigned ub, unsigned n, void *dst); -#endif - -#ifdef __AVX__ - -typedef struct { - __m128i buf[8] __attribute__ ((aligned (32))); - __m128i h0, h1, h2, h3, h4, h5, h6, h7; - size_t ptr; - sph_u64 bcount; -} skein256_4way_context; - void skein256_4way_init(void *cc); void skein256_4way(void *cc, const void *data, size_t len); void skein256_4way_close(void *cc, void *dst); //void sph_skein256_addbits_and_close( // void *cc, unsigned ub, unsigned n, void *dst); -#endif #ifdef __cplusplus } #endif #endif +#endif diff --git a/algo/sm3/sm3-hash-4way.c b/algo/sm3/sm3-hash-4way.c new file mode 100644 index 0000000..c970d54 --- /dev/null +++ b/algo/sm3/sm3-hash-4way.c @@ -0,0 +1,231 @@ +/* ==================================================================== + * Copyright (c) 2014 - 2017 The GmSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the GmSSL Project. + * (http://gmssl.org/)" + * + * 4. The name "GmSSL Project" must not be used to endorse or promote + * products derived from this software without prior written + * permission. For written permission, please contact + * guanzhi1980@gmail.com. + * + * 5. Products derived from this software may not be called "GmSSL" + * nor may "GmSSL" appear in their names without prior written + * permission of the GmSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the GmSSL Project + * (http://gmssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE GmSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#include +#include "sm3-hash-4way.h" + +#ifdef __AVX__ + +void sm3_4way_init( sm3_4way_ctx_t *ctx ) +{ + ctx->digest[0] = _mm_set1_epi32( 0x7380166F ); + ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 ); + ctx->digest[2] = _mm_set1_epi32( 0x172442D7 ); + ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 ); + ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC ); + ctx->digest[5] = _mm_set1_epi32( 0x163138AA ); + ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D ); + ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E ); + ctx->nblocks = 0; + ctx->num = 0; +} + +void sm3_4way( void *cc, const void *data, size_t len ) +{ + sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc; + __m128i *block = (__m128i*)ctx->block; + __m128i *vdata = (__m128i*)data; + + if ( ctx->num ) + { + unsigned int left = SM3_BLOCK_SIZE - ctx->num; + if ( len < left ) + { + memcpy_128( block + (ctx->num >> 2), vdata , len>>2 ); + ctx->num += len; + return; + } + else + { + memcpy_128( block + (ctx->num >> 2), vdata , left>>2 ); + sm3_4way_compress( ctx->digest, block ); + ctx->nblocks++; + vdata += left>>2; + len -= left; + } + } + while ( len >= SM3_BLOCK_SIZE ) + { + sm3_4way_compress( ctx->digest, vdata ); + ctx->nblocks++; + vdata += SM3_BLOCK_SIZE>>2; + len -= SM3_BLOCK_SIZE; + } + ctx->num = len; + if ( len ) + memcpy_128( block, vdata, len>>2 ); +} + +void sm3_4way_close( void *cc, void *dst ) +{ + sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc; + __m128i *hash = (__m128i*)dst; + __m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) ); + __m128i *block = (__m128i*)ctx->block; + int i; + + block[ctx->num] = _mm_set1_epi32( 0x80 ); + + if ( ctx->num + 8 <= SM3_BLOCK_SIZE ) + { + memset_zero_128( block + (ctx->num >> 2) + 1, + ( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 ); + } + else + { + memset_zero_128( block + (ctx->num >> 2) + 1, + ( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) ); + sm3_4way_compress( ctx->digest, block ); + memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 ); + } + + count[0] = mm_byteswap_32( + _mm_set1_epi32( ctx->nblocks >> 23 ) ); + count[1] = mm_byteswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) + + ( ctx->num << 3 ) ) ); + sm3_4way_compress( ctx->digest, block ); + + for ( i = 0; i < 8 ; i++ ) + hash[i] = mm_byteswap_32( ctx->digest[i] ); +} + +#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x, 9 ), \ + mm_rotl_32( x, 17 ) ) ) +#define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x, 15 ), \ + mm_rotl_32( x, 23 ) ) ) + +#define FF0(x,y,z) _mm_xor_si128( x, _mm_xor_si128( y, z ) ) +#define FF1(x,y,z) _mm_or_si128( _mm_or_si128( _mm_and_si128( x, y ), \ + _mm_and_si128( x, z ) ), \ + _mm_and_si128( y, z ) ) + +#define GG0(x,y,z) FF0(x,y,z) +#define GG1(x,y,z) _mm_or_si128( _mm_and_si128( x, y ), \ + _mm_andnot_si128( x, z ) ) + + +void sm3_4way_compress( __m128i *digest, __m128i *block ) +{ + __m128i W[68], W1[64]; + __m128i A = digest[ 0 ]; + __m128i B = digest[ 1 ]; + __m128i C = digest[ 2 ]; + __m128i D = digest[ 3 ]; + __m128i E = digest[ 4 ]; + __m128i F = digest[ 5 ]; + __m128i G = digest[ 6 ]; + __m128i H = digest[ 7 ]; + __m128i SS1, SS2, TT1, TT2, T; + int j; + + for ( j = 0; j < 16; j++ ) + W[j] = mm_byteswap_32( block[j] ); + + for ( j = 16; j < 68; j++ ) + W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ], + W[ j-9 ] ), + mm_rotl_32( W[ j-3 ], 15 ) ) ), + _mm_xor_si128( mm_rotl_32( W[ j-13 ], 7 ), + W[ j-6 ] ) ); + + for( j = 0; j < 64; j++ ) + W1[j] = _mm_xor_si128( W[j], W[j+4] ); + + T = _mm_set1_epi32( 0x79CC4519UL ); + for( j =0; j < 16; j++ ) + { + SS1 = mm_rotl_32( _mm_add_epi32( _mm_add_epi32( mm_rotl_32( A, 12 ), E ), + mm_rotl_32( T, j ) ), 7 ); + SS2 = _mm_xor_si128( SS1, mm_rotl_32( A, 12 ) ); + TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF0( A, B, C ), D ), + SS2 ), W1[j] ); + TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG0( E, F, G ), H ), + SS1 ), W[j] ); + D = C; + C = mm_rotl_32( B, 9 ); + B = A; + A = TT1; + H = G; + G = mm_rotl_32( F, 19 ); + F = E; + E = P0( TT2 ); + } + + T = _mm_set1_epi32( 0x7A879D8AUL ); + for( j =16; j < 64; j++ ) + { + SS1 = mm_rotl_32( _mm_add_epi32( _mm_add_epi32( mm_rotl_32( A, 12 ), E ), + mm_rotl_32( T, j&31 ) ), 7 ); + SS2 = _mm_xor_si128( SS1, mm_rotl_32( A, 12 ) ); + TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF1( A, B, C ), D ), + SS2 ), W1[j] ); + TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG1( E, F, G ), H ), + SS1 ), W[j] ); + D = C; + C = mm_rotl_32( B, 9 ); + B = A; + A = TT1; + H = G; + G = mm_rotl_32( F, 19 ); + F = E; + E = P0( TT2 ); + } + + digest[0] = _mm_xor_si128( digest[0], A ); + digest[1] = _mm_xor_si128( digest[1], B ); + digest[2] = _mm_xor_si128( digest[2], C ); + digest[3] = _mm_xor_si128( digest[3], D ); + digest[4] = _mm_xor_si128( digest[4], E ); + digest[5] = _mm_xor_si128( digest[5], F ); + digest[6] = _mm_xor_si128( digest[6], G ); + digest[7] = _mm_xor_si128( digest[7], H ); +} + +#endif + diff --git a/algo/sm3/sm3-hash-4way.h b/algo/sm3/sm3-hash-4way.h new file mode 100644 index 0000000..423bda7 --- /dev/null +++ b/algo/sm3/sm3-hash-4way.h @@ -0,0 +1,89 @@ +/* ==================================================================== + * Copyright (c) 2014 - 2016 The GmSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the GmSSL Project. + * (http://gmssl.org/)" + * + * 4. The name "GmSSL Project" must not be used to endorse or promote + * products derived from this software without prior written + * permission. For written permission, please contact + * guanzhi1980@gmail.com. + * + * 5. Products derived from this software may not be called "GmSSL" + * nor may "GmSSL" appear in their names without prior written + * permission of the GmSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the GmSSL Project + * (http://gmssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE GmSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#ifndef SPH_SM3_HASH_4WAY_H +#define SPH_SM3_HASH_4WAY_H + +#define SM3_DIGEST_LENGTH 32 +#define SM3_BLOCK_SIZE 64 +#define SM3_CBLOCK (SM3_BLOCK_SIZE) +#define SM3_HMAC_SIZE (SM3_DIGEST_LENGTH) + + +#include +#include +#include +#include "avxdefs.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +typedef struct { + __m128i block[16] __attribute__ ((aligned (64))); + __m128i digest[8]; + uint32_t nblocks; + uint32_t num; +} sm3_4way_ctx_t; + +void sm3_4way_init( sm3_4way_ctx_t *ctx ); +//void sm3_4way_update( sm3_4way_ctx_t *ctx, const unsigned char* data, +// size_t data_len ); +//void sm3_4way_final( sm3_4way_ctx_t *ctx, +// unsigned char digest[SM3_DIGEST_LENGTH] ); +void sm3_4way_compress( __m128i *digest, __m128i *block ); + +void sm3_4way(void *cc, const void *data, size_t len); +void sm3_4way_close(void *cc, void *dst); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/algo/sm3/sm3.c b/algo/sm3/sm3.c index e5c5805..aea56cb 100644 --- a/algo/sm3/sm3.c +++ b/algo/sm3/sm3.c @@ -189,7 +189,7 @@ void sm3_compress(uint32_t digest[8], const unsigned char block[64]) for(j =16; j < 64; j++) { T[j] = 0x7A879D8A; - SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j)), 7); + SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j&31)), 7); SS2 = SS1 ^ ROTATELEFT(A,12); TT1 = FF1(A,B,C) + D + SS2 + W1[j]; TT2 = GG1(E,F,G) + H + SS1 + W[j]; diff --git a/algo/whirlpool/sph_whirlpool.c b/algo/whirlpool/sph_whirlpool.c index 22d5e64..e48536f 100644 --- a/algo/whirlpool/sph_whirlpool.c +++ b/algo/whirlpool/sph_whirlpool.c @@ -3468,9 +3468,10 @@ sph_ ## name ## _close(void *cc, void *dst) \ for (i = 0; i < 8; i ++) \ sph_enc64le((unsigned char *)dst + 8 * i, sc->state[i]); \ } -// sph_ ## name ## _init(cc); \ -//} - +/* + sph_ ## name ## _init(cc); \ +} +*/ MAKE_CLOSE(whirlpool) MAKE_CLOSE(whirlpool0) MAKE_CLOSE(whirlpool1) diff --git a/algo/whirlpool/whirlpool-gate.h b/algo/whirlpool/whirlpool-gate.h index 3d187bb..adf2994 100644 --- a/algo/whirlpool/whirlpool-gate.h +++ b/algo/whirlpool/whirlpool-gate.h @@ -22,6 +22,7 @@ void whirlpool_hash( void *state, const void *input ); int scanhash_whirlpool( int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done ); +void init_whirlpool_ctx(); #endif #endif diff --git a/algo/whirlpool/whirlpool-hash-4way.c b/algo/whirlpool/whirlpool-hash-4way.c index 81327fa..ae40d20 100644 --- a/algo/whirlpool/whirlpool-hash-4way.c +++ b/algo/whirlpool/whirlpool-hash-4way.c @@ -3345,8 +3345,10 @@ do { \ #define READ_STATE MUL8(READ_STATE_W) #define ROUND0 MUL8(ROUND0_W) #define UPDATE_STATE MUL8(UPDATE_STATE_W) -//#define BYTE(x, n) \ -// _mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) ) +/* +#define BYTE(x, n) \ + _mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) ) +*/ #define BYTE(x, n) ((unsigned)((x) >> (8 * (n))) & 0xFF) diff --git a/algo/x11/c11.c b/algo/x11/c11.c index 07dc774..b26791d 100644 --- a/algo/x11/c11.c +++ b/algo/x11/c11.c @@ -162,7 +162,8 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce, { pdata[19] = nonce; *hashes_done = pdata[19] - first_nonce; - return 1; + work_set_target_ratio( work, hash ); + return 1; } nonce++; } while ( nonce < max_nonce && !(*restart) ); diff --git a/algo/x11/timetravel-4way.c b/algo/x11/timetravel-4way.c new file mode 100644 index 0000000..ea7dc6c --- /dev/null +++ b/algo/x11/timetravel-4way.c @@ -0,0 +1,274 @@ +#include "timetravel-gate.h" + +#if defined(__AVX2__) && defined(__AES__) + +#include +#include +#include +#include +#include "algo/blake/blake-hash-4way.h" +#include "algo/bmw/bmw-hash-4way.h" +#include "algo/groestl/aes_ni/hash-groestl.h" +#include "algo/skein/skein-hash-4way.h" +#include "algo/jh/jh-hash-4way.h" +#include "algo/keccak/keccak-hash-4way.h" +#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/cubehash/sse2/cubehash_sse2.h" + +static __thread uint32_t s_ntime = UINT32_MAX; +static __thread int permutation[TT8_FUNC_COUNT] = { 0 }; + +typedef struct { + blake512_4way_context blake; + bmw512_4way_context bmw; + hashState_groestl groestl; + skein512_4way_context skein; + jh512_4way_context jh; + keccak512_4way_context keccak; + hashState_luffa luffa; + cubehashParam cube; +} tt8_4way_ctx_holder; + +tt8_4way_ctx_holder tt8_4way_ctx __attribute__ ((aligned (64))); + +void init_tt8_4way_ctx() +{ + blake512_4way_init( &tt8_4way_ctx.blake ); + bmw512_4way_init( &tt8_4way_ctx.bmw ); + init_groestl( &tt8_4way_ctx.groestl, 64 ); + skein512_4way_init( &tt8_4way_ctx.skein ); + jh512_4way_init( &tt8_4way_ctx.jh ); + keccak512_4way_init( &tt8_4way_ctx.keccak ); + init_luffa( &tt8_4way_ctx.luffa, 512 ); + cubehashInit( &tt8_4way_ctx.cube, 512, 16, 32 ); +}; + +void timetravel_4way_hash(void *output, const void *input) +{ + uint64_t hash0[8] __attribute__ ((aligned (64))); + uint64_t hash1[8] __attribute__ ((aligned (64))); + uint64_t hash2[8] __attribute__ ((aligned (64))); + uint64_t hash3[8] __attribute__ ((aligned (64))); + uint64_t vhashX[8*4] __attribute__ ((aligned (64))); + uint64_t vhashY[8*4] __attribute__ ((aligned (64))); + uint64_t *vhashA, *vhashB; + tt8_4way_ctx_holder ctx __attribute__ ((aligned (64))); + uint32_t dataLen = 64; + int i; + + memcpy( &ctx, &tt8_4way_ctx, sizeof(tt8_4way_ctx) ); + + for ( i = 0; i < TT8_FUNC_COUNT; i++ ) + { + if (i == 0) + { + dataLen = 80; + vhashA = (uint64_t*)input; + vhashB = vhashX; + } + else + { + dataLen = 64; + if ( i % 2 == 0 ) + { + vhashA = vhashY; + vhashB = vhashX; + } + else + { + vhashA = vhashX; + vhashB = vhashY; + } + } + + switch ( permutation[i] ) + { + case 0: + blake512_4way( &ctx.blake, vhashA, dataLen ); + blake512_4way_close( &ctx.blake, vhashB ); + if ( i == 7 ) + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhashB, dataLen<<3 ); + break; + case 1: + bmw512_4way( &ctx.bmw, vhashA, dataLen ); + bmw512_4way_close( &ctx.bmw, vhashB ); + if ( i == 7 ) + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhashB, dataLen<<3 ); + break; + case 2: + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhashA, dataLen<<3 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, + (char*)hash0, dataLen<<3 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, + (char*)hash1, dataLen<<3 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, + (char*)hash2, dataLen<<3 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, + (char*)hash3, dataLen<<3 ); + if ( i != 7 ) + mm256_interleave_4x64( vhashB, + hash0, hash1, hash2, hash3, dataLen<<3 ); + break; + case 3: + skein512_4way( &ctx.skein, vhashA, dataLen ); + skein512_4way_close( &ctx.skein, vhashB ); + if ( i == 7 ) + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhashB, dataLen<<3 ); + break; + case 4: + jh512_4way( &ctx.jh, vhashA, dataLen ); + jh512_4way_close( &ctx.jh, vhashB ); + if ( i == 7 ) + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhashB, dataLen<<3 ); + break; + case 5: + keccak512_4way( &ctx.keccak, vhashA, dataLen ); + keccak512_4way_close( &ctx.keccak, vhashB ); + if ( i == 7 ) + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhashB, dataLen<<3 ); + break; + case 6: + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhashA, dataLen<<3 ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, + (const BitSequence *)hash0, dataLen ); + memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, + (const BitSequence*)hash1, dataLen ); + memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, + (const BitSequence*)hash2, dataLen ); + memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, + (const BitSequence*)hash3, dataLen ); + if ( i != 7 ) + mm256_interleave_4x64( vhashB, + hash0, hash1, hash2, hash3, dataLen<<3 ); + break; + case 7: + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhashA, dataLen<<3 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash0, + (const byte*)hash0, dataLen ); + memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash1, + (const byte*)hash1, dataLen ); + memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash2, + (const byte*)hash2, dataLen ); + memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash3, + (const byte*)hash3, dataLen ); + if ( i != 7 ) + mm256_interleave_4x64( vhashB, + hash0, hash1, hash2, hash3, dataLen<<3 ); + break; + default: + applog(LOG_ERR,"SWERR: timetravel invalid permutation"); + break; + } + } + + memcpy( output, hash0, 32 ); + memcpy( output+32, hash1, 32 ); + memcpy( output+64, hash2, 32 ); + memcpy( output+96, hash3, 32 ); +} + +int scanhash_timetravel_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ) +{ + uint32_t hash[4*8] __attribute__ ((aligned (64))); + uint32_t vdata[24*4] __attribute__ ((aligned (64))); + uint32_t endiandata[20] __attribute__((aligned(64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + uint32_t *nonces = work->nonces; + bool *found = work->nfound; + int num_found = 0; + uint32_t *noncep0 = vdata + 73; // 9*8 + 1 + uint32_t *noncep1 = vdata + 75; + uint32_t *noncep2 = vdata + 77; + uint32_t *noncep3 = vdata + 79; + const uint32_t Htarg = ptarget[7]; + volatile uint8_t *restart = &(work_restart[thr_id].restart); + int i; + + if ( opt_benchmark ) + ptarget[7] = 0x0cff; + + for ( int k = 0; k < 19; k++ ) + be32enc( &endiandata[k], pdata[k] ); + + const uint32_t timestamp = endiandata[17]; + if ( timestamp != s_ntime ) + { + const int steps = ( timestamp - TT8_FUNC_BASE_TIMESTAMP ) + % TT8_FUNC_COUNT_PERMUTATIONS; + for ( i = 0; i < TT8_FUNC_COUNT; i++ ) + permutation[i] = i; + for ( i = 0; i < steps; i++ ) + tt8_next_permutation( permutation, permutation + TT8_FUNC_COUNT ); + s_ntime = timestamp; + } + + uint64_t *edata = (uint64_t*)endiandata; + mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); + + do + { + found[0] = found[1] = found[2] = found[3] = false; + be32enc( noncep0, n ); + be32enc( noncep1, n+1 ); + be32enc( noncep2, n+2 ); + be32enc( noncep3, n+3 ); + + timetravel_4way_hash( hash, vdata ); + pdata[19] = n; + + if ( hash[7] <= Htarg && fulltest( hash, ptarget) ) + { + found[0] = true; + num_found++; + nonces[0] = n; + work_set_target_ratio( work, hash ); + } + if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget) ) + { + found[1] = true; + num_found++; + nonces[1] = n+1; + work_set_target_ratio( work, hash+8 ); + } + if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget) ) + { + found[2] = true; + num_found++; + nonces[2] = n+2; + work_set_target_ratio( work, hash+16 ); + } + if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget) ) + { + found[3] = true; + num_found++; + nonces[3] = n+3; + work_set_target_ratio( work, hash+24 ); + } + n += 4; + } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) ); + *hashes_done = n - first_nonce + 1; + return num_found; +} + +#endif diff --git a/algo/x11/timetravel-gate.c b/algo/x11/timetravel-gate.c new file mode 100644 index 0000000..bee194c --- /dev/null +++ b/algo/x11/timetravel-gate.c @@ -0,0 +1,78 @@ +#include "timetravel-gate.h" + +void tt8_set_target( struct work* work, double job_diff ) +{ + work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); +} + +bool register_timetravel_algo( algo_gate_t* gate ) +{ +#ifdef TIMETRAVEL_4WAY + init_tt8_4way_ctx(); + gate->scanhash = (void*)&scanhash_timetravel_4way; + gate->hash = (void*)&timetravel_4way_hash; +#else + init_tt8_ctx(); + gate->scanhash = (void*)&scanhash_timetravel; + gate->hash = (void*)&timetravel_hash; +#endif + gate->set_target = (void*)&tt8_set_target; + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->get_max64 = (void*)&get_max64_0xffffLL; + return true; +}; + +inline void tt_swap( int *a, int *b ) +{ + int c = *a; + *a = *b; + *b = c; +} + +inline void reverse( int *pbegin, int *pend ) +{ + while ( (pbegin != pend) && (pbegin != --pend) ) + { + tt_swap( pbegin, pend ); + pbegin++; + } +} + +void tt8_next_permutation( int *pbegin, int *pend ) +{ + if ( pbegin == pend ) + return; + + int *i = pbegin; + ++i; + if ( i == pend ) + return; + + i = pend; + --i; + + while (1) + { + int *j = i; + --i; + + if ( *i < *j ) + { + int *k = pend; + + while ( !(*i < *--k) ) /* do nothing */ ; + + tt_swap( i, k ); + reverse(j, pend); + return; // true + } + + if ( i == pbegin ) + { + reverse(pbegin, pend); + return; // false + } + // else? + } +} + diff --git a/algo/x11/timetravel-gate.h b/algo/x11/timetravel-gate.h new file mode 100644 index 0000000..758b73d --- /dev/null +++ b/algo/x11/timetravel-gate.h @@ -0,0 +1,40 @@ +#ifndef TIMETRAVEL_GATE_H__ +#define TIMETRAVEL_GATE_H__ 1 + +#include "algo-gate-api.h" +#include + +#if defined(HASH_4WAY) && defined(__AES__) + #define TIMETRAVEL_4WAY +#endif + +// Machinecoin Genesis Timestamp +#define TT8_FUNC_BASE_TIMESTAMP 1389040865 + +#define TT8_FUNC_COUNT 8 +#define TT8_FUNC_COUNT_PERMUTATIONS 40320 + +void tt8_next_permutation( int *pbegin, int *pend ); + +bool register_timetravel_algo( algo_gate_t* gate ); + +#if defined(TIMETRAVEL_4WAY) + +void timetravel_4way_hash( void *state, const void *input ); + +int scanhash_timetravel_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +void init_tt8_4way_ctx(); + +#endif + +void timetravel_hash( void *state, const void *input ); + +int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +void init_tt8_ctx(); + +#endif + diff --git a/algo/timetravel.c b/algo/x11/timetravel.c similarity index 82% rename from algo/timetravel.c rename to algo/x11/timetravel.c index bd41aa5..fdbfef1 100644 --- a/algo/timetravel.c +++ b/algo/x11/timetravel.c @@ -1,11 +1,9 @@ -#include "algo-gate-api.h" +#include "timetravel-gate.h" #include #include #include #include -#include "avxdefs.h" - #include "algo/blake/sph_blake.h" #include "algo/bmw/sph_bmw.h" #include "algo/jh/sph_jh.h" @@ -13,75 +11,14 @@ #include "algo/skein/sph_skein.h" #include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h" - #ifdef NO_AES_NI #include "algo/groestl/sph_groestl.h" #else #include "algo/groestl/aes_ni/hash-groestl.h" #endif -// Machinecoin Genesis Timestamp -#define HASH_FUNC_BASE_TIMESTAMP 1389040865 - -#define HASH_FUNC_COUNT 8 -#define HASH_FUNC_COUNT_PERMUTATIONS 40320 - static __thread uint32_t s_ntime = UINT32_MAX; -static __thread int permutation[HASH_FUNC_COUNT] = { 0 }; - -inline void tt_swap( int *a, int *b ) -{ - int c = *a; - *a = *b; - *b = c; -} - -inline void reverse( int *pbegin, int *pend ) -{ - while ( (pbegin != pend) && (pbegin != --pend) ) - { - tt_swap( pbegin, pend ); - pbegin++; - } -} - -static void next_permutation( int *pbegin, int *pend ) -{ - if ( pbegin == pend ) - return; - - int *i = pbegin; - ++i; - if ( i == pend ) - return; - - i = pend; - --i; - - while (1) - { - int *j = i; - --i; - - if ( *i < *j ) - { - int *k = pend; - - while ( !(*i < *--k) ) /* do nothing */ ; - - tt_swap( i, k ); - reverse(j, pend); - return; // true - } - - if ( i == pbegin ) - { - reverse(pbegin, pend); - return; // false - } - // else? - } -} +static __thread int permutation[TT8_FUNC_COUNT] = { 0 }; typedef struct { sph_blake512_context blake; @@ -101,7 +38,7 @@ typedef struct { tt_ctx_holder tt_ctx __attribute__ ((aligned (64))); __thread tt_ctx_holder tt_mid __attribute__ ((aligned (64))); -void init_tt_ctx() +void init_tt8_ctx() { sph_blake512_init( &tt_ctx.blake ); sph_bmw512_init( &tt_ctx.bmw ); @@ -119,7 +56,7 @@ void init_tt_ctx() void timetravel_hash(void *output, const void *input) { - uint32_t hash[ 16 * HASH_FUNC_COUNT ] __attribute__ ((aligned (64))); + uint32_t hash[ 16 * TT8_FUNC_COUNT ] __attribute__ ((aligned (64))); uint32_t *hashA, *hashB; tt_ctx_holder ctx __attribute__ ((aligned (64))); uint32_t dataLen = 64; @@ -130,7 +67,7 @@ void timetravel_hash(void *output, const void *input) memcpy( &ctx, &tt_ctx, sizeof(tt_ctx) ); - for ( i = 0; i < HASH_FUNC_COUNT; i++ ) + for ( i = 0; i < TT8_FUNC_COUNT; i++ ) { if (i == 0) { @@ -270,7 +207,7 @@ void timetravel_hash(void *output, const void *input) } } - memcpy(output, &hash[16 * (HASH_FUNC_COUNT - 1)], 32); + memcpy(output, &hash[16 * (TT8_FUNC_COUNT - 1)], 32); } int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce, @@ -296,12 +233,12 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce, const uint32_t timestamp = endiandata[17]; if ( timestamp != s_ntime ) { - const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP ) - % HASH_FUNC_COUNT_PERMUTATIONS; - for ( i = 0; i < HASH_FUNC_COUNT; i++ ) + const int steps = ( timestamp - TT8_FUNC_BASE_TIMESTAMP ) + % TT8_FUNC_COUNT_PERMUTATIONS; + for ( i = 0; i < TT8_FUNC_COUNT; i++ ) permutation[i] = i; for ( i = 0; i < steps; i++ ) - next_permutation( permutation, permutation + HASH_FUNC_COUNT ); + tt8_next_permutation( permutation, permutation + TT8_FUNC_COUNT ); s_ntime = timestamp; // do midstate precalc for first function @@ -359,6 +296,7 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce, work_set_target_ratio( work, hash ); pdata[19] = nonce; *hashes_done = pdata[19] - first_nonce; + work_set_target_ratio( work, hash ); return 1; } nonce++; @@ -370,19 +308,4 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce, return 0; } -void timetravel_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); -} - -bool register_timetravel_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; - init_tt_ctx(); - gate->scanhash = (void*)&scanhash_timetravel; - gate->hash = (void*)&timetravel_hash; - gate->set_target = (void*)&timetravel_set_target; - gate->get_max64 = (void*)&get_max64_0xffffLL; - return true; -}; diff --git a/algo/x11/timetravel10-4way.c b/algo/x11/timetravel10-4way.c new file mode 100644 index 0000000..918cb2c --- /dev/null +++ b/algo/x11/timetravel10-4way.c @@ -0,0 +1,316 @@ +#include "timetravel10-gate.h" + +#if defined(__AVX2__) && defined(__AES__) + +#include +#include +#include +#include +#include "algo/blake/blake-hash-4way.h" +#include "algo/bmw/bmw-hash-4way.h" +#include "algo/groestl/aes_ni/hash-groestl.h" +#include "algo/skein/skein-hash-4way.h" +#include "algo/jh/jh-hash-4way.h" +#include "algo/keccak/keccak-hash-4way.h" +#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/cubehash/sse2/cubehash_sse2.h" +#include "algo/shavite/sph_shavite.h" +#include "algo/simd/sse2/nist.h" + +static __thread uint32_t s_ntime = UINT32_MAX; +static __thread int permutation[TT10_FUNC_COUNT] = { 0 }; + +typedef struct { + blake512_4way_context blake; + bmw512_4way_context bmw; + hashState_groestl groestl; + skein512_4way_context skein; + jh512_4way_context jh; + keccak512_4way_context keccak; + hashState_luffa luffa; + cubehashParam cube; + sph_shavite512_context shavite; + hashState_sd simd; +} tt10_4way_ctx_holder; + +tt10_4way_ctx_holder tt10_4way_ctx __attribute__ ((aligned (64))); + +void init_tt10_4way_ctx() +{ + blake512_4way_init( &tt10_4way_ctx.blake ); + bmw512_4way_init( &tt10_4way_ctx.bmw ); + init_groestl( &tt10_4way_ctx.groestl, 64 ); + skein512_4way_init( &tt10_4way_ctx.skein ); + jh512_4way_init( &tt10_4way_ctx.jh ); + keccak512_4way_init( &tt10_4way_ctx.keccak ); + init_luffa( &tt10_4way_ctx.luffa, 512 ); + cubehashInit( &tt10_4way_ctx.cube, 512, 16, 32 ); + sph_shavite512_init( &tt10_4way_ctx.shavite ); + init_sd( &tt10_4way_ctx.simd, 512 ); +}; + +void timetravel10_4way_hash(void *output, const void *input) +{ + uint64_t hash0[8] __attribute__ ((aligned (64))); + uint64_t hash1[8] __attribute__ ((aligned (64))); + uint64_t hash2[8] __attribute__ ((aligned (64))); + uint64_t hash3[8] __attribute__ ((aligned (64))); + uint64_t vhashX[8*4] __attribute__ ((aligned (64))); + uint64_t vhashY[8*4] __attribute__ ((aligned (64))); + uint64_t *vhashA, *vhashB; + tt10_4way_ctx_holder ctx __attribute__ ((aligned (64))); + uint32_t dataLen = 64; + int i; + + memcpy( &ctx, &tt10_4way_ctx, sizeof(tt10_4way_ctx) ); + + for ( i = 0; i < TT10_FUNC_COUNT; i++ ) + { + if (i == 0) + { + dataLen = 80; + vhashA = (uint64_t*)input; + vhashB = vhashX; + } + else + { + dataLen = 64; + if ( i % 2 == 0 ) + { + vhashA = vhashY; + vhashB = vhashX; + } + else + { + vhashA = vhashX; + vhashB = vhashY; + } + } + + switch ( permutation[i] ) + { + case 0: + blake512_4way( &ctx.blake, vhashA, dataLen ); + blake512_4way_close( &ctx.blake, vhashB ); + if ( i == 9 ) + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhashB, dataLen<<3 ); + break; + case 1: + bmw512_4way( &ctx.bmw, vhashA, dataLen ); + bmw512_4way_close( &ctx.bmw, vhashB ); + if ( i == 9 ) + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhashB, dataLen<<3 ); + break; + case 2: + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhashA, dataLen<<3 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, + (char*)hash0, dataLen<<3 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, + (char*)hash1, dataLen<<3 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, + (char*)hash2, dataLen<<3 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, + (char*)hash3, dataLen<<3 ); + if ( i != 9 ) + mm256_interleave_4x64( vhashB, + hash0, hash1, hash2, hash3, dataLen<<3 ); + break; + case 3: + skein512_4way( &ctx.skein, vhashA, dataLen ); + skein512_4way_close( &ctx.skein, vhashB ); + if ( i == 9 ) + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhashB, dataLen<<3 ); + break; + case 4: + jh512_4way( &ctx.jh, vhashA, dataLen ); + jh512_4way_close( &ctx.jh, vhashB ); + if ( i == 9 ) + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhashB, dataLen<<3 ); + break; + case 5: + keccak512_4way( &ctx.keccak, vhashA, dataLen ); + keccak512_4way_close( &ctx.keccak, vhashB ); + if ( i == 9 ) + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhashB, dataLen<<3 ); + break; + case 6: + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhashA, dataLen<<3 ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, + (const BitSequence *)hash0, dataLen ); + memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, + (const BitSequence*)hash1, dataLen ); + memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, + (const BitSequence*)hash2, dataLen ); + memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, + (const BitSequence*)hash3, dataLen ); + if ( i != 9 ) + mm256_interleave_4x64( vhashB, + hash0, hash1, hash2, hash3, dataLen<<3 ); + break; + case 7: + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhashA, dataLen<<3 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash0, + (const byte*)hash0, dataLen ); + memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash1, + (const byte*)hash1, dataLen ); + memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash2, + (const byte*)hash2, dataLen ); + memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash3, + (const byte*)hash3, dataLen ); + if ( i != 9 ) + mm256_interleave_4x64( vhashB, + hash0, hash1, hash2, hash3, dataLen<<3 ); + break; + case 8: + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhashA, dataLen<<3 ); + sph_shavite512( &ctx.shavite, hash0, dataLen ); + sph_shavite512_close( &ctx.shavite, hash0 ); + memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite ); + sph_shavite512( &ctx.shavite, hash1, dataLen ); + sph_shavite512_close( &ctx.shavite, hash1 ); + memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite ); + sph_shavite512( &ctx.shavite, hash2, dataLen ); + sph_shavite512_close( &ctx.shavite, hash2 ); + memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite ); + sph_shavite512( &ctx.shavite, hash3, dataLen ); + sph_shavite512_close( &ctx.shavite, hash3 ); + if ( i != 9 ) + mm256_interleave_4x64( vhashB, + hash0, hash1, hash2, hash3, dataLen<<3 ); + break; + case 9: + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhashA, dataLen<<3 ); + update_final_sd( &ctx.simd, (BitSequence *)hash0, + (const BitSequence *)hash0, dataLen<<3 ); + memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd ); + update_final_sd( &ctx.simd, (BitSequence *)hash1, + (const BitSequence *)hash1, dataLen<<3 ); + memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd ); + update_final_sd( &ctx.simd, (BitSequence *)hash2, + (const BitSequence *)hash2, dataLen<<3 ); + memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd ); + update_final_sd( &ctx.simd, (BitSequence *)hash3, + (const BitSequence *)hash3, dataLen<<3 ); + if ( i != 9 ) + mm256_interleave_4x64( vhashB, + hash0, hash1, hash2, hash3, dataLen<<3 ); + break; + default: + applog(LOG_ERR,"SWERR: timetravel invalid permutation"); + break; + } + } + + memcpy( output, hash0, 32 ); + memcpy( output+32, hash1, 32 ); + memcpy( output+64, hash2, 32 ); + memcpy( output+96, hash3, 32 ); +} + +int scanhash_timetravel10_4way( int thr_id, struct work *work, + uint32_t max_nonce, uint64_t *hashes_done ) +{ + uint32_t hash[4*8] __attribute__ ((aligned (64))); + uint32_t vdata[24*4] __attribute__ ((aligned (64))); + uint32_t endiandata[20] __attribute__((aligned(64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + uint32_t *nonces = work->nonces; + bool *found = work->nfound; + int num_found = 0; + uint32_t *noncep0 = vdata + 73; // 9*8 + 1 + uint32_t *noncep1 = vdata + 75; + uint32_t *noncep2 = vdata + 77; + uint32_t *noncep3 = vdata + 79; + const uint32_t Htarg = ptarget[7]; + volatile uint8_t *restart = &(work_restart[thr_id].restart); + int i; + + if ( opt_benchmark ) + ptarget[7] = 0x0cff; + + for ( int k = 0; k < 19; k++ ) + be32enc( &endiandata[k], pdata[k] ); + + const uint32_t timestamp = endiandata[17]; + if ( timestamp != s_ntime ) + { + const int steps = ( timestamp - TT10_FUNC_BASE_TIMESTAMP ) + % TT10_FUNC_COUNT_PERMUTATIONS; + for ( i = 0; i < TT10_FUNC_COUNT; i++ ) + permutation[i] = i; + for ( i = 0; i < steps; i++ ) + tt10_next_permutation( permutation, permutation + TT10_FUNC_COUNT ); + s_ntime = timestamp; + } + + uint64_t *edata = (uint64_t*)endiandata; + mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); + + do + { + found[0] = found[1] = found[2] = found[3] = false; + be32enc( noncep0, n ); + be32enc( noncep1, n+1 ); + be32enc( noncep2, n+2 ); + be32enc( noncep3, n+3 ); + + timetravel10_4way_hash( hash, vdata ); + pdata[19] = n; + + if ( hash[7] <= Htarg && fulltest( hash, ptarget) ) + { + found[0] = true; + num_found++; + nonces[0] = n; + work_set_target_ratio( work, hash ); + } + if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget) ) + { + found[1] = true; + num_found++; + nonces[1] = n+1; + work_set_target_ratio( work, hash+8 ); + } + if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget) ) + { + found[2] = true; + num_found++; + nonces[2] = n+2; + work_set_target_ratio( work, hash+16 ); + } + if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget) ) + { + found[3] = true; + num_found++; + nonces[3] = n+3; + work_set_target_ratio( work, hash+24 ); + } + n += 4; + } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) ); + *hashes_done = n - first_nonce + 1; + return num_found; +} + +#endif diff --git a/algo/x11/timetravel10-gate.c b/algo/x11/timetravel10-gate.c new file mode 100644 index 0000000..91e27db --- /dev/null +++ b/algo/x11/timetravel10-gate.c @@ -0,0 +1,78 @@ +#include "timetravel10-gate.h" + +void tt10_set_target( struct work* work, double job_diff ) +{ + work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); +} + +bool register_timetravel10_algo( algo_gate_t* gate ) +{ +#ifdef TIMETRAVEL10_4WAY + init_tt10_4way_ctx(); + gate->scanhash = (void*)&scanhash_timetravel10_4way; + gate->hash = (void*)&timetravel10_4way_hash; +#else + init_tt10_ctx(); + gate->scanhash = (void*)&scanhash_timetravel10; + gate->hash = (void*)&timetravel10_hash; +#endif + gate->set_target = (void*)&tt10_set_target; + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->get_max64 = (void*)&get_max64_0xffffLL; + return true; +}; + +inline void tt10_swap( int *a, int *b ) +{ + int c = *a; + *a = *b; + *b = c; +} + +inline void reverse( int *pbegin, int *pend ) +{ + while ( (pbegin != pend) && (pbegin != --pend) ) + { + tt10_swap( pbegin, pend ); + pbegin++; + } +} + +void tt10_next_permutation( int *pbegin, int *pend ) +{ + if ( pbegin == pend ) + return; + + int *i = pbegin; + ++i; + if ( i == pend ) + return; + + i = pend; + --i; + + while (1) + { + int *j = i; + --i; + + if ( *i < *j ) + { + int *k = pend; + + while ( !(*i < *--k) ) /* do nothing */ ; + + tt10_swap( i, k ); + reverse(j, pend); + return; // true + } + + if ( i == pbegin ) + { + reverse(pbegin, pend); + return; // false + } + // else? + } +} + diff --git a/algo/x11/timetravel10-gate.h b/algo/x11/timetravel10-gate.h new file mode 100644 index 0000000..6a7090a --- /dev/null +++ b/algo/x11/timetravel10-gate.h @@ -0,0 +1,39 @@ +#ifndef TIMETRAVEL10_GATE_H__ +#define TIMETRAVEL10_GATE_H__ 1 + +#include "algo-gate-api.h" +#include + +#if defined(HASH_4WAY) && defined(__AES__) + #define TIMETRAVEL10_4WAY +#endif + +// BitCore Genesis Timestamp +#define TT10_FUNC_BASE_TIMESTAMP 1492973331U +#define TT10_FUNC_COUNT 10 +#define TT10_FUNC_COUNT_PERMUTATIONS 40320 + +void tt10_next_permutation( int *pbegin, int *pend ); + +bool register_timetravel10_algo( algo_gate_t* gate ); + +#if defined(TIMETRAVEL10_4WAY) + +void timetravel10_4way_hash( void *state, const void *input ); + +int scanhash_timetravel10_4way( int thr_id, struct work *work, + uint32_t max_nonce, uint64_t *hashes_done ); + +void init_tt10_4way_ctx(); + +#endif + +void timetravel10_hash( void *state, const void *input ); + +int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +void init_tt10_ctx(); + +#endif + diff --git a/algo/timetravel10.c b/algo/x11/timetravel10.c similarity index 84% rename from algo/timetravel10.c rename to algo/x11/timetravel10.c index 27cf2db..888d53e 100644 --- a/algo/timetravel10.c +++ b/algo/x11/timetravel10.c @@ -1,11 +1,8 @@ -#include "algo-gate-api.h" - +#include "timetravel10-gate.h" #include #include #include #include -#include "avxdefs.h" - #include "algo/blake/sph_blake.h" #include "algo/bmw/sph_bmw.h" #include "algo/jh/sph_jh.h" @@ -22,68 +19,8 @@ #include "algo/groestl/aes_ni/hash-groestl.h" #endif -// BitCore Genesis Timestamp -#define HASH_FUNC_BASE_TIMESTAMP 1492973331U - -#define HASH_FUNC_COUNT 10 -#define HASH_FUNC_COUNT_PERMUTATIONS 40320 - static __thread uint32_t s_ntime = UINT32_MAX; -static __thread int permutation[HASH_FUNC_COUNT] = { 0 }; - -inline void tt10_swap( int *a, int *b ) -{ - int c = *a; - *a = *b; - *b = c; -} - -inline void reverse( int *pbegin, int *pend ) -{ - while ( (pbegin != pend) && (pbegin != --pend) ) - { - tt10_swap( pbegin, pend ); - pbegin++; - } -} - -static void next_permutation( int *pbegin, int *pend ) -{ - if ( pbegin == pend ) - return; - - int *i = pbegin; - ++i; - if ( i == pend ) - return; - - i = pend; - --i; - - while (1) - { - int *j = i; - --i; - - if ( *i < *j ) - { - int *k = pend; - - while ( !(*i < *--k) ) /* do nothing */ ; - - tt10_swap( i, k ); - reverse(j, pend); - return; // true - } - - if ( i == pbegin ) - { - reverse(pbegin, pend); - return; // false - } - // else? - } -} +static __thread int permutation[TT10_FUNC_COUNT] = { 0 }; typedef struct { sph_blake512_context blake; @@ -125,7 +62,7 @@ void init_tt10_ctx() void timetravel10_hash(void *output, const void *input) { - uint32_t hash[ 16 * HASH_FUNC_COUNT ] __attribute__ ((aligned (64))); + uint32_t hash[ 16 * TT10_FUNC_COUNT ] __attribute__ ((aligned (64))); uint32_t *hashA, *hashB; tt10_ctx_holder ctx __attribute__ ((aligned (64))); uint32_t dataLen = 64; @@ -136,7 +73,7 @@ void timetravel10_hash(void *output, const void *input) memcpy( &ctx, &tt10_ctx, sizeof(tt10_ctx) ); - for ( i = 0; i < HASH_FUNC_COUNT; i++ ) + for ( i = 0; i < TT10_FUNC_COUNT; i++ ) { if (i == 0) { @@ -302,7 +239,7 @@ void timetravel10_hash(void *output, const void *input) } } - memcpy(output, &hash[16 * (HASH_FUNC_COUNT - 1)], 32); + memcpy(output, &hash[16 * (TT10_FUNC_COUNT - 1)], 32); } int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce, @@ -328,12 +265,12 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce, const uint32_t timestamp = endiandata[17]; if ( timestamp != s_ntime ) { - const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP ) - % HASH_FUNC_COUNT_PERMUTATIONS; - for ( i = 0; i < HASH_FUNC_COUNT; i++ ) + const int steps = ( timestamp - TT10_FUNC_BASE_TIMESTAMP ) + % TT10_FUNC_COUNT_PERMUTATIONS; + for ( i = 0; i < TT10_FUNC_COUNT; i++ ) permutation[i] = i; for ( i = 0; i < steps; i++ ) - next_permutation( permutation, permutation + HASH_FUNC_COUNT ); + tt10_next_permutation( permutation, permutation + TT10_FUNC_COUNT ); s_ntime = timestamp; // do midstate precalc for first function @@ -398,6 +335,7 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce, { work_set_target_ratio( work, hash ); pdata[19] = nonce; + work_set_target_ratio( work, hash ); *hashes_done = pdata[19] - first_nonce; return 1; } @@ -409,20 +347,3 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce, *hashes_done = pdata[19] - first_nonce + 1; return 0; } - -void timetravel10_set_target( struct work* work, double job_diff ) -{ - work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); -} - -bool register_timetravel10_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; - init_tt10_ctx(); - gate->scanhash = (void*)&scanhash_timetravel10; - gate->hash = (void*)&timetravel10_hash; - gate->set_target = (void*)&timetravel10_set_target; - gate->get_max64 = (void*)&get_max64_0xffffLL; - return true; -}; - diff --git a/algo/x11/x11.c b/algo/x11/x11.c index ec511ca..41e4c4f 100644 --- a/algo/x11/x11.c +++ b/algo/x11/x11.c @@ -179,6 +179,7 @@ int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce, if ( fulltest( hash64, ptarget ) ) { *hashes_done = n - first_nonce + 1; + work_set_target_ratio( work, hash64 ); return true; } } @@ -189,14 +190,3 @@ int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce, pdata[19] = n; return 0; } -/* -bool register_x11_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; - init_x11_ctx(); - gate->scanhash = (void*)&scanhash_x11; - gate->hash = (void*)&x11_hash; - gate->get_max64 = (void*)&get_max64_0x3ffff; - return true; -}; -*/ diff --git a/algo/x11/x11evo-4way.c b/algo/x11/x11evo-4way.c new file mode 100644 index 0000000..d1a3339 --- /dev/null +++ b/algo/x11/x11evo-4way.c @@ -0,0 +1,340 @@ +#include "cpuminer-config.h" +#include "x11evo-gate.h" + +#if defined(__AVX2__) && defined(__AES__) + +#include +#include +#include +#include "algo/blake/blake-hash-4way.h" +#include "algo/bmw/bmw-hash-4way.h" +#include "algo/skein/skein-hash-4way.h" +#include "algo/jh/jh-hash-4way.h" +#include "algo/keccak/keccak-hash-4way.h" +#include "algo/luffa/sph_luffa.h" +#include "algo/cubehash/sph_cubehash.h" +#include "algo/shavite/sph_shavite.h" +#include "algo/simd/sph_simd.h" +#include "algo/groestl/aes_ni/hash-groestl.h" +#include "algo/echo/aes_ni/hash_api.h" +#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/cubehash/sse2/cubehash_sse2.h" +#include "algo/simd/sse2/nist.h" + +typedef struct { + blake512_4way_context blake; + bmw512_4way_context bmw; + hashState_groestl groestl; + skein512_4way_context skein; + jh512_4way_context jh; + keccak512_4way_context keccak; + hashState_luffa luffa; + cubehashParam cube; + sph_shavite512_context shavite; + hashState_sd simd; + hashState_echo echo; +} x11evo_4way_ctx_holder; + +static x11evo_4way_ctx_holder x11evo_4way_ctx __attribute__ ((aligned (64))); + +void init_x11evo_4way_ctx() +{ + blake512_4way_init( &x11evo_4way_ctx.blake ); + bmw512_4way_init( &x11evo_4way_ctx.bmw ); + init_groestl( &x11evo_4way_ctx.groestl, 64 ); + skein512_4way_init( &x11evo_4way_ctx.skein ); + jh512_4way_init( &x11evo_4way_ctx.jh ); + keccak512_4way_init( &x11evo_4way_ctx.keccak ); + init_luffa( &x11evo_4way_ctx.luffa, 512 ); + cubehashInit( &x11evo_4way_ctx.cube, 512, 16, 32 ); + sph_shavite512_init( &x11evo_4way_ctx.shavite ); + init_sd( &x11evo_4way_ctx.simd, 512 ); + init_echo( &x11evo_4way_ctx.echo, 512 ); +} + +static char hashOrder[X11EVO_FUNC_COUNT + 1] = { 0 }; +static __thread uint32_t s_ntime = UINT32_MAX; + +void x11evo_4way_hash( void *state, const void *input ) +{ + uint32_t hash0[16] __attribute__ ((aligned (64))); + uint32_t hash1[16] __attribute__ ((aligned (64))); + uint32_t hash2[16] __attribute__ ((aligned (64))); + uint32_t hash3[16] __attribute__ ((aligned (64))); + uint32_t vhash[16*4] __attribute__ ((aligned (64))); + x11evo_4way_ctx_holder ctx __attribute__ ((aligned (64))); + memcpy( &ctx, &x11evo_4way_ctx, sizeof(x11evo_4way_ctx) ); + + if ( s_seq == -1 ) + { + uint32_t *data = (uint32_t*) input; + const uint32_t ntime = data[17]; + evo_twisted_code( ntime, hashOrder ); + } + + int i; + int len = strlen( hashOrder ); + for ( i = 0; i < len; i++ ) + { + char elem = hashOrder[i]; + uint8_t idx; + if ( elem >= 'A' ) + idx = elem - 'A' + 10; + else + idx = elem - '0'; + +// int size = 64; + + switch ( idx ) + { + case 0: + blake512_4way( &ctx.blake, input, 80 ); + blake512_4way_close( &ctx.blake, vhash ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhash, 64<<3 ); + break; + case 1: + bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_close( &ctx.bmw, vhash ); + if ( i >= len-1 ) + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhash, 64<<3 ); + break; + case 2: + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhash, 64<<3 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, + (char*)hash0, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, + (char*)hash1, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, + (char*)hash2, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, + (char*)hash3, 512 ); + if ( i < len-1 ) + mm256_interleave_4x64( vhash, + hash0, hash1, hash2, hash3, 64<<3 ); + break; + case 3: + skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_close( &ctx.skein, vhash ); + if ( i >= len-1 ) + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhash, 64<<3 ); + break; + case 4: + jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_close( &ctx.jh, vhash ); + if ( i >= len-1 ) + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhash, 64<<3 ); + break; + case 5: + keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_close( &ctx.keccak, vhash ); + if ( i >= len-1 ) + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhash, 64<<3 ); + break; + case 6: + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhash, 64<<3 ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, + (const BitSequence*)hash0, 64 ); + memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa, + sizeof(hashState_luffa) ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, + (const BitSequence*)hash1, 64 ); + memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa, + sizeof(hashState_luffa) ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, + (const BitSequence*)hash2, 64 ); + memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa, + sizeof(hashState_luffa) ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, + (const BitSequence*)hash3, 64 ); + if ( i < len-1 ) + mm256_interleave_4x64( vhash, + hash0, hash1, hash2, hash3, 64<<3 ); + break; + case 7: + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhash, 64<<3 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash0, + (const byte*) hash0, 64 ); + memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash1, + (const byte*) hash1, 64 ); + memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash2, + (const byte*) hash2, 64 ); + memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash3, + (const byte*) hash3, 64 ); + if ( i < len-1 ) + mm256_interleave_4x64( vhash, + hash0, hash1, hash2, hash3, 64<<3 ); + break; + case 8: + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhash, 64<<3 ); + sph_shavite512( &ctx.shavite, hash0, 64 ); + sph_shavite512_close( &ctx.shavite, hash0 ); + memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash1, 64 ); + sph_shavite512_close( &ctx.shavite, hash1 ); + memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash2, 64 ); + sph_shavite512_close( &ctx.shavite, hash2 ); + memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash3, 64 ); + sph_shavite512_close( &ctx.shavite, hash3 ); + if ( i < len-1 ) + mm256_interleave_4x64( vhash, + hash0, hash1, hash2, hash3, 64<<3 ); + break; + case 9: + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhash, 64<<3 ); + update_final_sd( &ctx.simd, (BitSequence *)hash0, + (const BitSequence *)hash0, 512 ); + memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) ); + update_final_sd( &ctx.simd, (BitSequence *)hash1, + (const BitSequence *)hash1, 512 ); + memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) ); + update_final_sd( &ctx.simd, (BitSequence *)hash2, + (const BitSequence *)hash2, 512 ); + memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) ); + update_final_sd( &ctx.simd, (BitSequence *)hash3, + (const BitSequence *)hash3, 512 ); + if ( i < len-1 ) + mm256_interleave_4x64( vhash, + hash0, hash1, hash2, hash3, 64<<3 ); + break; + case 10: + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, + vhash, 64<<3 ); + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *) hash0, 512 ); + memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *) hash1, 512 ); + memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *) hash2, 512 ); + memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *) hash3, 512 ); + if ( i < len-1 ) + mm256_interleave_4x64( vhash, + hash0, hash1, hash2, hash3, 64<<3 ); + break; + } + } + + memcpy( state, hash0, 32 ); + memcpy( state+32, hash1, 32 ); + memcpy( state+64, hash2, 32 ); + memcpy( state+96, hash3, 32 ); +} + +//static const uint32_t diff1targ = 0x0000ffff; + +int scanhash_x11evo_4way( int thr_id, struct work* work, uint32_t max_nonce, + uint64_t *hashes_done ) +{ + uint32_t hash[4*8] __attribute__ ((aligned (64))); + uint32_t vdata[24*4] __attribute__ ((aligned (64))); + uint32_t endiandata[20] __attribute__((aligned(64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + uint32_t *nonces = work->nonces; + bool *found = work->nfound; + int num_found = 0; + uint32_t *noncep0 = vdata + 73; // 9*8 + 1 + uint32_t *noncep1 = vdata + 75; + uint32_t *noncep2 = vdata + 77; + uint32_t *noncep3 = vdata + 79; + const uint32_t Htarg = ptarget[7]; + + swab32_array( endiandata, pdata, 20 ); + + int ntime = endiandata[17]; + if ( ntime != s_ntime || s_seq == -1 ) + { + evo_twisted_code( ntime, hashOrder ); + s_ntime = ntime; + } + + uint32_t hmask = 0xFFFFFFFF; + if ( Htarg > 0 ) + { + if ( Htarg <= 0xF ) + hmask = 0xFFFFFFF0; + else if ( Htarg <= 0xFF ) + hmask = 0xFFFFFF00; + else if ( Htarg <= 0xFFF ) + hmask = 0xFFFF000; + else if ( Htarg <= 0xFFFF ) + hmask = 0xFFFF000; + } + + uint64_t *edata = (uint64_t*)endiandata; + mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); + + do + { + found[0] = found[1] = found[2] = found[3] = false; + be32enc( noncep0, n ); + be32enc( noncep1, n+1 ); + be32enc( noncep2, n+2 ); + be32enc( noncep3, n+3 ); + + x11evo_4way_hash( hash, vdata ); + pdata[19] = n; + + if ( ( hash[7] & hmask ) == 0 && fulltest( hash, ptarget ) ) + { + found[0] = true; + num_found++; + nonces[0] = n; + work_set_target_ratio( work, hash ); + } + if ( ( (hash+8)[7] & hmask ) == 0 && fulltest( hash+8, ptarget ) ) + { + found[1] = true; + num_found++; + nonces[1] = n+1; + work_set_target_ratio( work, hash+8 ); + } + if ( ( (hash+16)[7] & hmask ) == 0 && fulltest( hash+16, ptarget ) ) + { + found[2] = true; + num_found++; + nonces[2] = n+2; + work_set_target_ratio( work, hash+16 ); + } + if ( ( (hash+24)[7] & hmask ) == 0 && fulltest( hash+24, ptarget ) ) + { + found[3] = true; + num_found++; + nonces[3] = n+3; + work_set_target_ratio( work, hash+24 ); + } + n += 4; + } while ( ( num_found == 0 ) && ( n < max_nonce ) + && !work_restart[thr_id].restart ); + + *hashes_done = n - first_nonce + 1; + return num_found; +} + +#endif diff --git a/algo/x11/x11evo-gate.c b/algo/x11/x11evo-gate.c new file mode 100644 index 0000000..8f8841a --- /dev/null +++ b/algo/x11/x11evo-gate.c @@ -0,0 +1,95 @@ +#include "x11evo-gate.h" + +int s_seq = -1; + +static inline int getCurrentAlgoSeq( uint32_t current_time ) +{ + // change once per day + return (int) (current_time - X11EVO_INITIAL_DATE) / (60 * 60 * 24); +} + +// swap_vars doesn't work here +void evo_swap( uint8_t *a, uint8_t *b ) +{ + uint8_t __tmp = *a; + *a = *b; + *b = __tmp; +} + +void initPerm( uint8_t n[], uint8_t count ) +{ + int i; + for ( i = 0; i0 && n[i - 1] >= n[i]; i-- ); + tail = i; + + if ( tail > 0 ) + for ( j = count - 1; j>tail && n[j] <= n[tail - 1]; j-- ); + evo_swap( &n[tail - 1], &n[j] ); + + for ( i = tail, j = count - 1; i= 10 ) + sprintf( sptr, "%c", 'A' + (algoList[j] - 10) ); + else + sprintf( sptr, "%u", algoList[j] ); + sptr++; + } + *sptr = 0; + + //applog(LOG_DEBUG, "nextPerm %s", str); +} + +void evo_twisted_code( uint32_t ntime, char *permstr ) +{ + int seq = getCurrentAlgoSeq( ntime ); + if ( s_seq != seq ) + { + getAlgoString( permstr, seq ); + s_seq = seq; + } +} + +bool register_x11evo_algo( algo_gate_t* gate ) +{ +#if defined (X11EVO_4WAY) + init_x11evo_4way_ctx(); + gate->scanhash = (void*)&scanhash_x11evo_4way; + gate->hash = (void*)&x11evo_4way_hash; +#else + init_x11evo_ctx(); + gate->scanhash = (void*)&scanhash_x11evo; + gate->hash = (void*)&x11evo_hash; +#endif + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + return true; +}; + diff --git a/algo/x11/x11evo-gate.h b/algo/x11/x11evo-gate.h new file mode 100644 index 0000000..32bd551 --- /dev/null +++ b/algo/x11/x11evo-gate.h @@ -0,0 +1,39 @@ +#ifndef X11EVO_GATE_H__ +#define X11EVO_GATE_H__ 1 + +#include "algo-gate-api.h" +#include + +#if defined(HASH_4WAY) && defined(__AES__) + #define X11EVO_4WAY +#endif + +#define X11EVO_INITIAL_DATE 1462060800 +#define X11EVO_FUNC_COUNT 11 + +extern int s_seq; + +bool register_x11evo_algo( algo_gate_t* gate ); + +#if defined(X11EVO_4WAY) + +void x11evo_4way_hash( void *state, const void *input ); + +int scanhash_x11evo_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +void init_x11evo_4way_ctx(); + +#endif + +void x11evo_hash( void *state, const void *input ); + +int scanhash_x11evo( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +void init_x11evo_ctx(); + +void evo_twisted_code( uint32_t ntime, char *permstr ); + +#endif + diff --git a/algo/x11/x11evo.c b/algo/x11/x11evo.c index edbd41f..6b1f3f9 100644 --- a/algo/x11/x11evo.c +++ b/algo/x11/x11evo.c @@ -1,5 +1,5 @@ #include "cpuminer-config.h" -#include "algo-gate-api.h" +#include "x11evo-gate.h" #include #include @@ -26,9 +26,6 @@ #include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/simd/sse2/nist.h" -#define INITIAL_DATE 1462060800 -#define HASH_FUNC_COUNT 11 - typedef struct { #ifdef NO_AES_NI sph_groestl512_context groestl; @@ -70,94 +67,10 @@ void init_x11evo_ctx() sph_shavite512_init( &x11evo_ctx.shavite ); } -/* -uint32_t getCurrentAlgoSeq(uint32_t current_time, uint32_t base_time) -{ - return (current_time - base_time) / (60 * 60 * 24); -} -*/ - -static inline int getCurrentAlgoSeq( uint32_t current_time ) -{ - // change once per day - return (int) (current_time - INITIAL_DATE) / (60 * 60 * 24); -} - -// swap_vars doesn't work here -void evo_swap( uint8_t *a, uint8_t *b ) -{ - uint8_t __tmp = *a; - *a = *b; - *b = __tmp; -} - -void initPerm( uint8_t n[], uint8_t count ) -{ - int i; - for ( i = 0; i0 && n[i - 1] >= n[i]; i-- ); - tail = i; - - if ( tail > 0 ) - for ( j = count - 1; j>tail && n[j] <= n[tail - 1]; j-- ); - evo_swap( &n[tail - 1], &n[j] ); - - for ( i = tail, j = count - 1; i= 10 ) - sprintf( sptr, "%c", 'A' + (algoList[j] - 10) ); - else - sprintf( sptr, "%u", algoList[j] ); - sptr++; - } - *sptr = 0; - - //applog(LOG_DEBUG, "nextPerm %s", str); -} - -static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 }; +static char hashOrder[X11EVO_FUNC_COUNT + 1] = { 0 }; static __thread uint32_t s_ntime = UINT32_MAX; -static int s_seq = -1; -static void evo_twisted_code(uint32_t ntime, char *permstr) -{ - int seq = getCurrentAlgoSeq(ntime); - if (s_seq != seq) - { - getAlgoString(permstr, seq); - s_seq = seq; - } -} - -static inline void x11evo_hash( void *state, const void *input ) +void x11evo_hash( void *state, const void *input ) { uint32_t hash[16] __attribute__ ((aligned (64))); x11evo_ctx_holder ctx __attribute__ ((aligned (64))); @@ -242,10 +155,10 @@ static inline void x11evo_hash( void *state, const void *input ) memcpy( state, hash, 32 ); } -static const uint32_t diff1targ = 0x0000ffff; +//static const uint32_t diff1targ = 0x0000ffff; int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce, - unsigned long *hashes_done ) + uint64_t *hashes_done ) { uint32_t endiandata[20] __attribute__((aligned(64))); uint32_t hash64[8] __attribute__((aligned(64))); @@ -274,19 +187,20 @@ int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce, else if ( Htarg <= 0xFFF ) hmask = 0xFFFF000; else if ( Htarg <= 0xFFFF ) - hmask = 0xFFFF000; + hmask = 0xFFFF000; } do { pdata[19] = ++n; be32enc( &endiandata[19], n ); - x11evo_hash( hash64, &endiandata ); + x11evo_hash( hash64, endiandata ); if ( ( hash64[7] & hmask ) == 0 ) { if ( fulltest( hash64, ptarget ) ) { *hashes_done = n - first_nonce + 1; + work_set_target_ratio( work, hash64 ); return true; } } @@ -296,13 +210,3 @@ int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce, pdata[19] = n; return 0; } - -bool register_x11evo_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; - gate->scanhash = (void*)&scanhash_x11evo; - gate->hash = (void*)&x11evo_hash; - init_x11evo_ctx(); - return true; -}; - diff --git a/algo/x11/x11gost.c b/algo/x11/x11gost.c index a77424e..31d391b 100644 --- a/algo/x11/x11gost.c +++ b/algo/x11/x11gost.c @@ -161,6 +161,7 @@ int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce, if (hash[7] <= Htarg && fulltest(hash, ptarget)) { pdata[19] = nonce; *hashes_done = pdata[19] - first_nonce; + work_set_target_ratio( work, hash ); return 1; } nonce++; diff --git a/algo/x13/phi1612.c b/algo/x13/phi1612.c index 151fec5..aca38d1 100644 --- a/algo/x13/phi1612.c +++ b/algo/x13/phi1612.c @@ -116,6 +116,7 @@ int scanhash_phi1612( int thr_id, struct work *work, uint32_t max_nonce, if (hash[7] <= Htarg && fulltest(hash, ptarget)) { pdata[19] = nonce; + work_set_target_ratio( work, hash ); *hashes_done = pdata[19] - first_nonce; return 1; } diff --git a/algo/x13/skunk.c b/algo/x13/skunk.c index 31709ef..c736278 100644 --- a/algo/x13/skunk.c +++ b/algo/x13/skunk.c @@ -70,6 +70,7 @@ int scanhash_skunk( int thr_id, struct work *work, uint32_t max_nonce, { pdata[19] = nonce; *hashes_done = pdata[19] - first_nonce; + work_set_target_ratio( work, hash ); return 1; } nonce++; diff --git a/algo/x13/x13.c b/algo/x13/x13.c index e9acc77..8a052c3 100644 --- a/algo/x13/x13.c +++ b/algo/x13/x13.c @@ -234,6 +234,7 @@ int scanhash_x13(int thr_id, struct work *work, uint32_t max_nonce, if (!(hash64[7] & mask)) { printf("[%d]",thr_id); if (fulltest(hash64, ptarget)) { + work_set_target_ratio( work, hash ); *hashes_done = n - first_nonce + 1; return true; } diff --git a/algo/x13/x13sm3-4way.c b/algo/x13/x13sm3-4way.c index 5b639ac..e7b5283 100644 --- a/algo/x13/x13sm3-4way.c +++ b/algo/x13/x13sm3-4way.c @@ -17,7 +17,7 @@ #include "algo/shavite/sph_shavite.h" #include "algo/simd/sse2/nist.h" #include "algo/echo/aes_ni/hash_api.h" -#include "algo/sm3/sph_sm3.h" +#include "algo/sm3/sm3-hash-4way.h" #include "algo/hamsi/sph_hamsi.h" #include "algo/fugue/sph_fugue.h" @@ -33,7 +33,7 @@ typedef struct { sph_shavite512_context shavite; hashState_sd simd; hashState_echo echo; - sm3_ctx_t sm3; + sm3_4way_ctx_t sm3; sph_hamsi512_context hamsi; sph_fugue512_context fugue; } x13sm3_4way_ctx_holder; @@ -54,7 +54,7 @@ void init_x13sm3_4way_ctx() sph_shavite512_init( &x13sm3_4way_ctx.shavite ); init_sd( &x13sm3_4way_ctx.simd, 512 ); init_echo( &x13sm3_4way_ctx.echo, 512 ); - sm3_init( &x13sm3_4way_ctx.sm3 ); + sm3_4way_init( &x13sm3_4way_ctx.sm3 ); sph_hamsi512_init( &x13sm3_4way_ctx.hamsi ); sph_fugue512_init( &x13sm3_4way_ctx.fugue ); }; @@ -85,14 +85,11 @@ void x13sm3_4way_hash( void *state, const void *input ) // Groestl update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl, - sizeof(hashState_groestl) ); + reinit_groestl( &ctx.groestl ); update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl, - sizeof(hashState_groestl) ); + reinit_groestl( &ctx.groestl ); update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl, - sizeof(hashState_groestl) ); + reinit_groestl( &ctx.groestl ); update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); // Parallel 4way @@ -178,6 +175,8 @@ void x13sm3_4way_hash( void *state, const void *input ) (const BitSequence *) hash3, 512 ); // SM3 + uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64))); + memset( sm3_vhash, 0, sizeof sm3_vhash ); uint32_t sm3_hash0[32] __attribute__ ((aligned (32))); memset( sm3_hash0, 0, sizeof sm3_hash0 ); uint32_t sm3_hash1[32] __attribute__ ((aligned (32))); @@ -187,17 +186,11 @@ void x13sm3_4way_hash( void *state, const void *input ) uint32_t sm3_hash3[32] __attribute__ ((aligned (32))); memset( sm3_hash3, 0, sizeof sm3_hash3 ); - sph_sm3( &ctx.sm3, hash0, 64 ); - sph_sm3_close( &ctx.sm3, sm3_hash0 ); - memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) ); - sph_sm3( &ctx.sm3, hash1, 64 ); - sph_sm3_close( &ctx.sm3, sm3_hash1 ); - memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) ); - sph_sm3( &ctx.sm3, hash2, 64 ); - sph_sm3_close( &ctx.sm3, sm3_hash2 ); - memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) ); - sph_sm3( &ctx.sm3, hash3, 64 ); - sph_sm3_close( &ctx.sm3, sm3_hash3 ); + mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); + sm3_4way( &ctx.sm3, vhash, 64 ); + sm3_4way_close( &ctx.sm3, sm3_vhash ); + mm_deinterleave_4x32( sm3_hash0, sm3_hash1, sm3_hash2, sm3_hash3, + sm3_vhash, 1024 ); // Hamsi sph_hamsi512( &ctx.hamsi, sm3_hash0, 64 ); diff --git a/algo/x13/x13sm3.c b/algo/x13/x13sm3.c index f07c204..8724cef 100644 --- a/algo/x13/x13sm3.c +++ b/algo/x13/x13sm3.c @@ -224,6 +224,7 @@ int scanhash_x13sm3( int thr_id, struct work *work, if (!(hash64[7] & mask)) { printf("[%d]",thr_id); if (fulltest(hash64, ptarget)) { + work_set_target_ratio( work, hash64 ); *hashes_done = n - first_nonce + 1; return true; } diff --git a/algo/x14/axiom.c b/algo/x14/axiom.c index 9f08359..f2bcec3 100644 --- a/algo/x14/axiom.c +++ b/algo/x14/axiom.c @@ -65,6 +65,7 @@ int scanhash_axiom(int thr_id, struct work *work, if (hash64[7] < Htarg && fulltest(hash64, ptarget)) { *hashes_done = n - first_nonce + 1; pdata[19] = n; + work_set_target_ratio( work, hash64 ); return true; } n++; diff --git a/algo/x14/polytimos-gate.h b/algo/x14/polytimos-gate.h index 29b5248..11b4297 100644 --- a/algo/x14/polytimos-gate.h +++ b/algo/x14/polytimos-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) && defined(__AES__) +#if defined(__AVX2__) && defined(__AES__) #define POLYTIMOS_4WAY #endif diff --git a/algo/x14/x14.c b/algo/x14/x14.c index d53f919..8d1c928 100644 --- a/algo/x14/x14.c +++ b/algo/x14/x14.c @@ -233,6 +233,7 @@ int scanhash_x14(int thr_id, struct work *work, if (!(hash64[7] & mask)) { printf("[%d]",thr_id); if (fulltest(hash64, ptarget)) { + work_set_target_ratio( work, hash64 ); *hashes_done = n - first_nonce + 1; return true; } diff --git a/algo/x15/x15-4way.c b/algo/x15/x15-4way.c index c47e289..af13d98 100644 --- a/algo/x15/x15-4way.c +++ b/algo/x15/x15-4way.c @@ -47,7 +47,6 @@ void init_x15_4way_ctx() { blake512_4way_init( &x15_4way_ctx.blake ); bmw512_4way_init( &x15_4way_ctx.bmw ); - sph_bmw512_init( &x15_4way_ctx.bmw ); init_groestl( &x15_4way_ctx.groestl, 64 ); skein512_4way_init( &x15_4way_ctx.skein ); jh512_4way_init( &x15_4way_ctx.jh ); diff --git a/algo/x15/x15.c b/algo/x15/x15.c index 5e4dd36..f96c684 100644 --- a/algo/x15/x15.c +++ b/algo/x15/x15.c @@ -245,6 +245,7 @@ int scanhash_x15(int thr_id, struct work *work, if (!(hash64[7] & mask)) { printf("[%d]",thr_id); if (fulltest(hash64, ptarget)) { + work_set_target_ratio( work, hash64 ); *hashes_done = n - first_nonce + 1; return true; } diff --git a/algo/x17/x17.c b/algo/x17/x17.c index a377492..fca8a72 100644 --- a/algo/x17/x17.c +++ b/algo/x17/x17.c @@ -266,6 +266,7 @@ int scanhash_x17(int thr_id, struct work *work, if (!(hash64[7] & mask)) { printf("[%d]",thr_id); if (fulltest(hash64, ptarget)) { + work_set_target_ratio( work, hash64 ); *hashes_done = n - first_nonce + 1; return true; } @@ -281,13 +282,3 @@ int scanhash_x17(int thr_id, struct work *work, pdata[19] = n; return 0; } -/* -bool register_x17_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; - init_x17_ctx(); - gate->scanhash = (void*)&scanhash_x17; - gate->hash = (void*)&x17hash; - return true; -}; -*/ diff --git a/avxdefs.h b/avxdefs.h index cb42c25..cf273bb 100644 --- a/avxdefs.h +++ b/avxdefs.h @@ -35,10 +35,18 @@ #define mm_one_64 _mm_set1_epi64x( 1ULL ) #define mm_one_32 _mm_set1_epi32( 1UL ) #define mm_one_16 _mm_set1_epi16( 1U ) +#define mm_one_8 _mm_set1_epi8( 1U ) // Constant minus 1 #define mm_neg1 _mm_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL ) +// Lane index, useful for byte rotate using shuffle +#define mm_lanex_64 _mm_set_epi64( 1ULL, 0ULL ); +#define mm_lanex_32 _mm_set_epi32( 3UL, 2UL, 1UL, 0UL ); +#define mm_lanex_16 _mm_set_epi16( 7U, 6U, 5U, 4U, 3U, 2U, 1U, 0U ); +#define mm_lanex_8 _mm_set_epi8( 15U, 14U, 13U, 12U, 11U, 10U , 9U, 8U, \ + 7U, 6U, 5U, 4U, 3U, 2U, 1U, 0U ); + // // Basic operations without equivalent SIMD intrinsic @@ -327,6 +335,16 @@ inline __m128i mm_byteswap_16( __m128i x ) // Constant minus 1 #define mm256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL ) +// Lane index, useful for rotate using permutevar +#define mm256_lane_64 _mm_set_epi64x( 3ULL, 2ULL, 1ULL, 0ULL ); +#define mm256_lane_32 _mm_set_epi32( 7UL, 6UL, 5UL, 4UL, 3UL, 2UL, 1UL, 0UL ); +#define mm256_lane_16 _mm_set_epi16( 15U, 14U, 13U, 12U, 11U, 10U , 9U, 8U, \ + 7U, 6U, 5U, 4U, 3U, 2U, 1U, 0U ); +#define mm256_lane_8 _mm_set_epi8( 31U, 30U, 29U, 28U, 27U, 26U, 25U, 24U, \ + 23U, 22U, 21U, 20U, 19U, 18U, 17U, 16U, \ + 15U, 14U, 13U, 12U, 11U, 10U , 9U, 8U, \ + 7U, 6U, 5U, 4U, 3U, 2U, 1U, 0U ); + // // Basic operations without SIMD equivalent @@ -1109,7 +1127,7 @@ inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1, } // Can't do it in place -inline void mm256_reinterleave_4x64x( void *dst, void *src, int bit_len ) +inline void mm256_reinterleave_4x64( void *dst, void *src, int bit_len ) { __m256i* d = (__m256i*)dst; uint32_t *s = (uint32_t*)src; @@ -1146,7 +1164,8 @@ inline void mm256_reinterleave_4x64x( void *dst, void *src, int bit_len ) // likely of no use. // convert 4x32 byte (128 bit) vectors to 4x64 (256 bit) vectors for AVX2 // bit_len must be multiple of 64 -inline void mm256_reinterleave_4x64( uint64_t *dst, uint32_t *src, +// broken +inline void mm256_reinterleave_4x64x( uint64_t *dst, uint32_t *src, int bit_len ) { uint32_t *d = (uint32_t*)dst; @@ -1200,6 +1219,7 @@ inline void mm256_reinterleave_4x32( void *dst, void *src, int bit_len ) // bit_len == 1024 } +// not used inline void mm_reinterleave_4x32( void *dst, void *src, int bit_len ) { uint32_t *d = (uint32_t*)dst; diff --git a/configure b/configure index edaff47..e0ce335 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.9. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.10. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.7.9' -PACKAGE_STRING='cpuminer-opt 3.7.9' +PACKAGE_VERSION='3.7.10' +PACKAGE_STRING='cpuminer-opt 3.7.10' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.7.9 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.7.10 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1392,7 +1392,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.7.9:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.7.10:";; esac cat <<\_ACEOF @@ -1497,7 +1497,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.7.9 +cpuminer-opt configure 3.7.10 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.7.9, which was +It was created by cpuminer-opt $as_me 3.7.10, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2981,7 +2981,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.7.9' + VERSION='3.7.10' cat >>confdefs.h <<_ACEOF @@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.7.9, which was +This file was extended by cpuminer-opt $as_me 3.7.10, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6743,7 +6743,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.7.9 +cpuminer-opt config.status 3.7.10 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index d99ed38..4b84491 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.7.9]) +AC_INIT([cpuminer-opt], [3.7.10]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index e26731c..0f4f793 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -346,6 +346,7 @@ static bool work_decode( const json_t *val, struct work *work ) work->targetdiff = target_to_diff( work->target ); // for api stats, on longpoll pools stratum_diff = work->targetdiff; + work->sharediff = 0; algo_gate.display_extra_data( work, &net_blocks ); return true; } @@ -755,6 +756,7 @@ static int share_result( int result, struct work *work, const char *reason ) uint32_t total_submits; float rate; char rate_s[8] = {0}; + double sharediff = work ? work->sharediff : stratum.sharediff; int i; pthread_mutex_lock(&stats_lock); @@ -814,6 +816,8 @@ static int share_result( int result, struct work *work, const char *reason ) sprintf(hr, "%.2f", hashrate ); } + if ( sharediff == 0 ) + { #if ((defined(_WIN64) || defined(__WINDOWS__))) applog( LOG_NOTICE, "%s %lu/%lu (%s%%), %s %sH, %s %sH/s", sres, ( result ? accepted_count : rejected_count ), @@ -824,6 +828,20 @@ static int share_result( int result, struct work *work, const char *reason ) total_submits, rate_s, hc, hc_units, hr, hr_units, (uint32_t)cpu_temp(0) ); #endif + } + else + { +#if ((defined(_WIN64) || defined(__WINDOWS__))) + applog( LOG_NOTICE, "%s %lu/%lu (%s%%), diff %.3g, %s %sH/s", + sres, ( result ? accepted_count : rejected_count ), + total_submits, rate_s, sharediff, hr, hr_units ); +#else + applog( LOG_NOTICE, "%s %lu/%lu (%s%%), diff %.3g, %s %sH/s, %dC", + sres, ( result ? accepted_count : rejected_count ), + total_submits, rate_s, sharediff, hr, hr_units, + (uint32_t)cpu_temp(0) ); +#endif + } if (reason) { @@ -1026,6 +1044,7 @@ static bool submit_upstream_work( CURL *curl, struct work *work ) } if ( have_stratum ) { + stratum.sharediff = work->sharediff; algo_gate.build_stratum_request( req, work, &stratum ); if ( unlikely( !stratum_send_line( &stratum, req ) ) ) { @@ -1569,7 +1588,7 @@ void std_get_new_work( struct work* work, struct work* g_work, int thr_id, uint32_t *end_nonce_ptr, bool clean_job ) { uint32_t *nonceptr = algo_gate.get_nonceptr( work->data ); - + if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size ) && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) || ( work->job_id != g_work->job_id ) ) ) @@ -2984,25 +3003,22 @@ bool check_cpu_capability () } if ( sw_has_avx2 && !( cpu_has_avx2 && cpu_has_aes ) ) { - if ( sw_has_4way && algo_has_4way ) - printf( "A CPU with AES and AVX2 is required to use 4way!\n" ); - else if ( algo_has_avx2 ) - printf( "A CPU with AES and AVX2 is required!\n" ); + printf( "The SW build requires a CPU with AES and AVX2!\n" ); return false; } - if ( sw_has_avx && !( cpu_has_avx && cpu_has_aes ) ) + if ( sw_has_avx && !cpu_has_avx ) { - printf( "A CPU with AES and AVX2 is required!\n" ); + printf( "The SW build requires a CPU with AVX!\n" ); return false; } - if ( sw_has_aes && algo_has_aes && !cpu_has_aes ) + if ( sw_has_aes && !cpu_has_aes ) { - printf( "A CPU with AES is required!\n" ); + printf( "The SW build requires a CPU with AES!\n" ); return false; } - if ( sw_has_sha && algo_has_sha && !cpu_has_sha ) + if ( sw_has_sha && !cpu_has_sha ) { - printf( "A CPU with SHA is required!\n" ); + printf( "The SW build requires a CPU with SHA!\n" ); return false; } @@ -3187,6 +3203,9 @@ int main(int argc, char *argv[]) } #endif + if ( num_cpus != opt_n_threads ) + applog( LOG_INFO,"%u CPU cores available, %u miner threads selected.", + num_cpus, opt_n_threads ); if ( opt_affinity != -1 ) { if ( num_cpus > 64 ) diff --git a/miner.h b/miner.h index 248c76e..b753f68 100644 --- a/miner.h +++ b/miner.h @@ -736,7 +736,7 @@ Options:\n\ whirlpool\n\ whirlpoolx\n\ x11 Dash\n\ - x11evo Revolvercoin\n\ + x11evo Revolvercoin (XRE)\n\ x11gost sib (SibCoin)\n\ x13 X13\n\ x13sm3 hsr (Hshare)\n\