diff --git a/Makefile.am b/Makefile.am index 1dfadfb..13ee81e 100644 --- a/Makefile.am +++ b/Makefile.am @@ -90,6 +90,7 @@ cpuminer_SOURCES = \ algo/hodl/hodl-wolf.c \ algo/hodl/sha512_avx.c \ algo/hodl/sha512_avx2.c \ + algo/jh/jha.c \ algo/lbry.c \ algo/luffa/luffa.c \ algo/luffa/sse2/luffa_for_sse2.c \ diff --git a/README.md b/README.md index a31c9ba..8c58055 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ Supported Algorithms heavy Heavy hmq1725 Espers hodl Hodlcoin + jha jackpotcoin keccak Keccak lbry LBC, LBRY Credits luffa Luffa @@ -59,6 +60,7 @@ Supported Algorithms skein Skein+Sha (Skeincoin) skein2 Double Skein (Woodcoin) timetravel Machinecoin (MAC) + timetravel10 Bitcore vanilla blake256r8vnl (VCash) veltor whirlpool diff --git a/RELEASE_NOTES b/RELEASE_NOTES index f00f6aa..4404ea7 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -6,6 +6,9 @@ compile flag. HW SHA support is only available when compiled from source, Windows binaries are not yet available. +cpuminer-opt is a console program, if you're using a mouse you're doing it +wrong. + Compile Instructions -------------------- @@ -118,6 +121,11 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble. Change Log ---------- +v3.6.5 + +Cryptonight a little faster. +Added jha algo (Jackpotcoin) with AES optimizations. + v3.6.4 Added support for Bitcore (BTX) using the timetravel10 algo, optimized for diff --git a/algo-gate-api.c b/algo-gate-api.c index 8919db6..0e77ca1 100644 --- a/algo-gate-api.c +++ b/algo-gate-api.c @@ -169,6 +169,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate ) case ALGO_HEAVY: register_heavy_algo ( gate ); break; case ALGO_HMQ1725: register_hmq1725_algo ( gate ); break; case ALGO_HODL: register_hodl_algo ( gate ); break; + case ALGO_JHA: register_jha_algo ( gate ); break; case ALGO_KECCAK: register_keccak_algo ( gate ); break; case ALGO_LBRY: register_lbry_algo ( gate ); break; case ALGO_LUFFA: register_luffa_algo ( gate ); break; @@ -276,6 +277,7 @@ const char* const algo_alias_map[][2] = { "droplp", "drop" }, { "espers", "hmq1725" }, { "flax", "c11" }, + { "jackpot", "jha" }, { "jane", "scryptjane" }, { "lyra2", "lyra2re" }, { "lyra2v2", "lyra2rev2" }, diff --git a/algo/cryptonight/cryptonight-aesni.c b/algo/cryptonight/cryptonight-aesni.c index f906c17..2893703 100644 --- a/algo/cryptonight/cryptonight-aesni.c +++ b/algo/cryptonight/cryptonight-aesni.c @@ -109,43 +109,43 @@ static __thread cryptonight_ctx ctx; void cryptonight_hash_aes( void *restrict output, const void *input, int len ) { #ifndef NO_AES_NI - keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 ); + uint8_t ExpandedKey[256] __attribute__((aligned(64))); + __m128i *longoutput, *expkey, *xmminput; size_t i, j; - memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE); - memcpy(ExpandedKey, ctx.state.hs.b, AES_KEY_SIZE); - ExpandAESKey256(ExpandedKey); + keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 ); + memcpy( ExpandedKey, ctx.state.hs.b, AES_KEY_SIZE ); + ExpandAESKey256( ExpandedKey ); + memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE ); - __m128i *longoutput, *expkey, *xmminput; - longoutput = (__m128i *)ctx.long_state; - expkey = (__m128i *)ExpandedKey; - xmminput = (__m128i *)ctx.text; + longoutput = (__m128i*)ctx.long_state; + xmminput = (__m128i*)ctx.text; + expkey = (__m128i*)ExpandedKey; - //for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) - // aesni_parallel_noxor(&ctx->long_state[i], ctx->text, ExpandedKey); - - // prefetch expkey, all of xmminput and enough longoutput for 4 loops + // prefetch expkey, xmminput and enough longoutput for 4 iterations _mm_prefetch( xmminput, _MM_HINT_T0 ); _mm_prefetch( xmminput + 4, _MM_HINT_T0 ); - for ( i = 0; i < 64; i += 16 ) - { - _mm_prefetch( longoutput + i, _MM_HINT_T0 ); - _mm_prefetch( longoutput + i + 4, _MM_HINT_T0 ); - _mm_prefetch( longoutput + i + 8, _MM_HINT_T0 ); - _mm_prefetch( longoutput + i + 12, _MM_HINT_T0 ); - } _mm_prefetch( expkey, _MM_HINT_T0 ); _mm_prefetch( expkey + 4, _MM_HINT_T0 ); _mm_prefetch( expkey + 8, _MM_HINT_T0 ); - - for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I ) + for ( i = 0; i < 64; i += 16 ) { - // prefetch 4 loops ahead, + __builtin_prefetch( longoutput + i, 1, 0 ); + __builtin_prefetch( longoutput + i + 4, 1, 0 ); + __builtin_prefetch( longoutput + i + 8, 1, 0 ); + __builtin_prefetch( longoutput + i + 12, 1, 0 ); + } + + // n-4 iterations + for ( i = 0; likely( i < MEMORY_M128I - 4*INIT_SIZE_M128I ); + i += INIT_SIZE_M128I ) + { + // prefetch 4 iterations ahead. __builtin_prefetch( longoutput + i + 64, 1, 0 ); __builtin_prefetch( longoutput + i + 68, 1, 0 ); - for (j = 0; j < 10; j++ ) + for ( j = 0; j < 10; j++ ) { xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] ); xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] ); @@ -165,84 +165,99 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len ) _mm_store_si128( &( longoutput[i+6] ), xmminput[6] ); _mm_store_si128( &( longoutput[i+7] ), xmminput[7] ); } + // last 4 iterations + for ( ; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I ) + { + for ( j = 0; j < 10; j++ ) + { + xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] ); + xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] ); + xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] ); + xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] ); + xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] ); + xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] ); + xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] ); + xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] ); + } + _mm_store_si128( &( longoutput[i ] ), xmminput[0] ); + _mm_store_si128( &( longoutput[i+1] ), xmminput[1] ); + _mm_store_si128( &( longoutput[i+2] ), xmminput[2] ); + _mm_store_si128( &( longoutput[i+3] ), xmminput[3] ); + _mm_store_si128( &( longoutput[i+4] ), xmminput[4] ); + _mm_store_si128( &( longoutput[i+5] ), xmminput[5] ); + _mm_store_si128( &( longoutput[i+6] ), xmminput[6] ); + _mm_store_si128( &( longoutput[i+7] ), xmminput[7] ); + } -// cast_m128i( ctx.a ) = _mm_xor_si128( casti_m128i( ctx.state.k, 0 ) , -// casti_m128i( ctx.state.k, 2 ) ); -// cast_m128i( ctx.b ) = _mm_xor_si128( casti_m128i( ctx.state.k, 1 ), -// casti_m128i( ctx.state.k, 3 ) ); + ctx.a[0] = ((uint64_t *)ctx.state.k)[0] ^ ((uint64_t *)ctx.state.k)[4]; + ctx.b[0] = ((uint64_t *)ctx.state.k)[2] ^ ((uint64_t *)ctx.state.k)[6]; + ctx.a[1] = ((uint64_t *)ctx.state.k)[1] ^ ((uint64_t *)ctx.state.k)[5]; + ctx.b[1] = ((uint64_t *)ctx.state.k)[3] ^ ((uint64_t *)ctx.state.k)[7]; - ctx.a[0] = ((uint64_t *)ctx.state.k)[0] ^ ((uint64_t *)ctx.state.k)[4]; - ctx.b[0] = ((uint64_t *)ctx.state.k)[2] ^ ((uint64_t *)ctx.state.k)[6]; - ctx.a[1] = ((uint64_t *)ctx.state.k)[1] ^ ((uint64_t *)ctx.state.k)[5]; - ctx.b[1] = ((uint64_t *)ctx.state.k)[3] ^ ((uint64_t *)ctx.state.k)[7]; - -// for (i = 0; i < 2; i++) -// { -// ctx.a[i] = ((uint64_t *)ctx.state.k)[i] ^ ((uint64_t *)ctx.state.k)[i+4]; -// ctx.b[i] = ((uint64_t *)ctx.state.k)[i+2] ^ ((uint64_t *)ctx.state.k)[i+6]; -// } - - __m128i b_x = _mm_load_si128((__m128i *)ctx.b); - uint64_t a[2] __attribute((aligned(16))), b[2] __attribute((aligned(16))); + uint64_t a[2] __attribute((aligned(16))), + b[2] __attribute((aligned(16))), + c[2] __attribute((aligned(16))); a[0] = ctx.a[0]; a[1] = ctx.a[1]; - - for(i = 0; __builtin_expect(i < 0x80000, 1); i++) + __m128i b_x = _mm_load_si128( (__m128i*)ctx.b ); + __m128i a_x = _mm_load_si128( (__m128i*)a ); + __m128i* lsa = (__m128i*)&ctx.long_state[ a[0] & 0x1FFFF0 ]; + __m128i c_x = _mm_load_si128( lsa ); + uint64_t *nextblock; + uint64_t hi, lo; + + // n-1 iterations + for( i = 0; __builtin_expect( i < 0x7ffff, 1 ); i++ ) { - uint64_t c[2]; - __builtin_prefetch( &ctx.long_state[c[0] & 0x1FFFF0], 0, 1 ); - - __m128i c_x = _mm_load_si128( - (__m128i *)&ctx.long_state[a[0] & 0x1FFFF0]); - __m128i a_x = _mm_load_si128((__m128i *)a); - c_x = _mm_aesenc_si128(c_x, a_x); - _mm_store_si128((__m128i *)c, c_x); - - b_x = _mm_xor_si128(b_x, c_x); - _mm_store_si128((__m128i *)&ctx.long_state[a[0] & 0x1FFFF0], b_x); - - uint64_t *nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0]; -// uint64_t b[2]; + c_x = _mm_aesenc_si128( c_x, a_x ); + _mm_store_si128( (__m128i*)c, c_x ); + b_x = _mm_xor_si128( b_x, c_x ); + nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0]; + _mm_store_si128( lsa, b_x ); b[0] = nextblock[0]; b[1] = nextblock[1]; - { - uint64_t hi, lo; - // hi,lo = 64bit x 64bit multiply of c[0] and b[0] + // hi,lo = 64bit x 64bit multiply of c[0] and b[0] + __asm__( "mulq %3\n\t" + : "=d" ( hi ), + "=a" ( lo ) + : "%a" ( c[0] ), + "rm" ( b[0] ) + : "cc" ); - __asm__("mulq %3\n\t" - : "=d" (hi), - "=a" (lo) - : "%a" (c[0]), - "rm" (b[0]) - : "cc" ); - - a[0] += hi; - a[1] += lo; - } - uint64_t *dst = (uint64_t*)&ctx.long_state[c[0] & 0x1FFFF0]; -// __m128i *dst = (__m128i*)&ctx.long_state[c[0] & 0x1FFFF0]; - -// *dst = cast_m128i( a ); - dst[0] = a[0]; - dst[1] = a[1]; - -// cast_m128i( a ) = _mm_xor_si128( cast_m128i( a ), cast_m128i( b ) ); - a[0] ^= b[0]; - a[1] ^= b[1]; - b_x = c_x; - __builtin_prefetch( &ctx.long_state[a[0] & 0x1FFFF0], 0, 3 ); + b_x = c_x; + nextblock[0] = a[0] + hi; + nextblock[1] = a[1] + lo; + a[0] = b[0] ^ nextblock[0]; + a[1] = b[1] ^ nextblock[1]; + lsa = (__m128i*)&ctx.long_state[ a[0] & 0x1FFFF0 ]; + a_x = _mm_load_si128( (__m128i*)a ); + c_x = _mm_load_si128( lsa ); } + // abreviated nth iteration + c_x = _mm_aesenc_si128( c_x, a_x ); + _mm_store_si128( (__m128i*)c, c_x ); + b_x = _mm_xor_si128( b_x, c_x ); + nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0]; + _mm_store_si128( lsa, b_x ); + b[0] = nextblock[0]; + b[1] = nextblock[1]; + + __asm__( "mulq %3\n\t" + : "=d" ( hi ), + "=a" ( lo ) + : "%a" ( c[0] ), + "rm" ( b[0] ) + : "cc" ); + + nextblock[0] = a[0] + hi; + nextblock[1] = a[1] + lo; - memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE ); memcpy( ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE ); ExpandAESKey256( ExpandedKey ); - - //for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) - // aesni_parallel_xor(&ctx->text, ExpandedKey, &ctx->long_state[i]); + memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE ); // prefetch expkey, all of xmminput and enough longoutput for 4 loops - _mm_prefetch( xmminput, _MM_HINT_T0 ); _mm_prefetch( xmminput + 4, _MM_HINT_T0 ); for ( i = 0; i < 64; i += 16 ) @@ -256,9 +271,11 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len ) _mm_prefetch( expkey + 4, _MM_HINT_T0 ); _mm_prefetch( expkey + 8, _MM_HINT_T0 ); - for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I ) + // n-4 iterations + for ( i = 0; likely( i < MEMORY_M128I - 4*INIT_SIZE_M128I ); + i += INIT_SIZE_M128I ) { - // stay 4 loops ahead, + // stay 4 iterations ahead. _mm_prefetch( longoutput + i + 64, _MM_HINT_T0 ); _mm_prefetch( longoutput + i + 68, _MM_HINT_T0 ); @@ -283,10 +300,34 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len ) xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] ); } } - + // last 4 iterations + for ( ; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I ) + { + xmminput[0] = _mm_xor_si128( longoutput[i ], xmminput[0] ); + xmminput[1] = _mm_xor_si128( longoutput[i+1], xmminput[1] ); + xmminput[2] = _mm_xor_si128( longoutput[i+2], xmminput[2] ); + xmminput[3] = _mm_xor_si128( longoutput[i+3], xmminput[3] ); + xmminput[4] = _mm_xor_si128( longoutput[i+4], xmminput[4] ); + xmminput[5] = _mm_xor_si128( longoutput[i+5], xmminput[5] ); + xmminput[6] = _mm_xor_si128( longoutput[i+6], xmminput[6] ); + xmminput[7] = _mm_xor_si128( longoutput[i+7], xmminput[7] ); + + for( j = 0; j < 10; j++ ) + { + xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] ); + xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] ); + xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] ); + xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] ); + xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] ); + xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] ); + xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] ); + xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] ); + } + } + memcpy( ctx.state.init, ctx.text, INIT_SIZE_BYTE); keccakf( (uint64_t*)&ctx.state.hs.w, 24 ); - extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output); + #endif } diff --git a/algo/jh/jha.c b/algo/jh/jha.c new file mode 100644 index 0000000..62eccd5 --- /dev/null +++ b/algo/jh/jha.c @@ -0,0 +1,166 @@ +#include "miner.h" +#include "algo-gate-api.h" + +#include +#include +#include +#include + +#include "algo/blake/sph_blake.h" +#include "algo/jh/sph_jh.h" +#include "algo/keccak/sph_keccak.h" +#include "algo/skein/sph_skein.h" + +#ifdef NO_AES_NI + #include "algo/groestl/sph_groestl.h" +#else + #include "algo/groestl/aes_ni/hash-groestl.h" +#endif + +static __thread sph_keccak512_context jha_kec_mid __attribute__ ((aligned (64))); + +void jha_kec_midstate( const void* input ) +{ + sph_keccak512_init( &jha_kec_mid ); + sph_keccak512( &jha_kec_mid, input, 64 ); +} + +void jha_hash(void *output, const void *input) +{ + uint8_t _ALIGN(128) hash[64]; + +#ifdef NO_AES_NI + sph_groestl512_context ctx_groestl; +#else + hashState_groestl ctx_groestl; +#endif + sph_blake512_context ctx_blake; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + sph_skein512_context ctx_skein; + + sph_keccak512_init(&ctx_keccak); + memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid ); + sph_keccak512(&ctx_keccak, input+64, 16 ); + sph_keccak512_close(&ctx_keccak, hash ); + + // Heavy & Light Pair Loop + for (int round = 0; round < 3; round++) + { + if (hash[0] & 0x01) + { +#ifdef NO_AES_NI + sph_groestl512_init(&ctx_groestl); + sph_groestl512(&ctx_groestl, hash, 64 ); + sph_groestl512_close(&ctx_groestl, hash ); +#else + init_groestl( &ctx_groestl, 64 ); + update_and_final_groestl( &ctx_groestl, (char*)hash, + (char*)hash, 512 ); +#endif + } + else + { + sph_skein512_init(&ctx_skein); + sph_skein512(&ctx_skein, hash, 64); + sph_skein512_close(&ctx_skein, hash ); + } + + if (hash[0] & 0x01) + { + sph_blake512_init(&ctx_blake); + sph_blake512(&ctx_blake, hash, 64); + sph_blake512_close(&ctx_blake, hash ); + } + else + { + sph_jh512_init(&ctx_jh); + sph_jh512(&ctx_jh, hash, 64 ); + sph_jh512_close(&ctx_jh, hash ); + } + } + + memcpy(output, hash, 32); +} + +int scanhash_jha(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done) +{ + uint32_t _ALIGN(128) hash32[8]; + uint32_t _ALIGN(128) endiandata[20]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + uint32_t n = pdata[19] - 1; + + uint64_t htmax[] = { + 0, + 0xF, + 0xFF, + 0xFFF, + 0xFFFF, + 0x10000000 + }; + uint32_t masks[] = { + 0xFFFFFFFF, + 0xFFFFFFF0, + 0xFFFFFF00, + 0xFFFFF000, + 0xFFFF0000, + 0 + }; + + // we need bigendian data... + for (int i=0; i < 19; i++) { + be32enc(&endiandata[i], pdata[i]); + } + + jha_kec_midstate( endiandata ); + +#ifdef DEBUG_ALGO + printf("[%d] Htarg=%X\n", thr_id, Htarg); +#endif + for (int m=0; m < 6; m++) { + if (Htarg <= htmax[m]) { + uint32_t mask = masks[m]; + do { + pdata[19] = ++n; + be32enc(&endiandata[19], n); + jha_hash(hash32, endiandata); +#ifndef DEBUG_ALGO + if ((!(hash32[7] & mask)) && fulltest(hash32, ptarget)) { + work_set_target_ratio(work, hash32); + *hashes_done = n - first_nonce + 1; + return 1; + } +#else + if (!(n % 0x1000) && !thr_id) printf("."); + if (!(hash32[7] & mask)) { + printf("[%d]",thr_id); + if (fulltest(hash32, ptarget)) { + work_set_target_ratio(work, hash32); + *hashes_done = n - first_nonce + 1; + return 1; + } + } +#endif + } while (n < max_nonce && !work_restart[thr_id].restart); + // see blake.c if else to understand the loop on htmax => mask + break; + } + } + + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + return 0; +} + +bool register_jha_algo( algo_gate_t* gate ) +{ + gate->optimizations = SSE2_OPT | AES_OPT; + gate->scanhash = (void*)&scanhash_jha; + gate->hash = (void*)&jha_hash; + gate->set_target = (void*)&scrypt_set_target; + return true; +}; + diff --git a/configure.ac b/configure.ac index 96cc8dd..39bc7c2 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.6.4]) +AC_INIT([cpuminer-opt], [3.6.5]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/miner.h b/miner.h index 8021503..d510f5e 100644 --- a/miner.h +++ b/miner.h @@ -495,6 +495,7 @@ enum algos { ALGO_HEAVY, ALGO_HMQ1725, ALGO_HODL, + ALGO_JHA, ALGO_KECCAK, ALGO_LBRY, ALGO_LUFFA, @@ -558,6 +559,7 @@ static const char* const algo_names[] = { "heavy", "hmq1725", "hodl", + "jha", "keccak", "lbry", "luffa", @@ -675,6 +677,7 @@ Options:\n\ heavy Heavy\n\ hmq1725 Espers\n\ hodl Hodlcoin\n\ + jha jackppot (Jackpotcoin)\n\ keccak Keccak\n\ lbry LBC, LBRY Credits\n\ luffa Luffa\n\