diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..c8c1d9b --- /dev/null +++ b/README.txt @@ -0,0 +1,22 @@ +This file is included in the Windows binary package. Compile instructions +for Linux and Windows can be found in RELEASE_NOTES. + +Choose the exe that best matches you CPU's features or use trial and +error to find the fastest one that doesn't crash. Pay attention to +the features listed at cpuminer startup to ensure you are mining at +optimum speed using all the available features. + +Architecture names and compile options used are only provided for Intel +Core series. Pentium and Celeron often have fewer features. +AMD is YMMV, see previous paragraph. + +Exe name Compile opts Arch name + +cpuminer-sse2.exe -march=core2, Core2 +cpuminer-sse42.exe -march=corei7, Nehalem +cpuminer-aes-sse42.exe -maes -msse4.2 Westmere +cpuminer-aes-avx.exe -march=corei7-avx, Sandybridge, Ivybridge +cpuminer-aes-avx2.exe -march=core-avx2, Haswell, Broadwell, Skylake, Kabylake + + + diff --git a/RELEASE_NOTES b/RELEASE_NOTES index cd901a7..bc2da42 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -3,15 +3,19 @@ Compile instruction for Linux and Windows are at the bottom of this file. Change Log ---------- +v3.5.7 + +Cryptonight 5% faster + v3.5.6 -Updated Lyra2z for new zcoin algo post block 2050. -Cleanup up Lyra2 code and increased performance +Updated Lyra2z algo for new zcoin algo post block 2050. +Cleaned up Lyra2 code and increased performance + - Lyra2Z (zcoin) +12% - Lyra2REv2 +11% - Lyra2RE +6% - - Lyra2Z (zcoin) +12% -Fixed performance of x11evo on Windows to match Linux. -Timetravel 3% to 5% faster +Fixed x11evo algo performance on Windows. +Timetravel algo 3% to 5% faster Whirlpool algo 15% faster. Removed aclocal.m4 from .gitignore. diff --git a/algo/cryptonight/cryptonight-aesni.c b/algo/cryptonight/cryptonight-aesni.c index 4d7434f..dbbde3c 100644 --- a/algo/cryptonight/cryptonight-aesni.c +++ b/algo/cryptonight/cryptonight-aesni.c @@ -93,15 +93,15 @@ static inline void ExpandAESKey256(char *keybuf) #endif } +// align to 64 byte cache line typedef struct { - uint8_t long_state[MEMORY] __attribute((aligned(16))); + uint8_t long_state[MEMORY] __attribute((aligned(64))); union cn_slow_hash_state state; - uint8_t text[INIT_SIZE_BYTE] __attribute((aligned(16))); - uint64_t a[AES_BLOCK_SIZE >> 3] __attribute__((aligned(16))); - uint64_t b[AES_BLOCK_SIZE >> 3] __attribute__((aligned(16))); - uint8_t c[AES_BLOCK_SIZE] __attribute__((aligned(16))); -// oaes_ctx* aes_ctx; + uint8_t text[INIT_SIZE_BYTE] __attribute((aligned(64))); + uint64_t a[AES_BLOCK_SIZE >> 3] __attribute__((aligned(64))); + uint64_t b[AES_BLOCK_SIZE >> 3] __attribute__((aligned(64))); + uint8_t c[AES_BLOCK_SIZE] __attribute__((aligned(64))); } cryptonight_ctx; static __thread cryptonight_ctx ctx; @@ -110,7 +110,7 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len ) { #ifndef NO_AES_NI keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 ); - uint8_t ExpandedKey[256]; + uint8_t ExpandedKey[256] __attribute__((aligned(64))); size_t i, j; memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE); @@ -118,34 +118,53 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len ) ExpandAESKey256(ExpandedKey); __m128i *longoutput, *expkey, *xmminput; - longoutput = (__m128i *)ctx.long_state; - expkey = (__m128i *)ExpandedKey; - xmminput = (__m128i *)ctx.text; + longoutput = (__m128i *)ctx.long_state; + expkey = (__m128i *)ExpandedKey; + xmminput = (__m128i *)ctx.text; //for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) // aesni_parallel_noxor(&ctx->long_state[i], ctx->text, ExpandedKey); - for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) + // prefetch expkey, all of xmminput and enough longoutput for 4 loops + _mm_prefetch( expkey, _MM_HINT_T0 ); + _mm_prefetch( expkey + 4, _MM_HINT_T0 ); + _mm_prefetch( expkey + 8, _MM_HINT_T0 ); + _mm_prefetch( xmminput, _MM_HINT_T0 ); + _mm_prefetch( xmminput + 4, _MM_HINT_T0 ); + + for ( i = 0; i < 64; i += 8 ) { - for(j = 0; j < 10; j++) + _mm_prefetch( longoutput + i, _MM_HINT_T0 ); + _mm_prefetch( longoutput + i + 4, _MM_HINT_T0 ); + _mm_prefetch( longoutput + i + 8, _MM_HINT_T0 ); + _mm_prefetch( longoutput + i + 12, _MM_HINT_T0 ); + } + + for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I ) + { + // prefetch 4 loops ahead, + _mm_prefetch( longoutput + i + 64, _MM_HINT_T0 ); + _mm_prefetch( longoutput + i + 68, _MM_HINT_T0 ); + + for (j = 0; j < 10; j++ ) { - xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]); - xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]); - xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]); - xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]); - xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]); - xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]); - xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]); - xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]); + xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] ); + xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] ); + xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] ); + xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] ); + xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] ); + xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] ); + xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] ); + xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] ); } - _mm_store_si128(&(longoutput[(i >> 4)]), xmminput[0]); - _mm_store_si128(&(longoutput[(i >> 4) + 1]), xmminput[1]); - _mm_store_si128(&(longoutput[(i >> 4) + 2]), xmminput[2]); - _mm_store_si128(&(longoutput[(i >> 4) + 3]), xmminput[3]); - _mm_store_si128(&(longoutput[(i >> 4) + 4]), xmminput[4]); - _mm_store_si128(&(longoutput[(i >> 4) + 5]), xmminput[5]); - _mm_store_si128(&(longoutput[(i >> 4) + 6]), xmminput[6]); - _mm_store_si128(&(longoutput[(i >> 4) + 7]), xmminput[7]); + _mm_store_si128( &( longoutput[i ] ), xmminput[0] ); + _mm_store_si128( &( longoutput[i+1] ), xmminput[1] ); + _mm_store_si128( &( longoutput[i+2] ), xmminput[2] ); + _mm_store_si128( &( longoutput[i+3] ), xmminput[3] ); + _mm_store_si128( &( longoutput[i+4] ), xmminput[4] ); + _mm_store_si128( &( longoutput[i+5] ), xmminput[5] ); + _mm_store_si128( &( longoutput[i+6] ), xmminput[6] ); + _mm_store_si128( &( longoutput[i+7] ), xmminput[7] ); } // cast_m128i( ctx.a ) = _mm_xor_si128( casti_m128i( ctx.state.k, 0 ) , @@ -171,13 +190,14 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len ) for(i = 0; __builtin_expect(i < 0x80000, 1); i++) { - __m128i c_x = _mm_load_si128((__m128i *)&ctx.long_state[a[0] & 0x1FFFF0]); - __m128i a_x = _mm_load_si128((__m128i *)a); - uint64_t c[2]; - c_x = _mm_aesenc_si128(c_x, a_x); + uint64_t c[2]; + _mm_prefetch( &ctx.long_state[c[0] & 0x1FFFF0], _MM_HINT_T0 ); + __m128i c_x = _mm_load_si128( + (__m128i *)&ctx.long_state[a[0] & 0x1FFFF0]); + __m128i a_x = _mm_load_si128((__m128i *)a); + c_x = _mm_aesenc_si128(c_x, a_x); _mm_store_si128((__m128i *)c, c_x); - __builtin_prefetch(&ctx.long_state[c[0] & 0x1FFFF0], 0, 1); b_x = _mm_xor_si128(b_x, c_x); _mm_store_si128((__m128i *)&ctx.long_state[a[0] & 0x1FFFF0], b_x); @@ -202,8 +222,9 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len ) a[1] += lo; } uint64_t *dst = (uint64_t*)&ctx.long_state[c[0] & 0x1FFFF0]; +// __m128i *dst = (__m128i*)&ctx.long_state[c[0] & 0x1FFFF0]; -// cast_m128i( dst ) = cast_m128i( a ); +// *dst = cast_m128i( a ); dst[0] = a[0]; dst[1] = a[1]; @@ -211,41 +232,59 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len ) a[0] ^= b[0]; a[1] ^= b[1]; b_x = c_x; - __builtin_prefetch(&ctx.long_state[a[0] & 0x1FFFF0], 0, 3); + _mm_prefetch( &ctx.long_state[a[0] & 0x1FFFF0], _MM_HINT_T0 ); } - memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE); - memcpy(ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE); - ExpandAESKey256(ExpandedKey); + memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE ); + memcpy( ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE ); + ExpandAESKey256( ExpandedKey ); //for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) // aesni_parallel_xor(&ctx->text, ExpandedKey, &ctx->long_state[i]); - for (i = 0; __builtin_expect(i < MEMORY, 1); i += INIT_SIZE_BYTE) - { - xmminput[0] = _mm_xor_si128(longoutput[(i >> 4)], xmminput[0]); - xmminput[1] = _mm_xor_si128(longoutput[(i >> 4) + 1], xmminput[1]); - xmminput[2] = _mm_xor_si128(longoutput[(i >> 4) + 2], xmminput[2]); - xmminput[3] = _mm_xor_si128(longoutput[(i >> 4) + 3], xmminput[3]); - xmminput[4] = _mm_xor_si128(longoutput[(i >> 4) + 4], xmminput[4]); - xmminput[5] = _mm_xor_si128(longoutput[(i >> 4) + 5], xmminput[5]); - xmminput[6] = _mm_xor_si128(longoutput[(i >> 4) + 6], xmminput[6]); - xmminput[7] = _mm_xor_si128(longoutput[(i >> 4) + 7], xmminput[7]); + // prefetch expkey, all of xmminput and enough longoutput for 4 loops + _mm_prefetch( xmminput, _MM_HINT_T0 ); + _mm_prefetch( xmminput + 4, _MM_HINT_T0 ); + for ( i = 0; i < 64; i += 16 ) + { + _mm_prefetch( longoutput + i, _MM_HINT_T0 ); + _mm_prefetch( longoutput + i + 4, _MM_HINT_T0 ); + _mm_prefetch( longoutput + i + 8, _MM_HINT_T0 ); + _mm_prefetch( longoutput + i + 12, _MM_HINT_T0 ); + } + _mm_prefetch( expkey, _MM_HINT_T0 ); + _mm_prefetch( expkey + 4, _MM_HINT_T0 ); + _mm_prefetch( expkey + 8, _MM_HINT_T0 ); + + for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I ) + { + // stay 4 loops ahead, + _mm_prefetch( longoutput + i + 64, _MM_HINT_T0 ); + _mm_prefetch( longoutput + i + 68, _MM_HINT_T0 ); + + xmminput[0] = _mm_xor_si128( longoutput[i ], xmminput[0] ); + xmminput[1] = _mm_xor_si128( longoutput[i+1], xmminput[1] ); + xmminput[2] = _mm_xor_si128( longoutput[i+2], xmminput[2] ); + xmminput[3] = _mm_xor_si128( longoutput[i+3], xmminput[3] ); + xmminput[4] = _mm_xor_si128( longoutput[i+4], xmminput[4] ); + xmminput[5] = _mm_xor_si128( longoutput[i+5], xmminput[5] ); + xmminput[6] = _mm_xor_si128( longoutput[i+6], xmminput[6] ); + xmminput[7] = _mm_xor_si128( longoutput[i+7], xmminput[7] ); - for(j = 0; j < 10; j++) - { - xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]); - xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]); - xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]); - xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]); - xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]); - xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]); - xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]); - xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]); - } + for( j = 0; j < 10; j++ ) + { + xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] ); + xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] ); + xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] ); + xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] ); + xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] ); + xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] ); + xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] ); + xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] ); + } } - memcpy(ctx.state.init, ctx.text, INIT_SIZE_BYTE); + memcpy( ctx.state.init, ctx.text, INIT_SIZE_BYTE); keccakf( (uint64_t*)&ctx.state.hs.w, 24 ); extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output); diff --git a/algo/cryptonight/cryptonight.h b/algo/cryptonight/cryptonight.h index 023a2e9..5124594 100644 --- a/algo/cryptonight/cryptonight.h +++ b/algo/cryptonight/cryptonight.h @@ -6,11 +6,13 @@ #include "miner.h" #define MEMORY (1 << 21) /* 2 MiB */ +#define MEMORY_M128I (MEMORY >> 4) // 2 MiB / 16 = 128 ki * __m128i #define ITER (1 << 20) #define AES_BLOCK_SIZE 16 #define AES_KEY_SIZE 32 /*16*/ #define INIT_SIZE_BLK 8 #define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128 +#define INIT_SIZE_M128I (INIT_SIZE_BYTE >> 4) // 8 #pragma pack(push, 1) diff --git a/algo/lyra2/lyra2.c b/algo/lyra2/lyra2.c index 87727d7..8323e37 100644 --- a/algo/lyra2/lyra2.c +++ b/algo/lyra2/lyra2.c @@ -133,6 +133,9 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block +// from here on it's all simd acces to state and matrix +// define vector pointers and adjust sizes and pointer offsets + //================= Initializing the Sponge State ====================// //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) diff --git a/configure.ac b/configure.ac index 81266e8..623d97a 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.5.6]) +AC_INIT([cpuminer-opt], [3.5.7]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM