From 4521b324e30b337e2605b8e824cec46e7657a428 Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Thu, 16 Feb 2017 10:51:20 -0500 Subject: [PATCH] v3.5.8 --- RELEASE_NOTES | 6 ++++ algo/cryptonight/cryptonight-aesni.c | 18 +++++------ algo/cryptonight/cryptonight.c | 32 +++++++++++++++++++ algo/lyra2/lyra2.c | 46 +++++----------------------- algo/lyra2/lyra2re.c | 3 +- algo/lyra2/lyra2rev2.c | 3 ++ algo/veltor.c | 18 ++++++++++- algo/xevan.c | 18 +++++++++-- configure.ac | 2 +- 9 files changed, 94 insertions(+), 52 deletions(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index bc2da42..9c9713a 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -2,6 +2,12 @@ Compile instruction for Linux and Windows are at the bottom of this file. Change Log ---------- +v3.5.8 + +Lyra2RE fixed on Windows, broken in v3.5.6. +Ported AES cryptonight optimizations from v3.5.7 to non-AES version + with little improvement. +Marginal improvements to xevan and veltor. v3.5.7 diff --git a/algo/cryptonight/cryptonight-aesni.c b/algo/cryptonight/cryptonight-aesni.c index dbbde3c..00145b1 100644 --- a/algo/cryptonight/cryptonight-aesni.c +++ b/algo/cryptonight/cryptonight-aesni.c @@ -126,25 +126,24 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len ) // aesni_parallel_noxor(&ctx->long_state[i], ctx->text, ExpandedKey); // prefetch expkey, all of xmminput and enough longoutput for 4 loops - _mm_prefetch( expkey, _MM_HINT_T0 ); - _mm_prefetch( expkey + 4, _MM_HINT_T0 ); - _mm_prefetch( expkey + 8, _MM_HINT_T0 ); _mm_prefetch( xmminput, _MM_HINT_T0 ); _mm_prefetch( xmminput + 4, _MM_HINT_T0 ); - - for ( i = 0; i < 64; i += 8 ) + for ( i = 0; i < 64; i += 16 ) { _mm_prefetch( longoutput + i, _MM_HINT_T0 ); _mm_prefetch( longoutput + i + 4, _MM_HINT_T0 ); _mm_prefetch( longoutput + i + 8, _MM_HINT_T0 ); _mm_prefetch( longoutput + i + 12, _MM_HINT_T0 ); } + _mm_prefetch( expkey, _MM_HINT_T0 ); + _mm_prefetch( expkey + 4, _MM_HINT_T0 ); + _mm_prefetch( expkey + 8, _MM_HINT_T0 ); for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I ) { // prefetch 4 loops ahead, - _mm_prefetch( longoutput + i + 64, _MM_HINT_T0 ); - _mm_prefetch( longoutput + i + 68, _MM_HINT_T0 ); + __builtin_prefetch( longoutput + i + 64, 1, 0 ); + __builtin_prefetch( longoutput + i + 68, 1, 0 ); for (j = 0; j < 10; j++ ) { @@ -191,7 +190,7 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len ) for(i = 0; __builtin_expect(i < 0x80000, 1); i++) { uint64_t c[2]; - _mm_prefetch( &ctx.long_state[c[0] & 0x1FFFF0], _MM_HINT_T0 ); + __builtin_prefetch( &ctx.long_state[c[0] & 0x1FFFF0], 0, 1 ); __m128i c_x = _mm_load_si128( (__m128i *)&ctx.long_state[a[0] & 0x1FFFF0]); @@ -232,7 +231,7 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len ) a[0] ^= b[0]; a[1] ^= b[1]; b_x = c_x; - _mm_prefetch( &ctx.long_state[a[0] & 0x1FFFF0], _MM_HINT_T0 ); + __builtin_prefetch( &ctx.long_state[a[0] & 0x1FFFF0], 0, 3 ); } memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE ); @@ -243,6 +242,7 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len ) // aesni_parallel_xor(&ctx->text, ExpandedKey, &ctx->long_state[i]); // prefetch expkey, all of xmminput and enough longoutput for 4 loops + _mm_prefetch( xmminput, _MM_HINT_T0 ); _mm_prefetch( xmminput + 4, _MM_HINT_T0 ); for ( i = 0; i < 64; i += 16 ) diff --git a/algo/cryptonight/cryptonight.c b/algo/cryptonight/cryptonight.c index 411be75..fb0eefb 100644 --- a/algo/cryptonight/cryptonight.c +++ b/algo/cryptonight/cryptonight.c @@ -5,6 +5,7 @@ // Modified for CPUminer by Lucas Jones #include "miner.h" +#include #if defined(__arm__) || defined(_MSC_VER) #ifndef NOASM @@ -175,11 +176,27 @@ void cryptonight_hash_ctx(void* output, const void* input, int len) { hash_process(&ctx.state.hs, (const uint8_t*) input, len); ctx.aes_ctx = (oaes_ctx*) oaes_alloc(); + + __builtin_prefetch( ctx.text, 0, 3 ); + __builtin_prefetch( ctx.text + 64, 0, 3 ); + __builtin_prefetch( ctx.long_state, 1, 0 ); + __builtin_prefetch( ctx.long_state + 64, 1, 0 ); + __builtin_prefetch( ctx.long_state + 128, 1, 0 ); + __builtin_prefetch( ctx.long_state + 192, 1, 0 ); + __builtin_prefetch( ctx.long_state + 256, 1, 0 ); + __builtin_prefetch( ctx.long_state + 320, 1, 0 ); + __builtin_prefetch( ctx.long_state + 384, 1, 0 ); + __builtin_prefetch( ctx.long_state + 448, 1, 0 ); + size_t i, j; memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE); oaes_key_import_data(ctx.aes_ctx, ctx.state.hs.b, AES_KEY_SIZE); for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) { + + __builtin_prefetch( ctx.long_state + i + 512, 1, 0 ); + __builtin_prefetch( ctx.long_state + i + 576, 1, 0 ); + aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 0], ctx.aes_ctx->key->exp_data); aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 1], ctx.aes_ctx->key->exp_data); aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 2], ctx.aes_ctx->key->exp_data); @@ -213,9 +230,24 @@ void cryptonight_hash_ctx(void* output, const void* input, int len) mul_sum_xor_dst(ctx.b, ctx.a, &ctx.long_state[e2i(ctx.b)]); } + __builtin_prefetch( ctx.text, 0, 3 ); + __builtin_prefetch( ctx.text + 64, 0, 3 ); + __builtin_prefetch( ctx.long_state, 1, 0 ); + __builtin_prefetch( ctx.long_state + 64, 1, 0 ); + __builtin_prefetch( ctx.long_state + 128, 1, 0 ); + __builtin_prefetch( ctx.long_state + 192, 1, 0 ); + __builtin_prefetch( ctx.long_state + 256, 1, 0 ); + __builtin_prefetch( ctx.long_state + 320, 1, 0 ); + __builtin_prefetch( ctx.long_state + 384, 1, 0 ); + __builtin_prefetch( ctx.long_state + 448, 1, 0 ); + memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE); oaes_key_import_data(ctx.aes_ctx, &ctx.state.hs.b[32], AES_KEY_SIZE); for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) { + + __builtin_prefetch( ctx.long_state + i + 512, 1, 0 ); + __builtin_prefetch( ctx.long_state + i + 576, 1, 0 ); + xor_blocks(&ctx.text[0 * AES_BLOCK_SIZE], &ctx.long_state[i + 0 * AES_BLOCK_SIZE]); aesb_pseudo_round_mut(&ctx.text[0 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data); xor_blocks(&ctx.text[1 * AES_BLOCK_SIZE], &ctx.long_state[i + 1 * AES_BLOCK_SIZE]); diff --git a/algo/lyra2/lyra2.c b/algo/lyra2/lyra2.c index 8323e37..f987873 100644 --- a/algo/lyra2/lyra2.c +++ b/algo/lyra2/lyra2.c @@ -71,20 +71,6 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, // for Lyra2REv2, nCols = 4, v1 was using 8 const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES; -/* - i = (int64_t)ROW_LEN_BYTES * nRows; - uint64_t *wholeMatrix = _mm_malloc( i, 64 ); - if (wholeMatrix == NULL) - return -1; - -#if defined (__AVX2__) - memset_zero_m256i( (__m256i*)wholeMatrix, i/32 ); -#elif defined(__AVX__) - memset_zero_m128i( (__m128i*)wholeMatrix, i/16 ); -#else - memset(wholeMatrix, 0, i); -#endif -*/ uint64_t *ptrWord = wholeMatrix; //=== Getting the password + salt + basil padded with 10*1 ==========// @@ -219,13 +205,12 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, //Squeezes the key squeeze(state, K, (unsigned int) kLen); - //================== Freeing the memory =============================// -// free(wholeMatrix); - return 0; } -int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols ) +int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, + uint64_t pwdlen, const void *salt, uint64_t saltlen, + uint64_t timeCost, uint64_t nRows, uint64_t nCols ) { //========================== Basic variables ============================// uint64_t _ALIGN(256) state[16]; @@ -244,27 +229,14 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, uint const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; -/* - i = (int64_t)ROW_LEN_BYTES * nRows; - uint64_t *wholeMatrix = _mm_malloc( i, 64 ); - if (wholeMatrix == NULL) - return -1; - -#if defined (__AVX2__) - memset_zero_m256i( (__m256i*)wholeMatrix, i/32 ); -#elif defined(__AVX__) - memset_zero_m128i( (__m128i*)wholeMatrix, i/16 ); -#else - memset(wholeMatrix, 0, i); -#endif -*/ //==== Getting the password + salt + basil padded with 10*1 ============// //OBS.:The memory matrix will temporarily hold the password: not for saving memory, //but this ensures that the password copied locally will be overwritten as soon as possible //First, we clean enough blocks for the password, salt, basil and padding - uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1; + uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * + sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1; byte *ptrByte = (byte*) wholeMatrix; memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES ); @@ -366,17 +338,15 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, uint //========================= Wrap-up Phase ===============================// //Absorbs the last block of the memory matrix - absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]); + absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]); //Squeezes the key squeeze( state, K, kLen ); - //====================== Freeing the memory =============================// -// _mm_free(state); -// _mm_free( wholeMatrix ); return 0; } +// Lyra2RE doesn't like the new wholeMatrix implementation int LYRA2RE( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, const uint64_t nRows, const uint64_t nCols ) @@ -548,7 +518,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, squeeze(state, K, (unsigned int) kLen); //================== Freeing the memory =============================// - free(wholeMatrix); + _mm_free(wholeMatrix); return 0; } diff --git a/algo/lyra2/lyra2re.c b/algo/lyra2/lyra2re.c index 31b44d4..7feffd5 100644 --- a/algo/lyra2/lyra2re.c +++ b/algo/lyra2/lyra2re.c @@ -69,7 +69,8 @@ void lyra2re_hash(void *state, const void *input) sph_keccak256(&ctx.keccak, hashA, 32); sph_keccak256_close(&ctx.keccak, hashB); - LYRA2RE( hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); + LYRA2RE( hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); +// LYRA2RE( lyra2re_wholeMatrix, hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); sph_skein256(&ctx.skein, hashA, 32); sph_skein256_close(&ctx.skein, hashB); diff --git a/algo/lyra2/lyra2rev2.c b/algo/lyra2/lyra2rev2.c index 725612e..c4e7d90 100644 --- a/algo/lyra2/lyra2rev2.c +++ b/algo/lyra2/lyra2rev2.c @@ -12,6 +12,9 @@ #include "lyra2.h" #include "avxdefs.h" +// This gets allocated when miner_thread starts up and is never freed. +// It's not a leak because the only way to allocate it again is to exit +// the thread and that only occurs when the entire program exits. __thread uint64_t* l2v2_wholeMatrix; typedef struct { diff --git a/algo/veltor.c b/algo/veltor.c index 7393acd..47de33a 100644 --- a/algo/veltor.c +++ b/algo/veltor.c @@ -18,6 +18,7 @@ typedef struct { } veltor_ctx_holder; veltor_ctx_holder veltor_ctx; +static __thread sph_skein512_context veltor_skein_mid; void init_veltor_ctx() { @@ -27,6 +28,12 @@ void init_veltor_ctx() sph_shabal512_init( &veltor_ctx.shabal); } +void veltor_skein512_midstate( const void* input ) +{ + memcpy( &veltor_skein_mid, &veltor_ctx.skein, sizeof veltor_skein_mid ); + sph_skein512( &veltor_skein_mid, input, 64 ); +} + void veltorhash(void *output, const void *input) { uint32_t _ALIGN(64) hashA[16], hashB[16]; @@ -34,7 +41,13 @@ void veltorhash(void *output, const void *input) veltor_ctx_holder ctx; memcpy( &ctx, &veltor_ctx, sizeof(veltor_ctx) ); - sph_skein512(&ctx.skein, input, 80); + const int midlen = 64; // bytes + const int tail = 80 - midlen; // 16 + + memcpy( &ctx.skein, &veltor_skein_mid, sizeof veltor_skein_mid ); + sph_skein512( &ctx.skein, input + midlen, tail ); + +// sph_skein512(&ctx.skein, input, 80); sph_skein512_close(&ctx.skein, hashA); sph_shavite512(&ctx.shavite, hashA, 64); @@ -68,6 +81,9 @@ int scanhash_veltor(int thr_id, struct work *work, uint32_t max_nonce, uint64_t for (int i=0; i < 19; i++) { be32enc(&endiandata[i], pdata[i]); } + + veltor_skein512_midstate( endiandata ); + do { be32enc(&endiandata[19], nonce); veltorhash(hash, endiandata); diff --git a/algo/xevan.c b/algo/xevan.c index c67a2bb..d51211c 100644 --- a/algo/xevan.c +++ b/algo/xevan.c @@ -56,6 +56,7 @@ typedef struct { } xevan_ctx_holder; xevan_ctx_holder xevan_ctx; +static __thread sph_blake512_context xevan_blake_mid; void init_xevan_ctx() { @@ -83,15 +84,26 @@ void init_xevan_ctx() #endif }; +void xevan_blake512_midstate( const void* input ) +{ + memcpy( &xevan_blake_mid, &xevan_ctx.blake, sizeof xevan_blake_mid ); + sph_blake512( &xevan_blake_mid, input, 64 ); +} + void xevan_hash(void *output, const void *input) { uint32_t _ALIGN(64) hash[32]; // 128 bytes required const int dataLen = 128; - xevan_ctx_holder ctx; memcpy( &ctx, &xevan_ctx, sizeof(xevan_ctx) ); - sph_blake512(&ctx.blake, input, 80); + const int midlen = 64; // bytes + const int tail = 80 - midlen; // 16 + + memcpy( &ctx.blake, &xevan_blake_mid, sizeof xevan_blake_mid ); + sph_blake512( &ctx.blake, input + midlen, tail ); + +// sph_blake512(&ctx.blake, input, 80); sph_blake512_close(&ctx.blake, hash); memset(&hash[16], 0, 64); @@ -239,6 +251,8 @@ int scanhash_xevan(int thr_id, struct work *work, uint32_t max_nonce, uint64_t * for (int k=0; k < 19; k++) be32enc(&endiandata[k], pdata[k]); + xevan_blake512_midstate( endiandata ); + do { be32enc(&endiandata[19], nonce); xevan_hash(hash, endiandata); diff --git a/configure.ac b/configure.ac index 623d97a..4b0df71 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.5.7]) +AC_INIT([cpuminer-opt], [3.5.8]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM