This commit is contained in:
Jay D Dee
2017-02-14 13:15:21 -05:00
parent cad2cd6339
commit 9ee5965e3f
6 changed files with 136 additions and 66 deletions

22
README.txt Normal file
View File

@@ -0,0 +1,22 @@
This file is included in the Windows binary package. Compile instructions
for Linux and Windows can be found in RELEASE_NOTES.
Choose the exe that best matches you CPU's features or use trial and
error to find the fastest one that doesn't crash. Pay attention to
the features listed at cpuminer startup to ensure you are mining at
optimum speed using all the available features.
Architecture names and compile options used are only provided for Intel
Core series. Pentium and Celeron often have fewer features.
AMD is YMMV, see previous paragraph.
Exe name Compile opts Arch name
cpuminer-sse2.exe -march=core2, Core2
cpuminer-sse42.exe -march=corei7, Nehalem
cpuminer-aes-sse42.exe -maes -msse4.2 Westmere
cpuminer-aes-avx.exe -march=corei7-avx, Sandybridge, Ivybridge
cpuminer-aes-avx2.exe -march=core-avx2, Haswell, Broadwell, Skylake, Kabylake

View File

@@ -3,15 +3,19 @@ Compile instruction for Linux and Windows are at the bottom of this file.
Change Log
----------
v3.5.7
Cryptonight 5% faster
v3.5.6
Updated Lyra2z for new zcoin algo post block 2050.
Cleanup up Lyra2 code and increased performance
Updated Lyra2z algo for new zcoin algo post block 2050.
Cleaned up Lyra2 code and increased performance
- Lyra2Z (zcoin) +12%
- Lyra2REv2 +11%
- Lyra2RE +6%
- Lyra2Z (zcoin) +12%
Fixed performance of x11evo on Windows to match Linux.
Timetravel 3% to 5% faster
Fixed x11evo algo performance on Windows.
Timetravel algo 3% to 5% faster
Whirlpool algo 15% faster.
Removed aclocal.m4 from .gitignore.

View File

@@ -93,15 +93,15 @@ static inline void ExpandAESKey256(char *keybuf)
#endif
}
// align to 64 byte cache line
typedef struct
{
uint8_t long_state[MEMORY] __attribute((aligned(16)));
uint8_t long_state[MEMORY] __attribute((aligned(64)));
union cn_slow_hash_state state;
uint8_t text[INIT_SIZE_BYTE] __attribute((aligned(16)));
uint64_t a[AES_BLOCK_SIZE >> 3] __attribute__((aligned(16)));
uint64_t b[AES_BLOCK_SIZE >> 3] __attribute__((aligned(16)));
uint8_t c[AES_BLOCK_SIZE] __attribute__((aligned(16)));
// oaes_ctx* aes_ctx;
uint8_t text[INIT_SIZE_BYTE] __attribute((aligned(64)));
uint64_t a[AES_BLOCK_SIZE >> 3] __attribute__((aligned(64)));
uint64_t b[AES_BLOCK_SIZE >> 3] __attribute__((aligned(64)));
uint8_t c[AES_BLOCK_SIZE] __attribute__((aligned(64)));
} cryptonight_ctx;
static __thread cryptonight_ctx ctx;
@@ -110,7 +110,7 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
{
#ifndef NO_AES_NI
keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
uint8_t ExpandedKey[256];
uint8_t ExpandedKey[256] __attribute__((aligned(64)));
size_t i, j;
memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
@@ -118,34 +118,53 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
ExpandAESKey256(ExpandedKey);
__m128i *longoutput, *expkey, *xmminput;
longoutput = (__m128i *)ctx.long_state;
expkey = (__m128i *)ExpandedKey;
xmminput = (__m128i *)ctx.text;
longoutput = (__m128i *)ctx.long_state;
expkey = (__m128i *)ExpandedKey;
xmminput = (__m128i *)ctx.text;
//for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
// aesni_parallel_noxor(&ctx->long_state[i], ctx->text, ExpandedKey);
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
// prefetch expkey, all of xmminput and enough longoutput for 4 loops
_mm_prefetch( expkey, _MM_HINT_T0 );
_mm_prefetch( expkey + 4, _MM_HINT_T0 );
_mm_prefetch( expkey + 8, _MM_HINT_T0 );
_mm_prefetch( xmminput, _MM_HINT_T0 );
_mm_prefetch( xmminput + 4, _MM_HINT_T0 );
for ( i = 0; i < 64; i += 8 )
{
for(j = 0; j < 10; j++)
_mm_prefetch( longoutput + i, _MM_HINT_T0 );
_mm_prefetch( longoutput + i + 4, _MM_HINT_T0 );
_mm_prefetch( longoutput + i + 8, _MM_HINT_T0 );
_mm_prefetch( longoutput + i + 12, _MM_HINT_T0 );
}
for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
{
// prefetch 4 loops ahead,
_mm_prefetch( longoutput + i + 64, _MM_HINT_T0 );
_mm_prefetch( longoutput + i + 68, _MM_HINT_T0 );
for (j = 0; j < 10; j++ )
{
xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]);
xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]);
xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]);
xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]);
xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]);
xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]);
xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]);
xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]);
xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
}
_mm_store_si128(&(longoutput[(i >> 4)]), xmminput[0]);
_mm_store_si128(&(longoutput[(i >> 4) + 1]), xmminput[1]);
_mm_store_si128(&(longoutput[(i >> 4) + 2]), xmminput[2]);
_mm_store_si128(&(longoutput[(i >> 4) + 3]), xmminput[3]);
_mm_store_si128(&(longoutput[(i >> 4) + 4]), xmminput[4]);
_mm_store_si128(&(longoutput[(i >> 4) + 5]), xmminput[5]);
_mm_store_si128(&(longoutput[(i >> 4) + 6]), xmminput[6]);
_mm_store_si128(&(longoutput[(i >> 4) + 7]), xmminput[7]);
_mm_store_si128( &( longoutput[i ] ), xmminput[0] );
_mm_store_si128( &( longoutput[i+1] ), xmminput[1] );
_mm_store_si128( &( longoutput[i+2] ), xmminput[2] );
_mm_store_si128( &( longoutput[i+3] ), xmminput[3] );
_mm_store_si128( &( longoutput[i+4] ), xmminput[4] );
_mm_store_si128( &( longoutput[i+5] ), xmminput[5] );
_mm_store_si128( &( longoutput[i+6] ), xmminput[6] );
_mm_store_si128( &( longoutput[i+7] ), xmminput[7] );
}
// cast_m128i( ctx.a ) = _mm_xor_si128( casti_m128i( ctx.state.k, 0 ) ,
@@ -171,13 +190,14 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
for(i = 0; __builtin_expect(i < 0x80000, 1); i++)
{
__m128i c_x = _mm_load_si128((__m128i *)&ctx.long_state[a[0] & 0x1FFFF0]);
__m128i a_x = _mm_load_si128((__m128i *)a);
uint64_t c[2];
c_x = _mm_aesenc_si128(c_x, a_x);
uint64_t c[2];
_mm_prefetch( &ctx.long_state[c[0] & 0x1FFFF0], _MM_HINT_T0 );
__m128i c_x = _mm_load_si128(
(__m128i *)&ctx.long_state[a[0] & 0x1FFFF0]);
__m128i a_x = _mm_load_si128((__m128i *)a);
c_x = _mm_aesenc_si128(c_x, a_x);
_mm_store_si128((__m128i *)c, c_x);
__builtin_prefetch(&ctx.long_state[c[0] & 0x1FFFF0], 0, 1);
b_x = _mm_xor_si128(b_x, c_x);
_mm_store_si128((__m128i *)&ctx.long_state[a[0] & 0x1FFFF0], b_x);
@@ -202,8 +222,9 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
a[1] += lo;
}
uint64_t *dst = (uint64_t*)&ctx.long_state[c[0] & 0x1FFFF0];
// __m128i *dst = (__m128i*)&ctx.long_state[c[0] & 0x1FFFF0];
// cast_m128i( dst ) = cast_m128i( a );
// *dst = cast_m128i( a );
dst[0] = a[0];
dst[1] = a[1];
@@ -211,41 +232,59 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
a[0] ^= b[0];
a[1] ^= b[1];
b_x = c_x;
__builtin_prefetch(&ctx.long_state[a[0] & 0x1FFFF0], 0, 3);
_mm_prefetch( &ctx.long_state[a[0] & 0x1FFFF0], _MM_HINT_T0 );
}
memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
memcpy(ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE);
ExpandAESKey256(ExpandedKey);
memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
memcpy( ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE );
ExpandAESKey256( ExpandedKey );
//for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
// aesni_parallel_xor(&ctx->text, ExpandedKey, &ctx->long_state[i]);
for (i = 0; __builtin_expect(i < MEMORY, 1); i += INIT_SIZE_BYTE)
{
xmminput[0] = _mm_xor_si128(longoutput[(i >> 4)], xmminput[0]);
xmminput[1] = _mm_xor_si128(longoutput[(i >> 4) + 1], xmminput[1]);
xmminput[2] = _mm_xor_si128(longoutput[(i >> 4) + 2], xmminput[2]);
xmminput[3] = _mm_xor_si128(longoutput[(i >> 4) + 3], xmminput[3]);
xmminput[4] = _mm_xor_si128(longoutput[(i >> 4) + 4], xmminput[4]);
xmminput[5] = _mm_xor_si128(longoutput[(i >> 4) + 5], xmminput[5]);
xmminput[6] = _mm_xor_si128(longoutput[(i >> 4) + 6], xmminput[6]);
xmminput[7] = _mm_xor_si128(longoutput[(i >> 4) + 7], xmminput[7]);
// prefetch expkey, all of xmminput and enough longoutput for 4 loops
_mm_prefetch( xmminput, _MM_HINT_T0 );
_mm_prefetch( xmminput + 4, _MM_HINT_T0 );
for ( i = 0; i < 64; i += 16 )
{
_mm_prefetch( longoutput + i, _MM_HINT_T0 );
_mm_prefetch( longoutput + i + 4, _MM_HINT_T0 );
_mm_prefetch( longoutput + i + 8, _MM_HINT_T0 );
_mm_prefetch( longoutput + i + 12, _MM_HINT_T0 );
}
_mm_prefetch( expkey, _MM_HINT_T0 );
_mm_prefetch( expkey + 4, _MM_HINT_T0 );
_mm_prefetch( expkey + 8, _MM_HINT_T0 );
for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
{
// stay 4 loops ahead,
_mm_prefetch( longoutput + i + 64, _MM_HINT_T0 );
_mm_prefetch( longoutput + i + 68, _MM_HINT_T0 );
xmminput[0] = _mm_xor_si128( longoutput[i ], xmminput[0] );
xmminput[1] = _mm_xor_si128( longoutput[i+1], xmminput[1] );
xmminput[2] = _mm_xor_si128( longoutput[i+2], xmminput[2] );
xmminput[3] = _mm_xor_si128( longoutput[i+3], xmminput[3] );
xmminput[4] = _mm_xor_si128( longoutput[i+4], xmminput[4] );
xmminput[5] = _mm_xor_si128( longoutput[i+5], xmminput[5] );
xmminput[6] = _mm_xor_si128( longoutput[i+6], xmminput[6] );
xmminput[7] = _mm_xor_si128( longoutput[i+7], xmminput[7] );
for(j = 0; j < 10; j++)
{
xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]);
xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]);
xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]);
xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]);
xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]);
xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]);
xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]);
xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]);
}
for( j = 0; j < 10; j++ )
{
xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
}
}
memcpy(ctx.state.init, ctx.text, INIT_SIZE_BYTE);
memcpy( ctx.state.init, ctx.text, INIT_SIZE_BYTE);
keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);

View File

@@ -6,11 +6,13 @@
#include "miner.h"
#define MEMORY (1 << 21) /* 2 MiB */
#define MEMORY_M128I (MEMORY >> 4) // 2 MiB / 16 = 128 ki * __m128i
#define ITER (1 << 20)
#define AES_BLOCK_SIZE 16
#define AES_KEY_SIZE 32 /*16*/
#define INIT_SIZE_BLK 8
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128
#define INIT_SIZE_M128I (INIT_SIZE_BYTE >> 4) // 8
#pragma pack(push, 1)

View File

@@ -133,6 +133,9 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
// from here on it's all simd acces to state and matrix
// define vector pointers and adjust sizes and pointer offsets
//================= Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.5.6])
AC_INIT([cpuminer-opt], [3.5.7])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM