mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.5.7
This commit is contained in:
22
README.txt
Normal file
22
README.txt
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
This file is included in the Windows binary package. Compile instructions
|
||||||
|
for Linux and Windows can be found in RELEASE_NOTES.
|
||||||
|
|
||||||
|
Choose the exe that best matches you CPU's features or use trial and
|
||||||
|
error to find the fastest one that doesn't crash. Pay attention to
|
||||||
|
the features listed at cpuminer startup to ensure you are mining at
|
||||||
|
optimum speed using all the available features.
|
||||||
|
|
||||||
|
Architecture names and compile options used are only provided for Intel
|
||||||
|
Core series. Pentium and Celeron often have fewer features.
|
||||||
|
AMD is YMMV, see previous paragraph.
|
||||||
|
|
||||||
|
Exe name Compile opts Arch name
|
||||||
|
|
||||||
|
cpuminer-sse2.exe -march=core2, Core2
|
||||||
|
cpuminer-sse42.exe -march=corei7, Nehalem
|
||||||
|
cpuminer-aes-sse42.exe -maes -msse4.2 Westmere
|
||||||
|
cpuminer-aes-avx.exe -march=corei7-avx, Sandybridge, Ivybridge
|
||||||
|
cpuminer-aes-avx2.exe -march=core-avx2, Haswell, Broadwell, Skylake, Kabylake
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -3,15 +3,19 @@ Compile instruction for Linux and Windows are at the bottom of this file.
|
|||||||
Change Log
|
Change Log
|
||||||
----------
|
----------
|
||||||
|
|
||||||
|
v3.5.7
|
||||||
|
|
||||||
|
Cryptonight 5% faster
|
||||||
|
|
||||||
v3.5.6
|
v3.5.6
|
||||||
|
|
||||||
Updated Lyra2z for new zcoin algo post block 2050.
|
Updated Lyra2z algo for new zcoin algo post block 2050.
|
||||||
Cleanup up Lyra2 code and increased performance
|
Cleaned up Lyra2 code and increased performance
|
||||||
|
- Lyra2Z (zcoin) +12%
|
||||||
- Lyra2REv2 +11%
|
- Lyra2REv2 +11%
|
||||||
- Lyra2RE +6%
|
- Lyra2RE +6%
|
||||||
- Lyra2Z (zcoin) +12%
|
Fixed x11evo algo performance on Windows.
|
||||||
Fixed performance of x11evo on Windows to match Linux.
|
Timetravel algo 3% to 5% faster
|
||||||
Timetravel 3% to 5% faster
|
|
||||||
Whirlpool algo 15% faster.
|
Whirlpool algo 15% faster.
|
||||||
Removed aclocal.m4 from .gitignore.
|
Removed aclocal.m4 from .gitignore.
|
||||||
|
|
||||||
|
|||||||
@@ -93,15 +93,15 @@ static inline void ExpandAESKey256(char *keybuf)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// align to 64 byte cache line
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
uint8_t long_state[MEMORY] __attribute((aligned(16)));
|
uint8_t long_state[MEMORY] __attribute((aligned(64)));
|
||||||
union cn_slow_hash_state state;
|
union cn_slow_hash_state state;
|
||||||
uint8_t text[INIT_SIZE_BYTE] __attribute((aligned(16)));
|
uint8_t text[INIT_SIZE_BYTE] __attribute((aligned(64)));
|
||||||
uint64_t a[AES_BLOCK_SIZE >> 3] __attribute__((aligned(16)));
|
uint64_t a[AES_BLOCK_SIZE >> 3] __attribute__((aligned(64)));
|
||||||
uint64_t b[AES_BLOCK_SIZE >> 3] __attribute__((aligned(16)));
|
uint64_t b[AES_BLOCK_SIZE >> 3] __attribute__((aligned(64)));
|
||||||
uint8_t c[AES_BLOCK_SIZE] __attribute__((aligned(16)));
|
uint8_t c[AES_BLOCK_SIZE] __attribute__((aligned(64)));
|
||||||
// oaes_ctx* aes_ctx;
|
|
||||||
} cryptonight_ctx;
|
} cryptonight_ctx;
|
||||||
|
|
||||||
static __thread cryptonight_ctx ctx;
|
static __thread cryptonight_ctx ctx;
|
||||||
@@ -110,7 +110,7 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
|
|||||||
{
|
{
|
||||||
#ifndef NO_AES_NI
|
#ifndef NO_AES_NI
|
||||||
keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
|
keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
|
||||||
uint8_t ExpandedKey[256];
|
uint8_t ExpandedKey[256] __attribute__((aligned(64)));
|
||||||
size_t i, j;
|
size_t i, j;
|
||||||
|
|
||||||
memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
|
memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
|
||||||
@@ -125,27 +125,46 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
|
|||||||
//for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
|
//for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
|
||||||
// aesni_parallel_noxor(&ctx->long_state[i], ctx->text, ExpandedKey);
|
// aesni_parallel_noxor(&ctx->long_state[i], ctx->text, ExpandedKey);
|
||||||
|
|
||||||
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
|
// prefetch expkey, all of xmminput and enough longoutput for 4 loops
|
||||||
|
_mm_prefetch( expkey, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( expkey + 4, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( expkey + 8, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( xmminput, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( xmminput + 4, _MM_HINT_T0 );
|
||||||
|
|
||||||
|
for ( i = 0; i < 64; i += 8 )
|
||||||
{
|
{
|
||||||
for(j = 0; j < 10; j++)
|
_mm_prefetch( longoutput + i, _MM_HINT_T0 );
|
||||||
{
|
_mm_prefetch( longoutput + i + 4, _MM_HINT_T0 );
|
||||||
xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]);
|
_mm_prefetch( longoutput + i + 8, _MM_HINT_T0 );
|
||||||
xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]);
|
_mm_prefetch( longoutput + i + 12, _MM_HINT_T0 );
|
||||||
xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]);
|
|
||||||
xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]);
|
|
||||||
xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]);
|
|
||||||
xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]);
|
|
||||||
xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]);
|
|
||||||
xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]);
|
|
||||||
}
|
}
|
||||||
_mm_store_si128(&(longoutput[(i >> 4)]), xmminput[0]);
|
|
||||||
_mm_store_si128(&(longoutput[(i >> 4) + 1]), xmminput[1]);
|
for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
|
||||||
_mm_store_si128(&(longoutput[(i >> 4) + 2]), xmminput[2]);
|
{
|
||||||
_mm_store_si128(&(longoutput[(i >> 4) + 3]), xmminput[3]);
|
// prefetch 4 loops ahead,
|
||||||
_mm_store_si128(&(longoutput[(i >> 4) + 4]), xmminput[4]);
|
_mm_prefetch( longoutput + i + 64, _MM_HINT_T0 );
|
||||||
_mm_store_si128(&(longoutput[(i >> 4) + 5]), xmminput[5]);
|
_mm_prefetch( longoutput + i + 68, _MM_HINT_T0 );
|
||||||
_mm_store_si128(&(longoutput[(i >> 4) + 6]), xmminput[6]);
|
|
||||||
_mm_store_si128(&(longoutput[(i >> 4) + 7]), xmminput[7]);
|
for (j = 0; j < 10; j++ )
|
||||||
|
{
|
||||||
|
xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
|
||||||
|
xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
|
||||||
|
xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
|
||||||
|
xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
|
||||||
|
xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
|
||||||
|
xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
|
||||||
|
xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
|
||||||
|
xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
|
||||||
|
}
|
||||||
|
_mm_store_si128( &( longoutput[i ] ), xmminput[0] );
|
||||||
|
_mm_store_si128( &( longoutput[i+1] ), xmminput[1] );
|
||||||
|
_mm_store_si128( &( longoutput[i+2] ), xmminput[2] );
|
||||||
|
_mm_store_si128( &( longoutput[i+3] ), xmminput[3] );
|
||||||
|
_mm_store_si128( &( longoutput[i+4] ), xmminput[4] );
|
||||||
|
_mm_store_si128( &( longoutput[i+5] ), xmminput[5] );
|
||||||
|
_mm_store_si128( &( longoutput[i+6] ), xmminput[6] );
|
||||||
|
_mm_store_si128( &( longoutput[i+7] ), xmminput[7] );
|
||||||
}
|
}
|
||||||
|
|
||||||
// cast_m128i( ctx.a ) = _mm_xor_si128( casti_m128i( ctx.state.k, 0 ) ,
|
// cast_m128i( ctx.a ) = _mm_xor_si128( casti_m128i( ctx.state.k, 0 ) ,
|
||||||
@@ -171,13 +190,14 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
|
|||||||
|
|
||||||
for(i = 0; __builtin_expect(i < 0x80000, 1); i++)
|
for(i = 0; __builtin_expect(i < 0x80000, 1); i++)
|
||||||
{
|
{
|
||||||
__m128i c_x = _mm_load_si128((__m128i *)&ctx.long_state[a[0] & 0x1FFFF0]);
|
|
||||||
__m128i a_x = _mm_load_si128((__m128i *)a);
|
|
||||||
uint64_t c[2];
|
uint64_t c[2];
|
||||||
c_x = _mm_aesenc_si128(c_x, a_x);
|
_mm_prefetch( &ctx.long_state[c[0] & 0x1FFFF0], _MM_HINT_T0 );
|
||||||
|
|
||||||
|
__m128i c_x = _mm_load_si128(
|
||||||
|
(__m128i *)&ctx.long_state[a[0] & 0x1FFFF0]);
|
||||||
|
__m128i a_x = _mm_load_si128((__m128i *)a);
|
||||||
|
c_x = _mm_aesenc_si128(c_x, a_x);
|
||||||
_mm_store_si128((__m128i *)c, c_x);
|
_mm_store_si128((__m128i *)c, c_x);
|
||||||
__builtin_prefetch(&ctx.long_state[c[0] & 0x1FFFF0], 0, 1);
|
|
||||||
|
|
||||||
b_x = _mm_xor_si128(b_x, c_x);
|
b_x = _mm_xor_si128(b_x, c_x);
|
||||||
_mm_store_si128((__m128i *)&ctx.long_state[a[0] & 0x1FFFF0], b_x);
|
_mm_store_si128((__m128i *)&ctx.long_state[a[0] & 0x1FFFF0], b_x);
|
||||||
@@ -202,8 +222,9 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
|
|||||||
a[1] += lo;
|
a[1] += lo;
|
||||||
}
|
}
|
||||||
uint64_t *dst = (uint64_t*)&ctx.long_state[c[0] & 0x1FFFF0];
|
uint64_t *dst = (uint64_t*)&ctx.long_state[c[0] & 0x1FFFF0];
|
||||||
|
// __m128i *dst = (__m128i*)&ctx.long_state[c[0] & 0x1FFFF0];
|
||||||
|
|
||||||
// cast_m128i( dst ) = cast_m128i( a );
|
// *dst = cast_m128i( a );
|
||||||
dst[0] = a[0];
|
dst[0] = a[0];
|
||||||
dst[1] = a[1];
|
dst[1] = a[1];
|
||||||
|
|
||||||
@@ -211,41 +232,59 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
|
|||||||
a[0] ^= b[0];
|
a[0] ^= b[0];
|
||||||
a[1] ^= b[1];
|
a[1] ^= b[1];
|
||||||
b_x = c_x;
|
b_x = c_x;
|
||||||
__builtin_prefetch(&ctx.long_state[a[0] & 0x1FFFF0], 0, 3);
|
_mm_prefetch( &ctx.long_state[a[0] & 0x1FFFF0], _MM_HINT_T0 );
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
|
memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
|
||||||
memcpy(ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE);
|
memcpy( ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE );
|
||||||
ExpandAESKey256(ExpandedKey);
|
ExpandAESKey256( ExpandedKey );
|
||||||
|
|
||||||
//for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
|
//for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
|
||||||
// aesni_parallel_xor(&ctx->text, ExpandedKey, &ctx->long_state[i]);
|
// aesni_parallel_xor(&ctx->text, ExpandedKey, &ctx->long_state[i]);
|
||||||
|
|
||||||
for (i = 0; __builtin_expect(i < MEMORY, 1); i += INIT_SIZE_BYTE)
|
// prefetch expkey, all of xmminput and enough longoutput for 4 loops
|
||||||
|
_mm_prefetch( xmminput, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( xmminput + 4, _MM_HINT_T0 );
|
||||||
|
for ( i = 0; i < 64; i += 16 )
|
||||||
{
|
{
|
||||||
xmminput[0] = _mm_xor_si128(longoutput[(i >> 4)], xmminput[0]);
|
_mm_prefetch( longoutput + i, _MM_HINT_T0 );
|
||||||
xmminput[1] = _mm_xor_si128(longoutput[(i >> 4) + 1], xmminput[1]);
|
_mm_prefetch( longoutput + i + 4, _MM_HINT_T0 );
|
||||||
xmminput[2] = _mm_xor_si128(longoutput[(i >> 4) + 2], xmminput[2]);
|
_mm_prefetch( longoutput + i + 8, _MM_HINT_T0 );
|
||||||
xmminput[3] = _mm_xor_si128(longoutput[(i >> 4) + 3], xmminput[3]);
|
_mm_prefetch( longoutput + i + 12, _MM_HINT_T0 );
|
||||||
xmminput[4] = _mm_xor_si128(longoutput[(i >> 4) + 4], xmminput[4]);
|
}
|
||||||
xmminput[5] = _mm_xor_si128(longoutput[(i >> 4) + 5], xmminput[5]);
|
_mm_prefetch( expkey, _MM_HINT_T0 );
|
||||||
xmminput[6] = _mm_xor_si128(longoutput[(i >> 4) + 6], xmminput[6]);
|
_mm_prefetch( expkey + 4, _MM_HINT_T0 );
|
||||||
xmminput[7] = _mm_xor_si128(longoutput[(i >> 4) + 7], xmminput[7]);
|
_mm_prefetch( expkey + 8, _MM_HINT_T0 );
|
||||||
|
|
||||||
for(j = 0; j < 10; j++)
|
for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
|
||||||
{
|
{
|
||||||
xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]);
|
// stay 4 loops ahead,
|
||||||
xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]);
|
_mm_prefetch( longoutput + i + 64, _MM_HINT_T0 );
|
||||||
xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]);
|
_mm_prefetch( longoutput + i + 68, _MM_HINT_T0 );
|
||||||
xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]);
|
|
||||||
xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]);
|
xmminput[0] = _mm_xor_si128( longoutput[i ], xmminput[0] );
|
||||||
xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]);
|
xmminput[1] = _mm_xor_si128( longoutput[i+1], xmminput[1] );
|
||||||
xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]);
|
xmminput[2] = _mm_xor_si128( longoutput[i+2], xmminput[2] );
|
||||||
xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]);
|
xmminput[3] = _mm_xor_si128( longoutput[i+3], xmminput[3] );
|
||||||
|
xmminput[4] = _mm_xor_si128( longoutput[i+4], xmminput[4] );
|
||||||
|
xmminput[5] = _mm_xor_si128( longoutput[i+5], xmminput[5] );
|
||||||
|
xmminput[6] = _mm_xor_si128( longoutput[i+6], xmminput[6] );
|
||||||
|
xmminput[7] = _mm_xor_si128( longoutput[i+7], xmminput[7] );
|
||||||
|
|
||||||
|
for( j = 0; j < 10; j++ )
|
||||||
|
{
|
||||||
|
xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
|
||||||
|
xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
|
||||||
|
xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
|
||||||
|
xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
|
||||||
|
xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
|
||||||
|
xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
|
||||||
|
xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
|
||||||
|
xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(ctx.state.init, ctx.text, INIT_SIZE_BYTE);
|
memcpy( ctx.state.init, ctx.text, INIT_SIZE_BYTE);
|
||||||
keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
|
keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
|
||||||
|
|
||||||
extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
|
extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
|
||||||
|
|||||||
@@ -6,11 +6,13 @@
|
|||||||
#include "miner.h"
|
#include "miner.h"
|
||||||
|
|
||||||
#define MEMORY (1 << 21) /* 2 MiB */
|
#define MEMORY (1 << 21) /* 2 MiB */
|
||||||
|
#define MEMORY_M128I (MEMORY >> 4) // 2 MiB / 16 = 128 ki * __m128i
|
||||||
#define ITER (1 << 20)
|
#define ITER (1 << 20)
|
||||||
#define AES_BLOCK_SIZE 16
|
#define AES_BLOCK_SIZE 16
|
||||||
#define AES_KEY_SIZE 32 /*16*/
|
#define AES_KEY_SIZE 32 /*16*/
|
||||||
#define INIT_SIZE_BLK 8
|
#define INIT_SIZE_BLK 8
|
||||||
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128
|
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128
|
||||||
|
#define INIT_SIZE_M128I (INIT_SIZE_BYTE >> 4) // 8
|
||||||
|
|
||||||
|
|
||||||
#pragma pack(push, 1)
|
#pragma pack(push, 1)
|
||||||
|
|||||||
@@ -133,6 +133,9 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
|||||||
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
|
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
|
||||||
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
|
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
|
||||||
|
|
||||||
|
// from here on it's all simd acces to state and matrix
|
||||||
|
// define vector pointers and adjust sizes and pointer offsets
|
||||||
|
|
||||||
//================= Initializing the Sponge State ====================//
|
//================= Initializing the Sponge State ====================//
|
||||||
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
|
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
AC_INIT([cpuminer-opt], [3.5.6])
|
AC_INIT([cpuminer-opt], [3.5.7])
|
||||||
|
|
||||||
AC_PREREQ([2.59c])
|
AC_PREREQ([2.59c])
|
||||||
AC_CANONICAL_SYSTEM
|
AC_CANONICAL_SYSTEM
|
||||||
|
|||||||
Reference in New Issue
Block a user