v3.5.7

2025-09-17 23:44:27 +00:00 · 2017-02-14 13:15:21 -05:00
parent cad2cd6339
commit 9ee5965e3f
6 changed files with 136 additions and 66 deletions
--- a/README.txt
+++ b/README.txt
@@ -0,0 +1,22 @@
 This file is included in the Windows binary package. Compile instructions
 for Linux and Windows can be found in RELEASE_NOTES.
 Choose the exe that best matches you CPU's features or use trial and
 error to find the fastest one that doesn't crash. Pay attention to
 the features listed at cpuminer startup to ensure you are mining at
 optimum speed using all the available features.
 Architecture names and compile options used are only provided for Intel
 Core series. Pentium and Celeron often have fewer features.
 AMD is YMMV, see previous paragraph.
 Exe name                  Compile opts       Arch name
 cpuminer-sse2.exe         -march=core2,      Core2   
 cpuminer-sse42.exe        -march=corei7,     Nehalem
 cpuminer-aes-sse42.exe    -maes -msse4.2     Westmere
 cpuminer-aes-avx.exe      -march=corei7-avx, Sandybridge, Ivybridge
 cpuminer-aes-avx2.exe     -march=core-avx2,  Haswell, Broadwell, Skylake, Kabylake
--- a/14
+++ b/14
@@ -3,15 +3,19 @@ Compile instruction for Linux and Windows are at the bottom of this file.
 Change Log
 ----------
 v3.5.7
 Cryptonight 5% faster
 v3.5.6
-Updated Lyra2z for new zcoin algo post block 2050.
+Updated Lyra2z algo for new zcoin algo post block 2050.
-Cleanup up Lyra2 code and increased performance
+Cleaned up Lyra2 code and increased performance
  - Lyra2Z (zcoin) +12%
  - Lyra2REv2 +11%
  - Lyra2RE +6%
-  - Lyra2Z (zcoin) +12%
+Fixed x11evo algo performance on Windows.
-Fixed performance of x11evo on Windows to match Linux.
+Timetravel algo 3% to 5% faster
 Timetravel 3% to 5% faster
 Whirlpool algo 15% faster.
 Removed aclocal.m4 from .gitignore.
--- a/algo/cryptonight/cryptonight-aesni.c
+++ b/algo/cryptonight/cryptonight-aesni.c
@@ -93,15 +93,15 @@ static inline void ExpandAESKey256(char *keybuf)
 #endif
 }
 // align to 64 byte cache line
 typedef struct 
 {
-    uint8_t long_state[MEMORY] __attribute((aligned(16)));
+    uint8_t long_state[MEMORY] __attribute((aligned(64)));
    union cn_slow_hash_state state;
-    uint8_t text[INIT_SIZE_BYTE] __attribute((aligned(16)));
+    uint8_t text[INIT_SIZE_BYTE] __attribute((aligned(64)));
-    uint64_t a[AES_BLOCK_SIZE >> 3] __attribute__((aligned(16)));
+    uint64_t a[AES_BLOCK_SIZE >> 3] __attribute__((aligned(64)));
-    uint64_t b[AES_BLOCK_SIZE >> 3] __attribute__((aligned(16)));
+    uint64_t b[AES_BLOCK_SIZE >> 3] __attribute__((aligned(64)));
-    uint8_t c[AES_BLOCK_SIZE] __attribute__((aligned(16)));
+    uint8_t c[AES_BLOCK_SIZE] __attribute__((aligned(64)));
 //    oaes_ctx* aes_ctx;
 } cryptonight_ctx;
 static __thread cryptonight_ctx ctx;
@@ -110,7 +110,7 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 {
 #ifndef NO_AES_NI
    keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
-    uint8_t ExpandedKey[256];
+    uint8_t ExpandedKey[256] __attribute__((aligned(64)));
    size_t i, j;
    memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
@@ -125,27 +125,46 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
    //for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
    //    aesni_parallel_noxor(&ctx->long_state[i], ctx->text, ExpandedKey);
-    for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
+    // prefetch expkey, all of xmminput and enough longoutput for 4 loops
    _mm_prefetch( expkey,       _MM_HINT_T0 );
    _mm_prefetch( expkey   + 4, _MM_HINT_T0 );
    _mm_prefetch( expkey   + 8, _MM_HINT_T0 );
    _mm_prefetch( xmminput,     _MM_HINT_T0 );
    _mm_prefetch( xmminput + 4, _MM_HINT_T0 );
    for ( i = 0; i < 64; i += 8 )
    {
-	for(j = 0; j < 10; j++)
+       _mm_prefetch( longoutput + i,      _MM_HINT_T0 );
-	{
+       _mm_prefetch( longoutput + i +  4, _MM_HINT_T0 );
-		xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]);
+       _mm_prefetch( longoutput + i +  8, _MM_HINT_T0 );
-		xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]);
+       _mm_prefetch( longoutput + i + 12, _MM_HINT_T0 );
 		xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]);
 		xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]);
 		xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]);
 		xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]);
 		xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]);
 		xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]);
    }
-	_mm_store_si128(&(longoutput[(i >> 4)]), xmminput[0]);
+
-	_mm_store_si128(&(longoutput[(i >> 4) + 1]), xmminput[1]);
+    for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
-	_mm_store_si128(&(longoutput[(i >> 4) + 2]), xmminput[2]);
+    {
-	_mm_store_si128(&(longoutput[(i >> 4) + 3]), xmminput[3]);
+        // prefetch 4 loops ahead,
-	_mm_store_si128(&(longoutput[(i >> 4) + 4]), xmminput[4]);
+        _mm_prefetch( longoutput + i + 64, _MM_HINT_T0 );
-	_mm_store_si128(&(longoutput[(i >> 4) + 5]), xmminput[5]);
+        _mm_prefetch( longoutput + i + 68, _MM_HINT_T0 );
-	_mm_store_si128(&(longoutput[(i >> 4) + 6]), xmminput[6]);
+
-	_mm_store_si128(&(longoutput[(i >> 4) + 7]), xmminput[7]);
+	for (j = 0; j < 10; j++ )
 	{
 		xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
 		xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
 		xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
 		xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
 		xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
 		xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
 		xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
 		xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
 	}
 	_mm_store_si128( &( longoutput[i  ] ), xmminput[0] );
 	_mm_store_si128( &( longoutput[i+1] ), xmminput[1] );
 	_mm_store_si128( &( longoutput[i+2] ), xmminput[2] );
 	_mm_store_si128( &( longoutput[i+3] ), xmminput[3] );
 	_mm_store_si128( &( longoutput[i+4] ), xmminput[4] );
 	_mm_store_si128( &( longoutput[i+5] ), xmminput[5] );
 	_mm_store_si128( &( longoutput[i+6] ), xmminput[6] );
 	_mm_store_si128( &( longoutput[i+7] ), xmminput[7] );
    }
 //     cast_m128i( ctx.a ) = _mm_xor_si128( casti_m128i( ctx.state.k, 0 ) ,
@@ -171,13 +190,14 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
    for(i = 0; __builtin_expect(i < 0x80000, 1); i++)
    {	  
 	__m128i c_x = _mm_load_si128((__m128i *)&ctx.long_state[a[0] & 0x1FFFF0]);
 	__m128i a_x = _mm_load_si128((__m128i *)a);
        uint64_t c[2];
-	c_x = _mm_aesenc_si128(c_x, a_x);
+        _mm_prefetch( &ctx.long_state[c[0] & 0x1FFFF0], _MM_HINT_T0 );
 	__m128i c_x = _mm_load_si128( 
                              (__m128i *)&ctx.long_state[a[0] & 0x1FFFF0]);
 	__m128i a_x = _mm_load_si128((__m128i *)a);
 	c_x = _mm_aesenc_si128(c_x, a_x);
 	_mm_store_si128((__m128i *)c, c_x);
 	__builtin_prefetch(&ctx.long_state[c[0] & 0x1FFFF0], 0, 1);
 	b_x = _mm_xor_si128(b_x, c_x);
 	_mm_store_si128((__m128i *)&ctx.long_state[a[0] & 0x1FFFF0], b_x);
@@ -202,8 +222,9 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 	  a[1] += lo;
 	}
 	uint64_t *dst = (uint64_t*)&ctx.long_state[c[0] & 0x1FFFF0];
 //        __m128i *dst = (__m128i*)&ctx.long_state[c[0] & 0x1FFFF0];
-//        cast_m128i( dst ) = cast_m128i( a ); 
+//        *dst = cast_m128i( a ); 
 	dst[0] = a[0];
 	dst[1] = a[1];
@@ -211,41 +232,59 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 	a[0] ^= b[0];
 	a[1] ^= b[1];
 	b_x = c_x;
-	__builtin_prefetch(&ctx.long_state[a[0] & 0x1FFFF0], 0, 3);
+	_mm_prefetch( &ctx.long_state[a[0] & 0x1FFFF0], _MM_HINT_T0 );
    }
-    memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
+    memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
-    memcpy(ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE);
+    memcpy( ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE );
-    ExpandAESKey256(ExpandedKey);
+    ExpandAESKey256( ExpandedKey );
    //for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
    //    aesni_parallel_xor(&ctx->text, ExpandedKey, &ctx->long_state[i]);
-    for (i = 0; __builtin_expect(i < MEMORY, 1); i += INIT_SIZE_BYTE) 
+    // prefetch expkey, all of xmminput and enough longoutput for 4 loops
    _mm_prefetch( xmminput,     _MM_HINT_T0 );
    _mm_prefetch( xmminput + 4, _MM_HINT_T0 );
    for ( i = 0; i < 64; i += 16 )
    {
-         xmminput[0] = _mm_xor_si128(longoutput[(i >> 4)], xmminput[0]);
+       _mm_prefetch( longoutput + i,      _MM_HINT_T0 );
-         xmminput[1] = _mm_xor_si128(longoutput[(i >> 4) + 1], xmminput[1]);
+       _mm_prefetch( longoutput + i +  4, _MM_HINT_T0 );
-         xmminput[2] = _mm_xor_si128(longoutput[(i >> 4) + 2], xmminput[2]);
+       _mm_prefetch( longoutput + i +  8, _MM_HINT_T0 );
-         xmminput[3] = _mm_xor_si128(longoutput[(i >> 4) + 3], xmminput[3]);
+       _mm_prefetch( longoutput + i + 12, _MM_HINT_T0 );
-         xmminput[4] = _mm_xor_si128(longoutput[(i >> 4) + 4], xmminput[4]);
+    }
-         xmminput[5] = _mm_xor_si128(longoutput[(i >> 4) + 5], xmminput[5]);
+    _mm_prefetch( expkey,     _MM_HINT_T0 );
-         xmminput[6] = _mm_xor_si128(longoutput[(i >> 4) + 6], xmminput[6]);
+    _mm_prefetch( expkey + 4, _MM_HINT_T0 );
-         xmminput[7] = _mm_xor_si128(longoutput[(i >> 4) + 7], xmminput[7]);
+    _mm_prefetch( expkey + 8, _MM_HINT_T0 );
-         for(j = 0; j < 10; j++)
+    for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
    {
-            xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]);
+        // stay 4 loops ahead,
-	    xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]);
+        _mm_prefetch( longoutput + i + 64, _MM_HINT_T0 );
-	    xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]);
+        _mm_prefetch( longoutput + i + 68, _MM_HINT_T0 );
-	    xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]);
+
-	    xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]);
+        xmminput[0] = _mm_xor_si128( longoutput[i  ], xmminput[0] );
-	    xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]);
+        xmminput[1] = _mm_xor_si128( longoutput[i+1], xmminput[1] );
-	    xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]);
+        xmminput[2] = _mm_xor_si128( longoutput[i+2], xmminput[2] );
-	    xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]);
+        xmminput[3] = _mm_xor_si128( longoutput[i+3], xmminput[3] );
        xmminput[4] = _mm_xor_si128( longoutput[i+4], xmminput[4] );
        xmminput[5] = _mm_xor_si128( longoutput[i+5], xmminput[5] );
        xmminput[6] = _mm_xor_si128( longoutput[i+6], xmminput[6] );
        xmminput[7] = _mm_xor_si128( longoutput[i+7], xmminput[7] );
        for( j = 0; j < 10; j++ )
        {
            xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
 	    xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
 	    xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
 	    xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
 	    xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
 	    xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
 	    xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
 	    xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
        }
    }
-    memcpy(ctx.state.init, ctx.text, INIT_SIZE_BYTE);
+    memcpy( ctx.state.init, ctx.text, INIT_SIZE_BYTE);
    keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
    extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
--- a/algo/cryptonight/cryptonight.h
+++ b/algo/cryptonight/cryptonight.h
@@ -6,11 +6,13 @@
 #include "miner.h"
 #define MEMORY         (1 << 21) /* 2 MiB */
 #define MEMORY_M128I   (MEMORY >> 4) // 2 MiB / 16 = 128 ki * __m128i
 #define ITER           (1 << 20)
 #define AES_BLOCK_SIZE  16
 #define AES_KEY_SIZE    32 /*16*/
 #define INIT_SIZE_BLK   8
 #define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)	// 128
 #define INIT_SIZE_M128I (INIT_SIZE_BYTE >> 4) // 8
 #pragma pack(push, 1)
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -133,6 +133,9 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
 // from here on it's all simd acces to state and matrix
 // define vector pointers and adjust sizes and pointer offsets
   //================= Initializing the Sponge State ====================//
   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.5.6])
+AC_INIT([cpuminer-opt], [3.5.7])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM