mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.5.8
This commit is contained in:
@@ -2,6 +2,12 @@ Compile instruction for Linux and Windows are at the bottom of this file.
|
||||
|
||||
Change Log
|
||||
----------
|
||||
v3.5.8
|
||||
|
||||
Lyra2RE fixed on Windows, broken in v3.5.6.
|
||||
Ported AES cryptonight optimizations from v3.5.7 to non-AES version
|
||||
with little improvement.
|
||||
Marginal improvements to xevan and veltor.
|
||||
|
||||
v3.5.7
|
||||
|
||||
|
@@ -126,25 +126,24 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
|
||||
// aesni_parallel_noxor(&ctx->long_state[i], ctx->text, ExpandedKey);
|
||||
|
||||
// prefetch expkey, all of xmminput and enough longoutput for 4 loops
|
||||
_mm_prefetch( expkey, _MM_HINT_T0 );
|
||||
_mm_prefetch( expkey + 4, _MM_HINT_T0 );
|
||||
_mm_prefetch( expkey + 8, _MM_HINT_T0 );
|
||||
_mm_prefetch( xmminput, _MM_HINT_T0 );
|
||||
_mm_prefetch( xmminput + 4, _MM_HINT_T0 );
|
||||
|
||||
for ( i = 0; i < 64; i += 8 )
|
||||
for ( i = 0; i < 64; i += 16 )
|
||||
{
|
||||
_mm_prefetch( longoutput + i, _MM_HINT_T0 );
|
||||
_mm_prefetch( longoutput + i + 4, _MM_HINT_T0 );
|
||||
_mm_prefetch( longoutput + i + 8, _MM_HINT_T0 );
|
||||
_mm_prefetch( longoutput + i + 12, _MM_HINT_T0 );
|
||||
}
|
||||
_mm_prefetch( expkey, _MM_HINT_T0 );
|
||||
_mm_prefetch( expkey + 4, _MM_HINT_T0 );
|
||||
_mm_prefetch( expkey + 8, _MM_HINT_T0 );
|
||||
|
||||
for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
|
||||
{
|
||||
// prefetch 4 loops ahead,
|
||||
_mm_prefetch( longoutput + i + 64, _MM_HINT_T0 );
|
||||
_mm_prefetch( longoutput + i + 68, _MM_HINT_T0 );
|
||||
__builtin_prefetch( longoutput + i + 64, 1, 0 );
|
||||
__builtin_prefetch( longoutput + i + 68, 1, 0 );
|
||||
|
||||
for (j = 0; j < 10; j++ )
|
||||
{
|
||||
@@ -191,7 +190,7 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
|
||||
for(i = 0; __builtin_expect(i < 0x80000, 1); i++)
|
||||
{
|
||||
uint64_t c[2];
|
||||
_mm_prefetch( &ctx.long_state[c[0] & 0x1FFFF0], _MM_HINT_T0 );
|
||||
__builtin_prefetch( &ctx.long_state[c[0] & 0x1FFFF0], 0, 1 );
|
||||
|
||||
__m128i c_x = _mm_load_si128(
|
||||
(__m128i *)&ctx.long_state[a[0] & 0x1FFFF0]);
|
||||
@@ -232,7 +231,7 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
|
||||
a[0] ^= b[0];
|
||||
a[1] ^= b[1];
|
||||
b_x = c_x;
|
||||
_mm_prefetch( &ctx.long_state[a[0] & 0x1FFFF0], _MM_HINT_T0 );
|
||||
__builtin_prefetch( &ctx.long_state[a[0] & 0x1FFFF0], 0, 3 );
|
||||
}
|
||||
|
||||
memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
|
||||
@@ -243,6 +242,7 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
|
||||
// aesni_parallel_xor(&ctx->text, ExpandedKey, &ctx->long_state[i]);
|
||||
|
||||
// prefetch expkey, all of xmminput and enough longoutput for 4 loops
|
||||
|
||||
_mm_prefetch( xmminput, _MM_HINT_T0 );
|
||||
_mm_prefetch( xmminput + 4, _MM_HINT_T0 );
|
||||
for ( i = 0; i < 64; i += 16 )
|
||||
|
@@ -5,6 +5,7 @@
|
||||
// Modified for CPUminer by Lucas Jones
|
||||
|
||||
#include "miner.h"
|
||||
#include <memory.h>
|
||||
|
||||
#if defined(__arm__) || defined(_MSC_VER)
|
||||
#ifndef NOASM
|
||||
@@ -175,11 +176,27 @@ void cryptonight_hash_ctx(void* output, const void* input, int len)
|
||||
{
|
||||
hash_process(&ctx.state.hs, (const uint8_t*) input, len);
|
||||
ctx.aes_ctx = (oaes_ctx*) oaes_alloc();
|
||||
|
||||
__builtin_prefetch( ctx.text, 0, 3 );
|
||||
__builtin_prefetch( ctx.text + 64, 0, 3 );
|
||||
__builtin_prefetch( ctx.long_state, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 64, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 128, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 192, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 256, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 320, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 384, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 448, 1, 0 );
|
||||
|
||||
size_t i, j;
|
||||
memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
|
||||
|
||||
oaes_key_import_data(ctx.aes_ctx, ctx.state.hs.b, AES_KEY_SIZE);
|
||||
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
|
||||
|
||||
__builtin_prefetch( ctx.long_state + i + 512, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + i + 576, 1, 0 );
|
||||
|
||||
aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 0], ctx.aes_ctx->key->exp_data);
|
||||
aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 1], ctx.aes_ctx->key->exp_data);
|
||||
aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 2], ctx.aes_ctx->key->exp_data);
|
||||
@@ -213,9 +230,24 @@ void cryptonight_hash_ctx(void* output, const void* input, int len)
|
||||
mul_sum_xor_dst(ctx.b, ctx.a, &ctx.long_state[e2i(ctx.b)]);
|
||||
}
|
||||
|
||||
__builtin_prefetch( ctx.text, 0, 3 );
|
||||
__builtin_prefetch( ctx.text + 64, 0, 3 );
|
||||
__builtin_prefetch( ctx.long_state, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 64, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 128, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 192, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 256, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 320, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 384, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + 448, 1, 0 );
|
||||
|
||||
memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
|
||||
oaes_key_import_data(ctx.aes_ctx, &ctx.state.hs.b[32], AES_KEY_SIZE);
|
||||
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
|
||||
|
||||
__builtin_prefetch( ctx.long_state + i + 512, 1, 0 );
|
||||
__builtin_prefetch( ctx.long_state + i + 576, 1, 0 );
|
||||
|
||||
xor_blocks(&ctx.text[0 * AES_BLOCK_SIZE], &ctx.long_state[i + 0 * AES_BLOCK_SIZE]);
|
||||
aesb_pseudo_round_mut(&ctx.text[0 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
|
||||
xor_blocks(&ctx.text[1 * AES_BLOCK_SIZE], &ctx.long_state[i + 1 * AES_BLOCK_SIZE]);
|
||||
|
@@ -71,20 +71,6 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||
// for Lyra2REv2, nCols = 4, v1 was using 8
|
||||
const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
|
||||
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
|
||||
/*
|
||||
i = (int64_t)ROW_LEN_BYTES * nRows;
|
||||
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
|
||||
if (wholeMatrix == NULL)
|
||||
return -1;
|
||||
|
||||
#if defined (__AVX2__)
|
||||
memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
|
||||
#elif defined(__AVX__)
|
||||
memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
|
||||
#else
|
||||
memset(wholeMatrix, 0, i);
|
||||
#endif
|
||||
*/
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
|
||||
//=== Getting the password + salt + basil padded with 10*1 ==========//
|
||||
@@ -219,13 +205,12 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||
//Squeezes the key
|
||||
squeeze(state, K, (unsigned int) kLen);
|
||||
|
||||
//================== Freeing the memory =============================//
|
||||
// free(wholeMatrix);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols )
|
||||
int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||
uint64_t pwdlen, const void *salt, uint64_t saltlen,
|
||||
uint64_t timeCost, uint64_t nRows, uint64_t nCols )
|
||||
{
|
||||
//========================== Basic variables ============================//
|
||||
uint64_t _ALIGN(256) state[16];
|
||||
@@ -244,27 +229,14 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, uint
|
||||
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
/*
|
||||
i = (int64_t)ROW_LEN_BYTES * nRows;
|
||||
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
|
||||
|
||||
if (wholeMatrix == NULL)
|
||||
return -1;
|
||||
|
||||
#if defined (__AVX2__)
|
||||
memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
|
||||
#elif defined(__AVX__)
|
||||
memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
|
||||
#else
|
||||
memset(wholeMatrix, 0, i);
|
||||
#endif
|
||||
*/
|
||||
//==== Getting the password + salt + basil padded with 10*1 ============//
|
||||
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
|
||||
//but this ensures that the password copied locally will be overwritten as soon as possible
|
||||
|
||||
//First, we clean enough blocks for the password, salt, basil and padding
|
||||
uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
|
||||
uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 *
|
||||
sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
||||
byte *ptrByte = (byte*) wholeMatrix;
|
||||
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES );
|
||||
|
||||
@@ -366,17 +338,15 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, uint
|
||||
|
||||
//========================= Wrap-up Phase ===============================//
|
||||
//Absorbs the last block of the memory matrix
|
||||
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
|
||||
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
|
||||
|
||||
//Squeezes the key
|
||||
squeeze( state, K, kLen );
|
||||
|
||||
//====================== Freeing the memory =============================//
|
||||
// _mm_free(state);
|
||||
// _mm_free( wholeMatrix );
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Lyra2RE doesn't like the new wholeMatrix implementation
|
||||
int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
|
||||
uint64_t pwdlen, const void *salt, uint64_t saltlen,
|
||||
uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
|
||||
@@ -548,7 +518,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
|
||||
squeeze(state, K, (unsigned int) kLen);
|
||||
|
||||
//================== Freeing the memory =============================//
|
||||
free(wholeMatrix);
|
||||
_mm_free(wholeMatrix);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@@ -69,7 +69,8 @@ void lyra2re_hash(void *state, const void *input)
|
||||
sph_keccak256(&ctx.keccak, hashA, 32);
|
||||
sph_keccak256_close(&ctx.keccak, hashB);
|
||||
|
||||
LYRA2RE( hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
|
||||
LYRA2RE( hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
|
||||
// LYRA2RE( lyra2re_wholeMatrix, hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
|
||||
|
||||
sph_skein256(&ctx.skein, hashA, 32);
|
||||
sph_skein256_close(&ctx.skein, hashB);
|
||||
|
@@ -12,6 +12,9 @@
|
||||
#include "lyra2.h"
|
||||
#include "avxdefs.h"
|
||||
|
||||
// This gets allocated when miner_thread starts up and is never freed.
|
||||
// It's not a leak because the only way to allocate it again is to exit
|
||||
// the thread and that only occurs when the entire program exits.
|
||||
__thread uint64_t* l2v2_wholeMatrix;
|
||||
|
||||
typedef struct {
|
||||
|
@@ -18,6 +18,7 @@ typedef struct {
|
||||
} veltor_ctx_holder;
|
||||
|
||||
veltor_ctx_holder veltor_ctx;
|
||||
static __thread sph_skein512_context veltor_skein_mid;
|
||||
|
||||
void init_veltor_ctx()
|
||||
{
|
||||
@@ -27,6 +28,12 @@ void init_veltor_ctx()
|
||||
sph_shabal512_init( &veltor_ctx.shabal);
|
||||
}
|
||||
|
||||
void veltor_skein512_midstate( const void* input )
|
||||
{
|
||||
memcpy( &veltor_skein_mid, &veltor_ctx.skein, sizeof veltor_skein_mid );
|
||||
sph_skein512( &veltor_skein_mid, input, 64 );
|
||||
}
|
||||
|
||||
void veltorhash(void *output, const void *input)
|
||||
{
|
||||
uint32_t _ALIGN(64) hashA[16], hashB[16];
|
||||
@@ -34,7 +41,13 @@ void veltorhash(void *output, const void *input)
|
||||
veltor_ctx_holder ctx;
|
||||
memcpy( &ctx, &veltor_ctx, sizeof(veltor_ctx) );
|
||||
|
||||
sph_skein512(&ctx.skein, input, 80);
|
||||
const int midlen = 64; // bytes
|
||||
const int tail = 80 - midlen; // 16
|
||||
|
||||
memcpy( &ctx.skein, &veltor_skein_mid, sizeof veltor_skein_mid );
|
||||
sph_skein512( &ctx.skein, input + midlen, tail );
|
||||
|
||||
// sph_skein512(&ctx.skein, input, 80);
|
||||
sph_skein512_close(&ctx.skein, hashA);
|
||||
|
||||
sph_shavite512(&ctx.shavite, hashA, 64);
|
||||
@@ -68,6 +81,9 @@ int scanhash_veltor(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
|
||||
for (int i=0; i < 19; i++) {
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
}
|
||||
|
||||
veltor_skein512_midstate( endiandata );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], nonce);
|
||||
veltorhash(hash, endiandata);
|
||||
|
18
algo/xevan.c
18
algo/xevan.c
@@ -56,6 +56,7 @@ typedef struct {
|
||||
} xevan_ctx_holder;
|
||||
|
||||
xevan_ctx_holder xevan_ctx;
|
||||
static __thread sph_blake512_context xevan_blake_mid;
|
||||
|
||||
void init_xevan_ctx()
|
||||
{
|
||||
@@ -83,15 +84,26 @@ void init_xevan_ctx()
|
||||
#endif
|
||||
};
|
||||
|
||||
void xevan_blake512_midstate( const void* input )
|
||||
{
|
||||
memcpy( &xevan_blake_mid, &xevan_ctx.blake, sizeof xevan_blake_mid );
|
||||
sph_blake512( &xevan_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
void xevan_hash(void *output, const void *input)
|
||||
{
|
||||
uint32_t _ALIGN(64) hash[32]; // 128 bytes required
|
||||
const int dataLen = 128;
|
||||
|
||||
xevan_ctx_holder ctx;
|
||||
memcpy( &ctx, &xevan_ctx, sizeof(xevan_ctx) );
|
||||
|
||||
sph_blake512(&ctx.blake, input, 80);
|
||||
const int midlen = 64; // bytes
|
||||
const int tail = 80 - midlen; // 16
|
||||
|
||||
memcpy( &ctx.blake, &xevan_blake_mid, sizeof xevan_blake_mid );
|
||||
sph_blake512( &ctx.blake, input + midlen, tail );
|
||||
|
||||
// sph_blake512(&ctx.blake, input, 80);
|
||||
sph_blake512_close(&ctx.blake, hash);
|
||||
|
||||
memset(&hash[16], 0, 64);
|
||||
@@ -239,6 +251,8 @@ int scanhash_xevan(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *
|
||||
for (int k=0; k < 19; k++)
|
||||
be32enc(&endiandata[k], pdata[k]);
|
||||
|
||||
xevan_blake512_midstate( endiandata );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], nonce);
|
||||
xevan_hash(hash, endiandata);
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.5.7])
|
||||
AC_INIT([cpuminer-opt], [3.5.8])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
Reference in New Issue
Block a user