mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v23.6
This commit is contained in:
@@ -2303,9 +2303,8 @@ static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
|
||||
XB[2] = _mm_blend_epi16( t0, t2, 0x0f );
|
||||
XB[3] = _mm_blend_epi16( t1, t3, 0xc3 );
|
||||
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#else // SSE2 or NEON
|
||||
|
||||
/*
|
||||
const v128u64_t mask_cc = v128_set64(0xffffffff00000000, 0xffffffff00000000);
|
||||
const v128u64_t mask_f0 = v128_set64(0xffffffffffffffff, 0);
|
||||
const v128u64_t mask_3c = v128_set64(0x00000000ffffffff, 0xffffffff00000000);
|
||||
@@ -2326,9 +2325,10 @@ static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
|
||||
XB[1] = v128_blendv( t1, t3, mask_3c );
|
||||
XB[2] = v128_blendv( t2, t0, mask_f0 );
|
||||
XB[3] = v128_blendv( t3, t1, mask_3c );
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
v128_t YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
|
||||
|
||||
YA0 = v128_set32( xa[15], xa[10], xa[ 5], xa[ 0] );
|
||||
@@ -2348,8 +2348,7 @@ static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
|
||||
XB[2] = YB2;
|
||||
XA[3] = YA3;
|
||||
XB[3] = YB3;
|
||||
|
||||
#endif
|
||||
*/
|
||||
}
|
||||
|
||||
static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
|
||||
@@ -2357,8 +2356,8 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
|
||||
|
||||
v128_t *XA = (v128_t*)xa;
|
||||
v128_t *XB = (v128_t*)xb;
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
v128_t t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
|
||||
v128_t t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
|
||||
@@ -2377,9 +2376,8 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
|
||||
XB[2] = _mm_blend_epi16( t1, t3, 0xcc );
|
||||
XB[3] = _mm_blend_epi16( t1, t3, 0x33 );
|
||||
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#else // SSE2 or NEON
|
||||
|
||||
/*
|
||||
const v128u64_t mask_cc = v128_set64(0xffffffff00000000, 0xffffffff00000000);
|
||||
const v128u64_t mask_f0 = v128_set64(0xffffffffffffffff, 0);
|
||||
const v128u64_t mask_3c = v128_set64(0x00000000ffffffff, 0xffffffff00000000);
|
||||
@@ -2389,19 +2387,21 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
|
||||
v128_t t2 = v128_blendv( XA[1], XA[3], mask_3c );
|
||||
v128_t t3 = v128_blendv( XA[3], XA[1], mask_3c );
|
||||
XA[0] = v128_blendv( t0, t2, mask_cc );
|
||||
XA[1] = v128_blendv( t1, t3, mask_cc );
|
||||
XA[2] = v128_blendv( t2, t0, mask_cc );
|
||||
XA[1] = v128_blendv( t2, t0, mask_cc );
|
||||
XA[2] = v128_blendv( t1, t3, mask_cc );
|
||||
XA[3] = v128_blendv( t3, t1, mask_cc );
|
||||
t0 = v128_blendv( XB[0], XB[2], mask_f0 );
|
||||
t1 = v128_blendv( XB[1], XB[3], mask_3c );
|
||||
t2 = v128_blendv( XB[2], XB[0], mask_f0 );
|
||||
t1 = v128_blendv( XB[2], XB[0], mask_f0 );
|
||||
t2 = v128_blendv( XB[1], XB[3], mask_3c );
|
||||
t3 = v128_blendv( XB[3], XB[1], mask_3c );
|
||||
XB[0] = v128_blendv( t0, t2, mask_cc );
|
||||
XB[1] = v128_blendv( t1, t3, mask_cc );
|
||||
XB[2] = v128_blendv( t2, t0, mask_cc );
|
||||
XB[1] = v128_blendv( t2, t0, mask_cc );
|
||||
XB[2] = v128_blendv( t1, t3, mask_cc );
|
||||
XB[3] = v128_blendv( t3, t1, mask_cc );
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
v128_ovly ya[4], za[4], yb[4], zb[4];
|
||||
|
||||
ya[0].m128 = XA[0];
|
||||
@@ -2457,9 +2457,7 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
|
||||
XB[2] = zb[2].m128;
|
||||
XA[3] = za[3].m128;
|
||||
XB[3] = zb[3].m128;
|
||||
|
||||
|
||||
#endif
|
||||
*/
|
||||
}
|
||||
|
||||
static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
|
||||
@@ -2611,7 +2609,7 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
|
||||
v128_t *XB = (v128_t*)xb;
|
||||
v128_t *XC = (v128_t*)xc;
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
v128_t t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
|
||||
v128_t t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
|
||||
@@ -2638,9 +2636,8 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
|
||||
XC[2] = _mm_blend_epi16( t0, t2, 0x0f );
|
||||
XC[3] = _mm_blend_epi16( t1, t3, 0xc3 );
|
||||
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#else // SSE2 or NEON
|
||||
|
||||
/*
|
||||
const v128u64_t mask_cc = v128_set64(0xffffffff00000000, 0xffffffff00000000);
|
||||
const v128u64_t mask_f0 = v128_set64(0xffffffffffffffff, 0);
|
||||
const v128u64_t mask_3c = v128_set64(0x00000000ffffffff, 0xffffffff00000000);
|
||||
@@ -2650,28 +2647,29 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
|
||||
v128_t t2 = v128_blendv( XA[2], XA[3], mask_cc );
|
||||
v128_t t3 = v128_blendv( XA[3], XA[2], mask_cc );
|
||||
XA[0] = v128_blendv( t0, t2, mask_f0 );
|
||||
XA[1] = v128_blendv( t1, t3, mask_3c );
|
||||
XA[2] = v128_blendv( t2, t0, mask_f0 );
|
||||
XA[1] = v128_blendv( t2, t0, mask_f0 );
|
||||
XA[2] = v128_blendv( t1, t3, mask_3c );
|
||||
XA[3] = v128_blendv( t3, t1, mask_3c );
|
||||
t0 = v128_blendv( XB[0], XB[1], mask_cc );
|
||||
t1 = v128_blendv( XB[1], XB[0], mask_cc );
|
||||
t2 = v128_blendv( XB[2], XB[3], mask_cc );
|
||||
t3 = v128_blendv( XB[3], XB[2], mask_cc );
|
||||
XB[0] = v128_blendv( t0, t2, mask_f0 );
|
||||
XB[1] = v128_blendv( t1, t3, mask_3c );
|
||||
XB[2] = v128_blendv( t2, t0, mask_f0 );
|
||||
XB[1] = v128_blendv( t2, t0, mask_f0 );
|
||||
XB[2] = v128_blendv( t1, t3, mask_3c );
|
||||
XB[3] = v128_blendv( t3, t1, mask_3c );
|
||||
t0 = v128_blendv( XC[0], XC[1], mask_cc );
|
||||
t1 = v128_blendv( XC[1], XC[0], mask_cc );
|
||||
t2 = v128_blendv( XC[2], XC[3], mask_cc );
|
||||
t3 = v128_blendv( XC[3], XC[2], mask_cc );
|
||||
XC[0] = v128_blendv( t0, t2, mask_f0 );
|
||||
XC[1] = v128_blendv( t1, t3, mask_3c );
|
||||
XC[2] = v128_blendv( t2, t0, mask_f0 );
|
||||
XC[1] = v128_blendv( t2, t0, mask_f0 );
|
||||
XC[2] = v128_blendv( t1, t3, mask_3c );
|
||||
XC[3] = v128_blendv( t3, t1, mask_3c );
|
||||
*/
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
v128_t YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
|
||||
|
||||
YA0 = v128_set32( xa[15], xa[10], xa[ 5], xa[ 0] );
|
||||
@@ -2699,9 +2697,7 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
|
||||
XA[3] = YA3;
|
||||
XB[3] = YB3;
|
||||
XC[3] = YC3;
|
||||
|
||||
|
||||
#endif
|
||||
*/
|
||||
}
|
||||
|
||||
static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
|
||||
@@ -2738,9 +2734,8 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
|
||||
XC[2] = _mm_blend_epi16( t1, t3, 0xcc );
|
||||
XC[3] = _mm_blend_epi16( t1, t3, 0x33 );
|
||||
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#else // SSE2 or NEON
|
||||
|
||||
/*
|
||||
const v128u64_t mask_cc = v128_set64(0xffffffff00000000, 0xffffffff00000000);
|
||||
const v128u64_t mask_f0 = v128_set64(0xffffffffffffffff, 0);
|
||||
const v128u64_t mask_3c = v128_set64(0x00000000ffffffff, 0xffffffff00000000);
|
||||
@@ -2750,27 +2745,29 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
|
||||
v128_t t2 = v128_blendv( XA[1], XA[3], mask_3c );
|
||||
v128_t t3 = v128_blendv( XA[3], XA[1], mask_3c );
|
||||
XA[0] = v128_blendv( t0, t2, mask_cc );
|
||||
XA[1] = v128_blendv( t1, t3, mask_cc );
|
||||
XA[2] = v128_blendv( t2, t0, mask_cc );
|
||||
XA[1] = v128_blendv( t2, t0, mask_cc );
|
||||
XA[2] = v128_blendv( t1, t3, mask_cc );
|
||||
XA[3] = v128_blendv( t3, t1, mask_cc );
|
||||
t0 = v128_blendv( XB[0], XB[2], mask_f0 );
|
||||
t1 = v128_blendv( XB[1], XB[3], mask_3c );
|
||||
t2 = v128_blendv( XB[2], XB[0], mask_f0 );
|
||||
t1 = v128_blendv( XB[2], XB[0], mask_f0 );
|
||||
t2 = v128_blendv( XB[1], XB[3], mask_3c );
|
||||
t3 = v128_blendv( XB[3], XB[1], mask_3c );
|
||||
XB[0] = v128_blendv( t0, t2, mask_cc );
|
||||
XB[1] = v128_blendv( t1, t3, mask_cc );
|
||||
XB[2] = v128_blendv( t2, t0, mask_cc );
|
||||
XB[1] = v128_blendv( t2, t0, mask_cc );
|
||||
XB[2] = v128_blendv( t1, t3, mask_cc );
|
||||
XB[3] = v128_blendv( t3, t1, mask_cc );
|
||||
t0 = v128_blendv( XC[0], XC[2], mask_f0 );
|
||||
t1 = v128_blendv( XC[1], XC[3], mask_3c );
|
||||
t2 = v128_blendv( XC[2], XC[0], mask_f0 );
|
||||
t1 = v128_blendv( XC[2], XC[0], mask_f0 );
|
||||
t2 = v128_blendv( XC[1], XC[3], mask_3c );
|
||||
t3 = v128_blendv( XC[3], XC[1], mask_3c );
|
||||
XC[0] = v128_blendv( t0, t2, mask_cc );
|
||||
XC[1] = v128_blendv( t1, t3, mask_cc );
|
||||
XC[2] = v128_blendv( t2, t0, mask_cc );
|
||||
XC[1] = v128_blendv( t2, t0, mask_cc );
|
||||
XC[2] = v128_blendv( t1, t3, mask_cc );
|
||||
XC[3] = v128_blendv( t3, t1, mask_cc );
|
||||
*/
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
v128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4];
|
||||
|
||||
ya[0].m128 = XA[0];
|
||||
@@ -2850,9 +2847,7 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
|
||||
XA[3] = za[3].m128;
|
||||
XB[3] = zb[3].m128;
|
||||
XC[3] = zc[3].m128;
|
||||
|
||||
|
||||
#endif
|
||||
*/
|
||||
}
|
||||
|
||||
// Triple buffered, 3x memory usage
|
||||
|
@@ -56,10 +56,10 @@ static const uint32_t sha256_initial_state[8] =
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define SCRYPT_THROUGHPUT 16
|
||||
#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
|
||||
#define SCRYPT_THROUGHPUT 2
|
||||
#elif defined(__AVX2__)
|
||||
#define SCRYPT_THROUGHPUT 8
|
||||
#elif defined(__SHA__) // NEON?
|
||||
#define SCRYPT_THROUGHPUT 2
|
||||
#else
|
||||
#define SCRYPT_THROUGHPUT 4
|
||||
#endif
|
||||
@@ -155,7 +155,7 @@ static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
|
||||
output[i] = bswap_32( ostate[i] );
|
||||
}
|
||||
|
||||
#if defined(__SHA__)
|
||||
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
|
||||
|
||||
static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0,
|
||||
const uint32_t *key1, uint32_t *tstate0, uint32_t *tstate1,
|
||||
@@ -266,6 +266,9 @@ static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,
|
||||
|
||||
#endif // SHA
|
||||
|
||||
|
||||
|
||||
|
||||
static const uint32_t keypad_4way[4 * 12] = {
|
||||
0x80000000, 0x80000000, 0x80000000, 0x80000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
@@ -1221,10 +1224,10 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#if ( SCRYPT_THROUGHPUT == 2 ) && defined(__SHA__)
|
||||
#if ( SCRYPT_THROUGHPUT == 2 ) && ( defined(__SHA__) || defined(__ARM_FEATURE_SHA2) )
|
||||
|
||||
static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
|
||||
uint32_t *midstate, int N, int thrid )
|
||||
static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input,
|
||||
uint32_t *output, uint32_t *midstate, int N, int thrid )
|
||||
{
|
||||
uint32_t _ALIGN(128) tstate[ 2*8 ];
|
||||
uint32_t _ALIGN(128) ostate[ 2*8 ];
|
||||
@@ -1241,13 +1244,13 @@ static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
|
||||
scrypt_core_simd128_2buf( W, scratchbuf, N );
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
|
||||
PBKDF2_SHA256_128_32_SHA_2BUF( tstate, tstate+8, ostate, ostate+8, W, W+32,
|
||||
output, output+8 );
|
||||
PBKDF2_SHA256_128_32_SHA_2BUF( tstate, tstate+8, ostate,
|
||||
ostate+8, W, W+32, output, output+8 );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // THROUGHPUT = 2 && SHA
|
||||
|
||||
#if ( SCRYPT_THROUGHPUT == 4 )
|
||||
|
||||
@@ -1267,13 +1270,10 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
|
||||
|
||||
HMAC_SHA256_80_init( input, tstate, ostate );
|
||||
PBKDF2_SHA256_80_128( tstate, ostate, input, W );
|
||||
|
||||
HMAC_SHA256_80_init( input +20, tstate+ 8, ostate+ 8 );
|
||||
PBKDF2_SHA256_80_128( tstate+ 8, ostate+ 8, input +20, W+32 );
|
||||
|
||||
HMAC_SHA256_80_init( input +40, tstate+16, ostate+16 );
|
||||
PBKDF2_SHA256_80_128( tstate+16, ostate+16, input +40, W+64 );
|
||||
|
||||
HMAC_SHA256_80_init( input +60, tstate+24, ostate+24 );
|
||||
PBKDF2_SHA256_80_128( tstate+24, ostate+24, input +60, W+96 );
|
||||
|
||||
@@ -1303,11 +1303,8 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
|
||||
PBKDF2_SHA256_128_32( tstate, ostate, W, output );
|
||||
|
||||
PBKDF2_SHA256_128_32( tstate+ 8, ostate+ 8, W+32, output+ 8 );
|
||||
|
||||
PBKDF2_SHA256_128_32( tstate+16, ostate+16, W+64, output+16 );
|
||||
|
||||
PBKDF2_SHA256_128_32( tstate+24, ostate+24, W+96, output+24 );
|
||||
|
||||
return 1;
|
||||
@@ -1418,14 +1415,14 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
|
||||
rc = scrypt_N_1_1_256_8way( data, hash, midstate, opt_param_n,
|
||||
thr_id );
|
||||
#elif ( SCRYPT_THROUGHPUT == 4 )
|
||||
#if defined(__SHA__)
|
||||
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
|
||||
rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, opt_param_n,
|
||||
thr_id );
|
||||
#else
|
||||
rc = scrypt_N_1_1_256_4way( data, hash, midstate, opt_param_n,
|
||||
thr_id );
|
||||
#endif
|
||||
#elif ( SCRYPT_THROUGHPUT == 2 ) && defined(__SHA__)
|
||||
#elif ( SCRYPT_THROUGHPUT == 2 ) && ( defined(__SHA__) || defined(__ARM_FEATURE_SHA2) )
|
||||
rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, opt_param_n,
|
||||
thr_id );
|
||||
#else
|
||||
@@ -1472,10 +1469,10 @@ bool scrypt_miner_thread_init( int thr_id )
|
||||
|
||||
bool register_scrypt_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(__SHA__)
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT;
|
||||
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
|
||||
#else
|
||||
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
#endif
|
||||
gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_scrypt;
|
||||
@@ -1492,15 +1489,15 @@ bool register_scrypt_algo( algo_gate_t* gate )
|
||||
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
|
||||
else
|
||||
scratchbuf_size = opt_param_n * 4 * 128; // 4 way
|
||||
#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
|
||||
// scrypt_throughput = 2;
|
||||
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
|
||||
#elif defined(__AVX2__)
|
||||
// scrypt_throughput = 8;
|
||||
if ( opt_param_n > 0x4000 )
|
||||
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
|
||||
else
|
||||
scratchbuf_size = opt_param_n * 2 * 128; // 2 way
|
||||
#elif defined(__SHA__)
|
||||
// scrypt_throughput = 4;
|
||||
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
|
||||
#else
|
||||
// scrypt_throughput = 4;
|
||||
if ( opt_param_n > 0x4000 )
|
||||
|
Reference in New Issue
Block a user