mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.19.6
This commit is contained in:
@@ -65,6 +65,12 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.19.6
|
||||
|
||||
#363 Fixed a stratum bug where the first job may be ignored delaying start of hashing
|
||||
Fixed handling of nonce exhaust when hashing a fast algo with extranonce disabled
|
||||
Small optimization to Shavite.
|
||||
|
||||
v3.19.5
|
||||
|
||||
Enhanced stratum-keepalive preemptively resets the stratum connection
|
||||
|
@@ -69,7 +69,6 @@ void allium_16way_hash( void *state, const void *input )
|
||||
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
|
||||
hash15, 256 );
|
||||
|
||||
// rintrlv_8x32_8x64( vhashA, vhash, 256 );
|
||||
keccak256_8way_update( &ctx.keccak, vhashA, 32 );
|
||||
keccak256_8way_close( &ctx.keccak, vhashA);
|
||||
keccak256_8way_init( &ctx.keccak );
|
||||
@@ -284,7 +283,7 @@ void allium_8way_hash( void *hash, const void *input )
|
||||
blake256_8way_close( &ctx.blake, vhashA );
|
||||
|
||||
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhashA, 256 );
|
||||
vhashA, 256 );
|
||||
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
|
||||
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
|
||||
|
||||
|
@@ -49,7 +49,7 @@ void lyra2z_16way_hash( void *state, const void *input )
|
||||
|
||||
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
|
||||
vhash, 256 );
|
||||
vhash, 256 );
|
||||
|
||||
intrlv_2x256( vhash, hash0, hash1, 256 );
|
||||
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
|
||||
|
@@ -18,10 +18,13 @@ static const uint32_t IV512[] =
|
||||
0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
#define mm256_ror2x256hi_1x32( a, b ) \
|
||||
_mm256_blend_epi32( mm256_shuflr128_32( a ), \
|
||||
mm256_shuflr128_32( b ), 0x88 )
|
||||
*/
|
||||
|
||||
//#define mm256_ror2x256hi_1x32( a, b ) _mm256_alignr_epi8( b, a, 4 )
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
@@ -127,24 +130,24 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
|
||||
// round 2, 6, 10
|
||||
|
||||
k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
|
||||
k00 = _mm256_xor_si256( k00, _mm256_alignr_epi8( k13, k12, 4 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k00 ), zero );
|
||||
k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
|
||||
k01 = _mm256_xor_si256( k01, _mm256_alignr_epi8( k00, k13, 4 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
||||
k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
|
||||
k02 = _mm256_xor_si256( k02, _mm256_alignr_epi8( k01, k00, 4 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
||||
k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
|
||||
k03 = _mm256_xor_si256( k03, _mm256_alignr_epi8( k02, k01, 4 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
||||
|
||||
p2 = _mm256_xor_si256( p2, x );
|
||||
|
||||
k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
|
||||
k10 = _mm256_xor_si256( k10, _mm256_alignr_epi8( k03, k02, 4 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k10 ), zero );
|
||||
k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
|
||||
k11 = _mm256_xor_si256( k11, _mm256_alignr_epi8( k10, k03, 4 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
||||
k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
|
||||
k12 = _mm256_xor_si256( k12, _mm256_alignr_epi8( k11, k10, 4 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
||||
k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
|
||||
k13 = _mm256_xor_si256( k13, _mm256_alignr_epi8( k12, k11, 4 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
|
||||
|
||||
p0 = _mm256_xor_si256( p0, x );
|
||||
@@ -183,24 +186,24 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
|
||||
// round 4, 8, 12
|
||||
|
||||
k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
|
||||
k00 = _mm256_xor_si256( k00, _mm256_alignr_epi8( k13, k12, 4 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ), zero );
|
||||
k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
|
||||
k01 = _mm256_xor_si256( k01, _mm256_alignr_epi8( k00, k13, 4 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
||||
k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
|
||||
k02 = _mm256_xor_si256( k02, _mm256_alignr_epi8( k01, k00, 4 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
||||
k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
|
||||
k03 = _mm256_xor_si256( k03, _mm256_alignr_epi8( k02, k01, 4 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
||||
|
||||
p0 = _mm256_xor_si256( p0, x );
|
||||
|
||||
k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
|
||||
k10 = _mm256_xor_si256( k10, _mm256_alignr_epi8( k03, k02, 4 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ), zero );
|
||||
k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
|
||||
k11 = _mm256_xor_si256( k11, _mm256_alignr_epi8( k10, k03, 4 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
||||
k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
|
||||
k12 = _mm256_xor_si256( k12, _mm256_alignr_epi8( k11, k10, 4 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
||||
k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
|
||||
k13 = _mm256_xor_si256( k13, _mm256_alignr_epi8( k12, k11, 4 ) );
|
||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
|
||||
|
||||
p2 = _mm256_xor_si256( p2, x );
|
||||
|
@@ -11,10 +11,6 @@ static const uint32_t IV512[] =
|
||||
0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
|
||||
};
|
||||
|
||||
#define mm512_ror2x512hi_1x32( a, b ) \
|
||||
_mm512_mask_blend_epi32( 0x8888, mm512_shuflr128_32( a ), \
|
||||
mm512_shuflr128_32( b ) )
|
||||
|
||||
static void
|
||||
c512_4way( shavite512_4way_context *ctx, const void *msg )
|
||||
{
|
||||
@@ -106,24 +102,24 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
|
||||
|
||||
// round 2, 6, 10
|
||||
|
||||
K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) );
|
||||
K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), m512_zero );
|
||||
K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) );
|
||||
K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
||||
K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) );
|
||||
K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
||||
K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) );
|
||||
K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
||||
|
||||
P2 = _mm512_xor_si512( P2, X );
|
||||
|
||||
K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) );
|
||||
K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), m512_zero );
|
||||
K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) );
|
||||
K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
||||
K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) );
|
||||
K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
||||
K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) );
|
||||
K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
||||
|
||||
P0 = _mm512_xor_si512( P0, X );
|
||||
@@ -162,24 +158,24 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
|
||||
|
||||
// round 4, 8, 12
|
||||
|
||||
K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) );
|
||||
K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
|
||||
K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) );
|
||||
K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
||||
K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) );
|
||||
K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
||||
K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) );
|
||||
K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
||||
|
||||
P0 = _mm512_xor_si512( P0, X );
|
||||
|
||||
K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) );
|
||||
K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
|
||||
K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) );
|
||||
K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
||||
K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) );
|
||||
K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
||||
K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) );
|
||||
K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
||||
|
||||
P2 = _mm512_xor_si512( P2, X );
|
||||
|
@@ -59,30 +59,6 @@ static const sph_u32 IV512[] = {
|
||||
C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
|
||||
};
|
||||
|
||||
// Partially rotate elements in two 128 bit vectors a & b as one 256 bit vector
|
||||
// and return the rotated 128 bit vector a.
|
||||
// a[3:0] = { b[0], a[3], a[2], a[1] }
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
#define mm128_ror256hi_1x32( a, b ) _mm_alignr_epi8( b, a, 4 )
|
||||
|
||||
#else // SSE2
|
||||
|
||||
#define mm128_ror256hi_1x32( a, b ) \
|
||||
_mm_or_si128( _mm_srli_si128( a, 4 ), \
|
||||
_mm_slli_si128( b, 12 ) )
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
#if defined(__AVX2__)
|
||||
// 2 way version of above
|
||||
// a[7:0] = { b[4], a[7], a[6], a[5], b[0], a[3], a[2], a[1] }
|
||||
#define mm256_ror2x256hi_1x32( a, b ) \
|
||||
_mm256_blend_epi32( mm256_ror256_1x32( a ), \
|
||||
mm256_rol256_3x32( b ), 0x88 )
|
||||
#endif
|
||||
*/
|
||||
|
||||
static void
|
||||
c512( sph_shavite_big_context *sc, const void *msg )
|
||||
@@ -190,31 +166,31 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
|
||||
// round 2, 6, 10
|
||||
|
||||
k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
|
||||
k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) );
|
||||
x = _mm_xor_si128( p3, k00 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
|
||||
k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) );
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
|
||||
k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) );
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
|
||||
k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) );
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
p2 = _mm_xor_si128( p2, x );
|
||||
|
||||
k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
|
||||
k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) );
|
||||
x = _mm_xor_si128( p1, k10 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
|
||||
k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) );
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
|
||||
k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) );
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
|
||||
k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) );
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
@@ -262,31 +238,31 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
|
||||
// round 4, 8, 12
|
||||
|
||||
k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
|
||||
k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) );
|
||||
x = _mm_xor_si128( p1, k00 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
|
||||
k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) );
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
|
||||
k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) );
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
|
||||
k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) );
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
|
||||
k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
|
||||
k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) );
|
||||
x = _mm_xor_si128( p3, k10 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
|
||||
k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) );
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
|
||||
k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) );
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
|
||||
k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) );
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
|
@@ -35,7 +35,7 @@
|
||||
|
||||
#include "sph_shavite.h"
|
||||
|
||||
#if !defined(__AES__)
|
||||
#if !(defined(__AES__) && defined(__SSSE3__))
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
|
@@ -263,7 +263,7 @@ void sph_shavite384_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
//Don't call these directly from application code, use the macros below.
|
||||
#ifdef __AES__
|
||||
#if defined(__AES__) && defined(__SSSE3__)
|
||||
|
||||
void sph_shavite512_aesni_init(void *cc);
|
||||
void sph_shavite512_aesni(void *cc, const void *data, size_t len);
|
||||
|
@@ -36,8 +36,8 @@ mv cpuminer cpuminer-avx2-sha-vaes
|
||||
# AVX2 SHA AES: AMD Zen1
|
||||
make clean || echo done
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=znver1 -maes -Wall -fno-common" ./configure --with-curl
|
||||
#CFLAGS="-O3 -maes -mavx2 -msha -Wall -fno-common" ./configure --with-curl
|
||||
#CFLAGS="-O3 -march=znver1 -maes -Wall -fno-common" ./configure --with-curl
|
||||
CFLAGS="-O3 -maes -mavx2 -msha -Wall -fno-common" ./configure --with-curl
|
||||
make -j 8
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-avx2-sha
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.5.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.6.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.19.5'
|
||||
PACKAGE_STRING='cpuminer-opt 3.19.5'
|
||||
PACKAGE_VERSION='3.19.6'
|
||||
PACKAGE_STRING='cpuminer-opt 3.19.6'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.19.5 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.19.6 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1404,7 +1404,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.19.5:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.19.6:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1509,7 +1509,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.19.5
|
||||
cpuminer-opt configure 3.19.6
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.19.5, which was
|
||||
It was created by cpuminer-opt $as_me 3.19.6, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2993,7 +2993,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.19.5'
|
||||
VERSION='3.19.6'
|
||||
|
||||
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.19.5, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.19.6, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -6756,7 +6756,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.19.5
|
||||
cpuminer-opt config.status 3.19.6
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.19.5])
|
||||
AC_INIT([cpuminer-opt], [3.19.6])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
35
cpu-miner.c
35
cpu-miner.c
@@ -2246,7 +2246,7 @@ static void *miner_thread( void *userdata )
|
||||
|
||||
if ( !algo_gate.miner_thread_init( thr_id ) )
|
||||
{
|
||||
applog( LOG_ERR, "FAIL: thread %u failed to initialize", thr_id );
|
||||
applog( LOG_ERR, "FAIL: thread %d failed to initialize", thr_id );
|
||||
exit (1);
|
||||
}
|
||||
|
||||
@@ -2274,10 +2274,24 @@ static void *miner_thread( void *userdata )
|
||||
{
|
||||
while ( unlikely( stratum_down ) )
|
||||
sleep( 1 );
|
||||
if ( *nonceptr >= end_nonce )
|
||||
stratum_gen_work( &stratum, &g_work );
|
||||
if ( unlikely( ( *nonceptr >= end_nonce )
|
||||
&& !work_restart[thr_id].restart ) )
|
||||
{
|
||||
if ( opt_extranonce )
|
||||
stratum_gen_work( &stratum, &g_work );
|
||||
else
|
||||
{
|
||||
if ( !thr_id )
|
||||
{
|
||||
applog( LOG_WARNING, "nonce range exhausted, extranonce not subscribed" );
|
||||
applog( LOG_WARNING, "waiting for new work...");
|
||||
}
|
||||
while ( !work_restart[thr_id].restart )
|
||||
sleep ( 1 );
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
else // GBT or getwork
|
||||
{
|
||||
pthread_rwlock_wrlock( &g_work_lock );
|
||||
|
||||
@@ -2288,8 +2302,7 @@ static void *miner_thread( void *userdata )
|
||||
if ( unlikely( !get_work( mythr, &g_work ) ) )
|
||||
{
|
||||
pthread_rwlock_unlock( &g_work_lock );
|
||||
applog( LOG_ERR, "work retrieval failed, exiting "
|
||||
"mining thread %d", thr_id );
|
||||
applog( LOG_ERR, "work retrieval failed, exiting miner thread %d", thr_id );
|
||||
goto out;
|
||||
}
|
||||
g_work_time = time(NULL);
|
||||
@@ -2805,15 +2818,11 @@ static void *stratum_thread(void *userdata )
|
||||
{
|
||||
stratum_down = false;
|
||||
applog(LOG_BLUE,"Stratum connection established" );
|
||||
if ( stratum.new_job ) // prime first job
|
||||
stratum_gen_work( &stratum, &g_work );
|
||||
}
|
||||
}
|
||||
|
||||
// report_summary_log( ( stratum_diff != stratum.job.diff )
|
||||
// && ( stratum_diff != 0. ) );
|
||||
|
||||
// if ( stratum.new_job )
|
||||
// stratum_gen_work( &stratum, &g_work );
|
||||
|
||||
// Wait for new message from server
|
||||
if ( likely( stratum_socket_full( &stratum, opt_timeout ) ) )
|
||||
{
|
||||
@@ -3903,6 +3912,8 @@ int main(int argc, char *argv[])
|
||||
if ( opt_debug )
|
||||
applog(LOG_INFO,"Creating stratum thread");
|
||||
|
||||
stratum.new_job = false; // just to make sure
|
||||
|
||||
/* init stratum thread info */
|
||||
stratum_thr_id = opt_n_threads + 2;
|
||||
thr = &thr_info[stratum_thr_id];
|
||||
|
1226
simd-utils/intrlv.h
1226
simd-utils/intrlv.h
File diff suppressed because it is too large
Load Diff
@@ -272,9 +272,19 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
|
||||
#endif
|
||||
|
||||
// Mask making
|
||||
|
||||
// Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
|
||||
// Returns 2 or 4 bit integer mask from MSB of 64 or 32 bit elements.
|
||||
|
||||
#define mm_movmask_64( v ) \
|
||||
_mm_castpd_si128( _mm_movmask_pd( _mm_castsi128_pd( v ) ) )
|
||||
|
||||
#define mm_movmask_32( v ) \
|
||||
_mm_castps_si128( _mm_movmask_ps( _mm_castsi128_ps( v ) ) )
|
||||
|
||||
|
||||
// Diagonal blend: d = s3[3], s2[2], s1[1], s0[0] ||
|
||||
// Diagonal blend
|
||||
|
||||
// Blend 4 32 bit elements from 4 vectors
|
||||
|
||||
@@ -284,7 +294,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
mm_blend_epi32( _mm_blend_epi32( s3, s2, 0x4 ), \
|
||||
_mm_blend_epi32( s1, s0, 0x1 ), 0x3 )
|
||||
|
||||
#elif defined(__SSE4_1)
|
||||
#elif defined(__SSE4_1__)
|
||||
|
||||
#define mm128_diagonal_32( v3, v2, v1, v0 ) \
|
||||
mm_blend_epi16( _mm_blend_epi16( s3, s2, 0x30 ), \
|
||||
@@ -401,6 +411,16 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define mm128_rol_16( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
|
||||
|
||||
// Limited 2 input shuffle
|
||||
#define mm128_shuffle2_64( a, b, c ) \
|
||||
_mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( a ), \
|
||||
_mm_castsi128_pd( b ), c ) );
|
||||
|
||||
#define mm128_shuffle2_32( a, b, c ) \
|
||||
_mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( a ), \
|
||||
_mm_castsi128_ps( b ), c ) );
|
||||
|
||||
|
||||
//
|
||||
// Rotate vector elements accross all lanes
|
||||
|
||||
@@ -532,9 +552,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
// Function macro with two inputs and one output, inputs are preserved.
|
||||
// Returns modified first arg.
|
||||
// Two input functions are not available without SSSE3. Use procedure
|
||||
// belowe instead.
|
||||
// macros below instead.
|
||||
|
||||
#define mm128_shufl2r_64( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
|
||||
#define mm128_shufl2l_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
|
||||
@@ -548,12 +567,11 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
#define mm128_shufl2r_8( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
|
||||
#define mm128_shufl2l_8( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
|
||||
|
||||
// Procedure macroswith 2 inputs and 2 outputs, inputs are destroyed.
|
||||
// Returns both modified args in place.
|
||||
// Procedure macros with 2 inputs and 2 outputs, inputs args are overwritten.
|
||||
|
||||
// These macros retain the vrol/vror name for now to avoid
|
||||
// confusion with the shufl2r/shuffle2l function macros above.
|
||||
// These may be renamed to something like shufl2r2 for 2 1nputs and
|
||||
// These may be renamed to something like shufl2r2 for 2 nputs and
|
||||
// 2 outputs, ie SHUFfLe 2 inputs Right with 2 outputs.
|
||||
|
||||
#define mm128_vror256_64( v1, v2 ) \
|
||||
|
@@ -233,6 +233,18 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
|
||||
#endif
|
||||
|
||||
// Mask making
|
||||
|
||||
// Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
|
||||
// Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
|
||||
|
||||
#define mm256_movmask_64( v ) \
|
||||
_mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) )
|
||||
|
||||
#define mm256_movmask_32( v ) \
|
||||
_mm256_castps_si256( _mm256_movmask_ps( _mm256_castsi256_ps( v ) ) )
|
||||
|
||||
|
||||
// Diagonal blending
|
||||
|
||||
// Blend 4 64 bit elements from 4 vectors
|
||||
@@ -405,6 +417,16 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
//
|
||||
// Rotate elements within each 128 bit lane of 256 bit vector.
|
||||
|
||||
// Limited 2 input shuffle
|
||||
#define mm256_shuffle2_64( a, b, c ) \
|
||||
_mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( a ), \
|
||||
_mm256_castsi256_pd( b ), c ) );
|
||||
|
||||
#define mm256_shuffle2_32( a, b, c ) \
|
||||
_mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( a ), \
|
||||
_mm256_castsi256_ps( b ), c ) );
|
||||
|
||||
|
||||
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
|
||||
#define mm256_shuflr128_64 mm256_swap128_64
|
||||
#define mm256_shufll128_64 mm256_swap128_64
|
||||
@@ -485,20 +507,6 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
v2 = _mm256_xor_si256( v1, v2 ); \
|
||||
v1 = _mm256_xor_si256( v1, v2 );
|
||||
|
||||
#define mm256_vror512_128( v1, v2 ) \
|
||||
do { \
|
||||
__m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
|
||||
v1 = _mm256_permute2x128( v2, v1, 0x21 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm256_vrol512_128( v1, v2 ) \
|
||||
do { \
|
||||
__m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
|
||||
v2 = _mm256_permute2x128( v2, v1, 0x21 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#endif // __AVX2__
|
||||
#endif // SIMD_256_H__
|
||||
|
||||
|
@@ -493,7 +493,7 @@ static inline __m512i mm512_shufll_32( const __m512i v )
|
||||
static inline __m512i mm512_shuflr_x64( const __m512i v, const int n )
|
||||
{ return _mm512_alignr_epi64( v, v, n ); }
|
||||
|
||||
static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
|
||||
static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
{ return _mm512_alignr_epi32( v, v, n ); }
|
||||
|
||||
#define mm512_shuflr_16( v ) \
|
||||
@@ -581,8 +581,17 @@ static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
|
||||
0x0e0d0c0b0a090807, 0x060504030201001f ) )
|
||||
|
||||
//
|
||||
// Shuffle-roate elements within 128 bit lanes of 512 bit vector.
|
||||
// Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
|
||||
|
||||
// Limited 2 input, 1 output shuffle within 128 bit lanes.
|
||||
#define mm512_shuffle2_64( a, b, c ) \
|
||||
_mm512_castpd_si512( _mm512_shuffle_pd( _mm512_castsi512_pd( a ), \
|
||||
_mm512_castsi512_pd( b ), c ) );
|
||||
|
||||
#define mm512_shuffle2_32( a, b, c ) \
|
||||
_mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( a ), \
|
||||
_mm512_castsi512_ps( b ), c ) );
|
||||
|
||||
// Swap 64 bits in each 128 bit lane
|
||||
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
|
||||
#define mm512_shuflr128_64 mm512_swap128_64
|
||||
@@ -610,6 +619,7 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
|
||||
// shufl2r is 2 input ...
|
||||
// Drop macros? They can easilly be rebuilt using shufl2 functions
|
||||
|
||||
// 2 input, 1 output
|
||||
// Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
|
||||
// rotated v1
|
||||
// visually confusing for shif2r because of arg order. First arg is always
|
||||
@@ -627,76 +637,5 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
|
||||
#define mm512_shufl2r_32( v1, v2 ) _mm512_alignr_epi32( v2, v1, 1 )
|
||||
#define mm512_shufl2l_32( v1, v2 ) _mm512_alignr_epi32( v1, v2, 1 )
|
||||
|
||||
// Rotate elements from 2 512 bit vectors in place, source arguments
|
||||
// are overwritten.
|
||||
|
||||
#define mm512_swap1024_512( v1, v2 ) \
|
||||
v1 = _mm512_xor_si512( v1, v2 ); \
|
||||
v2 = _mm512_xor_si512( v1, v2 ); \
|
||||
v1 = _mm512_xor_si512( v1, v2 );
|
||||
#define mm512_shufl2l_512 mm512_swap1024_512 \
|
||||
#define mm512_shufl2r_512 mm512_swap1024_512 \
|
||||
|
||||
// Deprecated, will be removed. Use shufl2 functions instead. Leave them as is
|
||||
// for now.
|
||||
// Rotate elements from 2 512 bit vectors in place, both source arguments
|
||||
// are updated.
|
||||
|
||||
#define mm512_vror1024_256( v1, v2 ) \
|
||||
do { \
|
||||
__m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
|
||||
v1 = _mm512_alignr_epi64( v2, v1, 4 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm512_vrol1024_256( v1, v2 ) \
|
||||
do { \
|
||||
__m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
|
||||
v2 = _mm512_alignr_epi64( v2, v1, 4 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm512_vror1024_128( v1, v2 ) \
|
||||
do { \
|
||||
__m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \
|
||||
v1 = _mm512_alignr_epi64( v2, v1, 2 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm512_vrol1024_128( v1, v2 ) \
|
||||
do { \
|
||||
__m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \
|
||||
v2 = _mm512_alignr_epi64( v2, v1, 6 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm512_vror1024_64( v1, v2 ) \
|
||||
do { \
|
||||
__m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \
|
||||
v1 = _mm512_alignr_epi64( v2, v1, 1 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm512_vrol1024_64( v1, v2 ) \
|
||||
do { \
|
||||
__m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \
|
||||
v2 = _mm512_alignr_epi64( v2, v1, 7 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm512_vror1024_32( v1, v2 ) \
|
||||
do { \
|
||||
__m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \
|
||||
v1 = _mm512_alignr_epi32( v2, v1, 1 ); \
|
||||
v2 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm512_vrol1024_32( v1, v2 ) \
|
||||
do { \
|
||||
__m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \
|
||||
v2 = _mm512_alignr_epi32( v2, v1, 15 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#endif // AVX512
|
||||
#endif // SIMD_512_H__
|
||||
|
Reference in New Issue
Block a user