mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.23.0
This commit is contained in:
@@ -311,7 +311,7 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
|
||||
__m128i A, B, C, D, E, F, G, H;
|
||||
__m128i W[16]; memcpy_128( W, data, 16 );
|
||||
// Value required by H after round 60 to produce valid final hash
|
||||
const __m128i H_ = m128_const1_32( 0x136032ED );
|
||||
const __m128i H_ = _mm_set1_epi32( 0x136032ED );
|
||||
|
||||
A = _mm_load_si128( state_in );
|
||||
B = _mm_load_si128( state_in+1 );
|
||||
@@ -408,14 +408,14 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
|
||||
void sha256_4way_init( sha256_4way_context *sc )
|
||||
{
|
||||
sc->count_high = sc->count_low = 0;
|
||||
sc->val[0] = m128_const1_64( 0x6A09E6676A09E667 );
|
||||
sc->val[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
|
||||
sc->val[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
|
||||
sc->val[3] = m128_const1_64( 0xA54FF53AA54FF53A );
|
||||
sc->val[4] = m128_const1_64( 0x510E527F510E527F );
|
||||
sc->val[5] = m128_const1_64( 0x9B05688C9B05688C );
|
||||
sc->val[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
sc->val[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
sc->val[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
sc->val[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
sc->val[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
sc->val[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
sc->val[4] = _mm_set1_epi64x( 0x510E527F510E527F );
|
||||
sc->val[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
|
||||
sc->val[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
sc->val[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
}
|
||||
|
||||
void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
|
||||
@@ -458,7 +458,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
|
||||
const int pad = buf_size - 8;
|
||||
|
||||
ptr = (unsigned)sc->count_low & (buf_size - 1U);
|
||||
sc->buf[ ptr>>2 ] = m128_const1_64( 0x0000008000000080 );
|
||||
sc->buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
|
||||
ptr += 4;
|
||||
|
||||
if ( ptr > pad )
|
||||
@@ -474,8 +474,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
|
||||
high = (sc->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
|
||||
sc->buf[ pad >> 2 ] = m128_const1_32( bswap_32( high ) );
|
||||
sc->buf[( pad+4 ) >> 2 ] = m128_const1_32( bswap_32( low ) );
|
||||
sc->buf[ pad >> 2 ] = _mm_set1_epi32( bswap_32( high ) );
|
||||
sc->buf[( pad+4 ) >> 2 ] = _mm_set1_epi32( bswap_32( low ) );
|
||||
sha256_4way_transform_be( sc->val, sc->buf, sc->val );
|
||||
|
||||
mm128_block_bswap_32( dst, sc->val );
|
||||
@@ -589,7 +589,6 @@ do { \
|
||||
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
|
||||
Y_xor_Z ) )
|
||||
|
||||
|
||||
#define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
|
||||
do { \
|
||||
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \
|
||||
@@ -863,7 +862,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
{
|
||||
__m256i A, B, C, D, E, F, G, H;
|
||||
__m256i W[16]; memcpy_256( W, data, 16 );
|
||||
const __m256i H_ = m256_const1_32( 0x136032ED );
|
||||
const __m256i H_ = _mm256_set1_epi32( 0x136032ED );
|
||||
|
||||
A = _mm256_load_si256( state_in );
|
||||
B = _mm256_load_si256( state_in+1 );
|
||||
@@ -979,14 +978,14 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
void sha256_8way_init( sha256_8way_context *sc )
|
||||
{
|
||||
sc->count_high = sc->count_low = 0;
|
||||
sc->val[0] = m256_const1_64( 0x6A09E6676A09E667 );
|
||||
sc->val[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
|
||||
sc->val[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
|
||||
sc->val[3] = m256_const1_64( 0xA54FF53AA54FF53A );
|
||||
sc->val[4] = m256_const1_64( 0x510E527F510E527F );
|
||||
sc->val[5] = m256_const1_64( 0x9B05688C9B05688C );
|
||||
sc->val[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
sc->val[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
sc->val[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
sc->val[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
sc->val[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
sc->val[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
|
||||
sc->val[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
|
||||
sc->val[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
sc->val[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
}
|
||||
|
||||
// need to handle odd byte length for yespower.
|
||||
@@ -1032,7 +1031,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
|
||||
const int pad = buf_size - 8;
|
||||
|
||||
ptr = (unsigned)sc->count_low & (buf_size - 1U);
|
||||
sc->buf[ ptr>>2 ] = m256_const1_64( 0x0000008000000080 );
|
||||
sc->buf[ ptr>>2 ] = _mm256_set1_epi64x( 0x0000008000000080 );
|
||||
ptr += 4;
|
||||
|
||||
if ( ptr > pad )
|
||||
@@ -1048,8 +1047,8 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
|
||||
high = (sc->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
|
||||
sc->buf[ pad >> 2 ] = m256_const1_32( bswap_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] = m256_const1_32( bswap_32( low ) );
|
||||
sc->buf[ pad >> 2 ] = _mm256_set1_epi32( bswap_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] = _mm256_set1_epi32( bswap_32( low ) );
|
||||
|
||||
sha256_8way_transform_be( sc->val, sc->buf, sc->val );
|
||||
|
||||
@@ -1360,7 +1359,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
// Value for H at round 60, before adding K, needed to produce valid final
|
||||
// hash where H == 0.
|
||||
// H_ = -( H256[7] + K256[60] );
|
||||
const __m512i H_ = m512_const1_32( 0x136032ED );
|
||||
const __m512i H_ = _mm512_set1_epi32( 0x136032ED );
|
||||
|
||||
A = _mm512_load_si512( state_in );
|
||||
B = _mm512_load_si512( state_in+1 );
|
||||
@@ -1453,14 +1452,14 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
void sha256_16way_init( sha256_16way_context *sc )
|
||||
{
|
||||
sc->count_high = sc->count_low = 0;
|
||||
sc->val[0] = m512_const1_64( 0x6A09E6676A09E667 );
|
||||
sc->val[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
|
||||
sc->val[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
|
||||
sc->val[3] = m512_const1_64( 0xA54FF53AA54FF53A );
|
||||
sc->val[4] = m512_const1_64( 0x510E527F510E527F );
|
||||
sc->val[5] = m512_const1_64( 0x9B05688C9B05688C );
|
||||
sc->val[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
sc->val[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
sc->val[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
|
||||
sc->val[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
|
||||
sc->val[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
|
||||
sc->val[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
|
||||
sc->val[4] = _mm512_set1_epi64( 0x510E527F510E527F );
|
||||
sc->val[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
|
||||
sc->val[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
|
||||
sc->val[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
|
||||
}
|
||||
|
||||
void sha256_16way_update( sha256_16way_context *sc, const void *data,
|
||||
@@ -1504,7 +1503,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
|
||||
const int pad = buf_size - 8;
|
||||
|
||||
ptr = (unsigned)sc->count_low & (buf_size - 1U);
|
||||
sc->buf[ ptr>>2 ] = m512_const1_64( 0x0000008000000080 );
|
||||
sc->buf[ ptr>>2 ] = _mm512_set1_epi64( 0x0000008000000080 );
|
||||
ptr += 4;
|
||||
|
||||
if ( ptr > pad )
|
||||
@@ -1520,8 +1519,8 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
|
||||
high = (sc->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
|
||||
sc->buf[ pad >> 2 ] = m512_const1_32( bswap_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] = m512_const1_32( bswap_32( low ) );
|
||||
sc->buf[ pad >> 2 ] = _mm512_set1_epi32( bswap_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] = _mm512_set1_epi32( bswap_32( low ) );
|
||||
|
||||
sha256_16way_transform_be( sc->val, sc->buf, sc->val );
|
||||
|
||||
|
||||
@@ -28,32 +28,32 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
__m512i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i last_byte = m512_const1_32( 0x80000000 );
|
||||
const __m512i sixteen = m512_const1_32( 16 );
|
||||
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m512_const1_32( pdata[i] );
|
||||
vdata[i] = _mm512_set1_epi32( pdata[i] );
|
||||
|
||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_512( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m512_const1_32( 80*8 ); // bit count
|
||||
vdata[16+15] = _mm512_set1_epi32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = m512_const1_32( 32*8 ); // bit count
|
||||
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = m512_const1_64( 0x510E527F510E527F );
|
||||
initstate[5] = m512_const1_64( 0x9B05688C9B05688C );
|
||||
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
initstate[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = _mm512_set1_epi64( 0x510E527F510E527F );
|
||||
initstate[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
|
||||
initstate[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
sha256_16way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
@@ -116,31 +116,31 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
__m256i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = m256_const1_32( 0x80000000 );
|
||||
const __m256i eight = m256_const1_32( 8 );
|
||||
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m256_const1_32( pdata[i] );
|
||||
vdata[i] = _mm256_set1_epi32( pdata[i] );
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_256( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m256_const1_32( 80*8 ); // bit count
|
||||
vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = m256_const1_32( 32*8 ); // bit count
|
||||
block[15] = _mm256_set1_epi32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = m256_const1_64( 0x510E527F510E527F );
|
||||
initstate[5] = m256_const1_64( 0x9B05688C9B05688C );
|
||||
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
initstate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
initstate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
|
||||
initstate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
|
||||
initstate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
sha256_8way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
@@ -204,31 +204,31 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
__m128i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i last_byte = m128_const1_32( 0x80000000 );
|
||||
const __m128i four = m128_const1_32( 4 );
|
||||
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
|
||||
const __m128i four = _mm_set1_epi32( 4 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m128_const1_32( pdata[i] );
|
||||
vdata[i] = _mm_set1_epi32( pdata[i] );
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m128_const1_32( 80*8 ); // bit count
|
||||
vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = m128_const1_32( 32*8 ); // bit count
|
||||
block[15] = _mm_set1_epi32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = m128_const1_64( 0x510E527F510E527F );
|
||||
initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
|
||||
initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
|
||||
initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
|
||||
initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
@@ -68,7 +68,7 @@ int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
|
||||
*noncev = _mm512_add_epi32( *noncev, _mm512_set1_epi32( 16 ) );
|
||||
n += 16;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
@@ -140,7 +140,7 @@ int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
|
||||
*noncev = _mm256_add_epi32( *noncev, _mm256_set1_epi32( 8 ) );
|
||||
n += 8;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
|
||||
@@ -28,31 +28,31 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
__m512i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i last_byte = m512_const1_32( 0x80000000 );
|
||||
const __m512i sixteen = m512_const1_32( 16 );
|
||||
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m512_const1_32( pdata[i] );
|
||||
vdata[i] = _mm512_set1_epi32( pdata[i] );
|
||||
|
||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_512( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m512_const1_32( 80*8 ); // bit count
|
||||
vdata[16+15] = _mm512_set1_epi32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = m512_const1_32( 32*8 ); // bit count
|
||||
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
|
||||
|
||||
initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = m512_const1_64( 0x510E527F510E527F );
|
||||
initstate[5] = m512_const1_64( 0x9B05688C9B05688C );
|
||||
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
initstate[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = _mm512_set1_epi64( 0x510E527F510E527F );
|
||||
initstate[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
|
||||
initstate[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
sha256_16way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
@@ -120,31 +120,31 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
__m256i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = m256_const1_32( 0x80000000 );
|
||||
const __m256i eight = m256_const1_32( 8 );
|
||||
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m256_const1_32( pdata[i] );
|
||||
vdata[i] = _mm256_set1_epi32( pdata[i] );
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_256( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m256_const1_32( 80*8 ); // bit count
|
||||
vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = m256_const1_32( 32*8 ); // bit count
|
||||
block[15] = _mm256_set1_epi32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = m256_const1_64( 0x510E527F510E527F );
|
||||
initstate[5] = m256_const1_64( 0x9B05688C9B05688C );
|
||||
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
initstate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
initstate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
|
||||
initstate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
|
||||
initstate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
sha256_8way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
@@ -215,31 +215,31 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
__m128i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i last_byte = m128_const1_32( 0x80000000 );
|
||||
const __m128i four = m128_const1_32( 4 );
|
||||
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
|
||||
const __m128i four = _mm_set1_epi32( 4 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m128_const1_32( pdata[i] );
|
||||
vdata[i] = _mm_set1_epi32( pdata[i] );
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m128_const1_32( 80*8 ); // bit count
|
||||
vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = m128_const1_32( 32*8 ); // bit count
|
||||
block[15] = _mm_set1_epi32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = m128_const1_64( 0x510E527F510E527F );
|
||||
initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
|
||||
initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
|
||||
initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
|
||||
initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( midstate1, vdata, initstate );
|
||||
@@ -302,31 +302,31 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
__m128i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i last_byte = m128_const1_32( 0x80000000 );
|
||||
const __m128i four = m128_const1_32( 4 );
|
||||
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
|
||||
const __m128i four = _mm_set1_epi32( 4 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m128_const1_32( pdata[i] );
|
||||
vdata[i] = _mm_set1_epi32( pdata[i] );
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m128_const1_32( 80*8 ); // bit count
|
||||
vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = m128_const1_32( 32*8 ); // bit count
|
||||
block[15] = _mm_set1_epi32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = m128_const1_64( 0x510E527F510E527F );
|
||||
initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
|
||||
initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
|
||||
initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
|
||||
initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( midstate, vdata, initstate );
|
||||
|
||||
@@ -243,7 +243,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
||||
|
||||
ptr = (unsigned)sc->count & (buf_size - 1U);
|
||||
sc->buf[ ptr>>3 ] = m512_const1_64( 0x80 );
|
||||
sc->buf[ ptr>>3 ] = _mm512_set1_epi64( 0x80 );
|
||||
ptr += 8;
|
||||
if ( ptr > pad )
|
||||
{
|
||||
@@ -268,51 +268,56 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
|
||||
|
||||
// SHA-512 4 way 64 bit
|
||||
|
||||
#define BSG5_0( x ) mm256_xor3( mm256_ror_64( x, 28 ), \
|
||||
mm256_ror_64( x, 34 ), \
|
||||
mm256_ror_64( x, 39 ) )
|
||||
|
||||
#define BSG5_1( x ) mm256_xor3( mm256_ror_64( x, 14 ), \
|
||||
mm256_ror_64( x, 18 ), \
|
||||
mm256_ror_64( x, 41 ) )
|
||||
|
||||
#define SSG5_0( x ) mm256_xor3( mm256_ror_64( x, 1 ), \
|
||||
mm256_ror_64( x, 8 ), \
|
||||
_mm256_srli_epi64( x, 7 ) )
|
||||
|
||||
#define SSG5_1( x ) mm256_xor3( mm256_ror_64( x, 19 ), \
|
||||
mm256_ror_64( x, 61 ), \
|
||||
_mm256_srli_epi64( x, 6 ) )
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
// 4 way is not used whith AVX512 but will be whith AVX10_256 when it
|
||||
// becomes available.
|
||||
|
||||
#define CH( X, Y, Z ) _mm256_ternarylogic_epi64( X, Y, Z, 0xca )
|
||||
|
||||
#define MAJ( X, Y, Z ) _mm256_ternarylogic_epi64( X, Y, Z, 0xe8 )
|
||||
|
||||
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
|
||||
do { \
|
||||
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[i] ); \
|
||||
__m256i T1 = BSG5_1( E ); \
|
||||
__m256i T2 = BSG5_0( A ); \
|
||||
T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
|
||||
T1 = _mm256_add_epi64( T1, H ); \
|
||||
T2 = _mm256_add_epi64( T2, MAJ( A, B, C ) ); \
|
||||
T1 = _mm256_add_epi64( T1, T0 ); \
|
||||
D = _mm256_add_epi64( D, T1 ); \
|
||||
H = _mm256_add_epi64( T1, T2 ); \
|
||||
} while (0)
|
||||
|
||||
#else // AVX2 only
|
||||
|
||||
#define CH(X, Y, Z) \
|
||||
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
|
||||
|
||||
#define MAJ(X, Y, Z) \
|
||||
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
|
||||
Y_xor_Z ) )
|
||||
|
||||
#define BSG5_0(x) \
|
||||
mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
|
||||
_mm256_xor_si256( mm256_ror_64( x, 5 ), x ), 6 ), x ), 28 )
|
||||
|
||||
#define BSG5_1(x) \
|
||||
mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
|
||||
_mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 )
|
||||
|
||||
/*
|
||||
#define SSG5_0(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
mm256_ror_64(x, 1), mm256_ror_64(x, 8) ), _mm256_srli_epi64(x, 7) )
|
||||
|
||||
#define SSG5_1(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
mm256_ror_64(x, 19), mm256_ror_64(x, 61) ), _mm256_srli_epi64(x, 6) )
|
||||
*/
|
||||
// Interleave SSG0 & SSG1 for better throughput.
|
||||
// return ssg0(w0) + ssg1(w1)
|
||||
static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
|
||||
{
|
||||
__m256i w0a, w1a, w0b, w1b;
|
||||
w0a = mm256_ror_64( w0, 1 );
|
||||
w1a = mm256_ror_64( w1,19 );
|
||||
w0b = mm256_ror_64( w0, 8 );
|
||||
w1b = mm256_ror_64( w1,61 );
|
||||
w0a = _mm256_xor_si256( w0a, w0b );
|
||||
w1a = _mm256_xor_si256( w1a, w1b );
|
||||
w0b = _mm256_srli_epi64( w0, 7 );
|
||||
w1b = _mm256_srli_epi64( w1, 6 );
|
||||
w0a = _mm256_xor_si256( w0a, w0b );
|
||||
w1a = _mm256_xor_si256( w1a, w1b );
|
||||
return _mm256_add_epi64( w0a, w1a );
|
||||
}
|
||||
|
||||
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
|
||||
do { \
|
||||
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[ i ] ); \
|
||||
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[i] ); \
|
||||
__m256i T1 = BSG5_1( E ); \
|
||||
__m256i T2 = BSG5_0( A ); \
|
||||
T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
|
||||
@@ -324,19 +329,27 @@ do { \
|
||||
H = _mm256_add_epi64( T1, T2 ); \
|
||||
} while (0)
|
||||
|
||||
#endif // AVX512VL AVX10_256
|
||||
|
||||
static void
|
||||
sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
|
||||
{
|
||||
int i;
|
||||
register __m256i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
|
||||
register __m256i A, B, C, D, E, F, G, H;
|
||||
|
||||
#if !defined(__AVX512VL__)
|
||||
// Disable for AVX10_256
|
||||
__m256i X_xor_Y, Y_xor_Z;
|
||||
#endif
|
||||
|
||||
__m256i W[80];
|
||||
|
||||
mm256_block_bswap_64( W , in );
|
||||
mm256_block_bswap_64( W+8, in+8 );
|
||||
|
||||
for ( i = 16; i < 80; i++ )
|
||||
W[i] = _mm256_add_epi64( ssg512_add( W[i-15], W[i-2] ),
|
||||
_mm256_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
|
||||
W[i] = mm256_add4_64( SSG5_0( W[i-15] ), SSG5_1( W[i-2] ),
|
||||
W[ i- 7 ], W[ i-16 ] );
|
||||
|
||||
if ( ctx->initialized )
|
||||
{
|
||||
@@ -351,17 +364,20 @@ sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
|
||||
}
|
||||
else
|
||||
{
|
||||
A = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||
B = m256_const1_64( 0xBB67AE8584CAA73B );
|
||||
C = m256_const1_64( 0x3C6EF372FE94F82B );
|
||||
D = m256_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
E = m256_const1_64( 0x510E527FADE682D1 );
|
||||
F = m256_const1_64( 0x9B05688C2B3E6C1F );
|
||||
G = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
H = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
A = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
|
||||
B = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
|
||||
C = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
|
||||
D = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
|
||||
E = _mm256_set1_epi64x( 0x510E527FADE682D1 );
|
||||
F = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
|
||||
G = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
|
||||
H = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
|
||||
}
|
||||
|
||||
#if !defined(__AVX512VL__)
|
||||
// Disable for AVX10_256
|
||||
Y_xor_Z = _mm256_xor_si256( B, C );
|
||||
#endif
|
||||
|
||||
for ( i = 0; i < 80; i += 8 )
|
||||
{
|
||||
@@ -389,14 +405,14 @@ sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
|
||||
else
|
||||
{
|
||||
ctx->initialized = true;
|
||||
r[0] = _mm256_add_epi64( A, m256_const1_64( 0x6A09E667F3BCC908 ) );
|
||||
r[1] = _mm256_add_epi64( B, m256_const1_64( 0xBB67AE8584CAA73B ) );
|
||||
r[2] = _mm256_add_epi64( C, m256_const1_64( 0x3C6EF372FE94F82B ) );
|
||||
r[3] = _mm256_add_epi64( D, m256_const1_64( 0xA54FF53A5F1D36F1 ) );
|
||||
r[4] = _mm256_add_epi64( E, m256_const1_64( 0x510E527FADE682D1 ) );
|
||||
r[5] = _mm256_add_epi64( F, m256_const1_64( 0x9B05688C2B3E6C1F ) );
|
||||
r[6] = _mm256_add_epi64( G, m256_const1_64( 0x1F83D9ABFB41BD6B ) );
|
||||
r[7] = _mm256_add_epi64( H, m256_const1_64( 0x5BE0CD19137E2179 ) );
|
||||
r[0] = _mm256_add_epi64( A, _mm256_set1_epi64x( 0x6A09E667F3BCC908 ) );
|
||||
r[1] = _mm256_add_epi64( B, _mm256_set1_epi64x( 0xBB67AE8584CAA73B ) );
|
||||
r[2] = _mm256_add_epi64( C, _mm256_set1_epi64x( 0x3C6EF372FE94F82B ) );
|
||||
r[3] = _mm256_add_epi64( D, _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 ) );
|
||||
r[4] = _mm256_add_epi64( E, _mm256_set1_epi64x( 0x510E527FADE682D1 ) );
|
||||
r[5] = _mm256_add_epi64( F, _mm256_set1_epi64x( 0x9B05688C2B3E6C1F ) );
|
||||
r[6] = _mm256_add_epi64( G, _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B ) );
|
||||
r[7] = _mm256_add_epi64( H, _mm256_set1_epi64x( 0x5BE0CD19137E2179 ) );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -441,7 +457,7 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
||||
|
||||
ptr = (unsigned)sc->count & (buf_size - 1U);
|
||||
sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 );
|
||||
sc->buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
|
||||
ptr += 8;
|
||||
if ( ptr > pad )
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user