mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.19.0
This commit is contained in:
@@ -62,6 +62,12 @@ void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
|
||||
const __m128i *W, const __m128i *state_in );
|
||||
void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
|
||||
int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
|
||||
#endif // SSE2
|
||||
|
||||
@@ -84,10 +90,12 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
|
||||
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
|
||||
void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
|
||||
const __m256i *state_in );
|
||||
void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||
const __m256i *W, const __m256i *state_in );
|
||||
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in, const __m256i *state_mid );
|
||||
const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
|
||||
int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
@@ -109,10 +117,13 @@ void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
|
||||
const __m512i *state_in );
|
||||
void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||
const __m512i *W, const __m512i *state_in );
|
||||
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in, const __m512i *state_mid );
|
||||
const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
|
||||
|
||||
int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
||||
@@ -611,11 +611,11 @@ static inline int scanhash_sha256d_8way_pooler( struct work *work,
|
||||
|
||||
#endif /* HAVE_SHA256_8WAY */
|
||||
|
||||
int scanhash_sha256d_pooler( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||
int scanhash_sha256d_pooler( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t _ALIGN(128) data[64];
|
||||
uint32_t _ALIGN(32) hash[8];
|
||||
uint32_t _ALIGN(32) midstate[8];
|
||||
@@ -626,12 +626,12 @@ int scanhash_sha256d_pooler( struct work *work,
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
#ifdef HAVE_SHA256_8WAY
|
||||
if (sha256_use_8way())
|
||||
return scanhash_sha256d_8way_pooler( work, max_nonce, hashes_done, mythr );
|
||||
if ( sha256_use_8way() )
|
||||
return scanhash_sha256d_8way_pooler( work, max_nonce, hashes_done, mythr );
|
||||
#endif
|
||||
#ifdef HAVE_SHA256_4WAY
|
||||
if (sha256_use_4way())
|
||||
return scanhash_sha256d_4way_pooler( work, max_nonce, hashes_done, mythr );
|
||||
if ( sha256_use_4way() )
|
||||
return scanhash_sha256d_4way_pooler( work, max_nonce, hashes_done, mythr );
|
||||
#endif
|
||||
|
||||
memcpy(data, pdata + 16, 64);
|
||||
@@ -695,8 +695,11 @@ bool register_sha256d_algo( algo_gate_t* gate )
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
#if defined(SHA256D_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256d_16way;
|
||||
//#elif defined(SHA256D_8WAY)
|
||||
// gate->scanhash = (void*)&scanhash_sha256d_8way;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_sha256d_pooler;
|
||||
// gate->scanhash = (void*)&scanhash_sha256d_4way;
|
||||
#endif
|
||||
// gate->hash = (void*)&sha256d;
|
||||
return true;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -10,13 +10,14 @@
|
||||
int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m512i vdata[32] __attribute__ ((aligned (128)));
|
||||
__m512i block[16] __attribute__ ((aligned (64)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m512i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m512i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m512i midstate2[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m512i vdata[20] __attribute__ ((aligned (32)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (64)));
|
||||
__m512i initstate[8] __attribute__ ((aligned (64)));
|
||||
__m512i midstate1[8] __attribute__ ((aligned (64)));
|
||||
__m512i midstate2[8] __attribute__ ((aligned (64)));
|
||||
__m512i mexp_pre[16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
@@ -36,6 +37,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_512( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m512_const1_32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = m512_const1_32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
|
||||
@@ -49,39 +58,33 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
sha256_16way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
|
||||
sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy_512( block, vdata + 16, 4 );
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_512( block + 5, 10 );
|
||||
block[15] = m512_const1_32( 80*8 ); // bit count
|
||||
sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
|
||||
sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_512( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = m512_const1_32( 32*8 ); // bit count
|
||||
sha256_16way_transform_le( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm512_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
if ( sha256_16way_transform_le_short( hash32, block, initstate ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
// byte swap final hash for testing
|
||||
mm512_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, sixteen );
|
||||
n += 16;
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, sixteen );
|
||||
n += 16;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
@@ -95,13 +98,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m256i block[16] __attribute__ ((aligned (64)));
|
||||
__m256i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m256i block[16] __attribute__ ((aligned (32)));
|
||||
__m256i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m256i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate2[8] __attribute__ ((aligned (32)));
|
||||
__m256i mexp_pre[16] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m256i vdata[20] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
@@ -120,6 +124,14 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_256( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m256_const1_32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = m256_const1_32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
|
||||
@@ -133,35 +145,30 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
sha256_8way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
|
||||
sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy_256( block, vdata + 16, 4 );
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_256( block + 5, 10 );
|
||||
block[15] = m256_const1_32( 80*8 ); // bit count
|
||||
sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
|
||||
sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_256( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = m256_const1_32( 32*8 ); // bit count
|
||||
sha256_8way_transform_le( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm256_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
if ( unlikely(
|
||||
sha256_8way_transform_le_short( hash32, block, initstate ) ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
// byte swap final hash for testing
|
||||
mm256_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, eight );
|
||||
@@ -179,12 +186,14 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m128i block[16] __attribute__ ((aligned (64)));
|
||||
__m128i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m128i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m128i vdata[20] __attribute__ ((aligned (32)));
|
||||
__m128i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m128i block[16] __attribute__ ((aligned (32)));
|
||||
__m128i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m128i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate2[8] __attribute__ ((aligned (32)));
|
||||
__m128i mexp_pre[16] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
@@ -203,6 +212,14 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m128_const1_32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = m128_const1_32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
|
||||
@@ -214,39 +231,36 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( midstate, vdata, initstate );
|
||||
sha256_4way_transform_le( midstate1, vdata, initstate );
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy_128( block, vdata + 16, 4 );
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_128( block + 5, 10 );
|
||||
block[15] = m128_const1_32( 80*8 ); // bit count
|
||||
sha256_4way_transform_le( hash32, block, midstate );
|
||||
sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_128( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = m128_const1_32( 32*8 ); // bit count
|
||||
sha256_4way_transform_le( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
if ( unlikely(
|
||||
sha256_4way_transform_le_short( hash32, block, initstate ) ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
// byte swap final hash for testing
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
*noncev = _mm_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
}
|
||||
*noncev = _mm_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
|
||||
@@ -6,12 +6,10 @@
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define SHA256D_16WAY 1
|
||||
/*
|
||||
#elif defined(__AVX2__)
|
||||
#define SHA256D_8WAY 1
|
||||
#else
|
||||
#define SHA256D_4WAY 1
|
||||
*/
|
||||
#endif
|
||||
|
||||
bool register_sha256d_algo( algo_gate_t* gate );
|
||||
@@ -21,7 +19,7 @@ bool register_sha256d_algo( algo_gate_t* gate );
|
||||
int scanhash_sha256d_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
/*
|
||||
|
||||
#if defined(SHA256D_8WAY)
|
||||
|
||||
int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
|
||||
@@ -33,7 +31,7 @@ int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
|
||||
int scanhash_sha256d_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
#if defined(__SHA__)
|
||||
|
||||
@@ -10,13 +10,14 @@
|
||||
int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m512i vdata[32] __attribute__ ((aligned (128)));
|
||||
__m512i block[16] __attribute__ ((aligned (64)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m512i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m512i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m512i midstate2[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m512i vdata[20] __attribute__ ((aligned (32)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (64)));
|
||||
__m512i initstate[8] __attribute__ ((aligned (64)));
|
||||
__m512i midstate1[8] __attribute__ ((aligned (64)));
|
||||
__m512i midstate2[8] __attribute__ ((aligned (64)));
|
||||
__m512i mexp_pre[16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
@@ -36,7 +37,14 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
// initialize state
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_512( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m512_const1_32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = m512_const1_32( 32*8 ); // bit count
|
||||
|
||||
initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
|
||||
@@ -49,43 +57,37 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
sha256_16way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
|
||||
sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy_512( block, vdata + 16, 4 );
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_512( block + 5, 10 );
|
||||
block[15] = m512_const1_32( 80*8 ); // bit count
|
||||
sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
|
||||
// 1. final 16 bytes of data, pre-padded
|
||||
sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_512( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = m512_const1_32( 32*8 ); // bit count
|
||||
sha256_16way_transform_le( hash32, block, initstate );
|
||||
sha256_16way_transform_le( block, block, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
memcpy_512( block, hash32, 8 );
|
||||
sha256_16way_transform_le( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm512_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
if ( unlikely(
|
||||
sha256_16way_transform_le_short( hash32, block, initstate ) ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
// byte swap final hash for testing
|
||||
mm512_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( hash32_d7[ lane ] <= targ32_d7 )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, sixteen );
|
||||
n += 16;
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, sixteen );
|
||||
n += 16;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
@@ -100,13 +102,14 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m256i block[16] __attribute__ ((aligned (64)));
|
||||
__m256i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m256i block[16] __attribute__ ((aligned (32)));
|
||||
__m256i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m256i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate2[8] __attribute__ ((aligned (32)));
|
||||
__m256i mexp_pre[16] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m256i vdata[20] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
@@ -125,6 +128,14 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_256( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m256_const1_32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = m256_const1_32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
|
||||
@@ -138,43 +149,37 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
sha256_8way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
|
||||
sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy_256( block, vdata + 16, 4 );
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_256( block + 5, 10 );
|
||||
block[15] = m256_const1_32( 80*8 ); // bit count
|
||||
sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
|
||||
sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_256( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = m256_const1_32( 32*8 ); // bit count
|
||||
sha256_8way_transform_le( hash32, block, initstate );
|
||||
sha256_8way_transform_le( block, block, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
memcpy_256( block, hash32, 8 );
|
||||
sha256_8way_transform_le( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm256_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
if ( unlikely(
|
||||
sha256_8way_transform_le_short( hash32, block, initstate ) ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
// byte swap final hash for testing
|
||||
mm256_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash32_d7[ lane ] <= targ32_d7 )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, eight );
|
||||
n += 8;
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, eight );
|
||||
n += 8;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
@@ -183,17 +188,110 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(SHA256T_4WAY)
|
||||
|
||||
// Optimizations are slower with AVX/SSE2
|
||||
// https://github.com/JayDDee/cpuminer-opt/issues/344
|
||||
/*
|
||||
int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m128i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m128i block[16] __attribute__ ((aligned (32)));
|
||||
__m128i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m128i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate2[8] __attribute__ ((aligned (32)));
|
||||
__m128i mexp_pre[16] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t n = first_nonce;
|
||||
__m128i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i last_byte = m128_const1_32( 0x80000000 );
|
||||
const __m128i four = m128_const1_32( 4 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m128_const1_32( pdata[i] );
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m128_const1_32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = m128_const1_32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = m128_const1_64( 0x510E527F510E527F );
|
||||
initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
|
||||
initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
sha256_4way_transform_le( block, block, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
if ( unlikely(
|
||||
sha256_4way_transform_le_short( hash32, block, initstate ) ) )
|
||||
{
|
||||
// byte swap final hash for testing
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
*noncev = _mm_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m128i block[16] __attribute__ ((aligned (64)));
|
||||
__m128i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m128i block[16] __attribute__ ((aligned (32)));
|
||||
__m128i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m128i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m128i vdata[20] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
@@ -212,6 +310,14 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = m128_const1_32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = m128_const1_32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
|
||||
@@ -227,25 +333,9 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy_128( block, vdata + 16, 4 );
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_128( block + 5, 10 );
|
||||
block[15] = m128_const1_32( 80*8 ); // bit count
|
||||
sha256_4way_transform_le( hash32, block, midstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_128( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = m128_const1_32( 32*8 ); // bit count
|
||||
sha256_4way_transform_le( hash32, block, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
memcpy_128( block, hash32, 8 );
|
||||
sha256_4way_transform_le( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
sha256_4way_transform_le( block, vdata+16, midstate );
|
||||
sha256_4way_transform_le( block, block, initstate );
|
||||
sha256_4way_transform_le( hash32, block, initstate );
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
@@ -266,5 +356,6 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
Reference in New Issue
Block a user