mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.9.5.1
This commit is contained in:
@@ -38,9 +38,31 @@ supported.
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.9.5.1
|
||||
|
||||
Fixed skein2 crash on Windows.
|
||||
|
||||
Fixed CPU temperature reading on Ubuntu 19.04.
|
||||
|
||||
Realigned log message colours, blue is used to report normal activity and
|
||||
yellow is only used to report abnormal activity.
|
||||
|
||||
Changed stats colours, yellow now means below average, white is average
|
||||
range. Tweaked colour thresholds.
|
||||
|
||||
Changed colour of stratum difficulty change messages to blue to match other
|
||||
normal protocol messages. Blue messages (block, stratum, submit) will no
|
||||
longer be displayed when using -q option.
|
||||
|
||||
Added job id to new block, share submit, and share result messages and added
|
||||
new nessage when a new job is received for an existing block. This will for
|
||||
better troubleshooting of invalid job id rejects seen at zergpool.
|
||||
|
||||
Some more restructuring.
|
||||
|
||||
v3.9.5
|
||||
|
||||
New share reporting information includes calculation of equivalent hhashrate
|
||||
New share reporting information includes calculation of equivalent hashrate
|
||||
based on share difficulty, network latency, 5 minute summary.
|
||||
Per-thread hash rate reports are disabled by default.
|
||||
New command line option --hash-meter added to enable per-thread hash rates.
|
||||
|
@@ -368,9 +368,9 @@ bool submit_solution( struct work *work, void *hash,
|
||||
if ( submit_work( thr, work ) )
|
||||
{
|
||||
if ( !opt_quiet )
|
||||
applog( LOG_NOTICE, "Share %d submitted by thread %d.",
|
||||
applog( LOG_BLUE, "Share %d submitted by thread %d, job %s.",
|
||||
accepted_share_count + rejected_share_count + 1,
|
||||
thr->id );
|
||||
thr->id, work->job_id );
|
||||
return true;
|
||||
}
|
||||
else
|
||||
@@ -385,9 +385,12 @@ bool submit_lane_solution( struct work *work, void *hash,
|
||||
if ( submit_work( thr, work ) )
|
||||
{
|
||||
if ( !opt_quiet )
|
||||
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
|
||||
// applog( LOG_BLUE, "Share %d submitted by thread %d, lane %d.",
|
||||
// accepted_share_count + rejected_share_count + 1,
|
||||
// thr->id, lane );
|
||||
applog( LOG_BLUE, "Share %d submitted by thread %d, lane %d, job %s.",
|
||||
accepted_share_count + rejected_share_count + 1, thr->id,
|
||||
lane );
|
||||
lane, work->job_id );
|
||||
return true;
|
||||
}
|
||||
else
|
||||
|
@@ -15,7 +15,7 @@ void blakehash_4way(void *state, const void *input)
|
||||
memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
|
||||
blake256r14_4way( &ctx, input + (64<<2), 16 );
|
||||
blake256r14_4way_close( &ctx, vhash );
|
||||
mm128_dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
}
|
||||
|
||||
int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
|
||||
|
@@ -83,7 +83,7 @@ void blake2s_4way_hash( void *output, const void *input )
|
||||
blake2s_4way_update( &ctx, input + (64<<2), 16 );
|
||||
blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
|
||||
|
||||
mm128_dintrlv_4x32( output, output+32, output+64, output+96,
|
||||
dintrlv_4x32( output, output+32, output+64, output+96,
|
||||
vhash, 256 );
|
||||
}
|
||||
|
||||
|
@@ -17,7 +17,7 @@ void blakecoin_4way_hash(void *state, const void *input)
|
||||
blake256r8_4way( &ctx, input + (64<<2), 16 );
|
||||
blake256r8_4way_close( &ctx, vhash );
|
||||
|
||||
mm128_dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
}
|
||||
|
||||
int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
|
||||
|
@@ -23,7 +23,7 @@ void decred_hash_4way( void *state, const void *input )
|
||||
memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
|
||||
blake256_4way( &ctx, tail, tail_len );
|
||||
blake256_4way_close( &ctx, vhash );
|
||||
mm128_dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
}
|
||||
|
||||
int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
|
||||
|
@@ -33,7 +33,7 @@ void myriad_4way_hash( void *output, const void *input )
|
||||
myrgr_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &myrgr_4way_ctx, sizeof(myrgr_4way_ctx) );
|
||||
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, input, 640 );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, input, 640 );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
@@ -43,7 +43,7 @@ void myriad_4way_hash( void *output, const void *input )
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
|
||||
|
||||
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
sha256_4way( &ctx.sha, vhash, 64 );
|
||||
sha256_4way_close( &ctx.sha, output );
|
||||
@@ -89,7 +89,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( hash7[ lane ] <= Htarg )
|
||||
{
|
||||
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
|
@@ -89,7 +89,7 @@ int scanhash_jha_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[25]);
|
||||
uint32_t lane_hash[8];
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
@@ -143,7 +143,7 @@ int scanhash_jha_4way( struct work *work, uint32_t max_nonce,
|
||||
// && fulltest( hash+(i<<3), ptarget ) )
|
||||
for ( int i = 0; i < 4; i++ ) if ( !( (hash7[i] & mask ) == 0 ) )
|
||||
{
|
||||
mm256_extract_lane_4x64( lane_hash, hash, i, 256 );
|
||||
mm256_extr_lane_4x64( lane_hash, hash, i, 256 );
|
||||
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
|
@@ -21,8 +21,8 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[25]); // 3*8+1
|
||||
uint32_t lane_hash[8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
@@ -41,7 +41,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
|
||||
{
|
||||
mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
mm256_extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
|
@@ -36,7 +36,7 @@ void lyra2h_4way_hash( void *state, const void *input )
|
||||
blake256_4way( &ctx_blake, input + (64*4), 16 );
|
||||
blake256_4way_close( &ctx_blake, vhash );
|
||||
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
|
||||
LYRA2Z( lyra2h_4way_matrix, state, 32, hash0, 32, hash0, 32,
|
||||
16, 16, 16 );
|
||||
|
@@ -78,7 +78,7 @@ void lyra2rev2_4way_hash( void *state, const void *input )
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
|
||||
|
||||
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||
|
||||
bmw256_4way( &ctx.bmw, vhash, 32 );
|
||||
bmw256_4way_close( &ctx.bmw, state );
|
||||
@@ -90,7 +90,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t lane_hash[8];
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
@@ -116,7 +116,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
|
||||
{
|
||||
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
|
@@ -92,7 +92,7 @@ int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
uint32_t lane_hash[8];
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
@@ -115,7 +115,7 @@ int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
|
||||
{
|
||||
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
mm256_extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
@@ -161,7 +161,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
|
||||
|
||||
blake256_4way( &ctx.blake, input, 80 );
|
||||
blake256_4way_close( &ctx.blake, vhash );
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
|
||||
LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
|
||||
LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
|
||||
@@ -181,7 +181,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
|
||||
LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
|
||||
LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
|
||||
|
||||
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||
bmw256_4way( &ctx.bmw, vhash, 32 );
|
||||
bmw256_4way_close( &ctx.bmw, state );
|
||||
}
|
||||
@@ -192,7 +192,7 @@ int scanhash_lyra2rev3_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t lane_hash[8];
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
@@ -214,7 +214,7 @@ int scanhash_lyra2rev3_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
|
||||
{
|
||||
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
|
@@ -36,7 +36,7 @@ void lyra2z_4way_hash( void *state, const void *input )
|
||||
blake256_4way( &ctx_blake, input + (64*4), 16 );
|
||||
blake256_4way_close( &ctx_blake, vhash );
|
||||
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
|
||||
LYRA2Z( lyra2z_4way_matrix, state , 32, hash0, 32, hash0, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, state+32, 32, hash1, 32, hash1, 32, 8, 8, 8 );
|
||||
|
@@ -168,7 +168,7 @@ int scanhash_phi2_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t _ALIGN(128) edata[36];
|
||||
uint32_t vdata[4][36] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[25]);
|
||||
uint32_t lane_hash[8];
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
@@ -217,7 +217,7 @@ int scanhash_phi2_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[ lane<<1 ] < Htarg )
|
||||
{
|
||||
mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
mm256_extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
|
@@ -207,6 +207,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
||||
|
||||
SHA512_Update( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||
SHA512_Final( (unsigned char*) (bhash[1]), &ctx2.sha512 );
|
||||
|
||||
sph_keccak512( &ctx2.keccak, data_p64, 80 - M7_MIDSTATE_LEN );
|
||||
sph_keccak512_close( &ctx2.keccak, (void*)(bhash[2]) );
|
||||
|
||||
@@ -299,15 +300,15 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
||||
hash_str,
|
||||
target_str);
|
||||
}
|
||||
work_set_target_ratio( work, hash );
|
||||
pdata[19] = data[19];
|
||||
goto out;
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
pdata[19] = n;
|
||||
|
||||
out:
|
||||
// can this be skipped after finding a share? Seems to work ok.
|
||||
//out:
|
||||
mpf_set_prec_raw(magifpi, prec0);
|
||||
mpf_set_prec_raw(magifpi0, prec0);
|
||||
mpf_set_prec_raw(mptmp, prec0);
|
||||
|
@@ -70,7 +70,7 @@ int scanhash_nist5_4way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t hash[4*16] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[25]);
|
||||
uint32_t lane_hash[8];
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
@@ -122,7 +122,7 @@ int scanhash_nist5_4way( struct work *work, uint32_t max_nonce,
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( ( hash7[ lane ] & mask ) == 0 )
|
||||
{
|
||||
mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
mm256_extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
|
@@ -575,7 +575,7 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
// uint32_t *hash7 = &(hash[25]);
|
||||
// uint32_t lane_hash[8];
|
||||
// uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
|
@@ -191,7 +191,7 @@ int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( ( hash7[ i<<1 ] & 0xFFFFFF00 ) == 0 )
|
||||
{
|
||||
mm256_extract_lane_4x64( lane_hash, hash, i, 256 );
|
||||
mm256_extr_lane_4x64( lane_hash, hash, i, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
|
@@ -118,7 +118,7 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
|
||||
for ( int i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
|
||||
{
|
||||
// deinterleave hash for lane
|
||||
mm256_extract_lane_8x32( lane_hash, hash, i, 256 );
|
||||
mm256_extr_lane_8x32( lane_hash, hash, i, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[27] = n + i;
|
||||
|
@@ -36,6 +36,7 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
@@ -79,8 +80,7 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,
|
||||
if ( !( hash7[ lane ] & mask ) )
|
||||
{
|
||||
// deinterleave hash for lane
|
||||
uint32_t lane_hash[8];
|
||||
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
mm256_extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
@@ -130,7 +130,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t lane_hash[8];
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
@@ -168,7 +168,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( !( hash7[ lane ] & mask ) )
|
||||
{
|
||||
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
|
@@ -99,7 +99,7 @@ int scanhash_sha256t_11way( struct work *work, uint32_t max_nonce,
|
||||
for ( i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
|
||||
{
|
||||
// deinterleave hash for lane
|
||||
mm256_extract_lane_8x32( lane_hash, hashx, i, 256 );
|
||||
mm256_extr_lane_8x32( lane_hash, hashx, i, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = n + i;
|
||||
@@ -111,7 +111,7 @@ int scanhash_sha256t_11way( struct work *work, uint32_t max_nonce,
|
||||
for( i = 0; i < 2; i++ ) if ( !(hash7[ 0] & mask ) )
|
||||
|
||||
{
|
||||
mm64_extract_lane_2x32( lane_hash, hashy, i, 256 );
|
||||
mm64_extr_lane_2x32( lane_hash, hashy, i, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = n + 8 + i;
|
||||
@@ -204,7 +204,7 @@ int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce,
|
||||
if ( !( hash7[ lane ] & mask ) )
|
||||
{
|
||||
// deinterleave hash for lane
|
||||
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
mm256_extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
@@ -287,7 +287,7 @@ int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce,
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( !( hash7[ lane ] & mask ) )
|
||||
{
|
||||
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
|
@@ -48,7 +48,7 @@ void skeinhash_4way( void *state, const void *input )
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
|
||||
SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
|
||||
|
||||
mm128_intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
|
||||
intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
|
||||
#else
|
||||
mm256_rintrlv_4x64_4x32( vhash32, vhash64, 512 );
|
||||
|
||||
@@ -63,7 +63,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8];
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
@@ -84,7 +84,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( hash7[ lane ] <= Htarg )
|
||||
{
|
||||
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
|
@@ -23,29 +23,41 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[25]);
|
||||
uint32_t edata[20] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[25]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
// __m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
uint32_t *noncep = vdata + 73; // 9*8 + 1
|
||||
|
||||
mm256_bswap_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
swab32_array( edata, pdata, 20 );
|
||||
|
||||
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
// mm256_bswap_intrlv80_4x64( vdata, pdata );
|
||||
do
|
||||
{
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+2, n+1 );
|
||||
be32enc( noncep+4, n+2 );
|
||||
be32enc( noncep+6, n+3 );
|
||||
|
||||
// *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
// _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
skein2hash_4way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( hash7[ lane<<1 ] <= Htarg )
|
||||
{
|
||||
uint32_t lane_hash[8];
|
||||
mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
mm256_extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
|
@@ -171,18 +171,14 @@ int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
uint32_t *noncep = vdata + 73; // 9*8 + 1
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
mm256_bswap_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
for (int m=0; m < 6; m++)
|
||||
if (Htarg <= htmax[m])
|
||||
@@ -190,10 +186,8 @@ int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+2, n+1 );
|
||||
be32enc( noncep+4, n+2 );
|
||||
be32enc( noncep+6, n+3 );
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
c11_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
@@ -64,13 +64,12 @@ int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t _ALIGN(128) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t n = pdata[19];
|
||||
uint32_t *noncep = vdata + 73; // 9*8 + 1
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
uint64_t htmax[] = { 0,
|
||||
@@ -87,14 +86,7 @@ int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
|
||||
0xFFFF0000,
|
||||
0 };
|
||||
|
||||
// we need bigendian data...
|
||||
for ( int i = 0; i < 20; i++ )
|
||||
{
|
||||
be32enc( &endiandata[i], pdata[i] );
|
||||
}
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
mm256_bswap_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
// precalc midstate
|
||||
// doing it one way then then interleaving would be faster but too
|
||||
@@ -108,10 +100,8 @@ int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do {
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+2, n+1 );
|
||||
be32enc( noncep+4, n+2 );
|
||||
be32enc( noncep+6, n+3 );
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
tribus_hash_4way( hash, vdata );
|
||||
|
||||
|
@@ -170,18 +170,14 @@ int scanhash_x11_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
uint32_t *noncep = vdata + 73; // 9*8 + 1
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
mm256_bswap_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
for (int m=0; m < 6; m++)
|
||||
if (Htarg <= htmax[m])
|
||||
@@ -189,10 +185,8 @@ int scanhash_x11_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+2, n+1 );
|
||||
be32enc( noncep+4, n+2 );
|
||||
be32enc( noncep+6, n+3 );
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
x11_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
@@ -171,24 +171,19 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
uint32_t *noncep = vdata + 73; // 9*8 + 1
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
mm256_bswap_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
for (int m=0; m < 6; m++)
|
||||
if (Htarg <= htmax[m])
|
||||
@@ -196,10 +191,8 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+2, n+1 );
|
||||
be32enc( noncep+4, n+2 );
|
||||
be32enc( noncep+6, n+3 );
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
x11gost_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
@@ -78,29 +78,23 @@ int scanhash_skunk_4way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *noncep = vdata + 73; // 9*8 + 1
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
|
||||
if ( opt_benchmark )
|
||||
((uint32_t*)ptarget)[7] = 0x0cff;
|
||||
for ( int k = 0; k < 19; k++ )
|
||||
be32enc( &endiandata[k], pdata[k] );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
mm256_bswap_intrlv80_4x64( vdata, pdata );
|
||||
do
|
||||
{
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+2, n+1 );
|
||||
be32enc( noncep+4, n+2 );
|
||||
be32enc( noncep+6, n+3 );
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
skunk_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
@@ -189,12 +189,11 @@ int scanhash_x13_4way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *noncep = vdata + 73; // 9*8 + 1
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
@@ -202,11 +201,7 @@ int scanhash_x13_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
mm256_bswap_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
for ( int m=0; m < 6; m++ )
|
||||
if ( Htarg <= htmax[m] )
|
||||
@@ -214,10 +209,8 @@ int scanhash_x13_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+2, n+1 );
|
||||
be32enc( noncep+4, n+2 );
|
||||
be32enc( noncep+6, n+3 );
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
x13_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
@@ -166,7 +166,7 @@ void x13sm3_4way_hash( void *state, const void *input )
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
// SM3 parallel 32 bit
|
||||
uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64)));
|
||||
@@ -182,7 +182,7 @@ void x13sm3_4way_hash( void *state, const void *input )
|
||||
|
||||
sm3_4way( &ctx.sm3, vhash, 64 );
|
||||
sm3_4way_close( &ctx.sm3, sm3_vhash );
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
|
||||
|
||||
// Hamsi parallel 4x32x2
|
||||
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
@@ -214,12 +214,11 @@ int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *noncep = vdata + 73; // 9*8 + 1
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
@@ -227,11 +226,7 @@ int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
mm256_bswap_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
blake512_4way_init( &x13sm3_ctx_mid );
|
||||
blake512_4way( &x13sm3_ctx_mid, vdata, 64 );
|
||||
@@ -242,10 +237,8 @@ int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+2, n+1 );
|
||||
be32enc( noncep+4, n+2 );
|
||||
be32enc( noncep+6, n+3 );
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
x13sm3_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
@@ -43,7 +43,7 @@ void polytimos_4way_hash( void *output, const void *input )
|
||||
shabal512_4way_init( &ctx.shabal );
|
||||
shabal512_4way( &ctx.shabal, vhash32, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, vhash32 );
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
|
||||
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash0,
|
||||
|
@@ -54,10 +54,10 @@ void veltor_4way_hash( void *output, const void *input )
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
shabal512_4way( &ctx.shabal, vhash, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_gost512( &ctx.gost, hash0, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash0 );
|
||||
@@ -82,31 +82,24 @@ int scanhash_veltor_4way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *noncep = vdata + 73; // 9*8 + 1
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x0cff;
|
||||
for ( int i=0; i < 19; i++ )
|
||||
{
|
||||
be32enc( &endiandata[i], pdata[i] );
|
||||
}
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
mm256_bswap_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
do
|
||||
{
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+2, n+1 );
|
||||
be32enc( noncep+4, n+2 );
|
||||
be32enc( noncep+6, n+3 );
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
veltor_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
@@ -183,10 +183,9 @@ void x14_4way_hash( void *state, const void *input )
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
// 14 Shabal, parallel 32 bit
|
||||
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
shabal512_4way( &ctx.shabal, vhash, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, state );
|
||||
|
||||
}
|
||||
|
||||
int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
|
||||
@@ -194,12 +193,11 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t hash[4*16] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *noncep = vdata + 73; // 9*8 + 1
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
@@ -207,11 +205,7 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
mm256_bswap_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
for ( int m=0; m < 6; m++ )
|
||||
if ( Htarg <= htmax[m] )
|
||||
@@ -219,10 +213,8 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+2, n+1 );
|
||||
be32enc( noncep+4, n+2 );
|
||||
be32enc( noncep+6, n+3 );
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
x14_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
@@ -234,7 +226,7 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
// deinterleave hash for lane
|
||||
uint32_t lane_hash[8];
|
||||
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
|
@@ -186,10 +186,10 @@ void x15_4way_hash( void *state, const void *input )
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
// 14 Shabal, parallel 32 bit
|
||||
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
shabal512_4way( &ctx.shabal, vhash, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 15 Whirlpool
|
||||
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
|
||||
@@ -218,12 +218,11 @@ int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *noncep = vdata + 73; // 9*8 + 1
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
@@ -231,11 +230,7 @@ int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
mm256_bswap_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
for ( int m=0; m < 6; m++ )
|
||||
if ( Htarg <= htmax[m] )
|
||||
@@ -243,10 +238,8 @@ int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+2, n+1 );
|
||||
be32enc( noncep+4, n+2 );
|
||||
be32enc( noncep+6, n+3 );
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
x15_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
@@ -248,11 +248,11 @@ void x16r_4way_hash( void* output, const void* input )
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
break;
|
||||
case SHABAL:
|
||||
mm128_intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
|
||||
intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
|
||||
shabal512_4way_init( &ctx.shabal );
|
||||
shabal512_4way( &ctx.shabal, vhash, size );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
break;
|
||||
case WHIRLPOOL:
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
|
@@ -390,7 +390,7 @@ void sonoa_4way_hash( void *state, const void *input )
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
shabal512_4way_init( &ctx.shabal );
|
||||
shabal512_4way( &ctx.shabal, vhash, 64 );
|
||||
@@ -438,7 +438,7 @@ void sonoa_4way_hash( void *state, const void *input )
|
||||
shabal512_4way( &ctx.shabal, vhashB, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
@@ -522,13 +522,13 @@ void sonoa_4way_hash( void *state, const void *input )
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
shabal512_4way_init( &ctx.shabal );
|
||||
shabal512_4way( &ctx.shabal, vhash, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
|
||||
@@ -635,13 +635,13 @@ void sonoa_4way_hash( void *state, const void *input )
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
shabal512_4way_init( &ctx.shabal );
|
||||
shabal512_4way( &ctx.shabal, vhash, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
|
||||
@@ -769,13 +769,13 @@ void sonoa_4way_hash( void *state, const void *input )
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
shabal512_4way_init( &ctx.shabal );
|
||||
shabal512_4way( &ctx.shabal, vhash, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
|
||||
@@ -807,9 +807,9 @@ int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t lane_hash[8];
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
@@ -837,7 +837,7 @@ int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( ( ( hash7[ lane ] & mask ) == 0 ) )
|
||||
{
|
||||
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
|
@@ -124,8 +124,8 @@ void x17_4way_hash( void *state, const void *input )
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
|
||||
|
||||
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
|
||||
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
|
||||
mm256_dintrlv_2x128_512( hash0, hash1, vhashA );
|
||||
mm256_dintrlv_2x128_512( hash2, hash3, vhashB );
|
||||
|
||||
// 11 Echo serial
|
||||
init_echo( &ctx.echo, 512 );
|
||||
@@ -165,13 +165,13 @@ void x17_4way_hash( void *state, const void *input )
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
// 14 Shabal, parallel 4 way 32 bit
|
||||
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
shabal512_4way_init( &ctx.shabal );
|
||||
shabal512_4way( &ctx.shabal, vhash, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 15 Whirlpool serial
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
@@ -206,9 +206,9 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t lane_hash[8];
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
@@ -235,7 +235,7 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( ( hash7[ lane ] & mask ) == 0 )
|
||||
{
|
||||
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
|
@@ -161,13 +161,13 @@ void xevan_4way_hash( void *output, const void *input )
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
// Parallel 4way 32 bit
|
||||
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
|
||||
shabal512_4way_init( &ctx.shabal );
|
||||
shabal512_4way( &ctx.shabal, vhash, dataLen );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
|
||||
|
||||
// Serial
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
@@ -295,13 +295,13 @@ void xevan_4way_hash( void *output, const void *input )
|
||||
sph_fugue512( &ctx.fugue, hash3, dataLen );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
|
||||
shabal512_4way_init( &ctx.shabal );
|
||||
shabal512_4way( &ctx.shabal, vhash, dataLen );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
|
||||
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
|
||||
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
|
||||
@@ -333,9 +333,9 @@ int scanhash_xevan_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t lane_hash[8];
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
@@ -357,7 +357,7 @@ int scanhash_xevan_4way( struct work *work, uint32_t max_nonce,
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( hash7[ lane ] <= Htarg )
|
||||
{
|
||||
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
|
@@ -18,7 +18,7 @@ rm -f config.status
|
||||
# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
|
||||
#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
|
||||
|
||||
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
|
||||
CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure --with-curl
|
||||
#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
|
||||
#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
|
||||
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.5.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.5.1.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.9.5'
|
||||
PACKAGE_STRING='cpuminer-opt 3.9.5'
|
||||
PACKAGE_VERSION='3.9.5.1'
|
||||
PACKAGE_STRING='cpuminer-opt 3.9.5.1'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.9.5 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.9.5.1 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1404,7 +1404,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.9.5:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.9.5.1:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1509,7 +1509,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.9.5
|
||||
cpuminer-opt configure 3.9.5.1
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.9.5, which was
|
||||
It was created by cpuminer-opt $as_me 3.9.5.1, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2993,7 +2993,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.9.5'
|
||||
VERSION='3.9.5.1'
|
||||
|
||||
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.9.5, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.9.5.1, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -6756,7 +6756,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.9.5
|
||||
cpuminer-opt config.status 3.9.5.1
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.9.5])
|
||||
AC_INIT([cpuminer-opt], [3.9.5.1])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
342
cpu-miner.c
342
cpu-miner.c
@@ -843,8 +843,6 @@ void scale_hash_for_display ( double* hashrate, char* units )
|
||||
|
||||
const uint64_t diff2hash = 0x40000000ULL;
|
||||
|
||||
static struct timeval submit_time, prev_submit_time;
|
||||
static struct timeval submit_interval;
|
||||
static struct timeval five_min_start;
|
||||
static double shash_sum = 0.;
|
||||
static double bhash_sum = 0.;
|
||||
@@ -852,40 +850,92 @@ static double time_sum = 0.;
|
||||
static double latency_sum = 0.;
|
||||
static uint64_t submits_sum = 0;
|
||||
|
||||
static int share_result( int result, struct work *work, const char *reason )
|
||||
struct share_stats_t
|
||||
{
|
||||
char hr[16];
|
||||
const char *sres;
|
||||
struct timeval submit_time;
|
||||
double net_diff;
|
||||
double share_diff;
|
||||
char job_id[32];
|
||||
};
|
||||
|
||||
// with more and more parallelism the chances of submitting multiple
|
||||
// shares in a very short time grows.
|
||||
#define s_stats_size 4
|
||||
static struct share_stats_t share_stats[ s_stats_size ];
|
||||
static int s_get_ptr = 0, s_put_ptr = 0;
|
||||
static struct timeval last_submit_time = {0};
|
||||
|
||||
static int share_result( int result, struct work *null_work,
|
||||
const char *reason )
|
||||
{
|
||||
double share_time, share_hash, block_hash, share_size;
|
||||
double hashcount = 0.;
|
||||
double hashrate = 0.;
|
||||
uint64_t latency;
|
||||
struct share_stats_t my_stats = {0};
|
||||
struct timeval ack_time, latency_tv, et;
|
||||
char hr[32];
|
||||
char hr_units[4] = {0};
|
||||
bool solved;
|
||||
char shr[16];
|
||||
char shr[32];
|
||||
char shr_units[4] = {0};
|
||||
char diffstr[32];
|
||||
struct timeval ack_time, latency_tv;
|
||||
uint64_t latency;
|
||||
double share_time, share_hash, block_hash;
|
||||
double sharediff = work ? work->sharediff : stratum.sharediff;
|
||||
double share_size;
|
||||
const char *sres;
|
||||
bool solved;
|
||||
|
||||
// Mutex while accessing global counters.
|
||||
pthread_mutex_lock( &stats_lock );
|
||||
|
||||
// There is a window where a second share could be submitted
|
||||
// before receiving the response for this one. When this happens
|
||||
// te second share will be processed from [1] on the next pass.
|
||||
memcpy( &my_stats, &share_stats[ s_get_ptr], sizeof my_stats );
|
||||
memset( &share_stats[ s_get_ptr ], 0, sizeof my_stats );
|
||||
s_get_ptr++;
|
||||
if ( s_get_ptr >= s_stats_size )
|
||||
s_get_ptr = 0;
|
||||
/*
|
||||
if ( share_stats[0].submit_time.tv_sec )
|
||||
{
|
||||
memcpy( &my_stats, &share_stats[0], sizeof my_stats );
|
||||
memset( &share_stats[0], 0, sizeof my_stats );
|
||||
}
|
||||
else if ( share_stats[1].submit_time.tv_sec )
|
||||
{
|
||||
memcpy( &my_stats, &share_stats[1], sizeof my_stats );
|
||||
memset( &share_stats[1], 0, sizeof my_stats );
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy( &my_stats, &share_stats[2], sizeof my_stats );
|
||||
memset( &share_stats[2], 0, sizeof my_stats );
|
||||
}
|
||||
*/
|
||||
for ( int i = 0; i < opt_n_threads; i++ )
|
||||
{
|
||||
hashcount += thr_hashcount[i];
|
||||
hashrate += thr_hashrates[i];
|
||||
}
|
||||
global_hashcount = hashcount;
|
||||
global_hashrate = hashrate;
|
||||
|
||||
// calculate latency
|
||||
// calculate latency and share time.
|
||||
gettimeofday( &ack_time, NULL );
|
||||
timeval_subtract( &latency_tv, &ack_time, &submit_time );
|
||||
timeval_subtract( &latency_tv, &ack_time, &my_stats.submit_time );
|
||||
latency = ( latency_tv.tv_sec * 1000 + latency_tv.tv_usec / 1000 );
|
||||
timeval_subtract( &et, &my_stats.submit_time, &last_submit_time );
|
||||
share_time = (double)et.tv_sec + ( (double)et.tv_usec / 1000000. );
|
||||
memcpy( &last_submit_time, &my_stats.submit_time, sizeof last_submit_time );
|
||||
|
||||
// calculate share hashrate and size
|
||||
share_time = submit_interval.tv_sec + ( submit_interval.tv_usec / 1000000. );
|
||||
share_hash = sharediff * diff2hash;
|
||||
block_hash = net_diff * diff2hash;
|
||||
share_size = block_hash == 0. ? 0. : share_hash / block_hash;
|
||||
share_hash = my_stats.share_diff * diff2hash;
|
||||
block_hash = my_stats.net_diff * diff2hash;
|
||||
share_size = block_hash == 0. ? 0. : share_hash / block_hash * 100.;
|
||||
|
||||
// check result
|
||||
result ? accepted_share_count++ : rejected_share_count++;
|
||||
solved = result && (my_stats.net_diff > 0.0 )
|
||||
&& ( my_stats.share_diff >= net_diff );
|
||||
solved_block_count += solved ? 1 : 0 ;
|
||||
|
||||
// update counters for 5 minute summary report
|
||||
shash_sum += share_hash;
|
||||
@@ -897,14 +947,10 @@ static int share_result( int result, struct work *work, const char *reason )
|
||||
pthread_mutex_unlock( &stats_lock );
|
||||
|
||||
double share_hash_rate = share_time == 0. ? 0. : share_hash / share_time;
|
||||
double scaled_shr;
|
||||
|
||||
result ? accepted_share_count++ : rejected_share_count++;
|
||||
global_hashcount = hashcount;
|
||||
global_hashrate = hashrate;
|
||||
|
||||
// check for solved block
|
||||
solved = result && (net_diff > 0.0 ) && ( sharediff >= net_diff );
|
||||
solved_block_count += solved ? 1 : 0 ;
|
||||
scaled_shr = share_hash_rate;
|
||||
scale_hash_for_display ( &scaled_shr, shr_units );
|
||||
|
||||
if ( use_colors )
|
||||
{
|
||||
@@ -914,21 +960,33 @@ static int share_result( int result, struct work *work, const char *reason )
|
||||
|
||||
// colour code the share diff to highlight high value.
|
||||
if ( solved )
|
||||
sprintf( diffstr, "%s%.3g%s", CL_MAG, sharediff, CL_WHT );
|
||||
else if ( share_size > 0.01 )
|
||||
sprintf( diffstr, "%s%.3g%s", CL_GRN, sharediff, CL_WHT );
|
||||
else if ( share_size > 0.001 )
|
||||
sprintf( diffstr, "%s%.3g%s", CL_CYN, sharediff, CL_WHT );
|
||||
else if ( share_hash_rate > hashrate )
|
||||
sprintf( diffstr, "%s%.3g%s", CL_YLW, sharediff, CL_WHT );
|
||||
sprintf( diffstr, "%s%.3g%s", CL_MAG, my_stats.share_diff, CL_WHT );
|
||||
else if ( my_stats.share_diff > (my_stats.net_diff*0.1) )
|
||||
sprintf( diffstr, "%s%.3g%s", CL_GRN, my_stats.share_diff, CL_WHT );
|
||||
else if ( my_stats.share_diff > (my_stats.net_diff*0.01) )
|
||||
sprintf( diffstr, "%s%.3g%s", CL_CYN, my_stats.share_diff, CL_WHT );
|
||||
else
|
||||
sprintf( diffstr, "%.3g", sharediff );
|
||||
sprintf( diffstr, "%.3g", my_stats.share_diff );
|
||||
|
||||
if ( hashrate && share_hash_rate > (768.*hashrate) )
|
||||
sprintf( shr, "%s%.2f %sH/s%s", CL_MAG, scaled_shr, shr_units,
|
||||
CL_WHT );
|
||||
else if ( share_hash_rate > (32.*hashrate) )
|
||||
sprintf( shr, "%s%.2f %sH/s%s", CL_GRN, scaled_shr, shr_units,
|
||||
CL_WHT );
|
||||
else if ( share_hash_rate > 2.0*hashrate )
|
||||
sprintf( shr, "%s%.2f %sH/s%s", CL_CYN, scaled_shr, shr_units,
|
||||
CL_WHT );
|
||||
else if ( share_hash_rate > 0.5*hashrate )
|
||||
sprintf( shr, "%.2f %sH/s", scaled_shr, shr_units );
|
||||
else
|
||||
sprintf( shr, "%s%.2f %sH/s%s", CL_YLW, scaled_shr, shr_units,
|
||||
CL_WHT );
|
||||
}
|
||||
else
|
||||
{
|
||||
sres = ( solved ? "BLOCK SOLVED"
|
||||
: result ? "Accepted" : "Rejected" );
|
||||
sprintf( diffstr, "%3g", sharediff );
|
||||
sres = ( solved ? "BLOCK SOLVED" : result ? "Accepted" : "Rejected" );
|
||||
sprintf( diffstr, "%.3g", my_stats.share_diff );
|
||||
}
|
||||
|
||||
scale_hash_for_display ( &hashrate, hr_units );
|
||||
@@ -941,35 +999,19 @@ static int share_result( int result, struct work *work, const char *reason )
|
||||
sres, diffstr, share_time, accepted_share_count,
|
||||
rejected_share_count, solved_block_count );
|
||||
|
||||
if ( have_stratum && result && sharediff && net_diff && !opt_quiet )
|
||||
if ( have_stratum && result && my_stats.share_diff && my_stats.net_diff
|
||||
&& !opt_quiet )
|
||||
{
|
||||
// double share_hash_rate = share_time == 0. ? 0. : share_hash / share_time;
|
||||
|
||||
scale_hash_for_display ( &share_hash_rate, shr_units );
|
||||
if ( share_hash_rate < 10 )
|
||||
// very low hashrate, add digits
|
||||
sprintf( shr, "%.4f", share_hash_rate );
|
||||
else
|
||||
sprintf( shr, "%.2f", share_hash_rate );
|
||||
|
||||
applog( LOG_NOTICE, "Miner %s %sH/s, Share %s %sH/s, Latency %d ms.",
|
||||
hr, hr_units, shr, shr_units, latency );
|
||||
applog( LOG_NOTICE, "Height %d, Block share %.5f%%.",
|
||||
stratum.bloc_height, share_size*100. );
|
||||
applog( LOG_NOTICE, "Miner %s %sH/s, Share %s, Latency %d ms.",
|
||||
hr, hr_units, shr, latency );
|
||||
applog( LOG_NOTICE, "Height %d, job %s, %.5f%% block share.",
|
||||
stratum.bloc_height, my_stats.job_id, share_size );
|
||||
applog(LOG_INFO,"- - - - - - - - - - - - - - - - - - - - - - - - - - -");
|
||||
}
|
||||
|
||||
if ( reason )
|
||||
{
|
||||
applog( LOG_WARNING, "reject reason: %s", reason );
|
||||
/*
|
||||
if (strncmp(reason, "low difficulty share", 20) == 0)
|
||||
{
|
||||
opt_diff_factor = (opt_diff_factor * 2.0) / 3.0;
|
||||
applog(LOG_WARNING, "factor reduced to : %0.2f", opt_diff_factor);
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
}
|
||||
applog( LOG_WARNING, "reject reason: %s.", reason );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -1557,9 +1599,42 @@ bool submit_work(struct thr_info *thr, const struct work *work_in)
|
||||
{
|
||||
struct workio_cmd *wc;
|
||||
|
||||
memcpy( &prev_submit_time, &submit_time, sizeof submit_time );
|
||||
gettimeofday( &submit_time, NULL );
|
||||
timeval_subtract( &submit_interval, &submit_time, &prev_submit_time );
|
||||
// collect some share stats
|
||||
pthread_mutex_lock( &stats_lock );
|
||||
|
||||
gettimeofday( &share_stats[ s_put_ptr ].submit_time, NULL );
|
||||
share_stats[ s_put_ptr ].share_diff = work_in->sharediff;
|
||||
share_stats[ s_put_ptr ].net_diff = net_diff;
|
||||
strcpy( share_stats[ s_put_ptr ].job_id, work_in->job_id );
|
||||
|
||||
s_put_ptr++;
|
||||
if ( s_put_ptr >= s_stats_size )
|
||||
s_put_ptr = 0;
|
||||
/*
|
||||
if ( share_stats[0].submit_time.tv_sec == 0 )
|
||||
{
|
||||
gettimeofday( &share_stats[0].submit_time, NULL );
|
||||
share_stats[0].share_diff = work_in->sharediff;
|
||||
share_stats[0].net_diff = net_diff;
|
||||
strcpy( share_stats[0].job_id, work_in->job_id );
|
||||
}
|
||||
else if ( share_stats[1].submit_time.tv_sec == 0 )
|
||||
{ // previous share hasn't been confirmed yet.
|
||||
gettimeofday( &share_stats[1].submit_time, NULL );
|
||||
share_stats[1].share_diff = work_in->sharediff;
|
||||
share_stats[1].net_diff = net_diff;
|
||||
strcpy( share_stats[1].job_id, work_in->job_id );
|
||||
}
|
||||
else
|
||||
{ // previous share hasn't been confirmed yet.
|
||||
gettimeofday( &share_stats[2].submit_time, NULL );
|
||||
share_stats[2].share_diff = work_in->sharediff;
|
||||
share_stats[2].net_diff = net_diff;
|
||||
strcpy( share_stats[2].job_id, work_in->job_id );
|
||||
}
|
||||
*/
|
||||
|
||||
pthread_mutex_unlock( &stats_lock );
|
||||
|
||||
/* fill out work request message */
|
||||
wc = (struct workio_cmd *) calloc(1, sizeof(*wc));
|
||||
@@ -1723,6 +1798,7 @@ uint32_t* jr2_get_nonceptr( uint32_t *work_data )
|
||||
return (uint32_t*) ( ((uint8_t*) work_data) + algo_gate.nonce_index );
|
||||
}
|
||||
|
||||
|
||||
void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
|
||||
uint32_t *end_nonce_ptr, bool clean_job )
|
||||
{
|
||||
@@ -2023,7 +2099,12 @@ static void *miner_thread( void *userdata )
|
||||
break;
|
||||
}
|
||||
if ( !opt_quiet )
|
||||
applog( LOG_NOTICE, "Share submitted." );
|
||||
// applog( LOG_BLUE, "Share %d submitted by thread %d.",
|
||||
// accepted_share_count + rejected_share_count + 1,
|
||||
// mythr->id );
|
||||
applog( LOG_BLUE, "Share %d submitted by thread %d, job %s.",
|
||||
accepted_share_count + rejected_share_count + 1,
|
||||
mythr->id, work.job_id );
|
||||
|
||||
// prevent stale work in solo
|
||||
// we can't submit twice a block!
|
||||
@@ -2035,6 +2116,7 @@ static void *miner_thread( void *userdata )
|
||||
pthread_mutex_unlock( &g_work_lock );
|
||||
}
|
||||
}
|
||||
|
||||
// Check for 5 minute summary report, mutex until global counters
|
||||
// are read and reset. It's bad form to unlock inside a conditional
|
||||
// block but more efficient. The logic is reversed to make the mutex
|
||||
@@ -2057,43 +2139,56 @@ static void *miner_thread( void *userdata )
|
||||
|
||||
pthread_mutex_unlock( &stats_lock );
|
||||
|
||||
char hr[16];
|
||||
char hr_units[4] = {0};
|
||||
char bshstr[32];
|
||||
double hrate = time == 0. ? 0. : hash / time;
|
||||
double ghrate = global_hashrate;
|
||||
double shrate = time == 0. ? 0. : hash / time;
|
||||
double scaled_shrate = shrate;
|
||||
double avg_share = bhash == 0. ? 0. : hash / bhash * 100.;
|
||||
char shr[32];
|
||||
char shr_units[4] = {0};
|
||||
int temp = cpu_temp(0);
|
||||
char timestr[32];
|
||||
|
||||
latency = submits ? latency / submits : 0;
|
||||
scale_hash_for_display( &scaled_shrate, shr_units );
|
||||
|
||||
// colour code the block share to highlight high value.
|
||||
if ( avg_share > 90.0 )
|
||||
sprintf( bshstr, "%s%.5f%s", CL_MAG, avg_share, CL_WHT );
|
||||
else if ( avg_share > 1.0 )
|
||||
sprintf( bshstr, "%s%.5f%s", CL_GRN, avg_share, CL_WHT );
|
||||
else if ( avg_share > 0.1 )
|
||||
sprintf( bshstr, "%s%.5f%s", CL_CYN, avg_share, CL_WHT );
|
||||
else if ( hrate > global_hashrate )
|
||||
sprintf( bshstr, "%s%.5f%s", CL_YLW, avg_share, CL_WHT );
|
||||
if ( use_colors )
|
||||
{
|
||||
if ( shrate > (32.*ghrate) )
|
||||
sprintf( shr, "%s%.2f %sH/s%s", CL_MAG, scaled_shrate,
|
||||
shr_units, CL_WHT );
|
||||
else if ( shrate > (8.*ghrate) )
|
||||
sprintf( shr, "%s%.2f %sH/s%s", CL_GRN, scaled_shrate,
|
||||
shr_units, CL_WHT );
|
||||
else if ( shrate > 2.0*ghrate )
|
||||
sprintf( shr, "%s%.2f %sH/s%s", CL_CYN, scaled_shrate,
|
||||
shr_units, CL_WHT );
|
||||
else if ( shrate > 0.5*ghrate )
|
||||
sprintf( shr, "%.2f %sH/s", scaled_shrate, shr_units );
|
||||
else
|
||||
sprintf( bshstr, "%.5f", avg_share );
|
||||
sprintf( shr, "%s%.2f %sH/s%s", CL_YLW, scaled_shrate,
|
||||
shr_units, CL_WHT );
|
||||
|
||||
scale_hash_for_display ( &hrate, hr_units );
|
||||
if ( hrate < 10. )
|
||||
// very low hashrate, add digits
|
||||
sprintf( hr, "%.4f", hrate );
|
||||
if ( temp >= 80 ) sprintf( timestr, "%s%d C%s",
|
||||
CL_RED, temp, CL_WHT );
|
||||
else if (temp >=70 ) sprintf( timestr, "%s%d C%s",
|
||||
CL_YLW, temp, CL_WHT );
|
||||
else sprintf( timestr, "%d C", temp );
|
||||
}
|
||||
else
|
||||
sprintf( hr, "%.2f", hrate );
|
||||
sprintf( shr, "%.2f %sH/s", scaled_shrate, shr_units );
|
||||
|
||||
applog(LOG_NOTICE,"Summary: %d submits in %dm%02ds, block share %s%%.",
|
||||
(uint64_t)submits, et.tv_sec / 60,
|
||||
et.tv_sec % 60, bshstr );
|
||||
applog(LOG_NOTICE,"Submitted %d shares in %dm%02ds, %.5f%% block share.",
|
||||
(uint64_t)submits, et.tv_sec / 60, et.tv_sec % 60, avg_share );
|
||||
|
||||
#if ((defined(_WIN64) || defined(__WINDOWS__)))
|
||||
applog(LOG_NOTICE,"Share hashrate %s %sH/s, latency %d ms.",
|
||||
hr, hr_units, latency );
|
||||
applog(LOG_NOTICE,"Share hashrate %s, latency %d ms.",
|
||||
shr, latency );
|
||||
#else
|
||||
applog(LOG_NOTICE,"Share hashrate %s %sH/s, latency %d ms, temp %dC.",
|
||||
hr, hr_units, latency, (uint32_t)cpu_temp(0) );
|
||||
applog(LOG_NOTICE,"Share hashrate %s, latency %d ms, temp %s.",
|
||||
shr, latency, timestr );
|
||||
#endif
|
||||
// applog(LOG_NOTICE,"Performance index: %s.", hixstr );
|
||||
applog(LOG_INFO,"- - - - - - - - - - - - - - - - - - - - - - - - - - -");
|
||||
|
||||
}
|
||||
|
||||
@@ -2457,6 +2552,9 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
|
||||
algo_gate.set_work_data_endian( g_work );
|
||||
pthread_mutex_unlock( &sctx->work_lock );
|
||||
|
||||
// if ( !opt_quiet )
|
||||
// applog( LOG_BLUE,"New job %s.", g_work->job_id );
|
||||
|
||||
if ( opt_debug )
|
||||
{
|
||||
unsigned char *xnonce2str = abin2hex( g_work->xnonce2,
|
||||
@@ -2470,14 +2568,14 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
|
||||
|
||||
if ( stratum_diff != sctx->job.diff )
|
||||
{
|
||||
char sdiff[32] = { 0 };
|
||||
// char sdiff[32] = { 0 };
|
||||
// store for api stats
|
||||
stratum_diff = sctx->job.diff;
|
||||
if ( opt_showdiff && g_work->targetdiff != stratum_diff )
|
||||
if ( !opt_quiet && opt_showdiff && g_work->targetdiff != stratum_diff )
|
||||
{
|
||||
snprintf( sdiff, 32, " (%.5f)", g_work->targetdiff );
|
||||
applog( LOG_WARNING, "Stratum difficulty set to %g%s", stratum_diff,
|
||||
sdiff );
|
||||
// snprintf( sdiff, 32, " (%.5f)", g_work->targetdiff );
|
||||
applog( LOG_BLUE, "Stratum difficulty set to %g", stratum_diff );
|
||||
// sdiff );
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2547,16 +2645,16 @@ static void *stratum_thread(void *userdata )
|
||||
}
|
||||
}
|
||||
|
||||
if ( stratum.job.job_id &&
|
||||
( !g_work_time || strcmp( stratum.job.job_id, g_work.job_id ) ) )
|
||||
if ( stratum.job.job_id
|
||||
&& ( !g_work_time || strcmp( stratum.job.job_id, g_work.job_id ) ) )
|
||||
{
|
||||
pthread_mutex_lock(&g_work_lock);
|
||||
algo_gate.stratum_gen_work( &stratum, &g_work );
|
||||
time(&g_work_time);
|
||||
pthread_mutex_unlock(&g_work_lock);
|
||||
// restart_threads();
|
||||
restart_threads();
|
||||
|
||||
if (stratum.job.clean || jsonrpc_2)
|
||||
if ( stratum.job.clean || jsonrpc_2 )
|
||||
{
|
||||
static uint32_t last_bloc_height;
|
||||
if ( last_bloc_height != stratum.bloc_height )
|
||||
@@ -2564,20 +2662,24 @@ static void *stratum_thread(void *userdata )
|
||||
last_bloc_height = stratum.bloc_height;
|
||||
if ( !opt_quiet )
|
||||
{
|
||||
if (net_diff > 0.)
|
||||
applog(LOG_BLUE, "%s block %d, network diff %.3f",
|
||||
algo_names[opt_algo], stratum.bloc_height, net_diff);
|
||||
if ( net_diff > 0. )
|
||||
applog( LOG_BLUE,
|
||||
"%s block %d, job %s, network diff %.4f",
|
||||
algo_names[opt_algo], stratum.bloc_height,
|
||||
g_work.job_id, net_diff);
|
||||
else
|
||||
applog(LOG_BLUE, "%s %s block %d", short_url,
|
||||
algo_names[opt_algo], stratum.bloc_height);
|
||||
applog( LOG_BLUE, "%s %s block %d, job %s",
|
||||
short_url, algo_names[opt_algo],
|
||||
stratum.bloc_height, g_work.job_id );
|
||||
}
|
||||
}
|
||||
restart_threads();
|
||||
else if ( !opt_quiet )
|
||||
applog( LOG_BLUE,"New job %s.", g_work.job_id );
|
||||
}
|
||||
else if (opt_debug && !opt_quiet)
|
||||
{
|
||||
applog(LOG_BLUE, "%s asks job %d for block %d", short_url,
|
||||
strtoul(stratum.job.job_id, NULL, 16), stratum.bloc_height);
|
||||
applog( LOG_BLUE, "%s asks job %d for block %d", short_url,
|
||||
strtoul( stratum.job.job_id, NULL, 16 ), stratum.bloc_height );
|
||||
}
|
||||
} // stratum.job.job_id
|
||||
|
||||
@@ -3402,23 +3504,23 @@ int main(int argc, char *argv[])
|
||||
if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1);
|
||||
|
||||
// Initialize stats times and counters
|
||||
gettimeofday( &prev_submit_time, NULL );
|
||||
memcpy( &submit_time, &prev_submit_time, sizeof submit_time );
|
||||
memcpy( &five_min_start, &prev_submit_time, sizeof prev_submit_time );
|
||||
memset( share_stats, 0, 2 * sizeof (struct share_stats_t) );
|
||||
gettimeofday( &last_submit_time, NULL );
|
||||
memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
|
||||
|
||||
if ( !check_cpu_capability() ) exit(1);
|
||||
|
||||
pthread_mutex_init(&stats_lock, NULL);
|
||||
pthread_mutex_init(&g_work_lock, NULL);
|
||||
pthread_mutex_init(&rpc2_job_lock, NULL);
|
||||
pthread_mutex_init(&rpc2_login_lock, NULL);
|
||||
pthread_mutex_init(&stratum.sock_lock, NULL);
|
||||
pthread_mutex_init(&stratum.work_lock, NULL);
|
||||
pthread_mutex_init( &stats_lock, NULL );
|
||||
pthread_mutex_init( &g_work_lock, NULL );
|
||||
pthread_mutex_init( &rpc2_job_lock, NULL );
|
||||
pthread_mutex_init( &rpc2_login_lock, NULL );
|
||||
pthread_mutex_init( &stratum.sock_lock, NULL );
|
||||
pthread_mutex_init( &stratum.work_lock, NULL );
|
||||
|
||||
flags = !opt_benchmark && strncmp(rpc_url, "https:", 6)
|
||||
? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL)
|
||||
flags = !opt_benchmark && strncmp( rpc_url, "https:", 6 )
|
||||
? ( CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL )
|
||||
: CURL_GLOBAL_ALL;
|
||||
if (curl_global_init(flags))
|
||||
if ( curl_global_init( flags ) )
|
||||
{
|
||||
applog(LOG_ERR, "CURL initialization failed");
|
||||
return 1;
|
||||
|
13
simd-utils.h
13
simd-utils.h
@@ -183,9 +183,15 @@
|
||||
#include "simd-utils/simd-sse2.h"
|
||||
#include "simd-utils/intrlv-sse2.h"
|
||||
|
||||
#if defined(__AVX__)
|
||||
|
||||
// 256 bit vector basics
|
||||
#include "simd-utils/simd-avx.h"
|
||||
#include "simd-utils/intrlv-avx.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// 256 bit vectors
|
||||
// 256 bit everything else
|
||||
#include "simd-utils/simd-avx2.h"
|
||||
#include "simd-utils/intrlv-avx2.h"
|
||||
|
||||
@@ -198,6 +204,11 @@
|
||||
|
||||
#endif // MMX
|
||||
#endif // SSE2
|
||||
#endif // AVX
|
||||
#endif // AVX2
|
||||
#endif // AVX512
|
||||
|
||||
// Picks implementation based on available CPU features.
|
||||
#include "simd-utils/intrlv-selector.h"
|
||||
|
||||
#endif // SIMD_UTILS_H__
|
||||
|
867
simd-utils/intrlv-avx.h
Normal file
867
simd-utils/intrlv-avx.h
Normal file
@@ -0,0 +1,867 @@
|
||||
#if !defined(INTRLV_AVX_H__)
|
||||
#define INTRLV_AVX_H__ 1
|
||||
|
||||
#if defined(__AVX__)
|
||||
|
||||
// Convenient short cuts for local use only
|
||||
|
||||
// Extract 64 bits from the low 128 bits of 256 bit vector.
|
||||
#define extr64_cast128_256( a, n ) \
|
||||
_mm_extract_epi64( _mm256_castsi256_si128( a ), n )
|
||||
|
||||
// Extract 32 bits from the low 128 bits of 256 bit vector.
|
||||
#define extr32_cast128_256( a, n ) \
|
||||
_mm_extract_epi32( _mm256_castsi256_si128( a ), n )
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
//
|
||||
// AVX 256 Bit Vectors
|
||||
//
|
||||
// 256 bit interleaving can be done with AVX.
|
||||
|
||||
#define mm256_put_64( s0, s1, s2, s3) \
|
||||
_mm256_set_epi64x( *((const uint64_t*)(s3)), *((const uint64_t*)(s2)), \
|
||||
*((const uint64_t*)(s1)), *((const uint64_t*)(s0)) )
|
||||
|
||||
#define mm256_put_32( s00, s01, s02, s03, s04, s05, s06, s07 ) \
|
||||
_mm256_set_epi32( *((const uint32_t*)(s07)), *((const uint32_t*)(s06)), \
|
||||
*((const uint32_t*)(s05)), *((const uint32_t*)(s04)), \
|
||||
*((const uint32_t*)(s03)), *((const uint32_t*)(s02)), \
|
||||
*((const uint32_t*)(s01)), *((const uint32_t*)(s00)) )
|
||||
|
||||
#define mm256_get_64( s, i0, i1, i2, i3 ) \
|
||||
_mm256_set_epi64x( ((const uint64_t*)(s))[i3], ((const uint64_t*)(s))[i2], \
|
||||
((const uint64_t*)(s))[i1], ((const uint64_t*)(s))[i0] )
|
||||
|
||||
#define mm256_get_32( s, i0, i1, i2, i3, i4, i5, i6, i7 ) \
|
||||
_mm256_set_epi32( ((const uint32_t*)(s))[i7], ((const uint32_t*)(s))[i6], \
|
||||
((const uint32_t*)(s))[i5], ((const uint32_t*)(s))[i4], \
|
||||
((const uint32_t*)(s))[i3], ((const uint32_t*)(s))[i2], \
|
||||
((const uint32_t*)(s))[i1], ((const uint32_t*)(s))[i0] )
|
||||
|
||||
/*
|
||||
// Blend 2 vectors alternating hi & lo: { hi[n], lo[n-1], ... hi[1], lo[0] }
|
||||
#define mm256_intrlv_blend_128( hi, lo ) \
|
||||
_mm256_blend_epi32( hi, lo, 0x0f )
|
||||
|
||||
#define mm256_intrlv_blend_64( hi, lo ) \
|
||||
_mm256_blend_epi32( hi, lo, 0x33 )
|
||||
|
||||
#define mm256_intrlv_blend_32( hi, lo ) \
|
||||
_mm256_blend_epi32( hi, lo, 0x55 )
|
||||
*/
|
||||
|
||||
// Interleave 8x32_256
|
||||
#define mm256_intrlv_8x32_256( d, s0, s1, s2, s3, s4, s5, s6, s7 ) \
|
||||
{ \
|
||||
__m128i s0hi = mm128_extr_hi128_256( s0 ); \
|
||||
__m128i s1hi = mm128_extr_hi128_256( s1 ); \
|
||||
__m128i s2hi = mm128_extr_hi128_256( s2 ); \
|
||||
__m128i s3hi = mm128_extr_hi128_256( s3 ); \
|
||||
__m128i s4hi = mm128_extr_hi128_256( s4 ); \
|
||||
__m128i s5hi = mm128_extr_hi128_256( s5 ); \
|
||||
__m128i s6hi = mm128_extr_hi128_256( s6 ); \
|
||||
__m128i s7hi = mm128_extr_hi128_256( s7 ); \
|
||||
casti_m256i( d,0 ) = _mm256_set_epi32( \
|
||||
extr32_cast128_256(s7,0), extr32_cast128_256(s6,0), \
|
||||
extr32_cast128_256(s5,0), extr32_cast128_256(s4,0), \
|
||||
extr32_cast128_256(s3,0), extr32_cast128_256(s2,0), \
|
||||
extr32_cast128_256(s1,0), extr32_cast128_256(s0,0) ); \
|
||||
casti_m256i( d,1 ) = _mm256_set_epi32( \
|
||||
extr32_cast128_256(s7,1), extr32_cast128_256(s6,1), \
|
||||
extr32_cast128_256(s5,1), extr32_cast128_256(s4,1), \
|
||||
extr32_cast128_256(s3,1), extr32_cast128_256(s2,1), \
|
||||
extr32_cast128_256(s1,1), extr32_cast128_256(s0,1) ); \
|
||||
casti_m256i( d,2 ) = _mm256_set_epi32( \
|
||||
extr32_cast128_256(s7,2), extr32_cast128_256(s6,2), \
|
||||
extr32_cast128_256(s5,2), extr32_cast128_256(s4,2), \
|
||||
extr32_cast128_256(s3,2), extr32_cast128_256(s2,2), \
|
||||
extr32_cast128_256(s1,2), extr32_cast128_256(s0,2) ); \
|
||||
casti_m256i( d,3 ) = _mm256_set_epi32( \
|
||||
extr32_cast128_256(s7,3), extr32_cast128_256(s6,3), \
|
||||
extr32_cast128_256(s5,3), extr32_cast128_256(s4,3), \
|
||||
extr32_cast128_256(s3,3), extr32_cast128_256(s2,3), \
|
||||
extr32_cast128_256(s1,3), extr32_cast128_256(s0,3) ); \
|
||||
casti_m256i( d,4 ) = _mm256_set_epi32( \
|
||||
mm128_extr_32(s7hi,0), mm128_extr_32(s6hi,0), \
|
||||
mm128_extr_32(s5hi,0), mm128_extr_32(s4hi,0), \
|
||||
mm128_extr_32(s3hi,0), mm128_extr_32(s2hi,0), \
|
||||
mm128_extr_32(s1hi,0), mm128_extr_32(s0hi,0) ); \
|
||||
casti_m256i( d,5 ) = _mm256_set_epi32( \
|
||||
mm128_extr_32(s7hi,1), mm128_extr_32(s6hi,1), \
|
||||
mm128_extr_32(s5hi,1), mm128_extr_32(s4hi,1), \
|
||||
mm128_extr_32(s3hi,1), mm128_extr_32(s2hi,1), \
|
||||
mm128_extr_32(s1hi,1), mm128_extr_32(s0hi,1) ); \
|
||||
casti_m256i( d,6 ) = _mm256_set_epi32( \
|
||||
mm128_extr_32(s7hi,2), mm128_extr_32(s6hi,2), \
|
||||
mm128_extr_32(s5hi,2), mm128_extr_32(s4hi,2), \
|
||||
mm128_extr_32(s3hi,2), mm128_extr_32(s2hi,2), \
|
||||
mm128_extr_32(s1hi,2), mm128_extr_32(s0hi,2) ); \
|
||||
casti_m256i( d,7 ) = _mm256_set_epi32( \
|
||||
mm128_extr_32(s7hi,3), mm128_extr_32(s6hi,3), \
|
||||
mm128_extr_32(s5hi,3), mm128_extr_32(s4hi,3), \
|
||||
mm128_extr_32(s3hi,3), mm128_extr_32(s2hi,3), \
|
||||
mm128_extr_32(s1hi,3), mm128_extr_32(s0hi,3) ); \
|
||||
} while(0)
|
||||
|
||||
#define mm256_intrlv_8x32_128( d, s0, s1, s2, s3, s4, s5, s6, s7 ) \
|
||||
{ \
|
||||
casti_m256i( d,0 ) = _mm256_set_epi32( \
|
||||
mm128_extr_32(s7,0), mm128_extr_32(s6,0), \
|
||||
mm128_extr_32(s5,0), mm128_extr_32(s4,0), \
|
||||
mm128_extr_32(s3,0), mm128_extr_32(s2,0), \
|
||||
mm128_extr_32(s1,0), mm128_extr_32(s0,0) ); \
|
||||
casti_m256i( d,1 ) = _mm256_set_epi32( \
|
||||
mm128_extr_32(s7,1), mm128_extr_32(s6,1), \
|
||||
mm128_extr_32(s5,1), mm128_extr_32(s4,1), \
|
||||
mm128_extr_32(s3,1), mm128_extr_32(s2,1), \
|
||||
mm128_extr_32(s1,1), mm128_extr_32(s0,1) ); \
|
||||
casti_m256i( d,2 ) = _mm256_set_epi32( \
|
||||
mm128_extr_32(s7,2), mm128_extr_32(s6,2), \
|
||||
mm128_extr_32(s5,2), mm128_extr_32(s4,2), \
|
||||
mm128_extr_32(s3,2), mm128_extr_32(s2,2), \
|
||||
mm128_extr_32(s1,2), mm128_extr_32(s0,2) ); \
|
||||
casti_m256i( d,3 ) = _mm256_set_epi32( \
|
||||
mm128_extr_32(s7,3), mm128_extr_32(s6,3), \
|
||||
mm128_extr_32(s5,3), mm128_extr_32(s4,3), \
|
||||
mm128_extr_32(s3,3), mm128_extr_32(s2,3), \
|
||||
mm128_extr_32(s1,3), mm128_extr_32(s0,3) ); \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
#define mm256_bswap_intrlv_8x32_256( d, src ) \
|
||||
do { \
|
||||
__m256i s0 = mm256_bswap_32( src ); \
|
||||
__m128i s1 = _mm256_extracti128_si256( s0, 1 ); \
|
||||
casti_m256i( d, 0 ) = _mm256_set1_epi32( _mm_extract_epi32( \
|
||||
_mm256_castsi256_si128( s0 ), 0 ) ); \
|
||||
casti_m256i( d, 1 ) = _mm256_set1_epi32( _mm_extract_epi32( \
|
||||
_mm256_castsi256_si128( s0 ), 1 ) ); \
|
||||
casti_m256i( d, 2 ) = _mm256_set1_epi32( _mm_extract_epi32( \
|
||||
_mm256_castsi256_si128( s0 ), 2 ) ); \
|
||||
casti_m256i( d, 3 ) = _mm256_set1_epi32( _mm_extract_epi32( \
|
||||
_mm256_castsi256_si128( s0 ), 3 ) ); \
|
||||
casti_m256i( d, 4 ) = _mm256_set1_epi32( _mm_extract_epi32( s1, 0 ) ); \
|
||||
casti_m256i( d, 5 ) = _mm256_set1_epi32( _mm_extract_epi32( s1, 1 ) ); \
|
||||
casti_m256i( d, 6 ) = _mm256_set1_epi32( _mm_extract_epi32( s1, 2 ) ); \
|
||||
casti_m256i( d, 7 ) = _mm256_set1_epi32( _mm_extract_epi32( s1, 3 ) ); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define mm256_bswap_intrlv_8x32_128( d, src ) \
|
||||
do { \
|
||||
__m128i ss = mm128_bswap_32( src ); \
|
||||
casti_m256i( d, 0 ) = _mm256_set1_epi32( _mm_extract_epi32( ss, 0 ) ); \
|
||||
casti_m256i( d, 1 ) = _mm256_set1_epi32( _mm_extract_epi32( ss, 1 ) ); \
|
||||
casti_m256i( d, 2 ) = _mm256_set1_epi32( _mm_extract_epi32( ss, 2 ) ); \
|
||||
casti_m256i( d, 3 ) = _mm256_set1_epi32( _mm_extract_epi32( ss, 3 ) ); \
|
||||
} while(0)
|
||||
*/
|
||||
|
||||
#define mm256_dintrlv_8x32_256( d0, d1, d2, d3, d4, d5, d6, d7, s ) \
|
||||
do { \
|
||||
__m256i s0 = casti_m256i(s,0); \
|
||||
__m256i s1 = casti_m256i(s,1); \
|
||||
__m256i s2 = casti_m256i(s,2); \
|
||||
__m256i s3 = casti_m256i(s,3); \
|
||||
__m256i s4 = casti_m256i(s,4); \
|
||||
__m256i s5 = casti_m256i(s,5); \
|
||||
__m256i s6 = casti_m256i(s,6); \
|
||||
__m256i s7 = casti_m256i(s,7); \
|
||||
__m128i s0hi = _mm256_extracti128_si256( s0, 1 ); \
|
||||
__m128i s1hi = _mm256_extracti128_si256( s1, 1 ); \
|
||||
__m128i s2hi = _mm256_extracti128_si256( s2, 1 ); \
|
||||
__m128i s3hi = _mm256_extracti128_si256( s3, 1 ); \
|
||||
__m128i s4hi = _mm256_extracti128_si256( s4, 1 ); \
|
||||
__m128i s5hi = _mm256_extracti128_si256( s5, 1 ); \
|
||||
__m128i s6hi = _mm256_extracti128_si256( s6, 1 ); \
|
||||
__m128i s7hi = _mm256_extracti128_si256( s7, 1 ); \
|
||||
d0 = _mm256_set_epi32( \
|
||||
extr32_cast128_256( s7, 0 ), extr32_cast128_256( s6, 0 ), \
|
||||
extr32_cast128_256( s5, 0 ), extr32_cast128_256( s4, 0 ), \
|
||||
extr32_cast128_256( s3, 0 ), extr32_cast128_256( s2, 0 ), \
|
||||
extr32_cast128_256( s1, 0 ), extr32_cast128_256( s0, 0 ) );\
|
||||
d1 = _mm256_set_epi32( \
|
||||
extr32_cast128_256( s7, 1 ), extr32_cast128_256( s6, 1 ), \
|
||||
extr32_cast128_256( s5, 1 ), extr32_cast128_256( s4, 1 ), \
|
||||
extr32_cast128_256( s3, 1 ), extr32_cast128_256( s2, 1 ), \
|
||||
extr32_cast128_256( s1, 1 ), extr32_cast128_256( s0, 1 ) );\
|
||||
d2 = _mm256_set_epi32( \
|
||||
extr32_cast128_256( s7, 2 ), extr32_cast128_256( s6, 2 ), \
|
||||
extr32_cast128_256( s5, 2 ), extr32_cast128_256( s4, 2 ), \
|
||||
extr32_cast128_256( s3, 2 ), extr32_cast128_256( s2, 2 ), \
|
||||
extr32_cast128_256( s1, 2 ), extr32_cast128_256( s0, 2 ) );\
|
||||
d3 = _mm256_set_epi32( \
|
||||
extr32_cast128_256( s7, 3 ), extr32_cast128_256( s6, 3 ), \
|
||||
extr32_cast128_256( s5, 3 ), extr32_cast128_256( s4, 3 ), \
|
||||
extr32_cast128_256( s3, 3 ), extr32_cast128_256( s2, 3 ), \
|
||||
extr32_cast128_256( s1, 3 ), extr32_cast128_256( s0, 3 ) );\
|
||||
d4 = _mm256_set_epi32( \
|
||||
_mm_extract_epi32( s7hi, 0 ), _mm_extract_epi32( s6hi, 0 ), \
|
||||
_mm_extract_epi32( s5hi, 0 ), _mm_extract_epi32( s4hi, 0 ), \
|
||||
_mm_extract_epi32( s3hi, 0 ), _mm_extract_epi32( s2hi, 0 ), \
|
||||
_mm_extract_epi32( s1hi, 0 ), _mm_extract_epi32( s0hi, 0 ) ); \
|
||||
d5 = _mm256_set_epi32( \
|
||||
_mm_extract_epi32( s7hi, 1 ), _mm_extract_epi32( s6hi, 1 ), \
|
||||
_mm_extract_epi32( s5hi, 1 ), _mm_extract_epi32( s4hi, 1 ), \
|
||||
_mm_extract_epi32( s3hi, 1 ), _mm_extract_epi32( s2hi, 1 ), \
|
||||
_mm_extract_epi32( s1hi, 1 ), _mm_extract_epi32( s0hi, 1 ) ); \
|
||||
d6 = _mm256_set_epi32( \
|
||||
_mm_extract_epi32( s7hi, 2 ), _mm_extract_epi32( s6hi, 2 ), \
|
||||
_mm_extract_epi32( s5hi, 2 ), _mm_extract_epi32( s4hi, 2 ), \
|
||||
_mm_extract_epi32( s3hi, 2 ), _mm_extract_epi32( s2hi, 2 ), \
|
||||
_mm_extract_epi32( s1hi, 2 ), _mm_extract_epi32( s0hi, 2 ) ); \
|
||||
d7 = _mm256_set_epi32( \
|
||||
_mm_extract_epi32( s7hi, 3 ), _mm_extract_epi32( s6hi, 3 ), \
|
||||
_mm_extract_epi32( s5hi, 3 ), _mm_extract_epi32( s4hi, 3 ), \
|
||||
_mm_extract_epi32( s3hi, 3 ), _mm_extract_epi32( s2hi, 3 ), \
|
||||
_mm_extract_epi32( s1hi, 3 ), _mm_extract_epi32( s0hi, 3 ) ); \
|
||||
} while(0)
|
||||
|
||||
#define mm128_dintrlv_8x32_128( d0, d1, d2, d3, d4, d5, d6, d7, s ) \
|
||||
do { \
|
||||
__m128i s0 = casti_m128i(s,0); \
|
||||
__m128i s1 = casti_m128i(s,1); \
|
||||
__m128i s2 = casti_m128i(s,2); \
|
||||
__m128i s3 = casti_m128i(s,3); \
|
||||
d0 = _mm_set_epi32( \
|
||||
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
|
||||
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
|
||||
d1 = _mm_set_epi32( \
|
||||
_mm_extract_epi32( s3, 1 ), _mm_extract_epi32( s2, 0 ), \
|
||||
_mm_extract_epi32( s1, 1 ), _mm_extract_epi32( s0, 0 ) ); \
|
||||
d2 = _mm_set_epi32( \
|
||||
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
|
||||
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
|
||||
d3 = _mm_set_epi32( \
|
||||
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
|
||||
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
|
||||
d4 = _mm_set_epi32( \
|
||||
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
|
||||
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
|
||||
d5 = _mm_set_epi32( \
|
||||
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
|
||||
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
|
||||
d6 = _mm_set_epi32( \
|
||||
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
|
||||
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
|
||||
d7 = _mm_set_epi32( \
|
||||
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
|
||||
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
|
||||
} while(0)
|
||||
|
||||
#define mm256_intrlv_4x64_256( d, s0, s1, s2, s3 ) \
|
||||
do { \
|
||||
__m128i s0hi = _mm256_extracti128_si256( s0, 1 ); \
|
||||
__m128i s1hi = _mm256_extracti128_si256( s1, 1 ); \
|
||||
__m128i s2hi = _mm256_extracti128_si256( s2, 1 ); \
|
||||
__m128i s3hi = _mm256_extracti128_si256( s3, 1 ); \
|
||||
casti_m256i( d,0 ) = _mm256_set_epi64x( \
|
||||
extr64_cast128_256( s3, 0 ), extr64_cast128_256( s2, 0 ), \
|
||||
extr64_cast128_256( s1, 0 ), extr64_cast128_256( s0, 0 ) ); \
|
||||
casti_m256i( d,1 ) = _mm256_set_epi64x( \
|
||||
extr64_cast128_256( s3, 1 ), extr64_cast128_256( s2, 1 ), \
|
||||
extr64_cast128_256( s1, 1 ), extr64_cast128_256( s0, 1 ) ); \
|
||||
casti_m256i( d,2 ) = _mm256_set_epi64x( \
|
||||
_mm_extract_epi64( s3hi,0 ), _mm_extract_epi64( s2hi,0 ), \
|
||||
_mm_extract_epi64( s1hi,0 ), _mm_extract_epi64( s0hi,0 ) ); \
|
||||
casti_m256i( d,3 ) = _mm256_set_epi64x( \
|
||||
_mm_extract_epi64( s3hi,1 ), _mm_extract_epi64( s2hi,1 ), \
|
||||
_mm_extract_epi64( s1hi,1 ), _mm_extract_epi64( s0hi,1 ) ); \
|
||||
} while(0)
|
||||
|
||||
#define mm256_intrlv_4x64_128( d, s0, s1, s2, s3 ) \
|
||||
do { \
|
||||
casti_m256i( d,0 ) = _mm256_set_epi64x( \
|
||||
_mm_extract_epi64( s3, 0 ), _mm_extract_epi64( s2, 0 ), \
|
||||
_mm_extract_epi64( s1, 0 ), _mm_extract_epi64( s0, 0 ) ); \
|
||||
casti_m256i( d,1 ) = _mm256_set_epi64x( \
|
||||
_mm_extract_epi64( s3, 1 ), _mm_extract_epi64( s2, 1 ), \
|
||||
_mm_extract_epi64( s1, 1 ), _mm_extract_epi64( s0, 1 ) ); \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
#define mm256_bswap_intrlv_4x64_256( d, src ) \
|
||||
do { \
|
||||
__m256i s0 = mm256_bswap_32( src ); \
|
||||
__m128i s1 = _mm256_extracti128_si256( s0, 1 ); \
|
||||
casti_m256i( d,0 ) = _mm256_set1_epi64x( _mm_extract_epi64( \
|
||||
_mm256_castsi256_si128( s0 ), 0 ) ); \
|
||||
casti_m256i( d,1 ) = _mm256_set1_epi64x( _mm_extract_epi64( \
|
||||
_mm256_castsi256_si128( s0 ), 1 ) ); \
|
||||
casti_m256i( d,2 ) = _mm256_set1_epi64x( _mm_extract_epi64( s1, 0 ) ); \
|
||||
casti_m256i( d,3 ) = _mm256_set1_epi64x( _mm_extract_epi64( s1, 1 ) ); \
|
||||
} while(0)
|
||||
|
||||
#define mm256_bswap_intrlv_4x64_128( d, src ) \
|
||||
do { \
|
||||
__m128i ss = mm128_bswap_32( src ); \
|
||||
casti_m256i( d,0 ) = _mm256_set1_epi64x( _mm_extract_epi64( ss, 0 ) ); \
|
||||
casti_m256i( d,1 ) = _mm256_set1_epi64x( _mm_extract_epi64( ss, 1 ) ); \
|
||||
} while(0)
|
||||
*/
|
||||
|
||||
// 4 lanes of 256 bits using 64 bit interleaving (standard final hash size)
|
||||
static inline void mm256_dintrlv_4x64_256( void *d0, void *d1, void *d2,
|
||||
void *d3, const int n, const void *src )
|
||||
{
|
||||
__m256i s0 = *( (__m256i*) src ); // s[0][1:0]
|
||||
__m256i s1 = *( (__m256i*)(src+32) ); // s[1][1:0]
|
||||
__m256i s2 = *( (__m256i*)(src+64) ); // s[2][1:0]
|
||||
__m256i s3 = *( (__m256i*)(src+96) ); // s[3][2:0]
|
||||
__m128i s0hi = _mm256_extracti128_si256( s0, 1 ); // s[0][3:2]
|
||||
__m128i s1hi = _mm256_extracti128_si256( s1, 1 ); // s[1][3:2]
|
||||
__m128i s2hi = _mm256_extracti128_si256( s2, 1 ); // s[2][3:2]
|
||||
__m128i s3hi = _mm256_extracti128_si256( s3, 1 ); // s[3][3:2]
|
||||
|
||||
casti_m256i( d0,n ) = _mm256_set_epi64x(
|
||||
extr64_cast128_256( s3, 0 ), extr64_cast128_256( s2, 0 ),
|
||||
extr64_cast128_256( s1, 0 ), extr64_cast128_256( s0, 0 ) );
|
||||
casti_m256i( d1,n ) = _mm256_set_epi64x(
|
||||
extr64_cast128_256( s3, 1 ), extr64_cast128_256( s2, 1 ),
|
||||
extr64_cast128_256( s1, 1 ), extr64_cast128_256( s0, 1 ) );
|
||||
casti_m256i( d2,n ) = _mm256_set_epi64x(
|
||||
_mm_extract_epi64( s3hi, 0 ), _mm_extract_epi64( s2hi, 0 ),
|
||||
_mm_extract_epi64( s1hi, 0 ), _mm_extract_epi64( s0hi, 0 ) );
|
||||
casti_m256i( d3,n ) = _mm256_set_epi64x(
|
||||
_mm_extract_epi64( s3hi, 1 ), _mm_extract_epi64( s2hi, 1 ),
|
||||
_mm_extract_epi64( s1hi, 1 ), _mm_extract_epi64( s0hi, 1 ) );
|
||||
}
|
||||
|
||||
|
||||
// quarter avx2 block, 16 bytes * 4 lanes
|
||||
// 4 lanes of 128 bits using 64 bit interleaving
|
||||
// Used for last 16 bytes of 80 byte input, only used for testing.
|
||||
static inline void mm128_dintrlv_4x64_128( void *d0, void *d1, void *d2,
|
||||
void *d3, const int n, const void *src )
|
||||
{
|
||||
__m256i s0 = *( (__m256i*) src );
|
||||
__m256i s1 = *( (__m256i*)(src+32) );
|
||||
__m128i s0hi = _mm256_extracti128_si256( s0, 1 );
|
||||
__m128i s1hi = _mm256_extracti128_si256( s1, 1 );
|
||||
|
||||
casti_m128i( d0,n ) = _mm_set_epi64x( extr64_cast128_256( s1 , 0 ),
|
||||
extr64_cast128_256( s0 , 0 ) );
|
||||
casti_m128i( d1,n ) = _mm_set_epi64x( extr64_cast128_256( s1 , 1 ),
|
||||
extr64_cast128_256( s0 , 1 ) );
|
||||
casti_m128i( d2,n ) = _mm_set_epi64x( _mm_extract_epi64( s1hi, 0 ),
|
||||
_mm_extract_epi64( s0hi, 0 ) );
|
||||
casti_m128i( d3,n ) = _mm_set_epi64x( _mm_extract_epi64( s1hi, 1 ),
|
||||
_mm_extract_epi64( s0hi, 1 ) );
|
||||
}
|
||||
|
||||
/*
|
||||
static inline void mm256_dintrlv_2x128x256( void *d0, void *d1,
|
||||
const int n, const void *s )
|
||||
{
|
||||
casti_m256i( d0,n ) = mm256_get_64( s, 0, 1, 4, 5 );
|
||||
casti_m256i( d1,n ) = mm256_get_64( s, 2, 3, 6, 7 );
|
||||
}
|
||||
*/
|
||||
//
|
||||
|
||||
#define mm256_intrlv_4x32_256( d, s0, s1, s2, s3 ) \
|
||||
do { \
|
||||
casti_m256i( d,0 ) = _mm256_set_epi32( \
|
||||
mm128_extr_32( s3, 1 ), mm128_extr_32( s2, 1 ), \
|
||||
mm128_extr_32( s1, 1 ), mm128_extr_32( s0, 1 ), \
|
||||
mm128_extr_32( s3, 0 ), mm128_extr_32( s2, 0 ), \
|
||||
mm128_extr_32( s1, 0 ), mm128_extr_32( s0, 0 ) ); \
|
||||
casti_m256i( d,1 ) = _mm256_set_epi32( \
|
||||
mm128_extr_32( s3, 3 ), mm128_extr_32( s2, 3 ), \
|
||||
mm128_extr_32( s1, 3 ), mm128_extr_32( s0, 3 ), \
|
||||
mm128_extr_32( s3, 2 ), mm128_extr_32( s2, 2 ), \
|
||||
mm128_extr_32( s1, 2 ), mm128_extr_32( s0, 2 ) ); \
|
||||
casti_m256i( d,2 ) = _mm256_set_epi32( \
|
||||
mm128_extr_32( s3, 5 ), mm128_extr_32( s2, 5 ), \
|
||||
mm128_extr_32( s1, 5 ), mm128_extr_32( s0, 5 ), \
|
||||
mm128_extr_32( s3, 4 ), mm128_extr_32( s2, 4 ), \
|
||||
mm128_extr_32( s1, 4 ), mm128_extr_32( s0, 4 ) ); \
|
||||
casti_m256i( d,3 ) = _mm256_set_epi32( \
|
||||
mm128_extr_32( s3, 7 ), mm128_extr_32( s2, 7 ), \
|
||||
mm128_extr_32( s1, 7 ), mm128_extr_32( s0, 7 ), \
|
||||
mm128_extr_32( s3, 6 ), mm128_extr_32( s2, 6 ), \
|
||||
mm128_extr_32( s1, 6 ), mm128_extr_32( s0, 6 ) ); \
|
||||
} while(0)
|
||||
|
||||
// 256 bit versions of commmon 128 bit functions.
|
||||
static inline void mm256_intrlv_4x32( void *d, const void *s0,
|
||||
const void *s1, const void *s2, const void *s3, int bit_len )
|
||||
{
|
||||
mm256_intrlv_4x32_256( d ,casti_m256i(s0,0), casti_m256i(s1,0),
|
||||
casti_m256i(s2,0), casti_m256i(s3,0) );
|
||||
if ( bit_len <= 256 ) return;
|
||||
mm256_intrlv_4x32_256( d+128 ,casti_m256i(s0,1), casti_m256i(s1,1),
|
||||
casti_m256i(s2,1), casti_m256i(s3,1) );
|
||||
if ( bit_len <= 512 ) return;
|
||||
if ( bit_len <= 640 )
|
||||
{
|
||||
mm128_intrlv_4x32_128( d+256, casti_m128i(s0,4), casti_m128i(s1,4),
|
||||
casti_m128i(s2,4), casti_m128i(s3,4) );
|
||||
return;
|
||||
}
|
||||
mm256_intrlv_4x32_256( d+256 ,casti_m256i(s0,2), casti_m256i(s1,2),
|
||||
casti_m256i(s2,2), casti_m256i(s3,2) );
|
||||
mm256_intrlv_4x32_256( d+384 ,casti_m256i(s0,3), casti_m256i(s1,3),
|
||||
casti_m256i(s2,3), casti_m256i(s3,3) );
|
||||
}
|
||||
|
||||
static inline void mm256_dintrlv_4x32_256( void *d0, void *d1, void *d2,
|
||||
void *d3, const void *src )
|
||||
{
|
||||
__m256i s0 = *(__m256i*) src;
|
||||
__m256i s1 = *(__m256i*)(src+32);
|
||||
__m256i s2 = *(__m256i*)(src+64);
|
||||
__m256i s3 = *(__m256i*)(src+96);
|
||||
*(__m256i*)d0 = _mm256_set_epi32(
|
||||
_mm256_extract_epi32( s3,4 ), _mm256_extract_epi32( s3,0 ),
|
||||
_mm256_extract_epi32( s2,4 ), _mm256_extract_epi32( s2,0 ),
|
||||
_mm256_extract_epi32( s1,4 ), _mm256_extract_epi32( s1,0 ),
|
||||
_mm256_extract_epi32( s0,4 ), _mm256_extract_epi32( s0,0 ) );
|
||||
*(__m256i*)d1 = _mm256_set_epi32(
|
||||
_mm256_extract_epi32( s3,5 ), _mm256_extract_epi32( s3,1 ),
|
||||
_mm256_extract_epi32( s2,5 ), _mm256_extract_epi32( s2,1 ),
|
||||
_mm256_extract_epi32( s1,5 ), _mm256_extract_epi32( s1,1 ),
|
||||
_mm256_extract_epi32( s0,5 ), _mm256_extract_epi32( s0,1 ) );
|
||||
*(__m256i*)d2 = _mm256_set_epi32(
|
||||
_mm256_extract_epi32( s3,6 ), _mm256_extract_epi32( s3,2 ),
|
||||
_mm256_extract_epi32( s2,6 ), _mm256_extract_epi32( s2,2 ),
|
||||
_mm256_extract_epi32( s1,6 ), _mm256_extract_epi32( s1,2 ),
|
||||
_mm256_extract_epi32( s0,6 ), _mm256_extract_epi32( s0,2 ) );
|
||||
*(__m256i*)d3 = _mm256_set_epi32(
|
||||
_mm256_extract_epi32( s3,7 ), _mm256_extract_epi32( s3,3 ),
|
||||
_mm256_extract_epi32( s2,7 ), _mm256_extract_epi32( s2,3 ),
|
||||
_mm256_extract_epi32( s1,7 ), _mm256_extract_epi32( s1,3 ),
|
||||
_mm256_extract_epi32( s0,7 ), _mm256_extract_epi32( s0,3 ) );
|
||||
}
|
||||
|
||||
static inline void mm256_dintrlv_4x32( void *d0, void *d1, void *d2,
|
||||
void *d3, const void *s, int bit_len )
|
||||
{
|
||||
mm256_dintrlv_4x32_256( d0 , d1 , d2 , d3 , s );
|
||||
if ( bit_len <= 256 ) return;
|
||||
mm256_dintrlv_4x32_256( d0+ 32, d1+ 32, d2+ 32, d3+ 32, s+128 );
|
||||
if ( bit_len <= 512 ) return;
|
||||
if ( bit_len <= 640 )
|
||||
{
|
||||
mm128_dintrlv_4x32_128( d0+ 64, d1+ 64, d2+ 64, d3+ 64, s+256 );
|
||||
return;
|
||||
}
|
||||
mm256_dintrlv_4x32_256( d0+ 64, d1+ 64, d2+ 64, d3+ 64, s+256 );
|
||||
mm256_dintrlv_4x32_256( d0+ 96, d1+ 96, d2+ 96, d3+ 96, s+384 );
|
||||
}
|
||||
|
||||
static inline void mm256_extr_lane_4x32( void *d, const void *s,
|
||||
const int lane, const int bit_len )
|
||||
{
|
||||
casti_m256i( d, 0 ) = mm256_get_32( s, lane , lane+ 4, lane+ 8, lane+12,
|
||||
lane+16, lane+20, lane+24, lane+28 );
|
||||
if ( bit_len <= 256 ) return;
|
||||
casti_m256i( d, 1 ) = mm256_get_32( s, lane+32, lane+36, lane+40, lane+44,
|
||||
lane+48, lane+52, lane+56, lane+60 );
|
||||
}
|
||||
|
||||
// Interleave 8 source buffers containing 32 bit data into the destination
|
||||
// vector
|
||||
static inline void mm256_intrlv_8x32( void *d, const void *s0,
|
||||
const void *s1, const void *s2, const void *s3, const void *s4,
|
||||
const void *s5, const void *s6, const void *s7, int bit_len )
|
||||
{
|
||||
mm256_intrlv_8x32_256( d , casti_m256i( s0,0 ), casti_m256i( s1,0 ),
|
||||
casti_m256i( s2,0 ), casti_m256i( s3,0 ), casti_m256i( s4,0 ),
|
||||
casti_m256i( s5,0 ), casti_m256i( s6,0 ), casti_m256i( s7,0 ) );
|
||||
if ( bit_len <= 256 ) return;
|
||||
mm256_intrlv_8x32_256( d+256, casti_m256i( s0,1 ), casti_m256i( s1,1 ),
|
||||
casti_m256i( s2,1 ), casti_m256i( s3,1 ), casti_m256i( s4,1 ),
|
||||
casti_m256i( s5,1 ), casti_m256i( s6,1 ), casti_m256i( s7,1 ) );
|
||||
if ( bit_len <= 512 ) return;
|
||||
if ( bit_len <= 640 )
|
||||
{
|
||||
mm256_intrlv_8x32_128( d+512, casti_m128i( s0,4 ), casti_m128i( s1,4 ),
|
||||
casti_m128i( s2,4 ), casti_m128i( s3,4 ), casti_m128i( s4,4 ),
|
||||
casti_m128i( s5,4 ), casti_m128i( s6,4 ), casti_m128i( s7,4 ) );
|
||||
return;
|
||||
}
|
||||
mm256_intrlv_8x32_256( d+512, casti_m256i( s0,2 ), casti_m256i( s1,2 ),
|
||||
casti_m256i( s2,2 ), casti_m256i( s3,2 ), casti_m256i( s4,2 ),
|
||||
casti_m256i( s5,2 ), casti_m256i( s6,2 ), casti_m256i( s7,2 ) );
|
||||
mm256_intrlv_8x32_256( d+768, casti_m256i( s0,3 ), casti_m256i( s1,3 ),
|
||||
casti_m256i( s2,3 ), casti_m256i( s3,3 ), casti_m256i( s4,3 ),
|
||||
casti_m256i( s5,3 ), casti_m256i( s6,3 ), casti_m256i( s7,3 ) );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// A couple of mining specifi functions.
|
||||
/*
|
||||
// Interleave 80 bytes of 32 bit data for 8 lanes.
|
||||
static inline void mm256_bswap_intrlv80_8x32( void *d, const void *s )
|
||||
{
|
||||
mm256_bswap_intrlv_8x32_256( d , casti_m256i( s, 0 ) );
|
||||
mm256_bswap_intrlv_8x32_256( d+256, casti_m256i( s, 1 ) );
|
||||
mm256_bswap_intrlv_8x32_128( d+512, casti_m128i( s, 4 ) );
|
||||
}
|
||||
*/
|
||||
|
||||
// Deinterleave 8 buffers of 32 bit data from the source buffer.
|
||||
// Sub-function can be called directly for 32 byte final hash.
|
||||
static inline void mm256_dintrlv_8x32( void *d0, void *d1, void *d2,
|
||||
void *d3, void *d4, void *d5, void *d6, void *d7,
|
||||
const void *s, int bit_len )
|
||||
{
|
||||
mm256_dintrlv_8x32_256( casti_m256i(d0,0), casti_m256i(d1,0),
|
||||
casti_m256i(d2,0), casti_m256i(d3,0), casti_m256i(d4,0),
|
||||
casti_m256i(d5,0), casti_m256i(d6,0), casti_m256i(d7,0), s );
|
||||
if ( bit_len <= 256 ) return;
|
||||
mm256_dintrlv_8x32_256( casti_m256i(d0,1), casti_m256i(d1,1),
|
||||
casti_m256i(d2,1), casti_m256i(d3,1), casti_m256i(d4,1),
|
||||
casti_m256i(d5,1), casti_m256i(d6,1), casti_m256i(d7,1), s+256 );
|
||||
if ( bit_len <= 512 ) return;
|
||||
// short block, final 16 bytes of input data
|
||||
if ( bit_len <= 640 )
|
||||
{
|
||||
mm128_dintrlv_8x32_128( casti_m128i(d0,2), casti_m128i(d1,2),
|
||||
casti_m128i(d2,2), casti_m128i(d3,2), casti_m128i(d4,2),
|
||||
casti_m128i(d5,2), casti_m128i(d6,2), casti_m128i(d7,2), s+512 );
|
||||
return;
|
||||
}
|
||||
// bitlen == 1024
|
||||
mm256_dintrlv_8x32_256( casti_m256i(d0,2), casti_m256i(d1,2),
|
||||
casti_m256i(d2,2), casti_m256i(d3,2), casti_m256i(d4,2),
|
||||
casti_m256i(d5,2), casti_m256i(d6,2), casti_m256i(d7,2), s+512 );
|
||||
mm256_dintrlv_8x32_256( casti_m256i(d0,3), casti_m256i(d1,3),
|
||||
casti_m256i(d2,3), casti_m256i(d3,3), casti_m256i(d4,3),
|
||||
casti_m256i(d5,3), casti_m256i(d6,3), casti_m256i(d7,3), s+768 );
|
||||
}
|
||||
|
||||
static inline void mm256_extr_lane_8x32( void *d, const void *s,
|
||||
const int lane, const int bit_len )
|
||||
{
|
||||
casti_m256i( d,0 ) = mm256_get_32(s, lane , lane+ 8, lane+ 16, lane+ 24,
|
||||
lane+32, lane+ 40, lane+ 48, lane+ 56 );
|
||||
if ( bit_len <= 256 ) return;
|
||||
casti_m256i( d,1 ) = mm256_get_32(s, lane+64, lane+ 72, lane+ 80, lane+ 88,
|
||||
lane+96, lane+104, lane+112, lane+120 );
|
||||
// bit_len == 512
|
||||
}
|
||||
|
||||
// Interleave 4 source buffers containing 64 bit data into the destination
|
||||
// buffer. Only bit_len 256, 512, 640 & 1024 are supported.
|
||||
static inline void mm256_intrlv_4x64( void *d, const void *s0,
|
||||
const void *s1, const void *s2, const void *s3, int bit_len )
|
||||
{
|
||||
mm256_intrlv_4x64_256( d , casti_m256i(s0,0), casti_m256i(s1,0),
|
||||
casti_m256i(s2,0), casti_m256i(s3,0) );
|
||||
if ( bit_len <= 256 ) return;
|
||||
mm256_intrlv_4x64_256( d+128, casti_m256i(s0,1), casti_m256i(s1,1),
|
||||
casti_m256i(s2,1), casti_m256i(s3,1) );
|
||||
if ( bit_len <= 512 ) return;
|
||||
if ( bit_len <= 640 )
|
||||
{
|
||||
mm256_intrlv_4x64_128( d+256, casti_m128i(s0,4), casti_m128i(s1,4),
|
||||
casti_m128i(s2,4), casti_m128i(s3,4) );
|
||||
return;
|
||||
}
|
||||
// bit_len == 1024
|
||||
mm256_intrlv_4x64_256( d+256, casti_m256i(s0,2), casti_m256i(s1,2),
|
||||
casti_m256i(s2,2), casti_m256i(s3,2) );
|
||||
mm256_intrlv_4x64_256( d+384, casti_m256i(s0,3), casti_m256i(s1,3),
|
||||
casti_m256i(s2,3), casti_m256i(s3,3) );
|
||||
}
|
||||
/*
|
||||
// Interleave 80 bytes of 32 bit data for 8 lanes.
|
||||
static inline void mm256_bswap_intrlv80_4x64( void *d, const void *s )
|
||||
{
|
||||
mm256_bswap_intrlv_4x64_256( d , casti_m256i( s, 0 ) );
|
||||
mm256_bswap_intrlv_4x64_256( d+128, casti_m256i( s, 1 ) );
|
||||
mm256_bswap_intrlv_4x64_128( d+256, casti_m128i( s, 4 ) );
|
||||
}
|
||||
|
||||
// Blend 32 byte lanes of hash from 2 sources according to control mask.
|
||||
// macro due to 256 bit value arg.
|
||||
#define mm256_blend_hash_4x64( dst, a, b, mask ) \
|
||||
do { \
|
||||
dst[0] = _mm256_blendv_epi8( a[0], b[0], mask ); \
|
||||
dst[1] = _mm256_blendv_epi8( a[1], b[1], mask ); \
|
||||
dst[2] = _mm256_blendv_epi8( a[2], b[2], mask ); \
|
||||
dst[3] = _mm256_blendv_epi8( a[3], b[3], mask ); \
|
||||
dst[4] = _mm256_blendv_epi8( a[4], b[4], mask ); \
|
||||
dst[5] = _mm256_blendv_epi8( a[5], b[5], mask ); \
|
||||
dst[6] = _mm256_blendv_epi8( a[6], b[6], mask ); \
|
||||
dst[7] = _mm256_blendv_epi8( a[7], b[7], mask ); \
|
||||
} while(0)
|
||||
*/
|
||||
|
||||
// Deinterleave 4 buffers of 64 bit data from the source buffer.
|
||||
// bit_len must be 256, 512, 640 or 1024 bits.
|
||||
// Requires overrun padding for 640 bit len.
|
||||
static inline void mm256_dintrlv_4x64( void *d0, void *d1, void *d2,
|
||||
void *d3, const void *s, int bit_len )
|
||||
{
|
||||
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 0, s );
|
||||
if ( bit_len <= 256 ) return;
|
||||
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 1, s+128 );
|
||||
if ( bit_len <= 512 ) return;
|
||||
// short block, final 16 bytes of input data
|
||||
if ( bit_len <= 640 )
|
||||
{
|
||||
mm128_dintrlv_4x64_128( d0, d1, d2, d3, 4, s+256 );
|
||||
return;
|
||||
}
|
||||
// bit_len == 1024
|
||||
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 2, s+256 );
|
||||
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 3, s+384 );
|
||||
}
|
||||
|
||||
// extract and deinterleave specified lane.
|
||||
#define mm256_extr_lane_4x64_256 \
|
||||
casti_m256i( d, 0 ) = mm256_get_64( s, lane, lane+4, lane+8, lane+12 )
|
||||
static inline void mm256_extr_lane_4x64( void *d, const void *s,
|
||||
const int lane, const int bit_len )
|
||||
{
|
||||
casti_m256i( d, 0 ) = mm256_get_64( s, lane, lane+4, lane+8, lane+12 );
|
||||
if ( bit_len <= 256 ) return;
|
||||
casti_m256i( d, 1 ) = mm256_get_64( s, lane+16, lane+20, lane+24, lane+28 );
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// Convert from 4x32 SSE2 interleaving to 4x64 AVX2.
|
||||
// Can't do it in place
|
||||
static inline void mm256_rintrlv_4x32_4x64( void *dst, void *src,
|
||||
int bit_len )
|
||||
{
|
||||
__m256i* d = (__m256i*)dst;
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
|
||||
d[0] = _mm256_set_epi32( s[ 7],s[ 3],s[ 6],s[ 2],s[ 5],s[ 1],s[ 4],s[ 0] );
|
||||
d[1] = _mm256_set_epi32( s[15],s[11],s[14],s[10],s[13],s[ 9],s[12],s[ 8] );
|
||||
d[2] = _mm256_set_epi32( s[23],s[19],s[22],s[18],s[21],s[17],s[20],s[16] );
|
||||
d[3] = _mm256_set_epi32( s[31],s[27],s[30],s[26],s[29],s[25],s[28],s[24] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[4] = _mm256_set_epi32( s[39],s[35],s[38],s[34],s[37],s[33],s[36],s[32] );
|
||||
d[5] = _mm256_set_epi32( s[47],s[43],s[46],s[42],s[45],s[41],s[44],s[40] );
|
||||
d[6] = _mm256_set_epi32( s[55],s[51],s[54],s[50],s[53],s[49],s[52],s[48] );
|
||||
d[7] = _mm256_set_epi32( s[63],s[59],s[62],s[58],s[61],s[57],s[60],s[56] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[8] = _mm256_set_epi32( s[71],s[67],s[70],s[66],s[69],s[65],s[68],s[64] );
|
||||
d[9] = _mm256_set_epi32( s[79],s[75],s[78],s[74],s[77],s[73],s[76],s[72] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d[10] = _mm256_set_epi32(s[87],s[83],s[86],s[82],s[85],s[81],s[84],s[80]);
|
||||
d[11] = _mm256_set_epi32(s[95],s[91],s[94],s[90],s[93],s[89],s[92],s[88]);
|
||||
|
||||
d[12] = _mm256_set_epi32(s[103],s[99],s[102],s[98],s[101],s[97],s[100],s[96]);
|
||||
d[13] = _mm256_set_epi32(s[111],s[107],s[110],s[106],s[109],s[105],s[108],s[104]);
|
||||
d[14] = _mm256_set_epi32(s[119],s[115],s[118],s[114],s[117],s[113],s[116],s[112]);
|
||||
d[15] = _mm256_set_epi32(s[127],s[123],s[126],s[122],s[125],s[121],s[124],s[120]);
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// Convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
|
||||
// bit_len must be multiple of 64
|
||||
static inline void mm256_rintrlv_4x64_4x32( void *dst, void *src,
|
||||
int bit_len )
|
||||
{
|
||||
__m256i *d = (__m256i*)dst;
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
|
||||
d[0] = _mm256_set_epi32( s[ 7],s[ 5],s[ 3],s[ 1],s[ 6],s[ 4],s[ 2],s[ 0] );
|
||||
d[1] = _mm256_set_epi32( s[15],s[13],s[11],s[ 9],s[14],s[12],s[10],s[ 8] );
|
||||
d[2] = _mm256_set_epi32( s[23],s[21],s[19],s[17],s[22],s[20],s[18],s[16] );
|
||||
d[3] = _mm256_set_epi32( s[31],s[29],s[27],s[25],s[30],s[28],s[26],s[24] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[4] = _mm256_set_epi32( s[39],s[37],s[35],s[33],s[38],s[36],s[34],s[32] );
|
||||
d[5] = _mm256_set_epi32( s[47],s[45],s[43],s[41],s[46],s[44],s[42],s[40] );
|
||||
d[6] = _mm256_set_epi32( s[55],s[53],s[51],s[49],s[54],s[52],s[50],s[48] );
|
||||
d[7] = _mm256_set_epi32( s[63],s[61],s[59],s[57],s[62],s[60],s[58],s[56] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[8] = _mm256_set_epi32( s[71],s[69],s[67],s[65],s[70],s[68],s[66],s[64] );
|
||||
d[9] = _mm256_set_epi32( s[79],s[77],s[75],s[73],s[78],s[76],s[74],s[72] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d[10] = _mm256_set_epi32( s[87],s[85],s[83],s[81],s[86],s[84],s[82],s[80] );
|
||||
d[11] = _mm256_set_epi32( s[95],s[93],s[91],s[89],s[94],s[92],s[90],s[88] );
|
||||
|
||||
d[12] = _mm256_set_epi32( s[103],s[101],s[99],s[97],s[102],s[100],s[98],s[96] );
|
||||
d[13] = _mm256_set_epi32( s[111],s[109],s[107],s[105],s[110],s[108],s[106],s[104] );
|
||||
d[14] = _mm256_set_epi32( s[119],s[117],s[115],s[113],s[118],s[116],s[114],s[112] );
|
||||
d[15] = _mm256_set_epi32( s[127],s[125],s[123],s[121],s[126],s[124],s[122],s[120] );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
static inline void mm256_rintrlv_4x64_2x128( void *dst0, void *dst1,
|
||||
const void *src, int bit_len )
|
||||
{
|
||||
__m256i* d0 = (__m256i*)dst0;
|
||||
__m256i* d1 = (__m256i*)dst1;
|
||||
uint64_t *s = (uint64_t*)src;
|
||||
|
||||
d0[0] = _mm256_set_epi64x( s[ 5], s[ 1], s[ 4], s[ 0] );
|
||||
d1[0] = _mm256_set_epi64x( s[ 7], s[ 3], s[ 6], s[ 2] );
|
||||
|
||||
d0[1] = _mm256_set_epi64x( s[13], s[ 9], s[12], s[ 8] );
|
||||
d1[1] = _mm256_set_epi64x( s[15], s[11], s[14], s[10] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d0[2] = _mm256_set_epi64x( s[21], s[17], s[20], s[16] );
|
||||
d1[2] = _mm256_set_epi64x( s[23], s[19], s[22], s[18] );
|
||||
|
||||
d0[3] = _mm256_set_epi64x( s[29], s[25], s[28], s[24] );
|
||||
d1[3] = _mm256_set_epi64x( s[31], s[27], s[30], s[26] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d0[4] = _mm256_set_epi64x( s[37], s[33], s[36], s[32] );
|
||||
d1[4] = _mm256_set_epi64x( s[39], s[35], s[38], s[34] );
|
||||
|
||||
d0[5] = _mm256_set_epi64x( s[45], s[41], s[44], s[40] );
|
||||
d1[5] = _mm256_set_epi64x( s[47], s[43], s[46], s[42] );
|
||||
|
||||
d0[6] = _mm256_set_epi64x( s[53], s[49], s[52], s[48] );
|
||||
d1[6] = _mm256_set_epi64x( s[55], s[51], s[54], s[50] );
|
||||
|
||||
d0[7] = _mm256_set_epi64x( s[61], s[57], s[60], s[56] );
|
||||
d1[7] = _mm256_set_epi64x( s[63], s[59], s[62], s[58] );
|
||||
}
|
||||
|
||||
static inline void mm256_rintrlv_2x128_4x64( void *dst, const void *src0,
|
||||
const void *src1, int bit_len )
|
||||
{
|
||||
__m256i* d = (__m256i*)dst;
|
||||
uint64_t *s0 = (uint64_t*)src0;
|
||||
uint64_t *s1 = (uint64_t*)src1;
|
||||
|
||||
d[ 0] = _mm256_set_epi64x( s1[2], s1[0], s0[2], s0[0] );
|
||||
d[ 1] = _mm256_set_epi64x( s1[3], s1[1], s0[3], s0[1] );
|
||||
d[ 2] = _mm256_set_epi64x( s1[6], s1[4], s0[6], s0[4] );
|
||||
d[ 3] = _mm256_set_epi64x( s1[7], s1[5], s0[7], s0[5] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[ 4] = _mm256_set_epi64x( s1[10], s1[ 8], s0[10], s0[ 8] );
|
||||
d[ 5] = _mm256_set_epi64x( s1[11], s1[ 9], s0[11], s0[ 9] );
|
||||
d[ 6] = _mm256_set_epi64x( s1[14], s1[12], s0[14], s0[12] );
|
||||
d[ 7] = _mm256_set_epi64x( s1[15], s1[13], s0[15], s0[13] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[ 8] = _mm256_set_epi64x( s1[18], s1[16], s0[18], s0[16] );
|
||||
d[ 9] = _mm256_set_epi64x( s1[19], s1[17], s0[19], s0[17] );
|
||||
d[10] = _mm256_set_epi64x( s1[22], s1[20], s0[22], s0[20] );
|
||||
d[11] = _mm256_set_epi64x( s1[23], s1[21], s0[23], s0[21] );
|
||||
|
||||
d[12] = _mm256_set_epi64x( s1[26], s1[24], s0[26], s0[24] );
|
||||
d[13] = _mm256_set_epi64x( s1[27], s1[25], s0[27], s0[25] );
|
||||
d[14] = _mm256_set_epi64x( s1[30], s1[28], s0[30], s0[28] );
|
||||
d[15] = _mm256_set_epi64x( s1[31], s1[29], s0[31], s0[29] );
|
||||
}
|
||||
|
||||
|
||||
static inline void mm256_intrlv_2x128( const void *d, const void *s0,
|
||||
void *s1, const int bit_len )
|
||||
{
|
||||
__m128i s1hi = _mm256_extracti128_si256( casti_m256i( s1,0 ), 1 );
|
||||
__m128i s0hi = _mm256_extracti128_si256( casti_m256i( s0,0 ), 1 );
|
||||
casti_m256i( d,0 ) = mm256_concat_128(
|
||||
_mm256_castsi256_si128( casti_m256i( s1,0 ) ),
|
||||
_mm256_castsi256_si128( casti_m256i( s0,0 ) ) );
|
||||
casti_m256i( d,1 ) = mm256_concat_128( s1hi, s0hi );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
s0hi = _mm256_extracti128_si256( casti_m256i( s0,1 ), 1 );
|
||||
s1hi = _mm256_extracti128_si256( casti_m256i( s1,1 ), 1 );
|
||||
casti_m256i( d,2 ) = mm256_concat_128(
|
||||
_mm256_castsi256_si128( casti_m256i( s1,1 ) ),
|
||||
_mm256_castsi256_si128( casti_m256i( s0,1 ) ) );
|
||||
casti_m256i( d,3 ) = mm256_concat_128( s1hi, s0hi );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
if ( bit_len <= 640 )
|
||||
{
|
||||
casti_m256i( d,4 ) = mm256_concat_128(
|
||||
_mm256_castsi256_si128( casti_m256i( s1,2 ) ),
|
||||
_mm256_castsi256_si128( casti_m256i( s0,2 ) ) );
|
||||
return;
|
||||
}
|
||||
|
||||
s0hi = _mm256_extracti128_si256( casti_m256i( s0,2 ), 1 );
|
||||
s1hi = _mm256_extracti128_si256( casti_m256i( s1,2 ), 1 );
|
||||
casti_m256i( d,4 ) = mm256_concat_128(
|
||||
_mm256_castsi256_si128( casti_m256i( s1,2 ) ),
|
||||
_mm256_castsi256_si128( casti_m256i( s0,2 ) ) );
|
||||
casti_m256i( d,5 ) = mm256_concat_128( s1hi, s0hi );
|
||||
|
||||
s0hi = _mm256_extracti128_si256( casti_m256i( s0,3 ), 1 );
|
||||
s1hi = _mm256_extracti128_si256( casti_m256i( s1,3 ), 1 );
|
||||
casti_m256i( d,6 ) = mm256_concat_128(
|
||||
_mm256_castsi256_si128( casti_m256i( s1,3 ) ),
|
||||
_mm256_castsi256_si128( casti_m256i( s0,3 ) ) );
|
||||
casti_m256i( d,7 ) = mm256_concat_128( s1hi, s0hi );
|
||||
}
|
||||
|
||||
// 512 is the bit len used by most, eliminate the conditionals
|
||||
static inline void mm256_dintrlv_2x128_512( void *dst0, void *dst1,
|
||||
const void *s )
|
||||
{
|
||||
__m256i *d0 = (__m256i*)dst0;
|
||||
__m256i *d1 = (__m256i*)dst1;
|
||||
|
||||
__m256i s0 = casti_m256i( s, 0 );
|
||||
__m256i s1 = casti_m256i( s, 1 );
|
||||
d0[0] = _mm256_permute2x128_si256( s0, s1, 0x20 );
|
||||
d1[0] = _mm256_permute2x128_si256( s0, s1, 0x31 );
|
||||
|
||||
s0 = casti_m256i( s, 2 );
|
||||
s1 = casti_m256i( s, 3 );
|
||||
d0[1] = _mm256_permute2x128_si256( s0, s1, 0x20 );
|
||||
d1[1] = _mm256_permute2x128_si256( s0, s1, 0x31 );
|
||||
}
|
||||
|
||||
// Phase out usage for all 512 bit data lengths
|
||||
static inline void mm256_dintrlv_2x128( void *dst0, void *dst1, const void *s,
|
||||
int bit_len )
|
||||
{
|
||||
__m256i *d0 = (__m256i*)dst0;
|
||||
__m256i *d1 = (__m256i*)dst1;
|
||||
|
||||
__m256i s0 = casti_m256i( s, 0 );
|
||||
__m256i s1 = casti_m256i( s, 1 );
|
||||
d0[0] = _mm256_permute2x128_si256( s0, s1, 0x20 );
|
||||
d1[0] = _mm256_permute2x128_si256( s0, s1, 0x31 );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
s0 = casti_m256i( s, 2 );
|
||||
s1 = casti_m256i( s, 3 );
|
||||
d0[1] = _mm256_permute2x128_si256( s0, s1, 0x20 );
|
||||
d1[1] = _mm256_permute2x128_si256( s0, s1, 0x31 );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
s0 = casti_m256i( s, 4 );
|
||||
s1 = casti_m256i( s, 5 );
|
||||
d0[2] = _mm256_permute2x128_si256( s0, s1, 0x20 );
|
||||
d1[2] = _mm256_permute2x128_si256( s0, s1, 0x31 );
|
||||
|
||||
s0 = casti_m256i( s, 6 );
|
||||
s1 = casti_m256i( s, 7 );
|
||||
d0[3] = _mm256_permute2x128_si256( s0, s1, 0x20 );
|
||||
d1[3] = _mm256_permute2x128_si256( s0, s1, 0x31 );
|
||||
}
|
||||
|
||||
#undef extr64_cast128_256
|
||||
#undef extr32_cast128_256
|
||||
|
||||
#endif // AVX
|
||||
#endif // INTRLV_AVX_H__
|
@@ -1,42 +1,13 @@
|
||||
#if !defined(INTRLV_AVX22_H__)
|
||||
#if !defined(INTRLV_AVX2_H__)
|
||||
#define INTRLV_AVX2_H__ 1
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// Convenient short cuts for local use only
|
||||
|
||||
// Extract 64 bits from the low 128 bits of 256 bit vector.
|
||||
#define extr64_cast128_256( a, n ) \
|
||||
_mm_extract_epi64( _mm256_castsi256_si128( a ), n )
|
||||
|
||||
// Extract 32 bits from the low 128 bits of 256 bit vector.
|
||||
#define extr32_cast128_256( a, n ) \
|
||||
_mm_extract_epi32( _mm256_castsi256_si128( a ), n )
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
//
|
||||
// AVX2 256 Bit Vectors
|
||||
//
|
||||
|
||||
#define mm256_put_64( s0, s1, s2, s3) \
|
||||
_mm256_set_epi64x( *((const uint64_t*)(s3)), *((const uint64_t*)(s2)), \
|
||||
*((const uint64_t*)(s1)), *((const uint64_t*)(s0)) )
|
||||
|
||||
#define mm256_put_32( s00, s01, s02, s03, s04, s05, s06, s07 ) \
|
||||
_mm256_set_epi32( *((const uint32_t*)(s07)), *((const uint32_t*)(s06)), \
|
||||
*((const uint32_t*)(s05)), *((const uint32_t*)(s04)), \
|
||||
*((const uint32_t*)(s03)), *((const uint32_t*)(s02)), \
|
||||
*((const uint32_t*)(s01)), *((const uint32_t*)(s00)) )
|
||||
|
||||
#define mm256_get_64( s, i0, i1, i2, i3 ) \
|
||||
_mm256_set_epi64x( ((const uint64_t*)(s))[i3], ((const uint64_t*)(s))[i2], \
|
||||
((const uint64_t*)(s))[i1], ((const uint64_t*)(s))[i0] )
|
||||
|
||||
#define mm256_get_32( s, i0, i1, i2, i3, i4, i5, i6, i7 ) \
|
||||
_mm256_set_epi32( ((const uint32_t*)(s))[i7], ((const uint32_t*)(s))[i6], \
|
||||
((const uint32_t*)(s))[i5], ((const uint32_t*)(s))[i4], \
|
||||
((const uint32_t*)(s))[i3], ((const uint32_t*)(s))[i2], \
|
||||
((const uint32_t*)(s))[i1], ((const uint32_t*)(s))[i0] )
|
||||
// A few functions that need AVX2 for 256 bit.
|
||||
|
||||
|
||||
// Blend 2 vectors alternating hi & lo: { hi[n], lo[n-1], ... hi[1], lo[0] }
|
||||
@@ -49,82 +20,6 @@
|
||||
#define mm256_intrlv_blend_32( hi, lo ) \
|
||||
_mm256_blend_epi32( hi, lo, 0x55 )
|
||||
|
||||
// Interleave 8x32_256
|
||||
#define mm256_intrlv_8x32_256( d, s0, s1, s2, s3, s4, s5, s6, s7 ) \
|
||||
{ \
|
||||
__m128i s0hi = mm128_extr_hi128_256( s0 ); \
|
||||
__m128i s1hi = mm128_extr_hi128_256( s1 ); \
|
||||
__m128i s2hi = mm128_extr_hi128_256( s2 ); \
|
||||
__m128i s3hi = mm128_extr_hi128_256( s3 ); \
|
||||
__m128i s4hi = mm128_extr_hi128_256( s4 ); \
|
||||
__m128i s5hi = mm128_extr_hi128_256( s5 ); \
|
||||
__m128i s6hi = mm128_extr_hi128_256( s6 ); \
|
||||
__m128i s7hi = mm128_extr_hi128_256( s7 ); \
|
||||
casti_m256i( d,0 ) = _mm256_set_epi32( \
|
||||
extr32_cast128_256(s7,0), extr32_cast128_256(s6,0), \
|
||||
extr32_cast128_256(s5,0), extr32_cast128_256(s4,0), \
|
||||
extr32_cast128_256(s3,0), extr32_cast128_256(s2,0), \
|
||||
extr32_cast128_256(s1,0), extr32_cast128_256(s0,0) ); \
|
||||
casti_m256i( d,1 ) = _mm256_set_epi32( \
|
||||
extr32_cast128_256(s7,1), extr32_cast128_256(s6,1), \
|
||||
extr32_cast128_256(s5,1), extr32_cast128_256(s4,1), \
|
||||
extr32_cast128_256(s3,1), extr32_cast128_256(s2,1), \
|
||||
extr32_cast128_256(s1,1), extr32_cast128_256(s0,1) ); \
|
||||
casti_m256i( d,2 ) = _mm256_set_epi32( \
|
||||
extr32_cast128_256(s7,2), extr32_cast128_256(s6,2), \
|
||||
extr32_cast128_256(s5,2), extr32_cast128_256(s4,2), \
|
||||
extr32_cast128_256(s3,2), extr32_cast128_256(s2,2), \
|
||||
extr32_cast128_256(s1,2), extr32_cast128_256(s0,2) ); \
|
||||
casti_m256i( d,3 ) = _mm256_set_epi32( \
|
||||
extr32_cast128_256(s7,3), extr32_cast128_256(s6,3), \
|
||||
extr32_cast128_256(s5,3), extr32_cast128_256(s4,3), \
|
||||
extr32_cast128_256(s3,3), extr32_cast128_256(s2,3), \
|
||||
extr32_cast128_256(s1,3), extr32_cast128_256(s0,3) ); \
|
||||
casti_m256i( d,4 ) = _mm256_set_epi32( \
|
||||
mm128_extr_32(s7hi,0), mm128_extr_32(s6hi,0), \
|
||||
mm128_extr_32(s5hi,0), mm128_extr_32(s4hi,0), \
|
||||
mm128_extr_32(s3hi,0), mm128_extr_32(s2hi,0), \
|
||||
mm128_extr_32(s1hi,0), mm128_extr_32(s0hi,0) ); \
|
||||
casti_m256i( d,5 ) = _mm256_set_epi32( \
|
||||
mm128_extr_32(s7hi,1), mm128_extr_32(s6hi,1), \
|
||||
mm128_extr_32(s5hi,1), mm128_extr_32(s4hi,1), \
|
||||
mm128_extr_32(s3hi,1), mm128_extr_32(s2hi,1), \
|
||||
mm128_extr_32(s1hi,1), mm128_extr_32(s0hi,1) ); \
|
||||
casti_m256i( d,6 ) = _mm256_set_epi32( \
|
||||
mm128_extr_32(s7hi,2), mm128_extr_32(s6hi,2), \
|
||||
mm128_extr_32(s5hi,2), mm128_extr_32(s4hi,2), \
|
||||
mm128_extr_32(s3hi,2), mm128_extr_32(s2hi,2), \
|
||||
mm128_extr_32(s1hi,2), mm128_extr_32(s0hi,2) ); \
|
||||
casti_m256i( d,7 ) = _mm256_set_epi32( \
|
||||
mm128_extr_32(s7hi,3), mm128_extr_32(s6hi,3), \
|
||||
mm128_extr_32(s5hi,3), mm128_extr_32(s4hi,3), \
|
||||
mm128_extr_32(s3hi,3), mm128_extr_32(s2hi,3), \
|
||||
mm128_extr_32(s1hi,3), mm128_extr_32(s0hi,3) ); \
|
||||
} while(0)
|
||||
|
||||
#define mm256_intrlv_8x32_128( d, s0, s1, s2, s3, s4, s5, s6, s7 ) \
|
||||
{ \
|
||||
casti_m256i( d,0 ) = _mm256_set_epi32( \
|
||||
mm128_extr_32(s7,0), mm128_extr_32(s6,0), \
|
||||
mm128_extr_32(s5,0), mm128_extr_32(s4,0), \
|
||||
mm128_extr_32(s3,0), mm128_extr_32(s2,0), \
|
||||
mm128_extr_32(s1,0), mm128_extr_32(s0,0) ); \
|
||||
casti_m256i( d,1 ) = _mm256_set_epi32( \
|
||||
mm128_extr_32(s7,1), mm128_extr_32(s6,1), \
|
||||
mm128_extr_32(s5,1), mm128_extr_32(s4,1), \
|
||||
mm128_extr_32(s3,1), mm128_extr_32(s2,1), \
|
||||
mm128_extr_32(s1,1), mm128_extr_32(s0,1) ); \
|
||||
casti_m256i( d,2 ) = _mm256_set_epi32( \
|
||||
mm128_extr_32(s7,2), mm128_extr_32(s6,2), \
|
||||
mm128_extr_32(s5,2), mm128_extr_32(s4,2), \
|
||||
mm128_extr_32(s3,2), mm128_extr_32(s2,2), \
|
||||
mm128_extr_32(s1,2), mm128_extr_32(s0,2) ); \
|
||||
casti_m256i( d,3 ) = _mm256_set_epi32( \
|
||||
mm128_extr_32(s7,3), mm128_extr_32(s6,3), \
|
||||
mm128_extr_32(s5,3), mm128_extr_32(s4,3), \
|
||||
mm128_extr_32(s3,3), mm128_extr_32(s2,3), \
|
||||
mm128_extr_32(s1,3), mm128_extr_32(s0,3) ); \
|
||||
} while(0)
|
||||
|
||||
#define mm256_bswap_intrlv_8x32_256( d, src ) \
|
||||
do { \
|
||||
@@ -153,128 +48,6 @@ do { \
|
||||
casti_m256i( d, 3 ) = _mm256_set1_epi32( _mm_extract_epi32( ss, 3 ) ); \
|
||||
} while(0)
|
||||
|
||||
#define mm256_dintrlv_8x32_256( d0, d1, d2, d3, d4, d5, d6, d7, s ) \
|
||||
do { \
|
||||
__m256i s0 = casti_m256i(s,0); \
|
||||
__m256i s1 = casti_m256i(s,1); \
|
||||
__m256i s2 = casti_m256i(s,2); \
|
||||
__m256i s3 = casti_m256i(s,3); \
|
||||
__m256i s4 = casti_m256i(s,4); \
|
||||
__m256i s5 = casti_m256i(s,5); \
|
||||
__m256i s6 = casti_m256i(s,6); \
|
||||
__m256i s7 = casti_m256i(s,7); \
|
||||
__m128i s0hi = _mm256_extracti128_si256( s0, 1 ); \
|
||||
__m128i s1hi = _mm256_extracti128_si256( s1, 1 ); \
|
||||
__m128i s2hi = _mm256_extracti128_si256( s2, 1 ); \
|
||||
__m128i s3hi = _mm256_extracti128_si256( s3, 1 ); \
|
||||
__m128i s4hi = _mm256_extracti128_si256( s4, 1 ); \
|
||||
__m128i s5hi = _mm256_extracti128_si256( s5, 1 ); \
|
||||
__m128i s6hi = _mm256_extracti128_si256( s6, 1 ); \
|
||||
__m128i s7hi = _mm256_extracti128_si256( s7, 1 ); \
|
||||
d0 = _mm256_set_epi32( \
|
||||
extr32_cast128_256( s7, 0 ), extr32_cast128_256( s6, 0 ), \
|
||||
extr32_cast128_256( s5, 0 ), extr32_cast128_256( s4, 0 ), \
|
||||
extr32_cast128_256( s3, 0 ), extr32_cast128_256( s2, 0 ), \
|
||||
extr32_cast128_256( s1, 0 ), extr32_cast128_256( s0, 0 ) );\
|
||||
d1 = _mm256_set_epi32( \
|
||||
extr32_cast128_256( s7, 1 ), extr32_cast128_256( s6, 1 ), \
|
||||
extr32_cast128_256( s5, 1 ), extr32_cast128_256( s4, 1 ), \
|
||||
extr32_cast128_256( s3, 1 ), extr32_cast128_256( s2, 1 ), \
|
||||
extr32_cast128_256( s1, 1 ), extr32_cast128_256( s0, 1 ) );\
|
||||
d2 = _mm256_set_epi32( \
|
||||
extr32_cast128_256( s7, 2 ), extr32_cast128_256( s6, 2 ), \
|
||||
extr32_cast128_256( s5, 2 ), extr32_cast128_256( s4, 2 ), \
|
||||
extr32_cast128_256( s3, 2 ), extr32_cast128_256( s2, 2 ), \
|
||||
extr32_cast128_256( s1, 2 ), extr32_cast128_256( s0, 2 ) );\
|
||||
d3 = _mm256_set_epi32( \
|
||||
extr32_cast128_256( s7, 3 ), extr32_cast128_256( s6, 3 ), \
|
||||
extr32_cast128_256( s5, 3 ), extr32_cast128_256( s4, 3 ), \
|
||||
extr32_cast128_256( s3, 3 ), extr32_cast128_256( s2, 3 ), \
|
||||
extr32_cast128_256( s1, 3 ), extr32_cast128_256( s0, 3 ) );\
|
||||
d4 = _mm256_set_epi32( \
|
||||
_mm_extract_epi32( s7hi, 0 ), _mm_extract_epi32( s6hi, 0 ), \
|
||||
_mm_extract_epi32( s5hi, 0 ), _mm_extract_epi32( s4hi, 0 ), \
|
||||
_mm_extract_epi32( s3hi, 0 ), _mm_extract_epi32( s2hi, 0 ), \
|
||||
_mm_extract_epi32( s1hi, 0 ), _mm_extract_epi32( s0hi, 0 ) ); \
|
||||
d5 = _mm256_set_epi32( \
|
||||
_mm_extract_epi32( s7hi, 1 ), _mm_extract_epi32( s6hi, 1 ), \
|
||||
_mm_extract_epi32( s5hi, 1 ), _mm_extract_epi32( s4hi, 1 ), \
|
||||
_mm_extract_epi32( s3hi, 1 ), _mm_extract_epi32( s2hi, 1 ), \
|
||||
_mm_extract_epi32( s1hi, 1 ), _mm_extract_epi32( s0hi, 1 ) ); \
|
||||
d6 = _mm256_set_epi32( \
|
||||
_mm_extract_epi32( s7hi, 2 ), _mm_extract_epi32( s6hi, 2 ), \
|
||||
_mm_extract_epi32( s5hi, 2 ), _mm_extract_epi32( s4hi, 2 ), \
|
||||
_mm_extract_epi32( s3hi, 2 ), _mm_extract_epi32( s2hi, 2 ), \
|
||||
_mm_extract_epi32( s1hi, 2 ), _mm_extract_epi32( s0hi, 2 ) ); \
|
||||
d7 = _mm256_set_epi32( \
|
||||
_mm_extract_epi32( s7hi, 3 ), _mm_extract_epi32( s6hi, 3 ), \
|
||||
_mm_extract_epi32( s5hi, 3 ), _mm_extract_epi32( s4hi, 3 ), \
|
||||
_mm_extract_epi32( s3hi, 3 ), _mm_extract_epi32( s2hi, 3 ), \
|
||||
_mm_extract_epi32( s1hi, 3 ), _mm_extract_epi32( s0hi, 3 ) ); \
|
||||
} while(0)
|
||||
|
||||
#define mm128_dintrlv_8x32_128( d0, d1, d2, d3, d4, d5, d6, d7, s ) \
|
||||
do { \
|
||||
__m128i s0 = casti_m128i(s,0); \
|
||||
__m128i s1 = casti_m128i(s,1); \
|
||||
__m128i s2 = casti_m128i(s,2); \
|
||||
__m128i s3 = casti_m128i(s,3); \
|
||||
d0 = _mm_set_epi32( \
|
||||
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
|
||||
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
|
||||
d1 = _mm_set_epi32( \
|
||||
_mm_extract_epi32( s3, 1 ), _mm_extract_epi32( s2, 0 ), \
|
||||
_mm_extract_epi32( s1, 1 ), _mm_extract_epi32( s0, 0 ) ); \
|
||||
d2 = _mm_set_epi32( \
|
||||
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
|
||||
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
|
||||
d3 = _mm_set_epi32( \
|
||||
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
|
||||
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
|
||||
d4 = _mm_set_epi32( \
|
||||
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
|
||||
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
|
||||
d5 = _mm_set_epi32( \
|
||||
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
|
||||
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
|
||||
d6 = _mm_set_epi32( \
|
||||
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
|
||||
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
|
||||
d7 = _mm_set_epi32( \
|
||||
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
|
||||
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
|
||||
} while(0)
|
||||
|
||||
#define mm256_intrlv_4x64_256( d, s0, s1, s2, s3 ) \
|
||||
do { \
|
||||
__m128i s0hi = _mm256_extracti128_si256( s0, 1 ); \
|
||||
__m128i s1hi = _mm256_extracti128_si256( s1, 1 ); \
|
||||
__m128i s2hi = _mm256_extracti128_si256( s2, 1 ); \
|
||||
__m128i s3hi = _mm256_extracti128_si256( s3, 1 ); \
|
||||
casti_m256i( d,0 ) = _mm256_set_epi64x( \
|
||||
extr64_cast128_256( s3, 0 ), extr64_cast128_256( s2, 0 ), \
|
||||
extr64_cast128_256( s1, 0 ), extr64_cast128_256( s0, 0 ) ); \
|
||||
casti_m256i( d,1 ) = _mm256_set_epi64x( \
|
||||
extr64_cast128_256( s3, 1 ), extr64_cast128_256( s2, 1 ), \
|
||||
extr64_cast128_256( s1, 1 ), extr64_cast128_256( s0, 1 ) ); \
|
||||
casti_m256i( d,2 ) = _mm256_set_epi64x( \
|
||||
_mm_extract_epi64( s3hi,0 ), _mm_extract_epi64( s2hi,0 ), \
|
||||
_mm_extract_epi64( s1hi,0 ), _mm_extract_epi64( s0hi,0 ) ); \
|
||||
casti_m256i( d,3 ) = _mm256_set_epi64x( \
|
||||
_mm_extract_epi64( s3hi,1 ), _mm_extract_epi64( s2hi,1 ), \
|
||||
_mm_extract_epi64( s1hi,1 ), _mm_extract_epi64( s0hi,1 ) ); \
|
||||
} while(0)
|
||||
|
||||
#define mm256_intrlv_4x64_128( d, s0, s1, s2, s3 ) \
|
||||
do { \
|
||||
casti_m256i( d,0 ) = _mm256_set_epi64x( \
|
||||
_mm_extract_epi64( s3, 0 ), _mm_extract_epi64( s2, 0 ), \
|
||||
_mm_extract_epi64( s1, 0 ), _mm_extract_epi64( s0, 0 ) ); \
|
||||
casti_m256i( d,1 ) = _mm256_set_epi64x( \
|
||||
_mm_extract_epi64( s3, 1 ), _mm_extract_epi64( s2, 1 ), \
|
||||
_mm_extract_epi64( s1, 1 ), _mm_extract_epi64( s0, 1 ) ); \
|
||||
} while(0)
|
||||
|
||||
#define mm256_bswap_intrlv_4x64_256( d, src ) \
|
||||
do { \
|
||||
__m256i s0 = mm256_bswap_32( src ); \
|
||||
@@ -294,94 +67,6 @@ do { \
|
||||
casti_m256i( d,1 ) = _mm256_set1_epi64x( _mm_extract_epi64( ss, 1 ) ); \
|
||||
} while(0)
|
||||
|
||||
// 4 lanes of 256 bits using 64 bit interleaving (standard final hash size)
|
||||
static inline void mm256_dintrlv_4x64_256( void *d0, void *d1, void *d2,
|
||||
void *d3, const int n, const void *src )
|
||||
{
|
||||
__m256i s0 = *( (__m256i*) src ); // s[0][1:0]
|
||||
__m256i s1 = *( (__m256i*)(src+32) ); // s[1][1:0]
|
||||
__m256i s2 = *( (__m256i*)(src+64) ); // s[2][1:0]
|
||||
__m256i s3 = *( (__m256i*)(src+96) ); // s[3][2:0]
|
||||
__m128i s0hi = _mm256_extracti128_si256( s0, 1 ); // s[0][3:2]
|
||||
__m128i s1hi = _mm256_extracti128_si256( s1, 1 ); // s[1][3:2]
|
||||
__m128i s2hi = _mm256_extracti128_si256( s2, 1 ); // s[2][3:2]
|
||||
__m128i s3hi = _mm256_extracti128_si256( s3, 1 ); // s[3][3:2]
|
||||
|
||||
casti_m256i( d0,n ) = _mm256_set_epi64x(
|
||||
extr64_cast128_256( s3, 0 ), extr64_cast128_256( s2, 0 ),
|
||||
extr64_cast128_256( s1, 0 ), extr64_cast128_256( s0, 0 ) );
|
||||
casti_m256i( d1,n ) = _mm256_set_epi64x(
|
||||
extr64_cast128_256( s3, 1 ), extr64_cast128_256( s2, 1 ),
|
||||
extr64_cast128_256( s1, 1 ), extr64_cast128_256( s0, 1 ) );
|
||||
casti_m256i( d2,n ) = _mm256_set_epi64x(
|
||||
_mm_extract_epi64( s3hi, 0 ), _mm_extract_epi64( s2hi, 0 ),
|
||||
_mm_extract_epi64( s1hi, 0 ), _mm_extract_epi64( s0hi, 0 ) );
|
||||
casti_m256i( d3,n ) = _mm256_set_epi64x(
|
||||
_mm_extract_epi64( s3hi, 1 ), _mm_extract_epi64( s2hi, 1 ),
|
||||
_mm_extract_epi64( s1hi, 1 ), _mm_extract_epi64( s0hi, 1 ) );
|
||||
}
|
||||
|
||||
|
||||
// quarter avx2 block, 16 bytes * 4 lanes
|
||||
// 4 lanes of 128 bits using 64 bit interleaving
|
||||
// Used for last 16 bytes of 80 byte input, only used for testing.
|
||||
static inline void mm128_dintrlv_4x64_128( void *d0, void *d1, void *d2,
|
||||
void *d3, const int n, const void *src )
|
||||
{
|
||||
__m256i s0 = *( (__m256i*) src );
|
||||
__m256i s1 = *( (__m256i*)(src+32) );
|
||||
__m128i s0hi = _mm256_extracti128_si256( s0, 1 );
|
||||
__m128i s1hi = _mm256_extracti128_si256( s1, 1 );
|
||||
|
||||
casti_m128i( d0,n ) = _mm_set_epi64x( extr64_cast128_256( s1 , 0 ),
|
||||
extr64_cast128_256( s0 , 0 ) );
|
||||
casti_m128i( d1,n ) = _mm_set_epi64x( extr64_cast128_256( s1 , 1 ),
|
||||
extr64_cast128_256( s0 , 1 ) );
|
||||
casti_m128i( d2,n ) = _mm_set_epi64x( _mm_extract_epi64( s1hi, 0 ),
|
||||
_mm_extract_epi64( s0hi, 0 ) );
|
||||
casti_m128i( d3,n ) = _mm_set_epi64x( _mm_extract_epi64( s1hi, 1 ),
|
||||
_mm_extract_epi64( s0hi, 1 ) );
|
||||
}
|
||||
|
||||
/*
|
||||
static inline void mm256_dintrlv_2x128x256( void *d0, void *d1,
|
||||
const int n, const void *s )
|
||||
{
|
||||
casti_m256i( d0,n ) = mm256_get_64( s, 0, 1, 4, 5 );
|
||||
casti_m256i( d1,n ) = mm256_get_64( s, 2, 3, 6, 7 );
|
||||
}
|
||||
*/
|
||||
//
|
||||
|
||||
// Interleave 8 source buffers containing 32 bit data into the destination
|
||||
// vector
|
||||
static inline void mm256_intrlv_8x32( void *d, const void *s0,
|
||||
const void *s1, const void *s2, const void *s3, const void *s4,
|
||||
const void *s5, const void *s6, const void *s7, int bit_len )
|
||||
{
|
||||
mm256_intrlv_8x32_256( d , casti_m256i( s0,0 ), casti_m256i( s1,0 ),
|
||||
casti_m256i( s2,0 ), casti_m256i( s3,0 ), casti_m256i( s4,0 ),
|
||||
casti_m256i( s5,0 ), casti_m256i( s6,0 ), casti_m256i( s7,0 ) );
|
||||
if ( bit_len <= 256 ) return;
|
||||
mm256_intrlv_8x32_256( d+256, casti_m256i( s0,1 ), casti_m256i( s1,1 ),
|
||||
casti_m256i( s2,1 ), casti_m256i( s3,1 ), casti_m256i( s4,1 ),
|
||||
casti_m256i( s5,1 ), casti_m256i( s6,1 ), casti_m256i( s7,1 ) );
|
||||
if ( bit_len <= 512 ) return;
|
||||
if ( bit_len <= 640 )
|
||||
{
|
||||
mm256_intrlv_8x32_128( d+512, casti_m128i( s0,4 ), casti_m128i( s1,4 ),
|
||||
casti_m128i( s2,4 ), casti_m128i( s3,4 ), casti_m128i( s4,4 ),
|
||||
casti_m128i( s5,4 ), casti_m128i( s6,4 ), casti_m128i( s7,4 ) );
|
||||
return;
|
||||
}
|
||||
mm256_intrlv_8x32_256( d+512, casti_m256i( s0,2 ), casti_m256i( s1,2 ),
|
||||
casti_m256i( s2,2 ), casti_m256i( s3,2 ), casti_m256i( s4,2 ),
|
||||
casti_m256i( s5,2 ), casti_m256i( s6,2 ), casti_m256i( s7,2 ) );
|
||||
mm256_intrlv_8x32_256( d+768, casti_m256i( s0,3 ), casti_m256i( s1,3 ),
|
||||
casti_m256i( s2,3 ), casti_m256i( s3,3 ), casti_m256i( s4,3 ),
|
||||
casti_m256i( s5,3 ), casti_m256i( s6,3 ), casti_m256i( s7,3 ) );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// A couple of mining specifi functions.
|
||||
|
||||
@@ -393,72 +78,6 @@ static inline void mm256_bswap_intrlv80_8x32( void *d, const void *s )
|
||||
mm256_bswap_intrlv_8x32_128( d+512, casti_m128i( s, 4 ) );
|
||||
}
|
||||
|
||||
// Deinterleave 8 buffers of 32 bit data from the source buffer.
|
||||
// Sub-function can be called directly for 32 byte final hash.
|
||||
static inline void mm256_dintrlv_8x32( void *d0, void *d1, void *d2,
|
||||
void *d3, void *d4, void *d5, void *d6, void *d7,
|
||||
const void *s, int bit_len )
|
||||
{
|
||||
mm256_dintrlv_8x32_256( casti_m256i(d0,0), casti_m256i(d1,0),
|
||||
casti_m256i(d2,0), casti_m256i(d3,0), casti_m256i(d4,0),
|
||||
casti_m256i(d5,0), casti_m256i(d6,0), casti_m256i(d7,0), s );
|
||||
if ( bit_len <= 256 ) return;
|
||||
mm256_dintrlv_8x32_256( casti_m256i(d0,1), casti_m256i(d1,1),
|
||||
casti_m256i(d2,1), casti_m256i(d3,1), casti_m256i(d4,1),
|
||||
casti_m256i(d5,1), casti_m256i(d6,1), casti_m256i(d7,1), s+256 );
|
||||
if ( bit_len <= 512 ) return;
|
||||
// short block, final 16 bytes of input data
|
||||
if ( bit_len <= 640 )
|
||||
{
|
||||
mm128_dintrlv_8x32_128( casti_m128i(d0,2), casti_m128i(d1,2),
|
||||
casti_m128i(d2,2), casti_m128i(d3,2), casti_m128i(d4,2),
|
||||
casti_m128i(d5,2), casti_m128i(d6,2), casti_m128i(d7,2), s+512 );
|
||||
return;
|
||||
}
|
||||
// bitlen == 1024
|
||||
mm256_dintrlv_8x32_256( casti_m256i(d0,2), casti_m256i(d1,2),
|
||||
casti_m256i(d2,2), casti_m256i(d3,2), casti_m256i(d4,2),
|
||||
casti_m256i(d5,2), casti_m256i(d6,2), casti_m256i(d7,2), s+512 );
|
||||
mm256_dintrlv_8x32_256( casti_m256i(d0,3), casti_m256i(d1,3),
|
||||
casti_m256i(d2,3), casti_m256i(d3,3), casti_m256i(d4,3),
|
||||
casti_m256i(d5,3), casti_m256i(d6,3), casti_m256i(d7,3), s+768 );
|
||||
}
|
||||
|
||||
static inline void mm256_extract_lane_8x32( void *d, const void *s,
|
||||
const int lane, const int bit_len )
|
||||
{
|
||||
casti_m256i( d,0 ) = mm256_get_32(s, lane , lane+ 8, lane+ 16, lane+ 24,
|
||||
lane+32, lane+ 40, lane+ 48, lane+ 56 );
|
||||
if ( bit_len <= 256 ) return;
|
||||
casti_m256i( d,1 ) = mm256_get_32(s, lane+64, lane+ 72, lane+ 80, lane+ 88,
|
||||
lane+96, lane+104, lane+112, lane+120 );
|
||||
// bit_len == 512
|
||||
}
|
||||
|
||||
// Interleave 4 source buffers containing 64 bit data into the destination
|
||||
// buffer. Only bit_len 256, 512, 640 & 1024 are supported.
|
||||
static inline void mm256_intrlv_4x64( void *d, const void *s0,
|
||||
const void *s1, const void *s2, const void *s3, int bit_len )
|
||||
{
|
||||
mm256_intrlv_4x64_256( d , casti_m256i(s0,0), casti_m256i(s1,0),
|
||||
casti_m256i(s2,0), casti_m256i(s3,0) );
|
||||
if ( bit_len <= 256 ) return;
|
||||
mm256_intrlv_4x64_256( d+128, casti_m256i(s0,1), casti_m256i(s1,1),
|
||||
casti_m256i(s2,1), casti_m256i(s3,1) );
|
||||
if ( bit_len <= 512 ) return;
|
||||
if ( bit_len <= 640 )
|
||||
{
|
||||
mm256_intrlv_4x64_128( d+256, casti_m128i(s0,4), casti_m128i(s1,4),
|
||||
casti_m128i(s2,4), casti_m128i(s3,4) );
|
||||
return;
|
||||
}
|
||||
// bit_len == 1024
|
||||
mm256_intrlv_4x64_256( d+256, casti_m256i(s0,2), casti_m256i(s1,2),
|
||||
casti_m256i(s2,2), casti_m256i(s3,2) );
|
||||
mm256_intrlv_4x64_256( d+384, casti_m256i(s0,3), casti_m256i(s1,3),
|
||||
casti_m256i(s2,3), casti_m256i(s3,3) );
|
||||
}
|
||||
|
||||
// Interleave 80 bytes of 32 bit data for 8 lanes.
|
||||
static inline void mm256_bswap_intrlv80_4x64( void *d, const void *s )
|
||||
{
|
||||
@@ -481,258 +100,5 @@ do { \
|
||||
dst[7] = _mm256_blendv_epi8( a[7], b[7], mask ); \
|
||||
} while(0)
|
||||
|
||||
// Deinterleave 4 buffers of 64 bit data from the source buffer.
|
||||
// bit_len must be 256, 512, 640 or 1024 bits.
|
||||
// Requires overrun padding for 640 bit len.
|
||||
static inline void mm256_dintrlv_4x64( void *d0, void *d1, void *d2,
|
||||
void *d3, const void *s, int bit_len )
|
||||
{
|
||||
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 0, s );
|
||||
if ( bit_len <= 256 ) return;
|
||||
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 1, s+128 );
|
||||
if ( bit_len <= 512 ) return;
|
||||
// short block, final 16 bytes of input data
|
||||
if ( bit_len <= 640 )
|
||||
{
|
||||
mm128_dintrlv_4x64_128( d0, d1, d2, d3, 4, s+256 );
|
||||
return;
|
||||
}
|
||||
// bit_len == 1024
|
||||
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 2, s+256 );
|
||||
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 3, s+384 );
|
||||
}
|
||||
|
||||
// extract and deinterleave specified lane.
|
||||
#define mm256_extract_lane_4x64_256 \
|
||||
casti_m256i( d, 0 ) = mm256_get_64( s, lane, lane+4, lane+8, lane+12 )
|
||||
static inline void mm256_extract_lane_4x64( void *d, const void *s,
|
||||
const int lane, const int bit_len )
|
||||
{
|
||||
casti_m256i( d, 0 ) = mm256_get_64( s, lane, lane+4, lane+8, lane+12 );
|
||||
if ( bit_len <= 256 ) return;
|
||||
casti_m256i( d, 1 ) = mm256_get_64( s, lane+16, lane+20, lane+24, lane+28 );
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// Convert from 4x32 SSE2 interleaving to 4x64 AVX2.
|
||||
// Can't do it in place
|
||||
static inline void mm256_rintrlv_4x32_4x64( void *dst, void *src,
|
||||
int bit_len )
|
||||
{
|
||||
__m256i* d = (__m256i*)dst;
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
|
||||
d[0] = _mm256_set_epi32( s[ 7],s[ 3],s[ 6],s[ 2],s[ 5],s[ 1],s[ 4],s[ 0] );
|
||||
d[1] = _mm256_set_epi32( s[15],s[11],s[14],s[10],s[13],s[ 9],s[12],s[ 8] );
|
||||
d[2] = _mm256_set_epi32( s[23],s[19],s[22],s[18],s[21],s[17],s[20],s[16] );
|
||||
d[3] = _mm256_set_epi32( s[31],s[27],s[30],s[26],s[29],s[25],s[28],s[24] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[4] = _mm256_set_epi32( s[39],s[35],s[38],s[34],s[37],s[33],s[36],s[32] );
|
||||
d[5] = _mm256_set_epi32( s[47],s[43],s[46],s[42],s[45],s[41],s[44],s[40] );
|
||||
d[6] = _mm256_set_epi32( s[55],s[51],s[54],s[50],s[53],s[49],s[52],s[48] );
|
||||
d[7] = _mm256_set_epi32( s[63],s[59],s[62],s[58],s[61],s[57],s[60],s[56] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[8] = _mm256_set_epi32( s[71],s[67],s[70],s[66],s[69],s[65],s[68],s[64] );
|
||||
d[9] = _mm256_set_epi32( s[79],s[75],s[78],s[74],s[77],s[73],s[76],s[72] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d[10] = _mm256_set_epi32(s[87],s[83],s[86],s[82],s[85],s[81],s[84],s[80]);
|
||||
d[11] = _mm256_set_epi32(s[95],s[91],s[94],s[90],s[93],s[89],s[92],s[88]);
|
||||
|
||||
d[12] = _mm256_set_epi32(s[103],s[99],s[102],s[98],s[101],s[97],s[100],s[96]);
|
||||
d[13] = _mm256_set_epi32(s[111],s[107],s[110],s[106],s[109],s[105],s[108],s[104]);
|
||||
d[14] = _mm256_set_epi32(s[119],s[115],s[118],s[114],s[117],s[113],s[116],s[112]);
|
||||
d[15] = _mm256_set_epi32(s[127],s[123],s[126],s[122],s[125],s[121],s[124],s[120]);
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// Convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
|
||||
// bit_len must be multiple of 64
|
||||
static inline void mm256_rintrlv_4x64_4x32( void *dst, void *src,
|
||||
int bit_len )
|
||||
{
|
||||
__m256i *d = (__m256i*)dst;
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
|
||||
d[0] = _mm256_set_epi32( s[ 7],s[ 5],s[ 3],s[ 1],s[ 6],s[ 4],s[ 2],s[ 0] );
|
||||
d[1] = _mm256_set_epi32( s[15],s[13],s[11],s[ 9],s[14],s[12],s[10],s[ 8] );
|
||||
d[2] = _mm256_set_epi32( s[23],s[21],s[19],s[17],s[22],s[20],s[18],s[16] );
|
||||
d[3] = _mm256_set_epi32( s[31],s[29],s[27],s[25],s[30],s[28],s[26],s[24] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[4] = _mm256_set_epi32( s[39],s[37],s[35],s[33],s[38],s[36],s[34],s[32] );
|
||||
d[5] = _mm256_set_epi32( s[47],s[45],s[43],s[41],s[46],s[44],s[42],s[40] );
|
||||
d[6] = _mm256_set_epi32( s[55],s[53],s[51],s[49],s[54],s[52],s[50],s[48] );
|
||||
d[7] = _mm256_set_epi32( s[63],s[61],s[59],s[57],s[62],s[60],s[58],s[56] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[8] = _mm256_set_epi32( s[71],s[69],s[67],s[65],s[70],s[68],s[66],s[64] );
|
||||
d[9] = _mm256_set_epi32( s[79],s[77],s[75],s[73],s[78],s[76],s[74],s[72] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d[10] = _mm256_set_epi32( s[87],s[85],s[83],s[81],s[86],s[84],s[82],s[80] );
|
||||
d[11] = _mm256_set_epi32( s[95],s[93],s[91],s[89],s[94],s[92],s[90],s[88] );
|
||||
|
||||
d[12] = _mm256_set_epi32( s[103],s[101],s[99],s[97],s[102],s[100],s[98],s[96] );
|
||||
d[13] = _mm256_set_epi32( s[111],s[109],s[107],s[105],s[110],s[108],s[106],s[104] );
|
||||
d[14] = _mm256_set_epi32( s[119],s[117],s[115],s[113],s[118],s[116],s[114],s[112] );
|
||||
d[15] = _mm256_set_epi32( s[127],s[125],s[123],s[121],s[126],s[124],s[122],s[120] );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
static inline void mm256_rintrlv_4x64_2x128( void *dst0, void *dst1,
|
||||
const void *src, int bit_len )
|
||||
{
|
||||
__m256i* d0 = (__m256i*)dst0;
|
||||
__m256i* d1 = (__m256i*)dst1;
|
||||
uint64_t *s = (uint64_t*)src;
|
||||
|
||||
d0[0] = _mm256_set_epi64x( s[ 5], s[ 1], s[ 4], s[ 0] );
|
||||
d1[0] = _mm256_set_epi64x( s[ 7], s[ 3], s[ 6], s[ 2] );
|
||||
|
||||
d0[1] = _mm256_set_epi64x( s[13], s[ 9], s[12], s[ 8] );
|
||||
d1[1] = _mm256_set_epi64x( s[15], s[11], s[14], s[10] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d0[2] = _mm256_set_epi64x( s[21], s[17], s[20], s[16] );
|
||||
d1[2] = _mm256_set_epi64x( s[23], s[19], s[22], s[18] );
|
||||
|
||||
d0[3] = _mm256_set_epi64x( s[29], s[25], s[28], s[24] );
|
||||
d1[3] = _mm256_set_epi64x( s[31], s[27], s[30], s[26] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d0[4] = _mm256_set_epi64x( s[37], s[33], s[36], s[32] );
|
||||
d1[4] = _mm256_set_epi64x( s[39], s[35], s[38], s[34] );
|
||||
|
||||
d0[5] = _mm256_set_epi64x( s[45], s[41], s[44], s[40] );
|
||||
d1[5] = _mm256_set_epi64x( s[47], s[43], s[46], s[42] );
|
||||
|
||||
d0[6] = _mm256_set_epi64x( s[53], s[49], s[52], s[48] );
|
||||
d1[6] = _mm256_set_epi64x( s[55], s[51], s[54], s[50] );
|
||||
|
||||
d0[7] = _mm256_set_epi64x( s[61], s[57], s[60], s[56] );
|
||||
d1[7] = _mm256_set_epi64x( s[63], s[59], s[62], s[58] );
|
||||
}
|
||||
|
||||
static inline void mm256_rintrlv_2x128_4x64( void *dst, const void *src0,
|
||||
const void *src1, int bit_len )
|
||||
{
|
||||
__m256i* d = (__m256i*)dst;
|
||||
uint64_t *s0 = (uint64_t*)src0;
|
||||
uint64_t *s1 = (uint64_t*)src1;
|
||||
|
||||
d[ 0] = _mm256_set_epi64x( s1[2], s1[0], s0[2], s0[0] );
|
||||
d[ 1] = _mm256_set_epi64x( s1[3], s1[1], s0[3], s0[1] );
|
||||
d[ 2] = _mm256_set_epi64x( s1[6], s1[4], s0[6], s0[4] );
|
||||
d[ 3] = _mm256_set_epi64x( s1[7], s1[5], s0[7], s0[5] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[ 4] = _mm256_set_epi64x( s1[10], s1[ 8], s0[10], s0[ 8] );
|
||||
d[ 5] = _mm256_set_epi64x( s1[11], s1[ 9], s0[11], s0[ 9] );
|
||||
d[ 6] = _mm256_set_epi64x( s1[14], s1[12], s0[14], s0[12] );
|
||||
d[ 7] = _mm256_set_epi64x( s1[15], s1[13], s0[15], s0[13] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[ 8] = _mm256_set_epi64x( s1[18], s1[16], s0[18], s0[16] );
|
||||
d[ 9] = _mm256_set_epi64x( s1[19], s1[17], s0[19], s0[17] );
|
||||
d[10] = _mm256_set_epi64x( s1[22], s1[20], s0[22], s0[20] );
|
||||
d[11] = _mm256_set_epi64x( s1[23], s1[21], s0[23], s0[21] );
|
||||
|
||||
d[12] = _mm256_set_epi64x( s1[26], s1[24], s0[26], s0[24] );
|
||||
d[13] = _mm256_set_epi64x( s1[27], s1[25], s0[27], s0[25] );
|
||||
d[14] = _mm256_set_epi64x( s1[30], s1[28], s0[30], s0[28] );
|
||||
d[15] = _mm256_set_epi64x( s1[31], s1[29], s0[31], s0[29] );
|
||||
}
|
||||
|
||||
|
||||
static inline void mm256_intrlv_2x128( const void *d, const void *s0,
|
||||
void *s1, const int bit_len )
|
||||
{
|
||||
__m128i s1hi = _mm256_extracti128_si256( casti_m256i( s1,0), 1 );
|
||||
__m128i s0hi = _mm256_extracti128_si256( casti_m256i( s0,0), 1 );
|
||||
casti_m256i( d,0 ) = mm256_concat_128(
|
||||
_mm256_castsi256_si128( casti_m256i( s1,0 ) ),
|
||||
_mm256_castsi256_si128( casti_m256i( s0,0 ) ) );
|
||||
casti_m256i( d,1 ) = mm256_concat_128( s1hi, s0hi );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
s0hi = _mm256_extracti128_si256( casti_m256i( s0,1), 1 );
|
||||
s1hi = _mm256_extracti128_si256( casti_m256i( s1,1), 1 );
|
||||
casti_m256i( d,2 ) = mm256_concat_128(
|
||||
_mm256_castsi256_si128( casti_m256i( s1,1 ) ),
|
||||
_mm256_castsi256_si128( casti_m256i( s0,1 ) ) );
|
||||
casti_m256i( d,3 ) = mm256_concat_128( s1hi, s0hi );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
if ( bit_len <= 640 )
|
||||
{
|
||||
casti_m256i( d,4 ) = mm256_concat_128(
|
||||
_mm256_castsi256_si128( casti_m256i( s1,2 ) ),
|
||||
_mm256_castsi256_si128( casti_m256i( s0,2 ) ) );
|
||||
return;
|
||||
}
|
||||
|
||||
s0hi = _mm256_extracti128_si256( casti_m256i( s0,2), 1 );
|
||||
s1hi = _mm256_extracti128_si256( casti_m256i( s1,2), 1 );
|
||||
casti_m256i( d,4 ) = mm256_concat_128(
|
||||
_mm256_castsi256_si128( casti_m256i( s1,2 ) ),
|
||||
_mm256_castsi256_si128( casti_m256i( s0,2 ) ) );
|
||||
casti_m256i( d,5 ) = mm256_concat_128( s1hi, s0hi );
|
||||
|
||||
s0hi = _mm256_extracti128_si256( casti_m256i( s0,3), 1 );
|
||||
s1hi = _mm256_extracti128_si256( casti_m256i( s1,3), 1 );
|
||||
casti_m256i( d,6 ) = mm256_concat_128(
|
||||
_mm256_castsi256_si128( casti_m256i( s1,3 ) ),
|
||||
_mm256_castsi256_si128( casti_m256i( s0,3 ) ) );
|
||||
casti_m256i( d,7 ) = mm256_concat_128( s1hi, s0hi );
|
||||
}
|
||||
|
||||
static inline void mm256_dintrlv_2x128( void *dst0, void *dst1, const void *s,
|
||||
int bit_len )
|
||||
{
|
||||
__m256i *d0 = (__m256i*)dst0;
|
||||
__m256i *d1 = (__m256i*)dst1;
|
||||
|
||||
__m256i s0 = casti_m256i( s, 0 );
|
||||
__m256i s1 = casti_m256i( s, 1 );
|
||||
d0[0] = _mm256_inserti128_si256( s0, mm128_extr_lo128_256( s1 ), 1 );
|
||||
d1[0] = _mm256_inserti128_si256( s1, mm128_extr_hi128_256( s0 ), 0 );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
s0 = casti_m256i( s, 2 );
|
||||
s1 = casti_m256i( s, 3 );
|
||||
d0[1] = _mm256_inserti128_si256( s0, mm128_extr_lo128_256( s1 ), 1 );
|
||||
d1[1] = _mm256_inserti128_si256( s1, mm128_extr_hi128_256( s0 ), 0 );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
s0 = casti_m256i( s, 4 );
|
||||
s1 = casti_m256i( s, 5 );
|
||||
d0[2] = _mm256_inserti128_si256( s0, mm128_extr_lo128_256( s1 ), 1 );
|
||||
d1[2] = _mm256_inserti128_si256( s1, mm128_extr_hi128_256( s0 ), 0 );
|
||||
|
||||
s0 = casti_m256i( s, 6 );
|
||||
s1 = casti_m256i( s, 7 );
|
||||
d0[3] = _mm256_inserti128_si256( s0, mm128_extr_lo128_256( s1 ), 1 );
|
||||
d1[3] = _mm256_inserti128_si256( s1, mm128_extr_hi128_256( s0 ), 0 );
|
||||
}
|
||||
|
||||
#undef extr64_cast128_256
|
||||
#undef extr32_cast128_256
|
||||
|
||||
#endif // AVX2
|
||||
#endif // INTRLV_AVX22_H__
|
||||
#endif // INTRLV_AVX2_H__
|
||||
|
@@ -442,7 +442,7 @@ static inline void mm512_dintrlv_16x32( void *d00, void *d01, void *d02,
|
||||
1, src+1024 );
|
||||
}
|
||||
|
||||
static inline void mm512_extract_lane_16x32( void *dst, const void *src,
|
||||
static inline void mm512_extr_lane_16x32( void *dst, const void *src,
|
||||
const int lane, const int bit_len )
|
||||
{
|
||||
if ( bit_len <= 256 )
|
||||
@@ -506,7 +506,7 @@ static inline void mm512_dintrlv_8x64( void *d0, void *d1, void *d2,
|
||||
}
|
||||
|
||||
// Extract one lane from 64 bit interleaved data
|
||||
static inline void mm512_extract_lane_8x64( void *d, const void *s,
|
||||
static inline void mm512_extr_lane_8x64( void *d, const void *s,
|
||||
const int lane, const int bit_len )
|
||||
{
|
||||
if ( bit_len <= 256 )
|
||||
@@ -661,7 +661,7 @@ static inline void mm512_rintrlv_4x128_8x64( void *dst, const void *src0,
|
||||
|
||||
}
|
||||
|
||||
static inline void mm512_extract_lane_4x128( void *d, const void *s,
|
||||
static inline void mm512_extr_lane_4x128( void *d, const void *s,
|
||||
const int lane, const int bit_len )
|
||||
{
|
||||
int l = lane<<1;
|
||||
|
@@ -104,7 +104,7 @@ static inline void mm64_dintrlv_2x32( void *d00, void *d01, const int n,
|
||||
casti_m64( d01,15 ) = mm64_get_32( s, 61, 63 );
|
||||
}
|
||||
|
||||
static inline void mm64_extract_lane_2x32( void *d, const void *s,
|
||||
static inline void mm64_extr_lane_2x32( void *d, const void *s,
|
||||
const int lane, const int bit_len )
|
||||
{
|
||||
casti_m64( d, 0 ) = mm64_get_32( s, lane , lane+ 4 );
|
||||
|
77
simd-utils/intrlv-selector.h
Normal file
77
simd-utils/intrlv-selector.h
Normal file
@@ -0,0 +1,77 @@
|
||||
#if !defined(INTRLV_SELECTOR_H__)
|
||||
#define INTRLV_SELECTOR_H__
|
||||
|
||||
//////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Generic interface for interleaving data for parallel processing.
|
||||
//
|
||||
// Best tech is chosen atomatically.
|
||||
|
||||
/*
|
||||
#if defined(__AVX512F__)
|
||||
|
||||
#define intrlv_4x128 mm512_intrlv_4x128
|
||||
#define intrlv_4x128 mm512_intrlv_4x128
|
||||
|
||||
#define intrlv_8x64 mm512_intrlv_8x64
|
||||
#define dintrlv_8x64 mm512_dintrlv_8x64
|
||||
#define extr_lane_8x64 mm512_extr_lane_8x64
|
||||
|
||||
#define intrlv_16x32 mm512_intrlv_16x32
|
||||
#define dintrlv_16x32 mm512_dintrlv_16x32
|
||||
#define extr_lane_16x32 mm512_extr_lane_16x32
|
||||
|
||||
#define intrlv_2x128 mm512_intrlv_2x128
|
||||
#define dintrlv_2x128 mm512_dintrlv_2x128
|
||||
|
||||
#define intrlv_4x64 mm512_intrlv_4x64
|
||||
#define dintrlv_4x64 mm512_dintrlv_4x64
|
||||
#define extr_lane_4x64 mm512_extr_lane_4x64
|
||||
|
||||
#define intrlv_8x32 mm512_intrlv_8x32
|
||||
#define dintrlv_8x32 mm512_dintrlv_8x32
|
||||
#define extr_lane_8x32 mm512_extr_lane_8x32
|
||||
|
||||
#elif defined(__AVX__)
|
||||
*/
|
||||
#if defined(__AVX__)
|
||||
|
||||
#define intrlv_2x128 mm256_intrlv_2x128
|
||||
#define dintrlv_2x128 mm256_dintrlv_2x128
|
||||
|
||||
#define intrlv_4x64 mm256_intrlv_4x64
|
||||
#define dintrlv_4x64 mm256_dintrlv_4x64
|
||||
#define extr_lane_4x64 mm256_extr_lane_4x64
|
||||
|
||||
#define intrlv_8x32 mm256_intrlv_8x32
|
||||
#define dintrlv_8x32 mm256_dintrlv_8x32
|
||||
#define extr_lane_8x32 mm256_extr_lane_8x32
|
||||
|
||||
#define intrlv_4x32 mm256_intrlv_4x32
|
||||
#define dintrlv_4x32 mm256_dintrlv_4x32
|
||||
#define extr_lane_4x32 mm256_extr_lane_4x32
|
||||
|
||||
#else
|
||||
|
||||
#define intrlv_2x128 mm128_intrlv_2x128
|
||||
#define dintrlv_2x128 mm128_dintrlv_2x128
|
||||
|
||||
#define intrlv_4x64 mm128_intrlv_4x64
|
||||
#define dintrlv_4x64 mm128_dintrlv_4x64
|
||||
#define extr_lane_4x64 mm128_extr_lane_4x64
|
||||
|
||||
#define intrlv_8x32 mm128_intrlv_8x32
|
||||
#define dintrlv_8x32 mm128_dintrlv_8x32
|
||||
#define extr_lane_8x32 mm128_extr_lane_8x32
|
||||
|
||||
#define intrlv_2x64 mm128_intrlv_2x64
|
||||
#define dintrlv_2x64 mm128_dintrlv_2x64
|
||||
#define extr_lane_2x64 mm128_extr_lane_2x64
|
||||
|
||||
#define intrlv_4x32 mm128_intrlv_4x32
|
||||
#define dintrlv_4x32 mm128_dintrlv_4x32
|
||||
#define extr_lane_4x32 mm128_extr_lane_4x32
|
||||
|
||||
#endif
|
||||
|
||||
#endif // INTRLV_SELECTOR_H__
|
@@ -162,7 +162,7 @@ static inline void mm128_dintrlv_4x32( void *d0, void *d1, void *d2,
|
||||
}
|
||||
|
||||
// extract and deinterleave specified lane.
|
||||
static inline void mm128_extract_lane_4x32( void *d, const void *s,
|
||||
static inline void mm128_extr_lane_4x32( void *d, const void *s,
|
||||
const int lane, const int bit_len )
|
||||
{
|
||||
casti_m128i( d, 0 ) =
|
||||
|
243
simd-utils/simd-avx.h
Normal file
243
simd-utils/simd-avx.h
Normal file
@@ -0,0 +1,243 @@
|
||||
#if !defined(SIMD_AVX_H__)
|
||||
#define SIMD_AVX_H__ 1
|
||||
|
||||
#if defined(__AVX__)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// AVX 256 bit vectors
|
||||
//
|
||||
// Basic support for 256 bit vectors. Most of the good stuff needs AVX2.
|
||||
|
||||
// Compile time vector constants and initializers.
|
||||
//
|
||||
// The following macro constants and functions should only be used
|
||||
// for compile time initialization of constant and variable vector
|
||||
// arrays. These constants use memory, use _mm256_set at run time to
|
||||
// avoid using memory.
|
||||
|
||||
#define mm256_const_64( x3, x2, x1, x0 ) {{ x3, x2, x1, x0 }}
|
||||
#define mm256_const1_64( x ) {{ x,x,x,x }}
|
||||
|
||||
#define mm256_const_32( x7, x6, x5, x4, x3, x2, x1, x0 ) \
|
||||
{{ x7, x6, x5, x4, x3, x2, x1, x0 }}
|
||||
#define mm256_const1_32( x ) {{ x,x,x,x, x,x,x,x }}
|
||||
|
||||
#define mm256_const_16( x15, x14, x13, x12, x11, x10, x09, x08, \
|
||||
x07, x06, x05, x04, x03, x02, x01, x00 ) \
|
||||
{{ x15, x14, x13, x12, x11, x10, x09, x08, \
|
||||
x07, x06, x05, x04, x03, x02, x01, x00 }}
|
||||
#define mm256_const1_16( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
|
||||
|
||||
#define mm256_const_8( x31, x30, x29, x28, x27, x26, x25, x24, \
|
||||
x23, x22, x21, x20, x19, x18, x17, x16, \
|
||||
x15, x14, x13, x12, x11, x10, x09, x08, \
|
||||
x07, x06, x05, x04, x03, x02, x01, x00 ) \
|
||||
{{ x31, x30, x29, x28, x27, x26, x25, x24, \
|
||||
x23, x22, x21, x20, x19, x18, x17, x16, \
|
||||
x15, x14, x13, x12, x11, x10, x09, x08, \
|
||||
x07, x06, x05, x04, x03, x02, x01, x00 }}
|
||||
#define mm256_const1_8( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, \
|
||||
x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
|
||||
|
||||
// Predefined compile time constant vectors.
|
||||
// Use Pseudo constants at run time for all simple constant vectors.
|
||||
#define c256_zero mm256_const1_64( 0ULL )
|
||||
#define c256_one_256 mm256_const_64( 0ULL, 0ULL, 0ULL, 1ULL )
|
||||
#define c256_one_128 mm256_const_64( 0ULL, 1ULL, 0ULL, 1ULL )
|
||||
#define c256_one_64 mm256_const1_64( 1ULL )
|
||||
#define c256_one_32 mm256_const1_32( 1UL )
|
||||
#define c256_one_16 mm256_const1_16( 1U )
|
||||
#define c256_one_8 mm256_const1_8( 1U )
|
||||
#define c256_neg1 mm256_const1_64( 0xFFFFFFFFFFFFFFFFULL )
|
||||
#define c256_neg1_64 mm256_const1_64( 0xFFFFFFFFFFFFFFFFULL )
|
||||
#define c256_neg1_32 mm256_const1_32( 0xFFFFFFFFUL )
|
||||
#define c256_neg1_16 mm256_const1_16( 0xFFFFU )
|
||||
#define c256_neg1_8 mm256_const1_8( 0xFFU )
|
||||
|
||||
//
|
||||
// Pseudo constants.
|
||||
// These can't be used for compile time initialization but are preferable
|
||||
// for simple constant vectors at run time.
|
||||
|
||||
#define m256_zero _mm256_setzero_si256()
|
||||
#define m256_one_256 _mm256_set_epi64x( 0ULL, 0ULL, 0ULL, 1ULL )
|
||||
#define m256_one_128 _mm256_set_epi64x( 0ULL, 1ULL, 0ULL, 1ULL )
|
||||
#define m256_one_64 _mm256_set1_epi64x( 1ULL )
|
||||
#define m256_one_32 _mm256_set1_epi32( 1UL )
|
||||
#define m256_one_16 _mm256_set1_epi16( 1U )
|
||||
#define m256_one_8 _mm256_set1_epi8( 1U )
|
||||
#define m256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
|
||||
|
||||
//
|
||||
// Vector size conversion.
|
||||
//
|
||||
// Allows operations on either or both halves of a 256 bit vector serially.
|
||||
// Handy for parallel AES.
|
||||
// Caveats:
|
||||
// _mm256_castsi256_si128 is free and without side effects.
|
||||
// _mm256_castsi128_si256 is also free but leaves the high half
|
||||
// undefined. That's ok if the hi half will be subseqnently assigned.
|
||||
// If assigning both, do lo first, If assigning only 1, use
|
||||
// _mm256_inserti128_si256.
|
||||
//
|
||||
// What to do about extractf128 (AVX) and extracti128 (AVX2)?
|
||||
#define mm128_extr_lo128_256( a ) _mm256_castsi256_si128( a )
|
||||
#define mm128_extr_hi128_256( a ) _mm256_extractf128_si256( a, 1 )
|
||||
|
||||
// Extract 4 u64 from 256 bit vector.
|
||||
#define mm256_extr_4x64( a0, a1, a2, a3, src ) \
|
||||
do { \
|
||||
__m128i hi = _mm256_extractf128_si256( src, 1 ); \
|
||||
a0 = _mm_extract_epi64( _mm256_castsi256_si128( src ), 0 ); \
|
||||
a1 = _mm_extract_epi64( _mm256_castsi256_si128( src ), 1 ); \
|
||||
a2 = _mm_extract_epi64( hi, 0 ); \
|
||||
a3 = _mm_extract_epi64( hi, 1 ); \
|
||||
} while(0)
|
||||
|
||||
#define mm256_extr_8x32( a0, a1, a2, a3, a4, a5, a6, a7, src ) \
|
||||
do { \
|
||||
__m128i hi = _mm256_extractf128_si256( src, 1 ); \
|
||||
a0 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 0 ); \
|
||||
a1 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 1 ); \
|
||||
a2 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 2 ); \
|
||||
a3 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 3 ); \
|
||||
a4 = _mm_extract_epi32( hi, 0 ); \
|
||||
a5 = _mm_extract_epi32( hi, 1 ); \
|
||||
a6 = _mm_extract_epi32( hi, 2 ); \
|
||||
a7 = _mm_extract_epi32( hi, 3 ); \
|
||||
} while(0)
|
||||
|
||||
// input __m128i, returns __m256i
|
||||
// To build a 256 bit vector from 2 128 bit vectors lo must be done first.
|
||||
// lo alone leaves hi undefined, hi alone leaves lo unchanged.
|
||||
// Both cost one clock while preserving the other half..
|
||||
// Insert b into specified half of a leaving other half of a unchanged.
|
||||
#define mm256_ins_lo128_256( a, b ) _mm256_insertf128_si256( a, b, 0 )
|
||||
#define mm256_ins_hi128_256( a, b ) _mm256_insertf128_si256( a, b, 1 )
|
||||
|
||||
// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
|
||||
#define mm256_concat_128( hi, lo ) \
|
||||
mm256_ins_hi128_256( _mm256_castsi128_si256( lo ), hi )
|
||||
|
||||
// Horizontal vector testing
|
||||
|
||||
// Needs int128 support
|
||||
// Bit-wise test of entire vector, useful to test results of cmp.
|
||||
#define mm256_anybits0( a ) \
|
||||
( (uint128_t)mm128_extr_hi128_256( a ) \
|
||||
| (uint128_t)mm128_extr_lo128_256( a ) )
|
||||
|
||||
#define mm256_anybits1( a ) \
|
||||
( ( (uint128_t)mm128_extr_hi128_256( a ) + 1 ) \
|
||||
| ( (uint128_t)mm128_extr_lo128_256( a ) + 1 ) )
|
||||
|
||||
#define mm256_allbits0_256( a ) ( !mm256_anybits1(a) )
|
||||
#define mm256_allbits1_256( a ) ( !mm256_anybits0(a) )
|
||||
|
||||
// Parallel AES, for when x is expected to be in a 256 bit register.
|
||||
#define mm256_aesenc_2x128( x ) \
|
||||
mm256_concat_128( \
|
||||
_mm_aesenc_si128( mm128_extr_hi128_256( x ), m128_zero ), \
|
||||
_mm_aesenc_si128( mm128_extr_lo128_256( x ), m128_zero ) )
|
||||
|
||||
#define mm256_aesenckey_2x128( x, k ) \
|
||||
mm256_concat_128( \
|
||||
_mm_aesenc_si128( mm128_extr_hi128_256( x ), \
|
||||
mm128_extr_lo128_256( k ) ), \
|
||||
_mm_aesenc_si128( mm128_extr_hi128_256( x ), \
|
||||
mm128_extr_lo128_256( k ) ) )
|
||||
|
||||
#define mm256_paesenc_2x128( y, x ) do \
|
||||
{ \
|
||||
__m256i *X = (__m256i*)x; \
|
||||
__m256i *Y = (__m256i*)y; \
|
||||
y[0] = _mm_aesenc_si128( x[0], m128_zero ); \
|
||||
y[1] = _mm_aesenc_si128( x[1], m128_zero ); \
|
||||
} while(0);
|
||||
|
||||
// With pointers.
|
||||
#define mm256_paesenckey_2x128( y, x, k ) do \
|
||||
{ \
|
||||
__m256i *X = (__m256i*)x; \
|
||||
__m256i *Y = (__m256i*)y; \
|
||||
__m256i *K = (__m256i*)ky; \
|
||||
y[0] = _mm_aesenc_si128( x[0], K[0] ); \
|
||||
y[1] = _mm_aesenc_si128( x[1], K[1] ); \
|
||||
} while(0);
|
||||
|
||||
//
|
||||
// Pointer casting
|
||||
|
||||
// p = any aligned pointer
|
||||
// returns p as pointer to vector type, not very useful
|
||||
#define castp_m256i(p) ((__m256i*)(p))
|
||||
|
||||
// p = any aligned pointer
|
||||
// returns *p, watch your pointer arithmetic
|
||||
#define cast_m256i(p) (*((__m256i*)(p)))
|
||||
|
||||
// p = any aligned pointer, i = scaled array index
|
||||
// returns value p[i]
|
||||
#define casti_m256i(p,i) (((__m256i*)(p))[(i)])
|
||||
|
||||
// p = any aligned pointer, o = scaled offset
|
||||
// returns pointer p+o
|
||||
#define casto_m256i(p,o) (((__m256i*)(p))+(o))
|
||||
|
||||
|
||||
// Gather scatter
|
||||
|
||||
#define mm256_gather_64( d, s0, s1, s2, s3 ) \
|
||||
((uint64_t*)(d))[0] = (uint64_t)(s0); \
|
||||
((uint64_t*)(d))[1] = (uint64_t)(s1); \
|
||||
((uint64_t*)(d))[2] = (uint64_t)(s2); \
|
||||
((uint64_t*)(d))[3] = (uint64_t)(s3);
|
||||
|
||||
#define mm256_gather_32( d, s0, s1, s2, s3, s4, s5, s6, s7 ) \
|
||||
((uint32_t*)(d))[0] = (uint32_t)(s0); \
|
||||
((uint32_t*)(d))[1] = (uint32_t)(s1); \
|
||||
((uint32_t*)(d))[2] = (uint32_t)(s2); \
|
||||
((uint32_t*)(d))[3] = (uint32_t)(s3); \
|
||||
((uint32_t*)(d))[4] = (uint32_t)(s4); \
|
||||
((uint32_t*)(d))[5] = (uint32_t)(s5); \
|
||||
((uint32_t*)(d))[6] = (uint32_t)(s6); \
|
||||
((uint32_t*)(d))[7] = (uint32_t)(s7);
|
||||
|
||||
|
||||
// Scatter data from contiguous memory.
|
||||
// All arguments are pointers
|
||||
#define mm256_scatter_64( d0, d1, d2, d3, s ) \
|
||||
*((uint64_t*)(d0)) = ((uint64_t*)(s))[0]; \
|
||||
*((uint64_t*)(d1)) = ((uint64_t*)(s))[1]; \
|
||||
*((uint64_t*)(d2)) = ((uint64_t*)(s))[2]; \
|
||||
*((uint64_t*)(d3)) = ((uint64_t*)(s))[3];
|
||||
|
||||
#define mm256_scatter_32( d0, d1, d2, d3, d4, d5, d6, d7, s ) \
|
||||
*((uint32_t*)(d0)) = ((uint32_t*)(s))[0]; \
|
||||
*((uint32_t*)(d1)) = ((uint32_t*)(s))[1]; \
|
||||
*((uint32_t*)(d2)) = ((uint32_t*)(s))[2]; \
|
||||
*((uint32_t*)(d3)) = ((uint32_t*)(s))[3]; \
|
||||
*((uint32_t*)(d4)) = ((uint32_t*)(s))[4]; \
|
||||
*((uint32_t*)(d5)) = ((uint32_t*)(s))[5]; \
|
||||
*((uint32_t*)(d6)) = ((uint32_t*)(s))[6]; \
|
||||
*((uint32_t*)(d7)) = ((uint32_t*)(s))[7];
|
||||
|
||||
|
||||
//
|
||||
// Memory functions
|
||||
// n = number of 256 bit (32 byte) vectors
|
||||
|
||||
static inline void memset_zero_256( __m256i *dst, int n )
|
||||
{ for ( int i = 0; i < n; i++ ) dst[i] = m256_zero; }
|
||||
|
||||
static inline void memset_256( __m256i *dst, const __m256i a, int n )
|
||||
{ for ( int i = 0; i < n; i++ ) dst[i] = a; }
|
||||
|
||||
static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
|
||||
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
|
||||
|
||||
|
||||
#endif // __AVX__
|
||||
#endif // SIMD_AVX_H__
|
||||
|
@@ -17,66 +17,6 @@
|
||||
// Constants of these types reside in memory.
|
||||
|
||||
|
||||
// Compile time vector constants and initializers.
|
||||
//
|
||||
// The following macro constants and functions should only be used
|
||||
// for compile time initialization of constant and variable vector
|
||||
// arrays. These constants use memory, use _mm256_set at run time to
|
||||
// avoid using memory.
|
||||
|
||||
#define mm256_const_64( x3, x2, x1, x0 ) {{ x3, x2, x1, x0 }}
|
||||
#define mm256_const1_64( x ) {{ x,x,x,x }}
|
||||
|
||||
#define mm256_const_32( x7, x6, x5, x4, x3, x2, x1, x0 ) \
|
||||
{{ x7, x6, x5, x4, x3, x2, x1, x0 }}
|
||||
#define mm256_const1_32( x ) {{ x,x,x,x, x,x,x,x }}
|
||||
|
||||
#define mm256_const_16( x15, x14, x13, x12, x11, x10, x09, x08, \
|
||||
x07, x06, x05, x04, x03, x02, x01, x00 ) \
|
||||
{{ x15, x14, x13, x12, x11, x10, x09, x08, \
|
||||
x07, x06, x05, x04, x03, x02, x01, x00 }}
|
||||
#define mm256_const1_16( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
|
||||
|
||||
#define mm256_const_8( x31, x30, x29, x28, x27, x26, x25, x24, \
|
||||
x23, x22, x21, x20, x19, x18, x17, x16, \
|
||||
x15, x14, x13, x12, x11, x10, x09, x08, \
|
||||
x07, x06, x05, x04, x03, x02, x01, x00 ) \
|
||||
{{ x31, x30, x29, x28, x27, x26, x25, x24, \
|
||||
x23, x22, x21, x20, x19, x18, x17, x16, \
|
||||
x15, x14, x13, x12, x11, x10, x09, x08, \
|
||||
x07, x06, x05, x04, x03, x02, x01, x00 }}
|
||||
#define mm256_const1_8( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, \
|
||||
x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
|
||||
|
||||
// Predefined compile time constant vectors.
|
||||
// Use Pseudo constants at run time for all simple constant vectors.
|
||||
#define c256_zero mm256_const1_64( 0ULL )
|
||||
#define c256_one_256 mm256_const_64( 0ULL, 0ULL, 0ULL, 1ULL )
|
||||
#define c256_one_128 mm256_const_64( 0ULL, 1ULL, 0ULL, 1ULL )
|
||||
#define c256_one_64 mm256_const1_64( 1ULL )
|
||||
#define c256_one_32 mm256_const1_32( 1UL )
|
||||
#define c256_one_16 mm256_const1_16( 1U )
|
||||
#define c256_one_8 mm256_const1_8( 1U )
|
||||
#define c256_neg1 mm256_const1_64( 0xFFFFFFFFFFFFFFFFULL )
|
||||
#define c256_neg1_64 mm256_const1_64( 0xFFFFFFFFFFFFFFFFULL )
|
||||
#define c256_neg1_32 mm256_const1_32( 0xFFFFFFFFUL )
|
||||
#define c256_neg1_16 mm256_const1_16( 0xFFFFU )
|
||||
#define c256_neg1_8 mm256_const1_8( 0xFFU )
|
||||
|
||||
//
|
||||
// Pseudo constants.
|
||||
// These can't be used for compile time initialization but are preferable
|
||||
// for simple constant vectors at run time.
|
||||
|
||||
#define m256_zero _mm256_setzero_si256()
|
||||
#define m256_one_256 _mm256_set_epi64x( 0ULL, 0ULL, 0ULL, 1ULL )
|
||||
#define m256_one_128 _mm256_set_epi64x( 0ULL, 1ULL, 0ULL, 1ULL )
|
||||
#define m256_one_64 _mm256_set1_epi64x( 1ULL )
|
||||
#define m256_one_32 _mm256_set1_epi32( 1UL )
|
||||
#define m256_one_16 _mm256_set1_epi16( 1U )
|
||||
#define m256_one_8 _mm256_set1_epi8( 1U )
|
||||
#define m256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
|
||||
|
||||
//
|
||||
// Basic operations without SIMD equivalent
|
||||
|
||||
@@ -88,6 +28,11 @@
|
||||
#define mm256_negate_32( a ) _mm256_sub_epi32( m256_zero, a )
|
||||
#define mm256_negate_16( a ) _mm256_sub_epi16( m256_zero, a )
|
||||
|
||||
/***************************
|
||||
*
|
||||
* extracti128 (AVX2) vs extractf128 (AVX)???
|
||||
|
||||
|
||||
//
|
||||
// Vector size conversion.
|
||||
//
|
||||
@@ -133,7 +78,9 @@ do { \
|
||||
// Insert b into specified half of a leaving other half of a unchanged.
|
||||
#define mm256_ins_lo128_256( a, b ) _mm256_inserti128_si256( a, b, 0 )
|
||||
#define mm256_ins_hi128_256( a, b ) _mm256_inserti128_si256( a, b, 1 )
|
||||
*/
|
||||
|
||||
/*
|
||||
// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
|
||||
#define mm256_concat_128( hi, lo ) \
|
||||
mm256_ins_hi128_256( _mm256_castsi128_si256( lo ), hi )
|
||||
@@ -254,6 +201,8 @@ static inline void memset_256( __m256i *dst, const __m256i a, int n )
|
||||
static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
|
||||
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
|
||||
|
||||
*************************************/
|
||||
|
||||
//
|
||||
// Bit rotations.
|
||||
//
|
||||
|
@@ -14,11 +14,13 @@
|
||||
#ifndef WIN32
|
||||
|
||||
#define HWMON_PATH \
|
||||
"/sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input"
|
||||
"/sys/class/hwmon/hwmon2/temp1_input"
|
||||
#define HWMON_ALT \
|
||||
"/sys/class/hwmon/hwmon1/temp1_input"
|
||||
#define HWMON_ALT2 \
|
||||
"/sys/class/hwmon/hwmon0/temp1_input"
|
||||
#define HWMON_ALT1 \
|
||||
"/sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input"
|
||||
#define HWMON_ALT2 \
|
||||
"/sys/class/hwmon/hwmon1/temp1_input"
|
||||
#define HWMON_ALT3 \
|
||||
"/sys/devices/platform/coretemp.0/hwmon/hwmon0/temp2_input"
|
||||
#define HWMON_ALT4 \
|
||||
|
5
util.c
5
util.c
@@ -1631,7 +1631,7 @@ bool rpc2_job_decode(const json_t *job, struct work *work)
|
||||
hashrate += thr_hashrates[i];
|
||||
pthread_mutex_unlock(&stats_lock);
|
||||
double diff = trunc( ( ((double)0xffffffff) / target ) );
|
||||
if ( opt_showdiff )
|
||||
if ( !opt_quiet )
|
||||
// xmr pool diff can change a lot...
|
||||
applog(LOG_WARNING, "Stratum difficulty set to %g", diff);
|
||||
stratum_diff = diff;
|
||||
@@ -1813,7 +1813,8 @@ static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params)
|
||||
/* store for api stats */
|
||||
stratum_diff = diff;
|
||||
|
||||
applog(LOG_WARNING, "Stratum difficulty set to %g", diff);
|
||||
if ( !opt_quiet )
|
||||
applog(LOG_BLUE, "Stratum difficulty set to %g", diff);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
Reference in New Issue
Block a user