This commit is contained in:
Jay D Dee
2019-07-02 15:10:38 -04:00
parent 0d48d573ce
commit 0d769ee0fe
53 changed files with 1755 additions and 1170 deletions

View File

@@ -38,9 +38,31 @@ supported.
Change Log
----------
v3.9.5.1
Fixed skein2 crash on Windows.
Fixed CPU temperature reading on Ubuntu 19.04.
Realigned log message colours, blue is used to report normal activity and
yellow is only used to report abnormal activity.
Changed stats colours, yellow now means below average, white is average
range. Tweaked colour thresholds.
Changed colour of stratum difficulty change messages to blue to match other
normal protocol messages. Blue messages (block, stratum, submit) will no
longer be displayed when using -q option.
Added job id to new block, share submit, and share result messages and added
new nessage when a new job is received for an existing block. This will for
better troubleshooting of invalid job id rejects seen at zergpool.
Some more restructuring.
v3.9.5
New share reporting information includes calculation of equivalent hhashrate
New share reporting information includes calculation of equivalent hashrate
based on share difficulty, network latency, 5 minute summary.
Per-thread hash rate reports are disabled by default.
New command line option --hash-meter added to enable per-thread hash rates.

View File

@@ -368,9 +368,9 @@ bool submit_solution( struct work *work, void *hash,
if ( submit_work( thr, work ) )
{
if ( !opt_quiet )
applog( LOG_NOTICE, "Share %d submitted by thread %d.",
applog( LOG_BLUE, "Share %d submitted by thread %d, job %s.",
accepted_share_count + rejected_share_count + 1,
thr->id );
thr->id, work->job_id );
return true;
}
else
@@ -385,9 +385,12 @@ bool submit_lane_solution( struct work *work, void *hash,
if ( submit_work( thr, work ) )
{
if ( !opt_quiet )
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
// applog( LOG_BLUE, "Share %d submitted by thread %d, lane %d.",
// accepted_share_count + rejected_share_count + 1,
// thr->id, lane );
applog( LOG_BLUE, "Share %d submitted by thread %d, lane %d, job %s.",
accepted_share_count + rejected_share_count + 1, thr->id,
lane );
lane, work->job_id );
return true;
}
else

View File

@@ -15,7 +15,7 @@ void blakehash_4way(void *state, const void *input)
memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
blake256r14_4way( &ctx, input + (64<<2), 16 );
blake256r14_4way_close( &ctx, vhash );
mm128_dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_blake_4way( struct work *work, uint32_t max_nonce,

View File

@@ -83,7 +83,7 @@ void blake2s_4way_hash( void *output, const void *input )
blake2s_4way_update( &ctx, input + (64<<2), 16 );
blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
mm128_dintrlv_4x32( output, output+32, output+64, output+96,
dintrlv_4x32( output, output+32, output+64, output+96,
vhash, 256 );
}

View File

@@ -17,7 +17,7 @@ void blakecoin_4way_hash(void *state, const void *input)
blake256r8_4way( &ctx, input + (64<<2), 16 );
blake256r8_4way_close( &ctx, vhash );
mm128_dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,

View File

@@ -23,7 +23,7 @@ void decred_hash_4way( void *state, const void *input )
memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
blake256_4way( &ctx, tail, tail_len );
blake256_4way_close( &ctx, vhash );
mm128_dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_decred_4way( struct work *work, uint32_t max_nonce,

View File

@@ -33,7 +33,7 @@ void myriad_4way_hash( void *output, const void *input )
myrgr_4way_ctx_holder ctx;
memcpy( &ctx, &myrgr_4way_ctx, sizeof(myrgr_4way_ctx) );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, input, 640 );
dintrlv_4x32( hash0, hash1, hash2, hash3, input, 640 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
@@ -43,7 +43,7 @@ void myriad_4way_hash( void *output, const void *input )
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
sha256_4way( &ctx.sha, vhash, 64 );
sha256_4way_close( &ctx.sha, output );
@@ -89,7 +89,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ )
if ( hash7[ lane ] <= Htarg )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;

View File

@@ -89,7 +89,7 @@ int scanhash_jha_4way( struct work *work, uint32_t max_nonce,
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[25]);
uint32_t lane_hash[8];
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -143,7 +143,7 @@ int scanhash_jha_4way( struct work *work, uint32_t max_nonce,
// && fulltest( hash+(i<<3), ptarget ) )
for ( int i = 0; i < 4; i++ ) if ( !( (hash7[i] & mask ) == 0 ) )
{
mm256_extract_lane_4x64( lane_hash, hash, i, 256 );
mm256_extr_lane_4x64( lane_hash, hash, i, 256 );
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;

View File

@@ -21,8 +21,8 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
{
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[25]); // 3*8+1
uint32_t lane_hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
@@ -41,7 +41,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ )
if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
{
mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
mm256_extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;

View File

@@ -36,7 +36,7 @@ void lyra2h_4way_hash( void *state, const void *input )
blake256_4way( &ctx_blake, input + (64*4), 16 );
blake256_4way_close( &ctx_blake, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2Z( lyra2h_4way_matrix, state, 32, hash0, 32, hash0, 32,
16, 16, 16 );

View File

@@ -78,7 +78,7 @@ void lyra2rev2_4way_hash( void *state, const void *input )
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, state );
@@ -90,7 +90,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -116,7 +116,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;

View File

@@ -92,7 +92,7 @@ int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t lane_hash[8];
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -115,7 +115,7 @@ int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
{
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
mm256_extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
@@ -161,7 +161,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
blake256_4way( &ctx.blake, input, 80 );
blake256_4way_close( &ctx.blake, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
@@ -181,7 +181,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, state );
}
@@ -192,7 +192,7 @@ int scanhash_lyra2rev3_4way( struct work *work, uint32_t max_nonce,
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -214,7 +214,7 @@ int scanhash_lyra2rev3_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;

View File

@@ -36,7 +36,7 @@ void lyra2z_4way_hash( void *state, const void *input )
blake256_4way( &ctx_blake, input + (64*4), 16 );
blake256_4way_close( &ctx_blake, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2Z( lyra2z_4way_matrix, state , 32, hash0, 32, hash0, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, state+32, 32, hash1, 32, hash1, 32, 8, 8, 8 );

View File

@@ -168,7 +168,7 @@ int scanhash_phi2_4way( struct work *work, uint32_t max_nonce,
uint32_t _ALIGN(128) edata[36];
uint32_t vdata[4][36] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[25]);
uint32_t lane_hash[8];
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -217,7 +217,7 @@ int scanhash_phi2_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[ lane<<1 ] < Htarg )
{
mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
mm256_extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;

View File

@@ -207,6 +207,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
SHA512_Update( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
SHA512_Final( (unsigned char*) (bhash[1]), &ctx2.sha512 );
sph_keccak512( &ctx2.keccak, data_p64, 80 - M7_MIDSTATE_LEN );
sph_keccak512_close( &ctx2.keccak, (void*)(bhash[2]) );
@@ -222,18 +223,18 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
sph_ripemd160( &ctx2.ripemd, data_p64, 80 - M7_MIDSTATE_LEN );
sph_ripemd160_close( &ctx2.ripemd, (void*)(bhash[6]) );
mpz_import(bns0, a, -1, p, -1, 0, bhash[0]);
mpz_import(bns0, a, -1, p, -1, 0, bhash[0]);
mpz_set(bns1, bns0);
mpz_set(product, bns0);
for ( i=1; i < 7; i++ )
mpz_set(product, bns0);
for ( i=1; i < 7; i++ )
{
mpz_import(bns0, a, -1, p, -1, 0, bhash[i]);
mpz_add(bns1, bns1, bns0);
mpz_mul(product, product, bns0);
mpz_import(bns0, a, -1, p, -1, 0, bhash[i]);
mpz_add(bns1, bns1, bns0);
mpz_mul(product, product, bns0);
}
mpz_mul(product, product, bns1);
mpz_mul(product, product, product);
mpz_mul(product, product, product);
bytes = mpz_sizeinbase(product, 256);
mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product);
@@ -243,27 +244,27 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
mpf_set_prec_raw(magifpi, prec);
mpf_set_prec_raw(mptmp, prec);
mpf_set_prec_raw(mpt1, prec);
mpf_set_prec_raw(mpt2, prec);
mpf_set_prec_raw(magifpi, prec);
mpf_set_prec_raw(mptmp, prec);
mpf_set_prec_raw(mpt1, prec);
mpf_set_prec_raw(mpt2, prec);
usw_ = sw2_(n/2);
mpzscale = 1;
mpzscale = 1;
mpz_set_ui(magisw, usw_);
for ( i = 0; i < 5; i++ )
{
mpf_set_d(mpt1, 0.25*mpzscale);
mpf_sub(mpt1, mpt1, mpt2);
mpf_sub(mpt1, mpt1, mpt2);
mpf_abs(mpt1, mpt1);
mpf_div(magifpi, magifpi0, mpt1);
mpf_pow_ui(mptmp, mpten, digits >> 1);
mpf_mul(magifpi, magifpi, mptmp);
mpz_set_f(magipi, magifpi);
mpz_set_f(magipi, magifpi);
mpz_add(magipi,magipi,magisw);
mpz_add(product,product,magipi);
mpz_import(bns0, b, -1, p, -1, 0, (void*)(hash));
mpz_import(bns0, b, -1, p, -1, 0, (void*)(hash));
mpz_add(bns1, bns1, bns0);
mpz_mul(product,product,bns1);
mpz_cdiv_q (product, product, bns0);
@@ -275,18 +276,18 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
SHA256_Init( &ctxf_sha256 );
SHA256_Update( &ctxf_sha256, bdata, bytes );
SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
}
}
const unsigned char *hash_ = (const unsigned char *)hash;
const unsigned char *target_ = (const unsigned char *)ptarget;
for ( i = 31; i >= 0; i-- )
const unsigned char *hash_ = (const unsigned char *)hash;
const unsigned char *target_ = (const unsigned char *)ptarget;
for ( i = 31; i >= 0; i-- )
{
if ( hash_[i] != target_[i] )
{
rc = hash_[i] < target_[i];
break;
}
}
if ( hash_[i] != target_[i] )
{
rc = hash_[i] < target_[i];
break;
}
}
if ( unlikely(rc) )
{
if ( opt_debug )
@@ -299,15 +300,15 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
hash_str,
target_str);
}
work_set_target_ratio( work, hash );
pdata[19] = data[19];
goto out;
}
submit_solution( work, hash, mythr );
}
} while (n < max_nonce && !work_restart[thr_id].restart);
pdata[19] = n;
out:
// can this be skipped after finding a share? Seems to work ok.
//out:
mpf_set_prec_raw(magifpi, prec0);
mpf_set_prec_raw(magifpi0, prec0);
mpf_set_prec_raw(mptmp, prec0);

View File

@@ -70,7 +70,7 @@ int scanhash_nist5_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[25]);
uint32_t lane_hash[8];
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
@@ -122,7 +122,7 @@ int scanhash_nist5_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ )
if ( ( hash7[ lane ] & mask ) == 0 )
{
mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
mm256_extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;

View File

@@ -575,7 +575,7 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
// uint32_t *hash7 = &(hash[25]);
// uint32_t lane_hash[8];
// uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;

View File

@@ -191,7 +191,7 @@ int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
for ( int i = 0; i < 4; i++ )
if ( ( hash7[ i<<1 ] & 0xFFFFFF00 ) == 0 )
{
mm256_extract_lane_4x64( lane_hash, hash, i, 256 );
mm256_extr_lane_4x64( lane_hash, hash, i, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;

View File

@@ -118,7 +118,7 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
for ( int i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
{
// deinterleave hash for lane
mm256_extract_lane_8x32( lane_hash, hash, i, 256 );
mm256_extr_lane_8x32( lane_hash, hash, i, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[27] = n + i;

View File

@@ -36,6 +36,7 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -79,8 +80,7 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,
if ( !( hash7[ lane ] & mask ) )
{
// deinterleave hash for lane
uint32_t lane_hash[8];
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
mm256_extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
@@ -130,7 +130,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -168,7 +168,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ )
if ( !( hash7[ lane ] & mask ) )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{

View File

@@ -99,7 +99,7 @@ int scanhash_sha256t_11way( struct work *work, uint32_t max_nonce,
for ( i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
{
// deinterleave hash for lane
mm256_extract_lane_8x32( lane_hash, hashx, i, 256 );
mm256_extr_lane_8x32( lane_hash, hashx, i, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + i;
@@ -111,7 +111,7 @@ int scanhash_sha256t_11way( struct work *work, uint32_t max_nonce,
for( i = 0; i < 2; i++ ) if ( !(hash7[ 0] & mask ) )
{
mm64_extract_lane_2x32( lane_hash, hashy, i, 256 );
mm64_extr_lane_2x32( lane_hash, hashy, i, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + 8 + i;
@@ -204,7 +204,7 @@ int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce,
if ( !( hash7[ lane ] & mask ) )
{
// deinterleave hash for lane
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
mm256_extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
@@ -287,7 +287,7 @@ int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ )
if ( !( hash7[ lane ] & mask ) )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;

View File

@@ -48,7 +48,7 @@ void skeinhash_4way( void *state, const void *input )
SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
mm128_intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
#else
mm256_rintrlv_4x64_4x32( vhash32, vhash64, 512 );
@@ -63,7 +63,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8];
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -84,7 +84,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ )
if ( hash7[ lane ] <= Htarg )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;

View File

@@ -23,29 +23,41 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[25]);
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[25]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
// __m256i *noncev = (__m256i*)vdata + 9; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
uint32_t *noncep = vdata + 73; // 9*8 + 1
mm256_bswap_intrlv80_4x64( vdata, pdata );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
do
{
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
// *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
// _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
skein2hash_4way( hash, vdata );
for ( int lane = 0; lane < 4; lane++ )
if ( hash7[ lane<<1 ] <= Htarg )
{
uint32_t lane_hash[8];
mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
mm256_extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;

View File

@@ -171,18 +171,14 @@ int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
int thr_id = mythr->id; // thr_id arg is deprecated
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_4x64( vdata, pdata );
for (int m=0; m < 6; m++)
if (Htarg <= htmax[m])
@@ -190,10 +186,8 @@ int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
c11_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -64,13 +64,12 @@ int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(128) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t n = pdata[19];
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = { 0,
@@ -87,14 +86,7 @@ int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
0xFFFF0000,
0 };
// we need bigendian data...
for ( int i = 0; i < 20; i++ )
{
be32enc( &endiandata[i], pdata[i] );
}
uint64_t *edata = (uint64_t*)endiandata;
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_4x64( vdata, pdata );
// precalc midstate
// doing it one way then then interleaving would be faster but too
@@ -108,10 +100,8 @@ int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
{
uint32_t mask = masks[m];
do {
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
tribus_hash_4way( hash, vdata );

View File

@@ -170,18 +170,14 @@ int scanhash_x11_4way( struct work *work, uint32_t max_nonce,
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
int thr_id = mythr->id; // thr_id arg is deprecated
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_4x64( vdata, pdata );
for (int m=0; m < 6; m++)
if (Htarg <= htmax[m])
@@ -189,10 +185,8 @@ int scanhash_x11_4way( struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x11_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -171,24 +171,19 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
int thr_id = mythr->id; // thr_id arg is deprecated
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_4x64( vdata, pdata );
for (int m=0; m < 6; m++)
if (Htarg <= htmax[m])
@@ -196,10 +191,8 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x11gost_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -78,29 +78,23 @@ int scanhash_skunk_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( opt_benchmark )
((uint32_t*)ptarget)[7] = 0x0cff;
for ( int k = 0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
uint64_t *edata = (uint64_t*)endiandata;
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_4x64( vdata, pdata );
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
skunk_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -189,12 +189,11 @@ int scanhash_x13_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
@@ -202,11 +201,7 @@ int scanhash_x13_4way( struct work *work, uint32_t max_nonce,
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_4x64( vdata, pdata );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )
@@ -214,10 +209,8 @@ int scanhash_x13_4way( struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x13_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -166,7 +166,7 @@ void x13sm3_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
// SM3 parallel 32 bit
uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64)));
@@ -182,7 +182,7 @@ void x13sm3_4way_hash( void *state, const void *input )
sm3_4way( &ctx.sm3, vhash, 64 );
sm3_4way_close( &ctx.sm3, sm3_vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
// Hamsi parallel 4x32x2
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
@@ -214,12 +214,11 @@ int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
@@ -227,11 +226,7 @@ int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce,
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_4x64( vdata, pdata );
blake512_4way_init( &x13sm3_ctx_mid );
blake512_4way( &x13sm3_ctx_mid, vdata, 64 );
@@ -242,10 +237,8 @@ int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x13sm3_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -43,7 +43,7 @@ void polytimos_4way_hash( void *output, const void *input )
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash32, 64 );
shabal512_4way_close( &ctx.shabal, vhash32 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash0,

View File

@@ -54,10 +54,10 @@ void veltor_4way_hash( void *output, const void *input )
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_gost512( &ctx.gost, hash0, 64 );
sph_gost512_close( &ctx.gost, hash0 );
@@ -82,31 +82,24 @@ int scanhash_veltor_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( opt_benchmark )
ptarget[7] = 0x0cff;
for ( int i=0; i < 19; i++ )
{
be32enc( &endiandata[i], pdata[i] );
}
uint64_t *edata = (uint64_t*)endiandata;
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_4x64( vdata, pdata );
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
veltor_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -183,10 +183,9 @@ void x14_4way_hash( void *state, const void *input )
sph_fugue512_close( &ctx.fugue, hash3 );
// 14 Shabal, parallel 32 bit
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, state );
}
int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
@@ -194,12 +193,11 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = { 0, 0xF, 0xFF,
@@ -207,11 +205,7 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_4x64( vdata, pdata );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )
@@ -219,10 +213,8 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x14_4way_hash( hash, vdata );
pdata[19] = n;
@@ -234,7 +226,7 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
{
// deinterleave hash for lane
uint32_t lane_hash[8];
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{

View File

@@ -186,10 +186,10 @@ void x15_4way_hash( void *state, const void *input )
sph_fugue512_close( &ctx.fugue, hash3 );
// 14 Shabal, parallel 32 bit
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
// 15 Whirlpool
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -218,12 +218,11 @@ int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = { 0, 0xF, 0xFF,
@@ -231,11 +230,7 @@ int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_4x64( vdata, pdata );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )
@@ -243,10 +238,8 @@ int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x15_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -248,11 +248,11 @@ void x16r_4way_hash( void* output, const void* input )
sph_fugue512_close( &ctx.fugue, hash3 );
break;
case SHABAL:
mm128_intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, size );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case WHIRLPOOL:
sph_whirlpool_init( &ctx.whirlpool );

View File

@@ -390,7 +390,7 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
@@ -438,7 +438,7 @@ void sonoa_4way_hash( void *state, const void *input )
shabal512_4way( &ctx.shabal, vhashB, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -522,13 +522,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -635,13 +635,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -769,13 +769,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -807,9 +807,9 @@ int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
@@ -837,7 +837,7 @@ int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ )
if ( ( ( hash7[ lane ] & mask ) == 0 ) )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;

View File

@@ -124,8 +124,8 @@ void x17_4way_hash( void *state, const void *input )
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
mm256_dintrlv_2x128_512( hash0, hash1, vhashA );
mm256_dintrlv_2x128_512( hash2, hash3, vhashB );
// 11 Echo serial
init_echo( &ctx.echo, 512 );
@@ -165,13 +165,13 @@ void x17_4way_hash( void *state, const void *input )
sph_fugue512_close( &ctx.fugue, hash3 );
// 14 Shabal, parallel 4 way 32 bit
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
// 15 Whirlpool serial
sph_whirlpool_init( &ctx.whirlpool );
@@ -206,9 +206,9 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
@@ -235,7 +235,7 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ )
if ( ( hash7[ lane ] & mask ) == 0 )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;

View File

@@ -161,13 +161,13 @@ void xevan_4way_hash( void *output, const void *input )
sph_fugue512_close( &ctx.fugue, hash3 );
// Parallel 4way 32 bit
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, dataLen );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
// Serial
sph_whirlpool_init( &ctx.whirlpool );
@@ -295,13 +295,13 @@ void xevan_4way_hash( void *output, const void *input )
sph_fugue512( &ctx.fugue, hash3, dataLen );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, dataLen );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
@@ -333,9 +333,9 @@ int scanhash_xevan_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id; // thr_id arg is deprecated
@@ -357,7 +357,7 @@ int scanhash_xevan_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ )
if ( hash7[ lane ] <= Htarg )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;

View File

@@ -18,7 +18,7 @@ rm -f config.status
# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure --with-curl
#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.5.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.5.1.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.9.5'
PACKAGE_STRING='cpuminer-opt 3.9.5'
PACKAGE_VERSION='3.9.5.1'
PACKAGE_STRING='cpuminer-opt 3.9.5.1'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.9.5 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.9.5.1 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.9.5:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.9.5.1:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.9.5
cpuminer-opt configure 3.9.5.1
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.9.5, which was
It was created by cpuminer-opt $as_me 3.9.5.1, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.9.5'
VERSION='3.9.5.1'
cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.9.5, which was
This file was extended by cpuminer-opt $as_me 3.9.5.1, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.9.5
cpuminer-opt config.status 3.9.5.1
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.9.5])
AC_INIT([cpuminer-opt], [3.9.5.1])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -843,8 +843,6 @@ void scale_hash_for_display ( double* hashrate, char* units )
const uint64_t diff2hash = 0x40000000ULL;
static struct timeval submit_time, prev_submit_time;
static struct timeval submit_interval;
static struct timeval five_min_start;
static double shash_sum = 0.;
static double bhash_sum = 0.;
@@ -852,40 +850,92 @@ static double time_sum = 0.;
static double latency_sum = 0.;
static uint64_t submits_sum = 0;
static int share_result( int result, struct work *work, const char *reason )
struct share_stats_t
{
char hr[16];
const char *sres;
struct timeval submit_time;
double net_diff;
double share_diff;
char job_id[32];
};
// with more and more parallelism the chances of submitting multiple
// shares in a very short time grows.
#define s_stats_size 4
static struct share_stats_t share_stats[ s_stats_size ];
static int s_get_ptr = 0, s_put_ptr = 0;
static struct timeval last_submit_time = {0};
static int share_result( int result, struct work *null_work,
const char *reason )
{
double share_time, share_hash, block_hash, share_size;
double hashcount = 0.;
double hashrate = 0.;
uint64_t latency;
struct share_stats_t my_stats = {0};
struct timeval ack_time, latency_tv, et;
char hr[32];
char hr_units[4] = {0};
bool solved;
char shr[16];
char shr[32];
char shr_units[4] = {0};
char diffstr[32];
struct timeval ack_time, latency_tv;
uint64_t latency;
double share_time, share_hash, block_hash;
double sharediff = work ? work->sharediff : stratum.sharediff;
double share_size;
const char *sres;
bool solved;
// Mutex while accessing global counters.
pthread_mutex_lock( &stats_lock );
// There is a window where a second share could be submitted
// before receiving the response for this one. When this happens
// te second share will be processed from [1] on the next pass.
memcpy( &my_stats, &share_stats[ s_get_ptr], sizeof my_stats );
memset( &share_stats[ s_get_ptr ], 0, sizeof my_stats );
s_get_ptr++;
if ( s_get_ptr >= s_stats_size )
s_get_ptr = 0;
/*
if ( share_stats[0].submit_time.tv_sec )
{
memcpy( &my_stats, &share_stats[0], sizeof my_stats );
memset( &share_stats[0], 0, sizeof my_stats );
}
else if ( share_stats[1].submit_time.tv_sec )
{
memcpy( &my_stats, &share_stats[1], sizeof my_stats );
memset( &share_stats[1], 0, sizeof my_stats );
}
else
{
memcpy( &my_stats, &share_stats[2], sizeof my_stats );
memset( &share_stats[2], 0, sizeof my_stats );
}
*/
for ( int i = 0; i < opt_n_threads; i++ )
{
hashcount += thr_hashcount[i];
hashrate += thr_hashrates[i];
}
global_hashcount = hashcount;
global_hashrate = hashrate;
// calculate latency
// calculate latency and share time.
gettimeofday( &ack_time, NULL );
timeval_subtract( &latency_tv, &ack_time, &submit_time );
timeval_subtract( &latency_tv, &ack_time, &my_stats.submit_time );
latency = ( latency_tv.tv_sec * 1000 + latency_tv.tv_usec / 1000 );
timeval_subtract( &et, &my_stats.submit_time, &last_submit_time );
share_time = (double)et.tv_sec + ( (double)et.tv_usec / 1000000. );
memcpy( &last_submit_time, &my_stats.submit_time, sizeof last_submit_time );
// calculate share hashrate and size
share_time = submit_interval.tv_sec + ( submit_interval.tv_usec / 1000000. );
share_hash = sharediff * diff2hash;
block_hash = net_diff * diff2hash;
share_size = block_hash == 0. ? 0. : share_hash / block_hash;
share_hash = my_stats.share_diff * diff2hash;
block_hash = my_stats.net_diff * diff2hash;
share_size = block_hash == 0. ? 0. : share_hash / block_hash * 100.;
// check result
result ? accepted_share_count++ : rejected_share_count++;
solved = result && (my_stats.net_diff > 0.0 )
&& ( my_stats.share_diff >= net_diff );
solved_block_count += solved ? 1 : 0 ;
// update counters for 5 minute summary report
shash_sum += share_hash;
@@ -897,38 +947,46 @@ static int share_result( int result, struct work *work, const char *reason )
pthread_mutex_unlock( &stats_lock );
double share_hash_rate = share_time == 0. ? 0. : share_hash / share_time;
double scaled_shr;
result ? accepted_share_count++ : rejected_share_count++;
global_hashcount = hashcount;
global_hashrate = hashrate;
// check for solved block
solved = result && (net_diff > 0.0 ) && ( sharediff >= net_diff );
solved_block_count += solved ? 1 : 0 ;
scaled_shr = share_hash_rate;
scale_hash_for_display ( &scaled_shr, shr_units );
if ( use_colors )
{
sres = ( solved ? ( CL_MAG "BLOCK SOLVED" CL_WHT )
: result ? ( CL_GRN "Accepted" CL_WHT )
: ( CL_RED "Rejected" CL_WHT ) );
: result ? ( CL_GRN "Accepted" CL_WHT )
: ( CL_RED "Rejected" CL_WHT ) );
// colour code the share diff to highlight high value.
if ( solved )
sprintf( diffstr, "%s%.3g%s", CL_MAG, sharediff, CL_WHT );
else if ( share_size > 0.01 )
sprintf( diffstr, "%s%.3g%s", CL_GRN, sharediff, CL_WHT );
else if ( share_size > 0.001 )
sprintf( diffstr, "%s%.3g%s", CL_CYN, sharediff, CL_WHT );
else if ( share_hash_rate > hashrate )
sprintf( diffstr, "%s%.3g%s", CL_YLW, sharediff, CL_WHT );
sprintf( diffstr, "%s%.3g%s", CL_MAG, my_stats.share_diff, CL_WHT );
else if ( my_stats.share_diff > (my_stats.net_diff*0.1) )
sprintf( diffstr, "%s%.3g%s", CL_GRN, my_stats.share_diff, CL_WHT );
else if ( my_stats.share_diff > (my_stats.net_diff*0.01) )
sprintf( diffstr, "%s%.3g%s", CL_CYN, my_stats.share_diff, CL_WHT );
else
sprintf( diffstr, "%.3g", sharediff );
sprintf( diffstr, "%.3g", my_stats.share_diff );
if ( hashrate && share_hash_rate > (768.*hashrate) )
sprintf( shr, "%s%.2f %sH/s%s", CL_MAG, scaled_shr, shr_units,
CL_WHT );
else if ( share_hash_rate > (32.*hashrate) )
sprintf( shr, "%s%.2f %sH/s%s", CL_GRN, scaled_shr, shr_units,
CL_WHT );
else if ( share_hash_rate > 2.0*hashrate )
sprintf( shr, "%s%.2f %sH/s%s", CL_CYN, scaled_shr, shr_units,
CL_WHT );
else if ( share_hash_rate > 0.5*hashrate )
sprintf( shr, "%.2f %sH/s", scaled_shr, shr_units );
else
sprintf( shr, "%s%.2f %sH/s%s", CL_YLW, scaled_shr, shr_units,
CL_WHT );
}
else
{
sres = ( solved ? "BLOCK SOLVED"
: result ? "Accepted" : "Rejected" );
sprintf( diffstr, "%3g", sharediff );
sres = ( solved ? "BLOCK SOLVED" : result ? "Accepted" : "Rejected" );
sprintf( diffstr, "%.3g", my_stats.share_diff );
}
scale_hash_for_display ( &hashrate, hr_units );
@@ -941,36 +999,20 @@ static int share_result( int result, struct work *work, const char *reason )
sres, diffstr, share_time, accepted_share_count,
rejected_share_count, solved_block_count );
if ( have_stratum && result && sharediff && net_diff && !opt_quiet )
if ( have_stratum && result && my_stats.share_diff && my_stats.net_diff
&& !opt_quiet )
{
// double share_hash_rate = share_time == 0. ? 0. : share_hash / share_time;
scale_hash_for_display ( &share_hash_rate, shr_units );
if ( share_hash_rate < 10 )
// very low hashrate, add digits
sprintf( shr, "%.4f", share_hash_rate );
else
sprintf( shr, "%.2f", share_hash_rate );
applog( LOG_NOTICE, "Miner %s %sH/s, Share %s %sH/s, Latency %d ms.",
hr, hr_units, shr, shr_units, latency );
applog( LOG_NOTICE, "Height %d, Block share %.5f%%.",
stratum.bloc_height, share_size*100. );
applog( LOG_NOTICE, "Miner %s %sH/s, Share %s, Latency %d ms.",
hr, hr_units, shr, latency );
applog( LOG_NOTICE, "Height %d, job %s, %.5f%% block share.",
stratum.bloc_height, my_stats.job_id, share_size );
applog(LOG_INFO,"- - - - - - - - - - - - - - - - - - - - - - - - - - -");
}
if ( reason )
{
applog( LOG_WARNING, "reject reason: %s", reason );
/*
if (strncmp(reason, "low difficulty share", 20) == 0)
{
opt_diff_factor = (opt_diff_factor * 2.0) / 3.0;
applog(LOG_WARNING, "factor reduced to : %0.2f", opt_diff_factor);
return 0;
}
*/
}
return 1;
applog( LOG_WARNING, "reject reason: %s.", reason );
return 1;
}
void std_le_build_stratum_request( char *req, struct work *work )
@@ -1557,9 +1599,42 @@ bool submit_work(struct thr_info *thr, const struct work *work_in)
{
struct workio_cmd *wc;
memcpy( &prev_submit_time, &submit_time, sizeof submit_time );
gettimeofday( &submit_time, NULL );
timeval_subtract( &submit_interval, &submit_time, &prev_submit_time );
// collect some share stats
pthread_mutex_lock( &stats_lock );
gettimeofday( &share_stats[ s_put_ptr ].submit_time, NULL );
share_stats[ s_put_ptr ].share_diff = work_in->sharediff;
share_stats[ s_put_ptr ].net_diff = net_diff;
strcpy( share_stats[ s_put_ptr ].job_id, work_in->job_id );
s_put_ptr++;
if ( s_put_ptr >= s_stats_size )
s_put_ptr = 0;
/*
if ( share_stats[0].submit_time.tv_sec == 0 )
{
gettimeofday( &share_stats[0].submit_time, NULL );
share_stats[0].share_diff = work_in->sharediff;
share_stats[0].net_diff = net_diff;
strcpy( share_stats[0].job_id, work_in->job_id );
}
else if ( share_stats[1].submit_time.tv_sec == 0 )
{ // previous share hasn't been confirmed yet.
gettimeofday( &share_stats[1].submit_time, NULL );
share_stats[1].share_diff = work_in->sharediff;
share_stats[1].net_diff = net_diff;
strcpy( share_stats[1].job_id, work_in->job_id );
}
else
{ // previous share hasn't been confirmed yet.
gettimeofday( &share_stats[2].submit_time, NULL );
share_stats[2].share_diff = work_in->sharediff;
share_stats[2].net_diff = net_diff;
strcpy( share_stats[2].job_id, work_in->job_id );
}
*/
pthread_mutex_unlock( &stats_lock );
/* fill out work request message */
wc = (struct workio_cmd *) calloc(1, sizeof(*wc));
@@ -1723,6 +1798,7 @@ uint32_t* jr2_get_nonceptr( uint32_t *work_data )
return (uint32_t*) ( ((uint8_t*) work_data) + algo_gate.nonce_index );
}
void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
uint32_t *end_nonce_ptr, bool clean_job )
{
@@ -1912,7 +1988,7 @@ static void *miner_thread( void *userdata )
if ( have_stratum )
{
algo_gate.wait_for_diff( &stratum );
pthread_mutex_lock( &g_work_lock );
pthread_mutex_lock( &g_work_lock );
if ( *algo_gate.get_nonceptr( work.data ) >= end_nonce )
algo_gate.stratum_gen_work( &stratum, &g_work );
algo_gate.get_new_work( &work, &g_work, thr_id, &end_nonce,
@@ -1922,20 +1998,20 @@ static void *miner_thread( void *userdata )
else
{
int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime;
pthread_mutex_lock( &g_work_lock );
pthread_mutex_lock( &g_work_lock );
if ( time(NULL) - g_work_time >= min_scantime
|| *algo_gate.get_nonceptr( work.data ) >= end_nonce )
{
if ( unlikely( !get_work( mythr, &g_work ) ) )
if ( unlikely( !get_work( mythr, &g_work ) ) )
{
applog( LOG_ERR, "work retrieval failed, exiting "
"mining thread %d", thr_id );
applog( LOG_ERR, "work retrieval failed, exiting "
"mining thread %d", thr_id );
pthread_mutex_unlock( &g_work_lock );
goto out;
}
goto out;
}
g_work_time = time(NULL);
}
}
algo_gate.get_new_work( &work, &g_work, thr_id, &end_nonce, true );
pthread_mutex_unlock( &g_work_lock );
@@ -2023,7 +2099,12 @@ static void *miner_thread( void *userdata )
break;
}
if ( !opt_quiet )
applog( LOG_NOTICE, "Share submitted." );
// applog( LOG_BLUE, "Share %d submitted by thread %d.",
// accepted_share_count + rejected_share_count + 1,
// mythr->id );
applog( LOG_BLUE, "Share %d submitted by thread %d, job %s.",
accepted_share_count + rejected_share_count + 1,
mythr->id, work.job_id );
// prevent stale work in solo
// we can't submit twice a block!
@@ -2035,6 +2116,7 @@ static void *miner_thread( void *userdata )
pthread_mutex_unlock( &g_work_lock );
}
}
// Check for 5 minute summary report, mutex until global counters
// are read and reset. It's bad form to unlock inside a conditional
// block but more efficient. The logic is reversed to make the mutex
@@ -2057,43 +2139,56 @@ static void *miner_thread( void *userdata )
pthread_mutex_unlock( &stats_lock );
char hr[16];
char hr_units[4] = {0};
char bshstr[32];
double hrate = time == 0. ? 0. : hash / time;
double ghrate = global_hashrate;
double shrate = time == 0. ? 0. : hash / time;
double scaled_shrate = shrate;
double avg_share = bhash == 0. ? 0. : hash / bhash * 100.;
char shr[32];
char shr_units[4] = {0};
int temp = cpu_temp(0);
char timestr[32];
latency = submits ? latency / submits : 0;
scale_hash_for_display( &scaled_shrate, shr_units );
// colour code the block share to highlight high value.
if ( avg_share > 90.0 )
sprintf( bshstr, "%s%.5f%s", CL_MAG, avg_share, CL_WHT );
else if ( avg_share > 1.0 )
sprintf( bshstr, "%s%.5f%s", CL_GRN, avg_share, CL_WHT );
else if ( avg_share > 0.1 )
sprintf( bshstr, "%s%.5f%s", CL_CYN, avg_share, CL_WHT );
else if ( hrate > global_hashrate )
sprintf( bshstr, "%s%.5f%s", CL_YLW, avg_share, CL_WHT );
else
sprintf( bshstr, "%.5f", avg_share );
scale_hash_for_display ( &hrate, hr_units );
if ( hrate < 10. )
// very low hashrate, add digits
sprintf( hr, "%.4f", hrate );
else
sprintf( hr, "%.2f", hrate );
if ( use_colors )
{
if ( shrate > (32.*ghrate) )
sprintf( shr, "%s%.2f %sH/s%s", CL_MAG, scaled_shrate,
shr_units, CL_WHT );
else if ( shrate > (8.*ghrate) )
sprintf( shr, "%s%.2f %sH/s%s", CL_GRN, scaled_shrate,
shr_units, CL_WHT );
else if ( shrate > 2.0*ghrate )
sprintf( shr, "%s%.2f %sH/s%s", CL_CYN, scaled_shrate,
shr_units, CL_WHT );
else if ( shrate > 0.5*ghrate )
sprintf( shr, "%.2f %sH/s", scaled_shrate, shr_units );
else
sprintf( shr, "%s%.2f %sH/s%s", CL_YLW, scaled_shrate,
shr_units, CL_WHT );
applog(LOG_NOTICE,"Summary: %d submits in %dm%02ds, block share %s%%.",
(uint64_t)submits, et.tv_sec / 60,
et.tv_sec % 60, bshstr );
if ( temp >= 80 ) sprintf( timestr, "%s%d C%s",
CL_RED, temp, CL_WHT );
else if (temp >=70 ) sprintf( timestr, "%s%d C%s",
CL_YLW, temp, CL_WHT );
else sprintf( timestr, "%d C", temp );
}
else
sprintf( shr, "%.2f %sH/s", scaled_shrate, shr_units );
applog(LOG_NOTICE,"Submitted %d shares in %dm%02ds, %.5f%% block share.",
(uint64_t)submits, et.tv_sec / 60, et.tv_sec % 60, avg_share );
#if ((defined(_WIN64) || defined(__WINDOWS__)))
applog(LOG_NOTICE,"Share hashrate %s %sH/s, latency %d ms.",
hr, hr_units, latency );
applog(LOG_NOTICE,"Share hashrate %s, latency %d ms.",
shr, latency );
#else
applog(LOG_NOTICE,"Share hashrate %s %sH/s, latency %d ms, temp %dC.",
hr, hr_units, latency, (uint32_t)cpu_temp(0) );
applog(LOG_NOTICE,"Share hashrate %s, latency %d ms, temp %s.",
shr, latency, timestr );
#endif
// applog(LOG_NOTICE,"Performance index: %s.", hixstr );
applog(LOG_INFO,"- - - - - - - - - - - - - - - - - - - - - - - - - - -");
}
@@ -2294,7 +2389,7 @@ start:
sprintf(netinfo, ", diff %.3f", net_diff);
}
if (opt_showdiff)
sprintf( &netinfo[strlen(netinfo)], ", target %.3f",
sprintf( &netinfo[strlen(netinfo)], ", target %.3f",
g_work.targetdiff );
applog(LOG_BLUE, "%s detected new block%s", short_url, netinfo);
}
@@ -2457,6 +2552,9 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
algo_gate.set_work_data_endian( g_work );
pthread_mutex_unlock( &sctx->work_lock );
// if ( !opt_quiet )
// applog( LOG_BLUE,"New job %s.", g_work->job_id );
if ( opt_debug )
{
unsigned char *xnonce2str = abin2hex( g_work->xnonce2,
@@ -2470,14 +2568,14 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
if ( stratum_diff != sctx->job.diff )
{
char sdiff[32] = { 0 };
// char sdiff[32] = { 0 };
// store for api stats
stratum_diff = sctx->job.diff;
if ( opt_showdiff && g_work->targetdiff != stratum_diff )
if ( !opt_quiet && opt_showdiff && g_work->targetdiff != stratum_diff )
{
snprintf( sdiff, 32, " (%.5f)", g_work->targetdiff );
applog( LOG_WARNING, "Stratum difficulty set to %g%s", stratum_diff,
sdiff );
// snprintf( sdiff, 32, " (%.5f)", g_work->targetdiff );
applog( LOG_BLUE, "Stratum difficulty set to %g", stratum_diff );
// sdiff );
}
}
}
@@ -2492,114 +2590,118 @@ void jr2_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
static void *stratum_thread(void *userdata )
{
struct thr_info *mythr = (struct thr_info *) userdata;
char *s;
struct thr_info *mythr = (struct thr_info *) userdata;
char *s;
stratum.url = (char*) tq_pop(mythr->q, NULL);
if (!stratum.url)
goto out;
applog(LOG_INFO, "Starting Stratum on %s", stratum.url);
stratum.url = (char*) tq_pop(mythr->q, NULL);
if (!stratum.url)
goto out;
applog(LOG_INFO, "Starting Stratum on %s", stratum.url);
while (1)
{
int failures = 0;
while (1)
{
int failures = 0;
if ( stratum_need_reset )
{
stratum_need_reset = false;
stratum_disconnect( &stratum );
if ( strcmp( stratum.url, rpc_url ) )
{
free( stratum.url );
stratum.url = strdup( rpc_url );
applog(LOG_BLUE, "Connection changed to %s", short_url);
}
else if ( !opt_quiet )
applog(LOG_DEBUG, "Stratum connection reset");
}
if ( stratum_need_reset )
{
stratum_need_reset = false;
stratum_disconnect( &stratum );
if ( strcmp( stratum.url, rpc_url ) )
{
free( stratum.url );
stratum.url = strdup( rpc_url );
applog(LOG_BLUE, "Connection changed to %s", short_url);
}
else if ( !opt_quiet )
applog(LOG_DEBUG, "Stratum connection reset");
}
while ( !stratum.curl )
{
pthread_mutex_lock( &g_work_lock );
g_work_time = 0;
pthread_mutex_unlock( &g_work_lock );
restart_threads();
if ( !stratum_connect( &stratum, stratum.url )
|| !stratum_subscribe( &stratum )
|| !stratum_authorize( &stratum, rpc_user, rpc_pass ) )
{
stratum_disconnect( &stratum );
if (opt_retries >= 0 && ++failures > opt_retries)
{
applog(LOG_ERR, "...terminating workio thread");
tq_push(thr_info[work_thr_id].q, NULL);
goto out;
}
if (!opt_benchmark)
applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
sleep(opt_fail_pause);
}
while ( !stratum.curl )
{
pthread_mutex_lock( &g_work_lock );
g_work_time = 0;
pthread_mutex_unlock( &g_work_lock );
restart_threads();
if ( !stratum_connect( &stratum, stratum.url )
|| !stratum_subscribe( &stratum )
|| !stratum_authorize( &stratum, rpc_user, rpc_pass ) )
{
stratum_disconnect( &stratum );
if (opt_retries >= 0 && ++failures > opt_retries)
{
applog(LOG_ERR, "...terminating workio thread");
tq_push(thr_info[work_thr_id].q, NULL);
goto out;
}
if (!opt_benchmark)
applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
sleep(opt_fail_pause);
}
if (jsonrpc_2)
{
work_free(&g_work);
work_copy(&g_work, &stratum.work);
}
}
if (jsonrpc_2)
{
work_free(&g_work);
work_copy(&g_work, &stratum.work);
}
}
if ( stratum.job.job_id &&
( !g_work_time || strcmp( stratum.job.job_id, g_work.job_id ) ) )
{
pthread_mutex_lock(&g_work_lock);
algo_gate.stratum_gen_work( &stratum, &g_work );
time(&g_work_time);
pthread_mutex_unlock(&g_work_lock);
// restart_threads();
if ( stratum.job.job_id
&& ( !g_work_time || strcmp( stratum.job.job_id, g_work.job_id ) ) )
{
pthread_mutex_lock(&g_work_lock);
algo_gate.stratum_gen_work( &stratum, &g_work );
time(&g_work_time);
pthread_mutex_unlock(&g_work_lock);
restart_threads();
if (stratum.job.clean || jsonrpc_2)
{
static uint32_t last_bloc_height;
if ( last_bloc_height != stratum.bloc_height )
{
last_bloc_height = stratum.bloc_height;
if ( !opt_quiet )
{
if (net_diff > 0.)
applog(LOG_BLUE, "%s block %d, network diff %.3f",
algo_names[opt_algo], stratum.bloc_height, net_diff);
else
applog(LOG_BLUE, "%s %s block %d", short_url,
algo_names[opt_algo], stratum.bloc_height);
}
}
restart_threads();
}
else if (opt_debug && !opt_quiet)
{
applog(LOG_BLUE, "%s asks job %d for block %d", short_url,
strtoul(stratum.job.job_id, NULL, 16), stratum.bloc_height);
}
} // stratum.job.job_id
if ( stratum.job.clean || jsonrpc_2 )
{
static uint32_t last_bloc_height;
if ( last_bloc_height != stratum.bloc_height )
{
last_bloc_height = stratum.bloc_height;
if ( !opt_quiet )
{
if ( net_diff > 0. )
applog( LOG_BLUE,
"%s block %d, job %s, network diff %.4f",
algo_names[opt_algo], stratum.bloc_height,
g_work.job_id, net_diff);
else
applog( LOG_BLUE, "%s %s block %d, job %s",
short_url, algo_names[opt_algo],
stratum.bloc_height, g_work.job_id );
}
}
else if ( !opt_quiet )
applog( LOG_BLUE,"New job %s.", g_work.job_id );
}
else if (opt_debug && !opt_quiet)
{
applog( LOG_BLUE, "%s asks job %d for block %d", short_url,
strtoul( stratum.job.job_id, NULL, 16 ), stratum.bloc_height );
}
} // stratum.job.job_id
if ( !stratum_socket_full( &stratum, opt_timeout ) )
{
applog(LOG_ERR, "Stratum connection timeout");
s = NULL;
}
else
s = stratum_recv_line(&stratum);
if ( !s )
{
stratum_disconnect(&stratum);
if ( !stratum_socket_full( &stratum, opt_timeout ) )
{
applog(LOG_ERR, "Stratum connection timeout");
s = NULL;
}
else
s = stratum_recv_line(&stratum);
if ( !s )
{
stratum_disconnect(&stratum);
// applog(LOG_WARNING, "Stratum connection interrupted");
continue;
}
if (!stratum_handle_method(&stratum, s))
continue;
}
if (!stratum_handle_method(&stratum, s))
stratum_handle_response(s);
free(s);
} // loop
free(s);
} // loop
out:
return NULL;
return NULL;
}
void show_version_and_exit(void)
@@ -3402,23 +3504,23 @@ int main(int argc, char *argv[])
if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1);
// Initialize stats times and counters
gettimeofday( &prev_submit_time, NULL );
memcpy( &submit_time, &prev_submit_time, sizeof submit_time );
memcpy( &five_min_start, &prev_submit_time, sizeof prev_submit_time );
memset( share_stats, 0, 2 * sizeof (struct share_stats_t) );
gettimeofday( &last_submit_time, NULL );
memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
if ( !check_cpu_capability() ) exit(1);
pthread_mutex_init(&stats_lock, NULL);
pthread_mutex_init(&g_work_lock, NULL);
pthread_mutex_init(&rpc2_job_lock, NULL);
pthread_mutex_init(&rpc2_login_lock, NULL);
pthread_mutex_init(&stratum.sock_lock, NULL);
pthread_mutex_init(&stratum.work_lock, NULL);
pthread_mutex_init( &stats_lock, NULL );
pthread_mutex_init( &g_work_lock, NULL );
pthread_mutex_init( &rpc2_job_lock, NULL );
pthread_mutex_init( &rpc2_login_lock, NULL );
pthread_mutex_init( &stratum.sock_lock, NULL );
pthread_mutex_init( &stratum.work_lock, NULL );
flags = !opt_benchmark && strncmp(rpc_url, "https:", 6)
? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL)
flags = !opt_benchmark && strncmp( rpc_url, "https:", 6 )
? ( CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL )
: CURL_GLOBAL_ALL;
if (curl_global_init(flags))
if ( curl_global_init( flags ) )
{
applog(LOG_ERR, "CURL initialization failed");
return 1;

View File

@@ -183,9 +183,15 @@
#include "simd-utils/simd-sse2.h"
#include "simd-utils/intrlv-sse2.h"
#if defined(__AVX__)
// 256 bit vector basics
#include "simd-utils/simd-avx.h"
#include "simd-utils/intrlv-avx.h"
#if defined(__AVX2__)
// 256 bit vectors
// 256 bit everything else
#include "simd-utils/simd-avx2.h"
#include "simd-utils/intrlv-avx2.h"
@@ -198,6 +204,11 @@
#endif // MMX
#endif // SSE2
#endif // AVX
#endif // AVX2
#endif // AVX512
// Picks implementation based on available CPU features.
#include "simd-utils/intrlv-selector.h"
#endif // SIMD_UTILS_H__

867
simd-utils/intrlv-avx.h Normal file
View File

@@ -0,0 +1,867 @@
#if !defined(INTRLV_AVX_H__)
#define INTRLV_AVX_H__ 1
#if defined(__AVX__)
// Convenient short cuts for local use only
// Extract 64 bits from the low 128 bits of 256 bit vector.
#define extr64_cast128_256( a, n ) \
_mm_extract_epi64( _mm256_castsi256_si128( a ), n )
// Extract 32 bits from the low 128 bits of 256 bit vector.
#define extr32_cast128_256( a, n ) \
_mm_extract_epi32( _mm256_castsi256_si128( a ), n )
///////////////////////////////////////////////////////////
//
// AVX 256 Bit Vectors
//
// 256 bit interleaving can be done with AVX.
#define mm256_put_64( s0, s1, s2, s3) \
_mm256_set_epi64x( *((const uint64_t*)(s3)), *((const uint64_t*)(s2)), \
*((const uint64_t*)(s1)), *((const uint64_t*)(s0)) )
#define mm256_put_32( s00, s01, s02, s03, s04, s05, s06, s07 ) \
_mm256_set_epi32( *((const uint32_t*)(s07)), *((const uint32_t*)(s06)), \
*((const uint32_t*)(s05)), *((const uint32_t*)(s04)), \
*((const uint32_t*)(s03)), *((const uint32_t*)(s02)), \
*((const uint32_t*)(s01)), *((const uint32_t*)(s00)) )
#define mm256_get_64( s, i0, i1, i2, i3 ) \
_mm256_set_epi64x( ((const uint64_t*)(s))[i3], ((const uint64_t*)(s))[i2], \
((const uint64_t*)(s))[i1], ((const uint64_t*)(s))[i0] )
#define mm256_get_32( s, i0, i1, i2, i3, i4, i5, i6, i7 ) \
_mm256_set_epi32( ((const uint32_t*)(s))[i7], ((const uint32_t*)(s))[i6], \
((const uint32_t*)(s))[i5], ((const uint32_t*)(s))[i4], \
((const uint32_t*)(s))[i3], ((const uint32_t*)(s))[i2], \
((const uint32_t*)(s))[i1], ((const uint32_t*)(s))[i0] )
/*
// Blend 2 vectors alternating hi & lo: { hi[n], lo[n-1], ... hi[1], lo[0] }
#define mm256_intrlv_blend_128( hi, lo ) \
_mm256_blend_epi32( hi, lo, 0x0f )
#define mm256_intrlv_blend_64( hi, lo ) \
_mm256_blend_epi32( hi, lo, 0x33 )
#define mm256_intrlv_blend_32( hi, lo ) \
_mm256_blend_epi32( hi, lo, 0x55 )
*/
// Interleave 8x32_256
#define mm256_intrlv_8x32_256( d, s0, s1, s2, s3, s4, s5, s6, s7 ) \
{ \
__m128i s0hi = mm128_extr_hi128_256( s0 ); \
__m128i s1hi = mm128_extr_hi128_256( s1 ); \
__m128i s2hi = mm128_extr_hi128_256( s2 ); \
__m128i s3hi = mm128_extr_hi128_256( s3 ); \
__m128i s4hi = mm128_extr_hi128_256( s4 ); \
__m128i s5hi = mm128_extr_hi128_256( s5 ); \
__m128i s6hi = mm128_extr_hi128_256( s6 ); \
__m128i s7hi = mm128_extr_hi128_256( s7 ); \
casti_m256i( d,0 ) = _mm256_set_epi32( \
extr32_cast128_256(s7,0), extr32_cast128_256(s6,0), \
extr32_cast128_256(s5,0), extr32_cast128_256(s4,0), \
extr32_cast128_256(s3,0), extr32_cast128_256(s2,0), \
extr32_cast128_256(s1,0), extr32_cast128_256(s0,0) ); \
casti_m256i( d,1 ) = _mm256_set_epi32( \
extr32_cast128_256(s7,1), extr32_cast128_256(s6,1), \
extr32_cast128_256(s5,1), extr32_cast128_256(s4,1), \
extr32_cast128_256(s3,1), extr32_cast128_256(s2,1), \
extr32_cast128_256(s1,1), extr32_cast128_256(s0,1) ); \
casti_m256i( d,2 ) = _mm256_set_epi32( \
extr32_cast128_256(s7,2), extr32_cast128_256(s6,2), \
extr32_cast128_256(s5,2), extr32_cast128_256(s4,2), \
extr32_cast128_256(s3,2), extr32_cast128_256(s2,2), \
extr32_cast128_256(s1,2), extr32_cast128_256(s0,2) ); \
casti_m256i( d,3 ) = _mm256_set_epi32( \
extr32_cast128_256(s7,3), extr32_cast128_256(s6,3), \
extr32_cast128_256(s5,3), extr32_cast128_256(s4,3), \
extr32_cast128_256(s3,3), extr32_cast128_256(s2,3), \
extr32_cast128_256(s1,3), extr32_cast128_256(s0,3) ); \
casti_m256i( d,4 ) = _mm256_set_epi32( \
mm128_extr_32(s7hi,0), mm128_extr_32(s6hi,0), \
mm128_extr_32(s5hi,0), mm128_extr_32(s4hi,0), \
mm128_extr_32(s3hi,0), mm128_extr_32(s2hi,0), \
mm128_extr_32(s1hi,0), mm128_extr_32(s0hi,0) ); \
casti_m256i( d,5 ) = _mm256_set_epi32( \
mm128_extr_32(s7hi,1), mm128_extr_32(s6hi,1), \
mm128_extr_32(s5hi,1), mm128_extr_32(s4hi,1), \
mm128_extr_32(s3hi,1), mm128_extr_32(s2hi,1), \
mm128_extr_32(s1hi,1), mm128_extr_32(s0hi,1) ); \
casti_m256i( d,6 ) = _mm256_set_epi32( \
mm128_extr_32(s7hi,2), mm128_extr_32(s6hi,2), \
mm128_extr_32(s5hi,2), mm128_extr_32(s4hi,2), \
mm128_extr_32(s3hi,2), mm128_extr_32(s2hi,2), \
mm128_extr_32(s1hi,2), mm128_extr_32(s0hi,2) ); \
casti_m256i( d,7 ) = _mm256_set_epi32( \
mm128_extr_32(s7hi,3), mm128_extr_32(s6hi,3), \
mm128_extr_32(s5hi,3), mm128_extr_32(s4hi,3), \
mm128_extr_32(s3hi,3), mm128_extr_32(s2hi,3), \
mm128_extr_32(s1hi,3), mm128_extr_32(s0hi,3) ); \
} while(0)
#define mm256_intrlv_8x32_128( d, s0, s1, s2, s3, s4, s5, s6, s7 ) \
{ \
casti_m256i( d,0 ) = _mm256_set_epi32( \
mm128_extr_32(s7,0), mm128_extr_32(s6,0), \
mm128_extr_32(s5,0), mm128_extr_32(s4,0), \
mm128_extr_32(s3,0), mm128_extr_32(s2,0), \
mm128_extr_32(s1,0), mm128_extr_32(s0,0) ); \
casti_m256i( d,1 ) = _mm256_set_epi32( \
mm128_extr_32(s7,1), mm128_extr_32(s6,1), \
mm128_extr_32(s5,1), mm128_extr_32(s4,1), \
mm128_extr_32(s3,1), mm128_extr_32(s2,1), \
mm128_extr_32(s1,1), mm128_extr_32(s0,1) ); \
casti_m256i( d,2 ) = _mm256_set_epi32( \
mm128_extr_32(s7,2), mm128_extr_32(s6,2), \
mm128_extr_32(s5,2), mm128_extr_32(s4,2), \
mm128_extr_32(s3,2), mm128_extr_32(s2,2), \
mm128_extr_32(s1,2), mm128_extr_32(s0,2) ); \
casti_m256i( d,3 ) = _mm256_set_epi32( \
mm128_extr_32(s7,3), mm128_extr_32(s6,3), \
mm128_extr_32(s5,3), mm128_extr_32(s4,3), \
mm128_extr_32(s3,3), mm128_extr_32(s2,3), \
mm128_extr_32(s1,3), mm128_extr_32(s0,3) ); \
} while(0)
/*
#define mm256_bswap_intrlv_8x32_256( d, src ) \
do { \
__m256i s0 = mm256_bswap_32( src ); \
__m128i s1 = _mm256_extracti128_si256( s0, 1 ); \
casti_m256i( d, 0 ) = _mm256_set1_epi32( _mm_extract_epi32( \
_mm256_castsi256_si128( s0 ), 0 ) ); \
casti_m256i( d, 1 ) = _mm256_set1_epi32( _mm_extract_epi32( \
_mm256_castsi256_si128( s0 ), 1 ) ); \
casti_m256i( d, 2 ) = _mm256_set1_epi32( _mm_extract_epi32( \
_mm256_castsi256_si128( s0 ), 2 ) ); \
casti_m256i( d, 3 ) = _mm256_set1_epi32( _mm_extract_epi32( \
_mm256_castsi256_si128( s0 ), 3 ) ); \
casti_m256i( d, 4 ) = _mm256_set1_epi32( _mm_extract_epi32( s1, 0 ) ); \
casti_m256i( d, 5 ) = _mm256_set1_epi32( _mm_extract_epi32( s1, 1 ) ); \
casti_m256i( d, 6 ) = _mm256_set1_epi32( _mm_extract_epi32( s1, 2 ) ); \
casti_m256i( d, 7 ) = _mm256_set1_epi32( _mm_extract_epi32( s1, 3 ) ); \
} while(0)
#define mm256_bswap_intrlv_8x32_128( d, src ) \
do { \
__m128i ss = mm128_bswap_32( src ); \
casti_m256i( d, 0 ) = _mm256_set1_epi32( _mm_extract_epi32( ss, 0 ) ); \
casti_m256i( d, 1 ) = _mm256_set1_epi32( _mm_extract_epi32( ss, 1 ) ); \
casti_m256i( d, 2 ) = _mm256_set1_epi32( _mm_extract_epi32( ss, 2 ) ); \
casti_m256i( d, 3 ) = _mm256_set1_epi32( _mm_extract_epi32( ss, 3 ) ); \
} while(0)
*/
#define mm256_dintrlv_8x32_256( d0, d1, d2, d3, d4, d5, d6, d7, s ) \
do { \
__m256i s0 = casti_m256i(s,0); \
__m256i s1 = casti_m256i(s,1); \
__m256i s2 = casti_m256i(s,2); \
__m256i s3 = casti_m256i(s,3); \
__m256i s4 = casti_m256i(s,4); \
__m256i s5 = casti_m256i(s,5); \
__m256i s6 = casti_m256i(s,6); \
__m256i s7 = casti_m256i(s,7); \
__m128i s0hi = _mm256_extracti128_si256( s0, 1 ); \
__m128i s1hi = _mm256_extracti128_si256( s1, 1 ); \
__m128i s2hi = _mm256_extracti128_si256( s2, 1 ); \
__m128i s3hi = _mm256_extracti128_si256( s3, 1 ); \
__m128i s4hi = _mm256_extracti128_si256( s4, 1 ); \
__m128i s5hi = _mm256_extracti128_si256( s5, 1 ); \
__m128i s6hi = _mm256_extracti128_si256( s6, 1 ); \
__m128i s7hi = _mm256_extracti128_si256( s7, 1 ); \
d0 = _mm256_set_epi32( \
extr32_cast128_256( s7, 0 ), extr32_cast128_256( s6, 0 ), \
extr32_cast128_256( s5, 0 ), extr32_cast128_256( s4, 0 ), \
extr32_cast128_256( s3, 0 ), extr32_cast128_256( s2, 0 ), \
extr32_cast128_256( s1, 0 ), extr32_cast128_256( s0, 0 ) );\
d1 = _mm256_set_epi32( \
extr32_cast128_256( s7, 1 ), extr32_cast128_256( s6, 1 ), \
extr32_cast128_256( s5, 1 ), extr32_cast128_256( s4, 1 ), \
extr32_cast128_256( s3, 1 ), extr32_cast128_256( s2, 1 ), \
extr32_cast128_256( s1, 1 ), extr32_cast128_256( s0, 1 ) );\
d2 = _mm256_set_epi32( \
extr32_cast128_256( s7, 2 ), extr32_cast128_256( s6, 2 ), \
extr32_cast128_256( s5, 2 ), extr32_cast128_256( s4, 2 ), \
extr32_cast128_256( s3, 2 ), extr32_cast128_256( s2, 2 ), \
extr32_cast128_256( s1, 2 ), extr32_cast128_256( s0, 2 ) );\
d3 = _mm256_set_epi32( \
extr32_cast128_256( s7, 3 ), extr32_cast128_256( s6, 3 ), \
extr32_cast128_256( s5, 3 ), extr32_cast128_256( s4, 3 ), \
extr32_cast128_256( s3, 3 ), extr32_cast128_256( s2, 3 ), \
extr32_cast128_256( s1, 3 ), extr32_cast128_256( s0, 3 ) );\
d4 = _mm256_set_epi32( \
_mm_extract_epi32( s7hi, 0 ), _mm_extract_epi32( s6hi, 0 ), \
_mm_extract_epi32( s5hi, 0 ), _mm_extract_epi32( s4hi, 0 ), \
_mm_extract_epi32( s3hi, 0 ), _mm_extract_epi32( s2hi, 0 ), \
_mm_extract_epi32( s1hi, 0 ), _mm_extract_epi32( s0hi, 0 ) ); \
d5 = _mm256_set_epi32( \
_mm_extract_epi32( s7hi, 1 ), _mm_extract_epi32( s6hi, 1 ), \
_mm_extract_epi32( s5hi, 1 ), _mm_extract_epi32( s4hi, 1 ), \
_mm_extract_epi32( s3hi, 1 ), _mm_extract_epi32( s2hi, 1 ), \
_mm_extract_epi32( s1hi, 1 ), _mm_extract_epi32( s0hi, 1 ) ); \
d6 = _mm256_set_epi32( \
_mm_extract_epi32( s7hi, 2 ), _mm_extract_epi32( s6hi, 2 ), \
_mm_extract_epi32( s5hi, 2 ), _mm_extract_epi32( s4hi, 2 ), \
_mm_extract_epi32( s3hi, 2 ), _mm_extract_epi32( s2hi, 2 ), \
_mm_extract_epi32( s1hi, 2 ), _mm_extract_epi32( s0hi, 2 ) ); \
d7 = _mm256_set_epi32( \
_mm_extract_epi32( s7hi, 3 ), _mm_extract_epi32( s6hi, 3 ), \
_mm_extract_epi32( s5hi, 3 ), _mm_extract_epi32( s4hi, 3 ), \
_mm_extract_epi32( s3hi, 3 ), _mm_extract_epi32( s2hi, 3 ), \
_mm_extract_epi32( s1hi, 3 ), _mm_extract_epi32( s0hi, 3 ) ); \
} while(0)
#define mm128_dintrlv_8x32_128( d0, d1, d2, d3, d4, d5, d6, d7, s ) \
do { \
__m128i s0 = casti_m128i(s,0); \
__m128i s1 = casti_m128i(s,1); \
__m128i s2 = casti_m128i(s,2); \
__m128i s3 = casti_m128i(s,3); \
d0 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d1 = _mm_set_epi32( \
_mm_extract_epi32( s3, 1 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 1 ), _mm_extract_epi32( s0, 0 ) ); \
d2 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d3 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d4 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d5 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d6 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d7 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
} while(0)
#define mm256_intrlv_4x64_256( d, s0, s1, s2, s3 ) \
do { \
__m128i s0hi = _mm256_extracti128_si256( s0, 1 ); \
__m128i s1hi = _mm256_extracti128_si256( s1, 1 ); \
__m128i s2hi = _mm256_extracti128_si256( s2, 1 ); \
__m128i s3hi = _mm256_extracti128_si256( s3, 1 ); \
casti_m256i( d,0 ) = _mm256_set_epi64x( \
extr64_cast128_256( s3, 0 ), extr64_cast128_256( s2, 0 ), \
extr64_cast128_256( s1, 0 ), extr64_cast128_256( s0, 0 ) ); \
casti_m256i( d,1 ) = _mm256_set_epi64x( \
extr64_cast128_256( s3, 1 ), extr64_cast128_256( s2, 1 ), \
extr64_cast128_256( s1, 1 ), extr64_cast128_256( s0, 1 ) ); \
casti_m256i( d,2 ) = _mm256_set_epi64x( \
_mm_extract_epi64( s3hi,0 ), _mm_extract_epi64( s2hi,0 ), \
_mm_extract_epi64( s1hi,0 ), _mm_extract_epi64( s0hi,0 ) ); \
casti_m256i( d,3 ) = _mm256_set_epi64x( \
_mm_extract_epi64( s3hi,1 ), _mm_extract_epi64( s2hi,1 ), \
_mm_extract_epi64( s1hi,1 ), _mm_extract_epi64( s0hi,1 ) ); \
} while(0)
#define mm256_intrlv_4x64_128( d, s0, s1, s2, s3 ) \
do { \
casti_m256i( d,0 ) = _mm256_set_epi64x( \
_mm_extract_epi64( s3, 0 ), _mm_extract_epi64( s2, 0 ), \
_mm_extract_epi64( s1, 0 ), _mm_extract_epi64( s0, 0 ) ); \
casti_m256i( d,1 ) = _mm256_set_epi64x( \
_mm_extract_epi64( s3, 1 ), _mm_extract_epi64( s2, 1 ), \
_mm_extract_epi64( s1, 1 ), _mm_extract_epi64( s0, 1 ) ); \
} while(0)
/*
#define mm256_bswap_intrlv_4x64_256( d, src ) \
do { \
__m256i s0 = mm256_bswap_32( src ); \
__m128i s1 = _mm256_extracti128_si256( s0, 1 ); \
casti_m256i( d,0 ) = _mm256_set1_epi64x( _mm_extract_epi64( \
_mm256_castsi256_si128( s0 ), 0 ) ); \
casti_m256i( d,1 ) = _mm256_set1_epi64x( _mm_extract_epi64( \
_mm256_castsi256_si128( s0 ), 1 ) ); \
casti_m256i( d,2 ) = _mm256_set1_epi64x( _mm_extract_epi64( s1, 0 ) ); \
casti_m256i( d,3 ) = _mm256_set1_epi64x( _mm_extract_epi64( s1, 1 ) ); \
} while(0)
#define mm256_bswap_intrlv_4x64_128( d, src ) \
do { \
__m128i ss = mm128_bswap_32( src ); \
casti_m256i( d,0 ) = _mm256_set1_epi64x( _mm_extract_epi64( ss, 0 ) ); \
casti_m256i( d,1 ) = _mm256_set1_epi64x( _mm_extract_epi64( ss, 1 ) ); \
} while(0)
*/
// 4 lanes of 256 bits using 64 bit interleaving (standard final hash size)
static inline void mm256_dintrlv_4x64_256( void *d0, void *d1, void *d2,
void *d3, const int n, const void *src )
{
__m256i s0 = *( (__m256i*) src ); // s[0][1:0]
__m256i s1 = *( (__m256i*)(src+32) ); // s[1][1:0]
__m256i s2 = *( (__m256i*)(src+64) ); // s[2][1:0]
__m256i s3 = *( (__m256i*)(src+96) ); // s[3][2:0]
__m128i s0hi = _mm256_extracti128_si256( s0, 1 ); // s[0][3:2]
__m128i s1hi = _mm256_extracti128_si256( s1, 1 ); // s[1][3:2]
__m128i s2hi = _mm256_extracti128_si256( s2, 1 ); // s[2][3:2]
__m128i s3hi = _mm256_extracti128_si256( s3, 1 ); // s[3][3:2]
casti_m256i( d0,n ) = _mm256_set_epi64x(
extr64_cast128_256( s3, 0 ), extr64_cast128_256( s2, 0 ),
extr64_cast128_256( s1, 0 ), extr64_cast128_256( s0, 0 ) );
casti_m256i( d1,n ) = _mm256_set_epi64x(
extr64_cast128_256( s3, 1 ), extr64_cast128_256( s2, 1 ),
extr64_cast128_256( s1, 1 ), extr64_cast128_256( s0, 1 ) );
casti_m256i( d2,n ) = _mm256_set_epi64x(
_mm_extract_epi64( s3hi, 0 ), _mm_extract_epi64( s2hi, 0 ),
_mm_extract_epi64( s1hi, 0 ), _mm_extract_epi64( s0hi, 0 ) );
casti_m256i( d3,n ) = _mm256_set_epi64x(
_mm_extract_epi64( s3hi, 1 ), _mm_extract_epi64( s2hi, 1 ),
_mm_extract_epi64( s1hi, 1 ), _mm_extract_epi64( s0hi, 1 ) );
}
// quarter avx2 block, 16 bytes * 4 lanes
// 4 lanes of 128 bits using 64 bit interleaving
// Used for last 16 bytes of 80 byte input, only used for testing.
static inline void mm128_dintrlv_4x64_128( void *d0, void *d1, void *d2,
void *d3, const int n, const void *src )
{
__m256i s0 = *( (__m256i*) src );
__m256i s1 = *( (__m256i*)(src+32) );
__m128i s0hi = _mm256_extracti128_si256( s0, 1 );
__m128i s1hi = _mm256_extracti128_si256( s1, 1 );
casti_m128i( d0,n ) = _mm_set_epi64x( extr64_cast128_256( s1 , 0 ),
extr64_cast128_256( s0 , 0 ) );
casti_m128i( d1,n ) = _mm_set_epi64x( extr64_cast128_256( s1 , 1 ),
extr64_cast128_256( s0 , 1 ) );
casti_m128i( d2,n ) = _mm_set_epi64x( _mm_extract_epi64( s1hi, 0 ),
_mm_extract_epi64( s0hi, 0 ) );
casti_m128i( d3,n ) = _mm_set_epi64x( _mm_extract_epi64( s1hi, 1 ),
_mm_extract_epi64( s0hi, 1 ) );
}
/*
static inline void mm256_dintrlv_2x128x256( void *d0, void *d1,
const int n, const void *s )
{
casti_m256i( d0,n ) = mm256_get_64( s, 0, 1, 4, 5 );
casti_m256i( d1,n ) = mm256_get_64( s, 2, 3, 6, 7 );
}
*/
//
#define mm256_intrlv_4x32_256( d, s0, s1, s2, s3 ) \
do { \
casti_m256i( d,0 ) = _mm256_set_epi32( \
mm128_extr_32( s3, 1 ), mm128_extr_32( s2, 1 ), \
mm128_extr_32( s1, 1 ), mm128_extr_32( s0, 1 ), \
mm128_extr_32( s3, 0 ), mm128_extr_32( s2, 0 ), \
mm128_extr_32( s1, 0 ), mm128_extr_32( s0, 0 ) ); \
casti_m256i( d,1 ) = _mm256_set_epi32( \
mm128_extr_32( s3, 3 ), mm128_extr_32( s2, 3 ), \
mm128_extr_32( s1, 3 ), mm128_extr_32( s0, 3 ), \
mm128_extr_32( s3, 2 ), mm128_extr_32( s2, 2 ), \
mm128_extr_32( s1, 2 ), mm128_extr_32( s0, 2 ) ); \
casti_m256i( d,2 ) = _mm256_set_epi32( \
mm128_extr_32( s3, 5 ), mm128_extr_32( s2, 5 ), \
mm128_extr_32( s1, 5 ), mm128_extr_32( s0, 5 ), \
mm128_extr_32( s3, 4 ), mm128_extr_32( s2, 4 ), \
mm128_extr_32( s1, 4 ), mm128_extr_32( s0, 4 ) ); \
casti_m256i( d,3 ) = _mm256_set_epi32( \
mm128_extr_32( s3, 7 ), mm128_extr_32( s2, 7 ), \
mm128_extr_32( s1, 7 ), mm128_extr_32( s0, 7 ), \
mm128_extr_32( s3, 6 ), mm128_extr_32( s2, 6 ), \
mm128_extr_32( s1, 6 ), mm128_extr_32( s0, 6 ) ); \
} while(0)
// 256 bit versions of commmon 128 bit functions.
static inline void mm256_intrlv_4x32( void *d, const void *s0,
const void *s1, const void *s2, const void *s3, int bit_len )
{
mm256_intrlv_4x32_256( d ,casti_m256i(s0,0), casti_m256i(s1,0),
casti_m256i(s2,0), casti_m256i(s3,0) );
if ( bit_len <= 256 ) return;
mm256_intrlv_4x32_256( d+128 ,casti_m256i(s0,1), casti_m256i(s1,1),
casti_m256i(s2,1), casti_m256i(s3,1) );
if ( bit_len <= 512 ) return;
if ( bit_len <= 640 )
{
mm128_intrlv_4x32_128( d+256, casti_m128i(s0,4), casti_m128i(s1,4),
casti_m128i(s2,4), casti_m128i(s3,4) );
return;
}
mm256_intrlv_4x32_256( d+256 ,casti_m256i(s0,2), casti_m256i(s1,2),
casti_m256i(s2,2), casti_m256i(s3,2) );
mm256_intrlv_4x32_256( d+384 ,casti_m256i(s0,3), casti_m256i(s1,3),
casti_m256i(s2,3), casti_m256i(s3,3) );
}
static inline void mm256_dintrlv_4x32_256( void *d0, void *d1, void *d2,
void *d3, const void *src )
{
__m256i s0 = *(__m256i*) src;
__m256i s1 = *(__m256i*)(src+32);
__m256i s2 = *(__m256i*)(src+64);
__m256i s3 = *(__m256i*)(src+96);
*(__m256i*)d0 = _mm256_set_epi32(
_mm256_extract_epi32( s3,4 ), _mm256_extract_epi32( s3,0 ),
_mm256_extract_epi32( s2,4 ), _mm256_extract_epi32( s2,0 ),
_mm256_extract_epi32( s1,4 ), _mm256_extract_epi32( s1,0 ),
_mm256_extract_epi32( s0,4 ), _mm256_extract_epi32( s0,0 ) );
*(__m256i*)d1 = _mm256_set_epi32(
_mm256_extract_epi32( s3,5 ), _mm256_extract_epi32( s3,1 ),
_mm256_extract_epi32( s2,5 ), _mm256_extract_epi32( s2,1 ),
_mm256_extract_epi32( s1,5 ), _mm256_extract_epi32( s1,1 ),
_mm256_extract_epi32( s0,5 ), _mm256_extract_epi32( s0,1 ) );
*(__m256i*)d2 = _mm256_set_epi32(
_mm256_extract_epi32( s3,6 ), _mm256_extract_epi32( s3,2 ),
_mm256_extract_epi32( s2,6 ), _mm256_extract_epi32( s2,2 ),
_mm256_extract_epi32( s1,6 ), _mm256_extract_epi32( s1,2 ),
_mm256_extract_epi32( s0,6 ), _mm256_extract_epi32( s0,2 ) );
*(__m256i*)d3 = _mm256_set_epi32(
_mm256_extract_epi32( s3,7 ), _mm256_extract_epi32( s3,3 ),
_mm256_extract_epi32( s2,7 ), _mm256_extract_epi32( s2,3 ),
_mm256_extract_epi32( s1,7 ), _mm256_extract_epi32( s1,3 ),
_mm256_extract_epi32( s0,7 ), _mm256_extract_epi32( s0,3 ) );
}
static inline void mm256_dintrlv_4x32( void *d0, void *d1, void *d2,
void *d3, const void *s, int bit_len )
{
mm256_dintrlv_4x32_256( d0 , d1 , d2 , d3 , s );
if ( bit_len <= 256 ) return;
mm256_dintrlv_4x32_256( d0+ 32, d1+ 32, d2+ 32, d3+ 32, s+128 );
if ( bit_len <= 512 ) return;
if ( bit_len <= 640 )
{
mm128_dintrlv_4x32_128( d0+ 64, d1+ 64, d2+ 64, d3+ 64, s+256 );
return;
}
mm256_dintrlv_4x32_256( d0+ 64, d1+ 64, d2+ 64, d3+ 64, s+256 );
mm256_dintrlv_4x32_256( d0+ 96, d1+ 96, d2+ 96, d3+ 96, s+384 );
}
static inline void mm256_extr_lane_4x32( void *d, const void *s,
const int lane, const int bit_len )
{
casti_m256i( d, 0 ) = mm256_get_32( s, lane , lane+ 4, lane+ 8, lane+12,
lane+16, lane+20, lane+24, lane+28 );
if ( bit_len <= 256 ) return;
casti_m256i( d, 1 ) = mm256_get_32( s, lane+32, lane+36, lane+40, lane+44,
lane+48, lane+52, lane+56, lane+60 );
}
// Interleave 8 source buffers containing 32 bit data into the destination
// vector
static inline void mm256_intrlv_8x32( void *d, const void *s0,
const void *s1, const void *s2, const void *s3, const void *s4,
const void *s5, const void *s6, const void *s7, int bit_len )
{
mm256_intrlv_8x32_256( d , casti_m256i( s0,0 ), casti_m256i( s1,0 ),
casti_m256i( s2,0 ), casti_m256i( s3,0 ), casti_m256i( s4,0 ),
casti_m256i( s5,0 ), casti_m256i( s6,0 ), casti_m256i( s7,0 ) );
if ( bit_len <= 256 ) return;
mm256_intrlv_8x32_256( d+256, casti_m256i( s0,1 ), casti_m256i( s1,1 ),
casti_m256i( s2,1 ), casti_m256i( s3,1 ), casti_m256i( s4,1 ),
casti_m256i( s5,1 ), casti_m256i( s6,1 ), casti_m256i( s7,1 ) );
if ( bit_len <= 512 ) return;
if ( bit_len <= 640 )
{
mm256_intrlv_8x32_128( d+512, casti_m128i( s0,4 ), casti_m128i( s1,4 ),
casti_m128i( s2,4 ), casti_m128i( s3,4 ), casti_m128i( s4,4 ),
casti_m128i( s5,4 ), casti_m128i( s6,4 ), casti_m128i( s7,4 ) );
return;
}
mm256_intrlv_8x32_256( d+512, casti_m256i( s0,2 ), casti_m256i( s1,2 ),
casti_m256i( s2,2 ), casti_m256i( s3,2 ), casti_m256i( s4,2 ),
casti_m256i( s5,2 ), casti_m256i( s6,2 ), casti_m256i( s7,2 ) );
mm256_intrlv_8x32_256( d+768, casti_m256i( s0,3 ), casti_m256i( s1,3 ),
casti_m256i( s2,3 ), casti_m256i( s3,3 ), casti_m256i( s4,3 ),
casti_m256i( s5,3 ), casti_m256i( s6,3 ), casti_m256i( s7,3 ) );
// bit_len == 1024
}
// A couple of mining specifi functions.
/*
// Interleave 80 bytes of 32 bit data for 8 lanes.
static inline void mm256_bswap_intrlv80_8x32( void *d, const void *s )
{
mm256_bswap_intrlv_8x32_256( d , casti_m256i( s, 0 ) );
mm256_bswap_intrlv_8x32_256( d+256, casti_m256i( s, 1 ) );
mm256_bswap_intrlv_8x32_128( d+512, casti_m128i( s, 4 ) );
}
*/
// Deinterleave 8 buffers of 32 bit data from the source buffer.
// Sub-function can be called directly for 32 byte final hash.
static inline void mm256_dintrlv_8x32( void *d0, void *d1, void *d2,
void *d3, void *d4, void *d5, void *d6, void *d7,
const void *s, int bit_len )
{
mm256_dintrlv_8x32_256( casti_m256i(d0,0), casti_m256i(d1,0),
casti_m256i(d2,0), casti_m256i(d3,0), casti_m256i(d4,0),
casti_m256i(d5,0), casti_m256i(d6,0), casti_m256i(d7,0), s );
if ( bit_len <= 256 ) return;
mm256_dintrlv_8x32_256( casti_m256i(d0,1), casti_m256i(d1,1),
casti_m256i(d2,1), casti_m256i(d3,1), casti_m256i(d4,1),
casti_m256i(d5,1), casti_m256i(d6,1), casti_m256i(d7,1), s+256 );
if ( bit_len <= 512 ) return;
// short block, final 16 bytes of input data
if ( bit_len <= 640 )
{
mm128_dintrlv_8x32_128( casti_m128i(d0,2), casti_m128i(d1,2),
casti_m128i(d2,2), casti_m128i(d3,2), casti_m128i(d4,2),
casti_m128i(d5,2), casti_m128i(d6,2), casti_m128i(d7,2), s+512 );
return;
}
// bitlen == 1024
mm256_dintrlv_8x32_256( casti_m256i(d0,2), casti_m256i(d1,2),
casti_m256i(d2,2), casti_m256i(d3,2), casti_m256i(d4,2),
casti_m256i(d5,2), casti_m256i(d6,2), casti_m256i(d7,2), s+512 );
mm256_dintrlv_8x32_256( casti_m256i(d0,3), casti_m256i(d1,3),
casti_m256i(d2,3), casti_m256i(d3,3), casti_m256i(d4,3),
casti_m256i(d5,3), casti_m256i(d6,3), casti_m256i(d7,3), s+768 );
}
static inline void mm256_extr_lane_8x32( void *d, const void *s,
const int lane, const int bit_len )
{
casti_m256i( d,0 ) = mm256_get_32(s, lane , lane+ 8, lane+ 16, lane+ 24,
lane+32, lane+ 40, lane+ 48, lane+ 56 );
if ( bit_len <= 256 ) return;
casti_m256i( d,1 ) = mm256_get_32(s, lane+64, lane+ 72, lane+ 80, lane+ 88,
lane+96, lane+104, lane+112, lane+120 );
// bit_len == 512
}
// Interleave 4 source buffers containing 64 bit data into the destination
// buffer. Only bit_len 256, 512, 640 & 1024 are supported.
static inline void mm256_intrlv_4x64( void *d, const void *s0,
const void *s1, const void *s2, const void *s3, int bit_len )
{
mm256_intrlv_4x64_256( d , casti_m256i(s0,0), casti_m256i(s1,0),
casti_m256i(s2,0), casti_m256i(s3,0) );
if ( bit_len <= 256 ) return;
mm256_intrlv_4x64_256( d+128, casti_m256i(s0,1), casti_m256i(s1,1),
casti_m256i(s2,1), casti_m256i(s3,1) );
if ( bit_len <= 512 ) return;
if ( bit_len <= 640 )
{
mm256_intrlv_4x64_128( d+256, casti_m128i(s0,4), casti_m128i(s1,4),
casti_m128i(s2,4), casti_m128i(s3,4) );
return;
}
// bit_len == 1024
mm256_intrlv_4x64_256( d+256, casti_m256i(s0,2), casti_m256i(s1,2),
casti_m256i(s2,2), casti_m256i(s3,2) );
mm256_intrlv_4x64_256( d+384, casti_m256i(s0,3), casti_m256i(s1,3),
casti_m256i(s2,3), casti_m256i(s3,3) );
}
/*
// Interleave 80 bytes of 32 bit data for 8 lanes.
static inline void mm256_bswap_intrlv80_4x64( void *d, const void *s )
{
mm256_bswap_intrlv_4x64_256( d , casti_m256i( s, 0 ) );
mm256_bswap_intrlv_4x64_256( d+128, casti_m256i( s, 1 ) );
mm256_bswap_intrlv_4x64_128( d+256, casti_m128i( s, 4 ) );
}
// Blend 32 byte lanes of hash from 2 sources according to control mask.
// macro due to 256 bit value arg.
#define mm256_blend_hash_4x64( dst, a, b, mask ) \
do { \
dst[0] = _mm256_blendv_epi8( a[0], b[0], mask ); \
dst[1] = _mm256_blendv_epi8( a[1], b[1], mask ); \
dst[2] = _mm256_blendv_epi8( a[2], b[2], mask ); \
dst[3] = _mm256_blendv_epi8( a[3], b[3], mask ); \
dst[4] = _mm256_blendv_epi8( a[4], b[4], mask ); \
dst[5] = _mm256_blendv_epi8( a[5], b[5], mask ); \
dst[6] = _mm256_blendv_epi8( a[6], b[6], mask ); \
dst[7] = _mm256_blendv_epi8( a[7], b[7], mask ); \
} while(0)
*/
// Deinterleave 4 buffers of 64 bit data from the source buffer.
// bit_len must be 256, 512, 640 or 1024 bits.
// Requires overrun padding for 640 bit len.
static inline void mm256_dintrlv_4x64( void *d0, void *d1, void *d2,
void *d3, const void *s, int bit_len )
{
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 0, s );
if ( bit_len <= 256 ) return;
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 1, s+128 );
if ( bit_len <= 512 ) return;
// short block, final 16 bytes of input data
if ( bit_len <= 640 )
{
mm128_dintrlv_4x64_128( d0, d1, d2, d3, 4, s+256 );
return;
}
// bit_len == 1024
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 2, s+256 );
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 3, s+384 );
}
// extract and deinterleave specified lane.
#define mm256_extr_lane_4x64_256 \
casti_m256i( d, 0 ) = mm256_get_64( s, lane, lane+4, lane+8, lane+12 )
static inline void mm256_extr_lane_4x64( void *d, const void *s,
const int lane, const int bit_len )
{
casti_m256i( d, 0 ) = mm256_get_64( s, lane, lane+4, lane+8, lane+12 );
if ( bit_len <= 256 ) return;
casti_m256i( d, 1 ) = mm256_get_64( s, lane+16, lane+20, lane+24, lane+28 );
return;
}
// Convert from 4x32 SSE2 interleaving to 4x64 AVX2.
// Can't do it in place
static inline void mm256_rintrlv_4x32_4x64( void *dst, void *src,
int bit_len )
{
__m256i* d = (__m256i*)dst;
uint32_t *s = (uint32_t*)src;
d[0] = _mm256_set_epi32( s[ 7],s[ 3],s[ 6],s[ 2],s[ 5],s[ 1],s[ 4],s[ 0] );
d[1] = _mm256_set_epi32( s[15],s[11],s[14],s[10],s[13],s[ 9],s[12],s[ 8] );
d[2] = _mm256_set_epi32( s[23],s[19],s[22],s[18],s[21],s[17],s[20],s[16] );
d[3] = _mm256_set_epi32( s[31],s[27],s[30],s[26],s[29],s[25],s[28],s[24] );
if ( bit_len <= 256 ) return;
d[4] = _mm256_set_epi32( s[39],s[35],s[38],s[34],s[37],s[33],s[36],s[32] );
d[5] = _mm256_set_epi32( s[47],s[43],s[46],s[42],s[45],s[41],s[44],s[40] );
d[6] = _mm256_set_epi32( s[55],s[51],s[54],s[50],s[53],s[49],s[52],s[48] );
d[7] = _mm256_set_epi32( s[63],s[59],s[62],s[58],s[61],s[57],s[60],s[56] );
if ( bit_len <= 512 ) return;
d[8] = _mm256_set_epi32( s[71],s[67],s[70],s[66],s[69],s[65],s[68],s[64] );
d[9] = _mm256_set_epi32( s[79],s[75],s[78],s[74],s[77],s[73],s[76],s[72] );
if ( bit_len <= 640 ) return;
d[10] = _mm256_set_epi32(s[87],s[83],s[86],s[82],s[85],s[81],s[84],s[80]);
d[11] = _mm256_set_epi32(s[95],s[91],s[94],s[90],s[93],s[89],s[92],s[88]);
d[12] = _mm256_set_epi32(s[103],s[99],s[102],s[98],s[101],s[97],s[100],s[96]);
d[13] = _mm256_set_epi32(s[111],s[107],s[110],s[106],s[109],s[105],s[108],s[104]);
d[14] = _mm256_set_epi32(s[119],s[115],s[118],s[114],s[117],s[113],s[116],s[112]);
d[15] = _mm256_set_epi32(s[127],s[123],s[126],s[122],s[125],s[121],s[124],s[120]);
// bit_len == 1024
}
// Convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
// bit_len must be multiple of 64
static inline void mm256_rintrlv_4x64_4x32( void *dst, void *src,
int bit_len )
{
__m256i *d = (__m256i*)dst;
uint32_t *s = (uint32_t*)src;
d[0] = _mm256_set_epi32( s[ 7],s[ 5],s[ 3],s[ 1],s[ 6],s[ 4],s[ 2],s[ 0] );
d[1] = _mm256_set_epi32( s[15],s[13],s[11],s[ 9],s[14],s[12],s[10],s[ 8] );
d[2] = _mm256_set_epi32( s[23],s[21],s[19],s[17],s[22],s[20],s[18],s[16] );
d[3] = _mm256_set_epi32( s[31],s[29],s[27],s[25],s[30],s[28],s[26],s[24] );
if ( bit_len <= 256 ) return;
d[4] = _mm256_set_epi32( s[39],s[37],s[35],s[33],s[38],s[36],s[34],s[32] );
d[5] = _mm256_set_epi32( s[47],s[45],s[43],s[41],s[46],s[44],s[42],s[40] );
d[6] = _mm256_set_epi32( s[55],s[53],s[51],s[49],s[54],s[52],s[50],s[48] );
d[7] = _mm256_set_epi32( s[63],s[61],s[59],s[57],s[62],s[60],s[58],s[56] );
if ( bit_len <= 512 ) return;
d[8] = _mm256_set_epi32( s[71],s[69],s[67],s[65],s[70],s[68],s[66],s[64] );
d[9] = _mm256_set_epi32( s[79],s[77],s[75],s[73],s[78],s[76],s[74],s[72] );
if ( bit_len <= 640 ) return;
d[10] = _mm256_set_epi32( s[87],s[85],s[83],s[81],s[86],s[84],s[82],s[80] );
d[11] = _mm256_set_epi32( s[95],s[93],s[91],s[89],s[94],s[92],s[90],s[88] );
d[12] = _mm256_set_epi32( s[103],s[101],s[99],s[97],s[102],s[100],s[98],s[96] );
d[13] = _mm256_set_epi32( s[111],s[109],s[107],s[105],s[110],s[108],s[106],s[104] );
d[14] = _mm256_set_epi32( s[119],s[117],s[115],s[113],s[118],s[116],s[114],s[112] );
d[15] = _mm256_set_epi32( s[127],s[125],s[123],s[121],s[126],s[124],s[122],s[120] );
// bit_len == 1024
}
static inline void mm256_rintrlv_4x64_2x128( void *dst0, void *dst1,
const void *src, int bit_len )
{
__m256i* d0 = (__m256i*)dst0;
__m256i* d1 = (__m256i*)dst1;
uint64_t *s = (uint64_t*)src;
d0[0] = _mm256_set_epi64x( s[ 5], s[ 1], s[ 4], s[ 0] );
d1[0] = _mm256_set_epi64x( s[ 7], s[ 3], s[ 6], s[ 2] );
d0[1] = _mm256_set_epi64x( s[13], s[ 9], s[12], s[ 8] );
d1[1] = _mm256_set_epi64x( s[15], s[11], s[14], s[10] );
if ( bit_len <= 256 ) return;
d0[2] = _mm256_set_epi64x( s[21], s[17], s[20], s[16] );
d1[2] = _mm256_set_epi64x( s[23], s[19], s[22], s[18] );
d0[3] = _mm256_set_epi64x( s[29], s[25], s[28], s[24] );
d1[3] = _mm256_set_epi64x( s[31], s[27], s[30], s[26] );
if ( bit_len <= 512 ) return;
d0[4] = _mm256_set_epi64x( s[37], s[33], s[36], s[32] );
d1[4] = _mm256_set_epi64x( s[39], s[35], s[38], s[34] );
d0[5] = _mm256_set_epi64x( s[45], s[41], s[44], s[40] );
d1[5] = _mm256_set_epi64x( s[47], s[43], s[46], s[42] );
d0[6] = _mm256_set_epi64x( s[53], s[49], s[52], s[48] );
d1[6] = _mm256_set_epi64x( s[55], s[51], s[54], s[50] );
d0[7] = _mm256_set_epi64x( s[61], s[57], s[60], s[56] );
d1[7] = _mm256_set_epi64x( s[63], s[59], s[62], s[58] );
}
static inline void mm256_rintrlv_2x128_4x64( void *dst, const void *src0,
const void *src1, int bit_len )
{
__m256i* d = (__m256i*)dst;
uint64_t *s0 = (uint64_t*)src0;
uint64_t *s1 = (uint64_t*)src1;
d[ 0] = _mm256_set_epi64x( s1[2], s1[0], s0[2], s0[0] );
d[ 1] = _mm256_set_epi64x( s1[3], s1[1], s0[3], s0[1] );
d[ 2] = _mm256_set_epi64x( s1[6], s1[4], s0[6], s0[4] );
d[ 3] = _mm256_set_epi64x( s1[7], s1[5], s0[7], s0[5] );
if ( bit_len <= 256 ) return;
d[ 4] = _mm256_set_epi64x( s1[10], s1[ 8], s0[10], s0[ 8] );
d[ 5] = _mm256_set_epi64x( s1[11], s1[ 9], s0[11], s0[ 9] );
d[ 6] = _mm256_set_epi64x( s1[14], s1[12], s0[14], s0[12] );
d[ 7] = _mm256_set_epi64x( s1[15], s1[13], s0[15], s0[13] );
if ( bit_len <= 512 ) return;
d[ 8] = _mm256_set_epi64x( s1[18], s1[16], s0[18], s0[16] );
d[ 9] = _mm256_set_epi64x( s1[19], s1[17], s0[19], s0[17] );
d[10] = _mm256_set_epi64x( s1[22], s1[20], s0[22], s0[20] );
d[11] = _mm256_set_epi64x( s1[23], s1[21], s0[23], s0[21] );
d[12] = _mm256_set_epi64x( s1[26], s1[24], s0[26], s0[24] );
d[13] = _mm256_set_epi64x( s1[27], s1[25], s0[27], s0[25] );
d[14] = _mm256_set_epi64x( s1[30], s1[28], s0[30], s0[28] );
d[15] = _mm256_set_epi64x( s1[31], s1[29], s0[31], s0[29] );
}
static inline void mm256_intrlv_2x128( const void *d, const void *s0,
void *s1, const int bit_len )
{
__m128i s1hi = _mm256_extracti128_si256( casti_m256i( s1,0 ), 1 );
__m128i s0hi = _mm256_extracti128_si256( casti_m256i( s0,0 ), 1 );
casti_m256i( d,0 ) = mm256_concat_128(
_mm256_castsi256_si128( casti_m256i( s1,0 ) ),
_mm256_castsi256_si128( casti_m256i( s0,0 ) ) );
casti_m256i( d,1 ) = mm256_concat_128( s1hi, s0hi );
if ( bit_len <= 256 ) return;
s0hi = _mm256_extracti128_si256( casti_m256i( s0,1 ), 1 );
s1hi = _mm256_extracti128_si256( casti_m256i( s1,1 ), 1 );
casti_m256i( d,2 ) = mm256_concat_128(
_mm256_castsi256_si128( casti_m256i( s1,1 ) ),
_mm256_castsi256_si128( casti_m256i( s0,1 ) ) );
casti_m256i( d,3 ) = mm256_concat_128( s1hi, s0hi );
if ( bit_len <= 512 ) return;
if ( bit_len <= 640 )
{
casti_m256i( d,4 ) = mm256_concat_128(
_mm256_castsi256_si128( casti_m256i( s1,2 ) ),
_mm256_castsi256_si128( casti_m256i( s0,2 ) ) );
return;
}
s0hi = _mm256_extracti128_si256( casti_m256i( s0,2 ), 1 );
s1hi = _mm256_extracti128_si256( casti_m256i( s1,2 ), 1 );
casti_m256i( d,4 ) = mm256_concat_128(
_mm256_castsi256_si128( casti_m256i( s1,2 ) ),
_mm256_castsi256_si128( casti_m256i( s0,2 ) ) );
casti_m256i( d,5 ) = mm256_concat_128( s1hi, s0hi );
s0hi = _mm256_extracti128_si256( casti_m256i( s0,3 ), 1 );
s1hi = _mm256_extracti128_si256( casti_m256i( s1,3 ), 1 );
casti_m256i( d,6 ) = mm256_concat_128(
_mm256_castsi256_si128( casti_m256i( s1,3 ) ),
_mm256_castsi256_si128( casti_m256i( s0,3 ) ) );
casti_m256i( d,7 ) = mm256_concat_128( s1hi, s0hi );
}
// 512 is the bit len used by most, eliminate the conditionals
static inline void mm256_dintrlv_2x128_512( void *dst0, void *dst1,
const void *s )
{
__m256i *d0 = (__m256i*)dst0;
__m256i *d1 = (__m256i*)dst1;
__m256i s0 = casti_m256i( s, 0 );
__m256i s1 = casti_m256i( s, 1 );
d0[0] = _mm256_permute2x128_si256( s0, s1, 0x20 );
d1[0] = _mm256_permute2x128_si256( s0, s1, 0x31 );
s0 = casti_m256i( s, 2 );
s1 = casti_m256i( s, 3 );
d0[1] = _mm256_permute2x128_si256( s0, s1, 0x20 );
d1[1] = _mm256_permute2x128_si256( s0, s1, 0x31 );
}
// Phase out usage for all 512 bit data lengths
static inline void mm256_dintrlv_2x128( void *dst0, void *dst1, const void *s,
int bit_len )
{
__m256i *d0 = (__m256i*)dst0;
__m256i *d1 = (__m256i*)dst1;
__m256i s0 = casti_m256i( s, 0 );
__m256i s1 = casti_m256i( s, 1 );
d0[0] = _mm256_permute2x128_si256( s0, s1, 0x20 );
d1[0] = _mm256_permute2x128_si256( s0, s1, 0x31 );
if ( bit_len <= 256 ) return;
s0 = casti_m256i( s, 2 );
s1 = casti_m256i( s, 3 );
d0[1] = _mm256_permute2x128_si256( s0, s1, 0x20 );
d1[1] = _mm256_permute2x128_si256( s0, s1, 0x31 );
if ( bit_len <= 512 ) return;
s0 = casti_m256i( s, 4 );
s1 = casti_m256i( s, 5 );
d0[2] = _mm256_permute2x128_si256( s0, s1, 0x20 );
d1[2] = _mm256_permute2x128_si256( s0, s1, 0x31 );
s0 = casti_m256i( s, 6 );
s1 = casti_m256i( s, 7 );
d0[3] = _mm256_permute2x128_si256( s0, s1, 0x20 );
d1[3] = _mm256_permute2x128_si256( s0, s1, 0x31 );
}
#undef extr64_cast128_256
#undef extr32_cast128_256
#endif // AVX
#endif // INTRLV_AVX_H__

View File

@@ -1,42 +1,13 @@
#if !defined(INTRLV_AVX22_H__)
#if !defined(INTRLV_AVX2_H__)
#define INTRLV_AVX2_H__ 1
#if defined(__AVX2__)
// Convenient short cuts for local use only
// Extract 64 bits from the low 128 bits of 256 bit vector.
#define extr64_cast128_256( a, n ) \
_mm_extract_epi64( _mm256_castsi256_si128( a ), n )
// Extract 32 bits from the low 128 bits of 256 bit vector.
#define extr32_cast128_256( a, n ) \
_mm_extract_epi32( _mm256_castsi256_si128( a ), n )
///////////////////////////////////////////////////////////
//
// AVX2 256 Bit Vectors
//
#define mm256_put_64( s0, s1, s2, s3) \
_mm256_set_epi64x( *((const uint64_t*)(s3)), *((const uint64_t*)(s2)), \
*((const uint64_t*)(s1)), *((const uint64_t*)(s0)) )
#define mm256_put_32( s00, s01, s02, s03, s04, s05, s06, s07 ) \
_mm256_set_epi32( *((const uint32_t*)(s07)), *((const uint32_t*)(s06)), \
*((const uint32_t*)(s05)), *((const uint32_t*)(s04)), \
*((const uint32_t*)(s03)), *((const uint32_t*)(s02)), \
*((const uint32_t*)(s01)), *((const uint32_t*)(s00)) )
#define mm256_get_64( s, i0, i1, i2, i3 ) \
_mm256_set_epi64x( ((const uint64_t*)(s))[i3], ((const uint64_t*)(s))[i2], \
((const uint64_t*)(s))[i1], ((const uint64_t*)(s))[i0] )
#define mm256_get_32( s, i0, i1, i2, i3, i4, i5, i6, i7 ) \
_mm256_set_epi32( ((const uint32_t*)(s))[i7], ((const uint32_t*)(s))[i6], \
((const uint32_t*)(s))[i5], ((const uint32_t*)(s))[i4], \
((const uint32_t*)(s))[i3], ((const uint32_t*)(s))[i2], \
((const uint32_t*)(s))[i1], ((const uint32_t*)(s))[i0] )
// A few functions that need AVX2 for 256 bit.
// Blend 2 vectors alternating hi & lo: { hi[n], lo[n-1], ... hi[1], lo[0] }
@@ -49,82 +20,6 @@
#define mm256_intrlv_blend_32( hi, lo ) \
_mm256_blend_epi32( hi, lo, 0x55 )
// Interleave 8x32_256
#define mm256_intrlv_8x32_256( d, s0, s1, s2, s3, s4, s5, s6, s7 ) \
{ \
__m128i s0hi = mm128_extr_hi128_256( s0 ); \
__m128i s1hi = mm128_extr_hi128_256( s1 ); \
__m128i s2hi = mm128_extr_hi128_256( s2 ); \
__m128i s3hi = mm128_extr_hi128_256( s3 ); \
__m128i s4hi = mm128_extr_hi128_256( s4 ); \
__m128i s5hi = mm128_extr_hi128_256( s5 ); \
__m128i s6hi = mm128_extr_hi128_256( s6 ); \
__m128i s7hi = mm128_extr_hi128_256( s7 ); \
casti_m256i( d,0 ) = _mm256_set_epi32( \
extr32_cast128_256(s7,0), extr32_cast128_256(s6,0), \
extr32_cast128_256(s5,0), extr32_cast128_256(s4,0), \
extr32_cast128_256(s3,0), extr32_cast128_256(s2,0), \
extr32_cast128_256(s1,0), extr32_cast128_256(s0,0) ); \
casti_m256i( d,1 ) = _mm256_set_epi32( \
extr32_cast128_256(s7,1), extr32_cast128_256(s6,1), \
extr32_cast128_256(s5,1), extr32_cast128_256(s4,1), \
extr32_cast128_256(s3,1), extr32_cast128_256(s2,1), \
extr32_cast128_256(s1,1), extr32_cast128_256(s0,1) ); \
casti_m256i( d,2 ) = _mm256_set_epi32( \
extr32_cast128_256(s7,2), extr32_cast128_256(s6,2), \
extr32_cast128_256(s5,2), extr32_cast128_256(s4,2), \
extr32_cast128_256(s3,2), extr32_cast128_256(s2,2), \
extr32_cast128_256(s1,2), extr32_cast128_256(s0,2) ); \
casti_m256i( d,3 ) = _mm256_set_epi32( \
extr32_cast128_256(s7,3), extr32_cast128_256(s6,3), \
extr32_cast128_256(s5,3), extr32_cast128_256(s4,3), \
extr32_cast128_256(s3,3), extr32_cast128_256(s2,3), \
extr32_cast128_256(s1,3), extr32_cast128_256(s0,3) ); \
casti_m256i( d,4 ) = _mm256_set_epi32( \
mm128_extr_32(s7hi,0), mm128_extr_32(s6hi,0), \
mm128_extr_32(s5hi,0), mm128_extr_32(s4hi,0), \
mm128_extr_32(s3hi,0), mm128_extr_32(s2hi,0), \
mm128_extr_32(s1hi,0), mm128_extr_32(s0hi,0) ); \
casti_m256i( d,5 ) = _mm256_set_epi32( \
mm128_extr_32(s7hi,1), mm128_extr_32(s6hi,1), \
mm128_extr_32(s5hi,1), mm128_extr_32(s4hi,1), \
mm128_extr_32(s3hi,1), mm128_extr_32(s2hi,1), \
mm128_extr_32(s1hi,1), mm128_extr_32(s0hi,1) ); \
casti_m256i( d,6 ) = _mm256_set_epi32( \
mm128_extr_32(s7hi,2), mm128_extr_32(s6hi,2), \
mm128_extr_32(s5hi,2), mm128_extr_32(s4hi,2), \
mm128_extr_32(s3hi,2), mm128_extr_32(s2hi,2), \
mm128_extr_32(s1hi,2), mm128_extr_32(s0hi,2) ); \
casti_m256i( d,7 ) = _mm256_set_epi32( \
mm128_extr_32(s7hi,3), mm128_extr_32(s6hi,3), \
mm128_extr_32(s5hi,3), mm128_extr_32(s4hi,3), \
mm128_extr_32(s3hi,3), mm128_extr_32(s2hi,3), \
mm128_extr_32(s1hi,3), mm128_extr_32(s0hi,3) ); \
} while(0)
#define mm256_intrlv_8x32_128( d, s0, s1, s2, s3, s4, s5, s6, s7 ) \
{ \
casti_m256i( d,0 ) = _mm256_set_epi32( \
mm128_extr_32(s7,0), mm128_extr_32(s6,0), \
mm128_extr_32(s5,0), mm128_extr_32(s4,0), \
mm128_extr_32(s3,0), mm128_extr_32(s2,0), \
mm128_extr_32(s1,0), mm128_extr_32(s0,0) ); \
casti_m256i( d,1 ) = _mm256_set_epi32( \
mm128_extr_32(s7,1), mm128_extr_32(s6,1), \
mm128_extr_32(s5,1), mm128_extr_32(s4,1), \
mm128_extr_32(s3,1), mm128_extr_32(s2,1), \
mm128_extr_32(s1,1), mm128_extr_32(s0,1) ); \
casti_m256i( d,2 ) = _mm256_set_epi32( \
mm128_extr_32(s7,2), mm128_extr_32(s6,2), \
mm128_extr_32(s5,2), mm128_extr_32(s4,2), \
mm128_extr_32(s3,2), mm128_extr_32(s2,2), \
mm128_extr_32(s1,2), mm128_extr_32(s0,2) ); \
casti_m256i( d,3 ) = _mm256_set_epi32( \
mm128_extr_32(s7,3), mm128_extr_32(s6,3), \
mm128_extr_32(s5,3), mm128_extr_32(s4,3), \
mm128_extr_32(s3,3), mm128_extr_32(s2,3), \
mm128_extr_32(s1,3), mm128_extr_32(s0,3) ); \
} while(0)
#define mm256_bswap_intrlv_8x32_256( d, src ) \
do { \
@@ -153,128 +48,6 @@ do { \
casti_m256i( d, 3 ) = _mm256_set1_epi32( _mm_extract_epi32( ss, 3 ) ); \
} while(0)
#define mm256_dintrlv_8x32_256( d0, d1, d2, d3, d4, d5, d6, d7, s ) \
do { \
__m256i s0 = casti_m256i(s,0); \
__m256i s1 = casti_m256i(s,1); \
__m256i s2 = casti_m256i(s,2); \
__m256i s3 = casti_m256i(s,3); \
__m256i s4 = casti_m256i(s,4); \
__m256i s5 = casti_m256i(s,5); \
__m256i s6 = casti_m256i(s,6); \
__m256i s7 = casti_m256i(s,7); \
__m128i s0hi = _mm256_extracti128_si256( s0, 1 ); \
__m128i s1hi = _mm256_extracti128_si256( s1, 1 ); \
__m128i s2hi = _mm256_extracti128_si256( s2, 1 ); \
__m128i s3hi = _mm256_extracti128_si256( s3, 1 ); \
__m128i s4hi = _mm256_extracti128_si256( s4, 1 ); \
__m128i s5hi = _mm256_extracti128_si256( s5, 1 ); \
__m128i s6hi = _mm256_extracti128_si256( s6, 1 ); \
__m128i s7hi = _mm256_extracti128_si256( s7, 1 ); \
d0 = _mm256_set_epi32( \
extr32_cast128_256( s7, 0 ), extr32_cast128_256( s6, 0 ), \
extr32_cast128_256( s5, 0 ), extr32_cast128_256( s4, 0 ), \
extr32_cast128_256( s3, 0 ), extr32_cast128_256( s2, 0 ), \
extr32_cast128_256( s1, 0 ), extr32_cast128_256( s0, 0 ) );\
d1 = _mm256_set_epi32( \
extr32_cast128_256( s7, 1 ), extr32_cast128_256( s6, 1 ), \
extr32_cast128_256( s5, 1 ), extr32_cast128_256( s4, 1 ), \
extr32_cast128_256( s3, 1 ), extr32_cast128_256( s2, 1 ), \
extr32_cast128_256( s1, 1 ), extr32_cast128_256( s0, 1 ) );\
d2 = _mm256_set_epi32( \
extr32_cast128_256( s7, 2 ), extr32_cast128_256( s6, 2 ), \
extr32_cast128_256( s5, 2 ), extr32_cast128_256( s4, 2 ), \
extr32_cast128_256( s3, 2 ), extr32_cast128_256( s2, 2 ), \
extr32_cast128_256( s1, 2 ), extr32_cast128_256( s0, 2 ) );\
d3 = _mm256_set_epi32( \
extr32_cast128_256( s7, 3 ), extr32_cast128_256( s6, 3 ), \
extr32_cast128_256( s5, 3 ), extr32_cast128_256( s4, 3 ), \
extr32_cast128_256( s3, 3 ), extr32_cast128_256( s2, 3 ), \
extr32_cast128_256( s1, 3 ), extr32_cast128_256( s0, 3 ) );\
d4 = _mm256_set_epi32( \
_mm_extract_epi32( s7hi, 0 ), _mm_extract_epi32( s6hi, 0 ), \
_mm_extract_epi32( s5hi, 0 ), _mm_extract_epi32( s4hi, 0 ), \
_mm_extract_epi32( s3hi, 0 ), _mm_extract_epi32( s2hi, 0 ), \
_mm_extract_epi32( s1hi, 0 ), _mm_extract_epi32( s0hi, 0 ) ); \
d5 = _mm256_set_epi32( \
_mm_extract_epi32( s7hi, 1 ), _mm_extract_epi32( s6hi, 1 ), \
_mm_extract_epi32( s5hi, 1 ), _mm_extract_epi32( s4hi, 1 ), \
_mm_extract_epi32( s3hi, 1 ), _mm_extract_epi32( s2hi, 1 ), \
_mm_extract_epi32( s1hi, 1 ), _mm_extract_epi32( s0hi, 1 ) ); \
d6 = _mm256_set_epi32( \
_mm_extract_epi32( s7hi, 2 ), _mm_extract_epi32( s6hi, 2 ), \
_mm_extract_epi32( s5hi, 2 ), _mm_extract_epi32( s4hi, 2 ), \
_mm_extract_epi32( s3hi, 2 ), _mm_extract_epi32( s2hi, 2 ), \
_mm_extract_epi32( s1hi, 2 ), _mm_extract_epi32( s0hi, 2 ) ); \
d7 = _mm256_set_epi32( \
_mm_extract_epi32( s7hi, 3 ), _mm_extract_epi32( s6hi, 3 ), \
_mm_extract_epi32( s5hi, 3 ), _mm_extract_epi32( s4hi, 3 ), \
_mm_extract_epi32( s3hi, 3 ), _mm_extract_epi32( s2hi, 3 ), \
_mm_extract_epi32( s1hi, 3 ), _mm_extract_epi32( s0hi, 3 ) ); \
} while(0)
#define mm128_dintrlv_8x32_128( d0, d1, d2, d3, d4, d5, d6, d7, s ) \
do { \
__m128i s0 = casti_m128i(s,0); \
__m128i s1 = casti_m128i(s,1); \
__m128i s2 = casti_m128i(s,2); \
__m128i s3 = casti_m128i(s,3); \
d0 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d1 = _mm_set_epi32( \
_mm_extract_epi32( s3, 1 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 1 ), _mm_extract_epi32( s0, 0 ) ); \
d2 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d3 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d4 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d5 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d6 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d7 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
} while(0)
#define mm256_intrlv_4x64_256( d, s0, s1, s2, s3 ) \
do { \
__m128i s0hi = _mm256_extracti128_si256( s0, 1 ); \
__m128i s1hi = _mm256_extracti128_si256( s1, 1 ); \
__m128i s2hi = _mm256_extracti128_si256( s2, 1 ); \
__m128i s3hi = _mm256_extracti128_si256( s3, 1 ); \
casti_m256i( d,0 ) = _mm256_set_epi64x( \
extr64_cast128_256( s3, 0 ), extr64_cast128_256( s2, 0 ), \
extr64_cast128_256( s1, 0 ), extr64_cast128_256( s0, 0 ) ); \
casti_m256i( d,1 ) = _mm256_set_epi64x( \
extr64_cast128_256( s3, 1 ), extr64_cast128_256( s2, 1 ), \
extr64_cast128_256( s1, 1 ), extr64_cast128_256( s0, 1 ) ); \
casti_m256i( d,2 ) = _mm256_set_epi64x( \
_mm_extract_epi64( s3hi,0 ), _mm_extract_epi64( s2hi,0 ), \
_mm_extract_epi64( s1hi,0 ), _mm_extract_epi64( s0hi,0 ) ); \
casti_m256i( d,3 ) = _mm256_set_epi64x( \
_mm_extract_epi64( s3hi,1 ), _mm_extract_epi64( s2hi,1 ), \
_mm_extract_epi64( s1hi,1 ), _mm_extract_epi64( s0hi,1 ) ); \
} while(0)
#define mm256_intrlv_4x64_128( d, s0, s1, s2, s3 ) \
do { \
casti_m256i( d,0 ) = _mm256_set_epi64x( \
_mm_extract_epi64( s3, 0 ), _mm_extract_epi64( s2, 0 ), \
_mm_extract_epi64( s1, 0 ), _mm_extract_epi64( s0, 0 ) ); \
casti_m256i( d,1 ) = _mm256_set_epi64x( \
_mm_extract_epi64( s3, 1 ), _mm_extract_epi64( s2, 1 ), \
_mm_extract_epi64( s1, 1 ), _mm_extract_epi64( s0, 1 ) ); \
} while(0)
#define mm256_bswap_intrlv_4x64_256( d, src ) \
do { \
__m256i s0 = mm256_bswap_32( src ); \
@@ -294,94 +67,6 @@ do { \
casti_m256i( d,1 ) = _mm256_set1_epi64x( _mm_extract_epi64( ss, 1 ) ); \
} while(0)
// 4 lanes of 256 bits using 64 bit interleaving (standard final hash size)
static inline void mm256_dintrlv_4x64_256( void *d0, void *d1, void *d2,
void *d3, const int n, const void *src )
{
__m256i s0 = *( (__m256i*) src ); // s[0][1:0]
__m256i s1 = *( (__m256i*)(src+32) ); // s[1][1:0]
__m256i s2 = *( (__m256i*)(src+64) ); // s[2][1:0]
__m256i s3 = *( (__m256i*)(src+96) ); // s[3][2:0]
__m128i s0hi = _mm256_extracti128_si256( s0, 1 ); // s[0][3:2]
__m128i s1hi = _mm256_extracti128_si256( s1, 1 ); // s[1][3:2]
__m128i s2hi = _mm256_extracti128_si256( s2, 1 ); // s[2][3:2]
__m128i s3hi = _mm256_extracti128_si256( s3, 1 ); // s[3][3:2]
casti_m256i( d0,n ) = _mm256_set_epi64x(
extr64_cast128_256( s3, 0 ), extr64_cast128_256( s2, 0 ),
extr64_cast128_256( s1, 0 ), extr64_cast128_256( s0, 0 ) );
casti_m256i( d1,n ) = _mm256_set_epi64x(
extr64_cast128_256( s3, 1 ), extr64_cast128_256( s2, 1 ),
extr64_cast128_256( s1, 1 ), extr64_cast128_256( s0, 1 ) );
casti_m256i( d2,n ) = _mm256_set_epi64x(
_mm_extract_epi64( s3hi, 0 ), _mm_extract_epi64( s2hi, 0 ),
_mm_extract_epi64( s1hi, 0 ), _mm_extract_epi64( s0hi, 0 ) );
casti_m256i( d3,n ) = _mm256_set_epi64x(
_mm_extract_epi64( s3hi, 1 ), _mm_extract_epi64( s2hi, 1 ),
_mm_extract_epi64( s1hi, 1 ), _mm_extract_epi64( s0hi, 1 ) );
}
// quarter avx2 block, 16 bytes * 4 lanes
// 4 lanes of 128 bits using 64 bit interleaving
// Used for last 16 bytes of 80 byte input, only used for testing.
static inline void mm128_dintrlv_4x64_128( void *d0, void *d1, void *d2,
void *d3, const int n, const void *src )
{
__m256i s0 = *( (__m256i*) src );
__m256i s1 = *( (__m256i*)(src+32) );
__m128i s0hi = _mm256_extracti128_si256( s0, 1 );
__m128i s1hi = _mm256_extracti128_si256( s1, 1 );
casti_m128i( d0,n ) = _mm_set_epi64x( extr64_cast128_256( s1 , 0 ),
extr64_cast128_256( s0 , 0 ) );
casti_m128i( d1,n ) = _mm_set_epi64x( extr64_cast128_256( s1 , 1 ),
extr64_cast128_256( s0 , 1 ) );
casti_m128i( d2,n ) = _mm_set_epi64x( _mm_extract_epi64( s1hi, 0 ),
_mm_extract_epi64( s0hi, 0 ) );
casti_m128i( d3,n ) = _mm_set_epi64x( _mm_extract_epi64( s1hi, 1 ),
_mm_extract_epi64( s0hi, 1 ) );
}
/*
static inline void mm256_dintrlv_2x128x256( void *d0, void *d1,
const int n, const void *s )
{
casti_m256i( d0,n ) = mm256_get_64( s, 0, 1, 4, 5 );
casti_m256i( d1,n ) = mm256_get_64( s, 2, 3, 6, 7 );
}
*/
//
// Interleave 8 source buffers containing 32 bit data into the destination
// vector
static inline void mm256_intrlv_8x32( void *d, const void *s0,
const void *s1, const void *s2, const void *s3, const void *s4,
const void *s5, const void *s6, const void *s7, int bit_len )
{
mm256_intrlv_8x32_256( d , casti_m256i( s0,0 ), casti_m256i( s1,0 ),
casti_m256i( s2,0 ), casti_m256i( s3,0 ), casti_m256i( s4,0 ),
casti_m256i( s5,0 ), casti_m256i( s6,0 ), casti_m256i( s7,0 ) );
if ( bit_len <= 256 ) return;
mm256_intrlv_8x32_256( d+256, casti_m256i( s0,1 ), casti_m256i( s1,1 ),
casti_m256i( s2,1 ), casti_m256i( s3,1 ), casti_m256i( s4,1 ),
casti_m256i( s5,1 ), casti_m256i( s6,1 ), casti_m256i( s7,1 ) );
if ( bit_len <= 512 ) return;
if ( bit_len <= 640 )
{
mm256_intrlv_8x32_128( d+512, casti_m128i( s0,4 ), casti_m128i( s1,4 ),
casti_m128i( s2,4 ), casti_m128i( s3,4 ), casti_m128i( s4,4 ),
casti_m128i( s5,4 ), casti_m128i( s6,4 ), casti_m128i( s7,4 ) );
return;
}
mm256_intrlv_8x32_256( d+512, casti_m256i( s0,2 ), casti_m256i( s1,2 ),
casti_m256i( s2,2 ), casti_m256i( s3,2 ), casti_m256i( s4,2 ),
casti_m256i( s5,2 ), casti_m256i( s6,2 ), casti_m256i( s7,2 ) );
mm256_intrlv_8x32_256( d+768, casti_m256i( s0,3 ), casti_m256i( s1,3 ),
casti_m256i( s2,3 ), casti_m256i( s3,3 ), casti_m256i( s4,3 ),
casti_m256i( s5,3 ), casti_m256i( s6,3 ), casti_m256i( s7,3 ) );
// bit_len == 1024
}
// A couple of mining specifi functions.
@@ -393,72 +78,6 @@ static inline void mm256_bswap_intrlv80_8x32( void *d, const void *s )
mm256_bswap_intrlv_8x32_128( d+512, casti_m128i( s, 4 ) );
}
// Deinterleave 8 buffers of 32 bit data from the source buffer.
// Sub-function can be called directly for 32 byte final hash.
static inline void mm256_dintrlv_8x32( void *d0, void *d1, void *d2,
void *d3, void *d4, void *d5, void *d6, void *d7,
const void *s, int bit_len )
{
mm256_dintrlv_8x32_256( casti_m256i(d0,0), casti_m256i(d1,0),
casti_m256i(d2,0), casti_m256i(d3,0), casti_m256i(d4,0),
casti_m256i(d5,0), casti_m256i(d6,0), casti_m256i(d7,0), s );
if ( bit_len <= 256 ) return;
mm256_dintrlv_8x32_256( casti_m256i(d0,1), casti_m256i(d1,1),
casti_m256i(d2,1), casti_m256i(d3,1), casti_m256i(d4,1),
casti_m256i(d5,1), casti_m256i(d6,1), casti_m256i(d7,1), s+256 );
if ( bit_len <= 512 ) return;
// short block, final 16 bytes of input data
if ( bit_len <= 640 )
{
mm128_dintrlv_8x32_128( casti_m128i(d0,2), casti_m128i(d1,2),
casti_m128i(d2,2), casti_m128i(d3,2), casti_m128i(d4,2),
casti_m128i(d5,2), casti_m128i(d6,2), casti_m128i(d7,2), s+512 );
return;
}
// bitlen == 1024
mm256_dintrlv_8x32_256( casti_m256i(d0,2), casti_m256i(d1,2),
casti_m256i(d2,2), casti_m256i(d3,2), casti_m256i(d4,2),
casti_m256i(d5,2), casti_m256i(d6,2), casti_m256i(d7,2), s+512 );
mm256_dintrlv_8x32_256( casti_m256i(d0,3), casti_m256i(d1,3),
casti_m256i(d2,3), casti_m256i(d3,3), casti_m256i(d4,3),
casti_m256i(d5,3), casti_m256i(d6,3), casti_m256i(d7,3), s+768 );
}
static inline void mm256_extract_lane_8x32( void *d, const void *s,
const int lane, const int bit_len )
{
casti_m256i( d,0 ) = mm256_get_32(s, lane , lane+ 8, lane+ 16, lane+ 24,
lane+32, lane+ 40, lane+ 48, lane+ 56 );
if ( bit_len <= 256 ) return;
casti_m256i( d,1 ) = mm256_get_32(s, lane+64, lane+ 72, lane+ 80, lane+ 88,
lane+96, lane+104, lane+112, lane+120 );
// bit_len == 512
}
// Interleave 4 source buffers containing 64 bit data into the destination
// buffer. Only bit_len 256, 512, 640 & 1024 are supported.
static inline void mm256_intrlv_4x64( void *d, const void *s0,
const void *s1, const void *s2, const void *s3, int bit_len )
{
mm256_intrlv_4x64_256( d , casti_m256i(s0,0), casti_m256i(s1,0),
casti_m256i(s2,0), casti_m256i(s3,0) );
if ( bit_len <= 256 ) return;
mm256_intrlv_4x64_256( d+128, casti_m256i(s0,1), casti_m256i(s1,1),
casti_m256i(s2,1), casti_m256i(s3,1) );
if ( bit_len <= 512 ) return;
if ( bit_len <= 640 )
{
mm256_intrlv_4x64_128( d+256, casti_m128i(s0,4), casti_m128i(s1,4),
casti_m128i(s2,4), casti_m128i(s3,4) );
return;
}
// bit_len == 1024
mm256_intrlv_4x64_256( d+256, casti_m256i(s0,2), casti_m256i(s1,2),
casti_m256i(s2,2), casti_m256i(s3,2) );
mm256_intrlv_4x64_256( d+384, casti_m256i(s0,3), casti_m256i(s1,3),
casti_m256i(s2,3), casti_m256i(s3,3) );
}
// Interleave 80 bytes of 32 bit data for 8 lanes.
static inline void mm256_bswap_intrlv80_4x64( void *d, const void *s )
{
@@ -481,258 +100,5 @@ do { \
dst[7] = _mm256_blendv_epi8( a[7], b[7], mask ); \
} while(0)
// Deinterleave 4 buffers of 64 bit data from the source buffer.
// bit_len must be 256, 512, 640 or 1024 bits.
// Requires overrun padding for 640 bit len.
static inline void mm256_dintrlv_4x64( void *d0, void *d1, void *d2,
void *d3, const void *s, int bit_len )
{
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 0, s );
if ( bit_len <= 256 ) return;
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 1, s+128 );
if ( bit_len <= 512 ) return;
// short block, final 16 bytes of input data
if ( bit_len <= 640 )
{
mm128_dintrlv_4x64_128( d0, d1, d2, d3, 4, s+256 );
return;
}
// bit_len == 1024
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 2, s+256 );
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 3, s+384 );
}
// extract and deinterleave specified lane.
#define mm256_extract_lane_4x64_256 \
casti_m256i( d, 0 ) = mm256_get_64( s, lane, lane+4, lane+8, lane+12 )
static inline void mm256_extract_lane_4x64( void *d, const void *s,
const int lane, const int bit_len )
{
casti_m256i( d, 0 ) = mm256_get_64( s, lane, lane+4, lane+8, lane+12 );
if ( bit_len <= 256 ) return;
casti_m256i( d, 1 ) = mm256_get_64( s, lane+16, lane+20, lane+24, lane+28 );
return;
}
// Convert from 4x32 SSE2 interleaving to 4x64 AVX2.
// Can't do it in place
static inline void mm256_rintrlv_4x32_4x64( void *dst, void *src,
int bit_len )
{
__m256i* d = (__m256i*)dst;
uint32_t *s = (uint32_t*)src;
d[0] = _mm256_set_epi32( s[ 7],s[ 3],s[ 6],s[ 2],s[ 5],s[ 1],s[ 4],s[ 0] );
d[1] = _mm256_set_epi32( s[15],s[11],s[14],s[10],s[13],s[ 9],s[12],s[ 8] );
d[2] = _mm256_set_epi32( s[23],s[19],s[22],s[18],s[21],s[17],s[20],s[16] );
d[3] = _mm256_set_epi32( s[31],s[27],s[30],s[26],s[29],s[25],s[28],s[24] );
if ( bit_len <= 256 ) return;
d[4] = _mm256_set_epi32( s[39],s[35],s[38],s[34],s[37],s[33],s[36],s[32] );
d[5] = _mm256_set_epi32( s[47],s[43],s[46],s[42],s[45],s[41],s[44],s[40] );
d[6] = _mm256_set_epi32( s[55],s[51],s[54],s[50],s[53],s[49],s[52],s[48] );
d[7] = _mm256_set_epi32( s[63],s[59],s[62],s[58],s[61],s[57],s[60],s[56] );
if ( bit_len <= 512 ) return;
d[8] = _mm256_set_epi32( s[71],s[67],s[70],s[66],s[69],s[65],s[68],s[64] );
d[9] = _mm256_set_epi32( s[79],s[75],s[78],s[74],s[77],s[73],s[76],s[72] );
if ( bit_len <= 640 ) return;
d[10] = _mm256_set_epi32(s[87],s[83],s[86],s[82],s[85],s[81],s[84],s[80]);
d[11] = _mm256_set_epi32(s[95],s[91],s[94],s[90],s[93],s[89],s[92],s[88]);
d[12] = _mm256_set_epi32(s[103],s[99],s[102],s[98],s[101],s[97],s[100],s[96]);
d[13] = _mm256_set_epi32(s[111],s[107],s[110],s[106],s[109],s[105],s[108],s[104]);
d[14] = _mm256_set_epi32(s[119],s[115],s[118],s[114],s[117],s[113],s[116],s[112]);
d[15] = _mm256_set_epi32(s[127],s[123],s[126],s[122],s[125],s[121],s[124],s[120]);
// bit_len == 1024
}
// Convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
// bit_len must be multiple of 64
static inline void mm256_rintrlv_4x64_4x32( void *dst, void *src,
int bit_len )
{
__m256i *d = (__m256i*)dst;
uint32_t *s = (uint32_t*)src;
d[0] = _mm256_set_epi32( s[ 7],s[ 5],s[ 3],s[ 1],s[ 6],s[ 4],s[ 2],s[ 0] );
d[1] = _mm256_set_epi32( s[15],s[13],s[11],s[ 9],s[14],s[12],s[10],s[ 8] );
d[2] = _mm256_set_epi32( s[23],s[21],s[19],s[17],s[22],s[20],s[18],s[16] );
d[3] = _mm256_set_epi32( s[31],s[29],s[27],s[25],s[30],s[28],s[26],s[24] );
if ( bit_len <= 256 ) return;
d[4] = _mm256_set_epi32( s[39],s[37],s[35],s[33],s[38],s[36],s[34],s[32] );
d[5] = _mm256_set_epi32( s[47],s[45],s[43],s[41],s[46],s[44],s[42],s[40] );
d[6] = _mm256_set_epi32( s[55],s[53],s[51],s[49],s[54],s[52],s[50],s[48] );
d[7] = _mm256_set_epi32( s[63],s[61],s[59],s[57],s[62],s[60],s[58],s[56] );
if ( bit_len <= 512 ) return;
d[8] = _mm256_set_epi32( s[71],s[69],s[67],s[65],s[70],s[68],s[66],s[64] );
d[9] = _mm256_set_epi32( s[79],s[77],s[75],s[73],s[78],s[76],s[74],s[72] );
if ( bit_len <= 640 ) return;
d[10] = _mm256_set_epi32( s[87],s[85],s[83],s[81],s[86],s[84],s[82],s[80] );
d[11] = _mm256_set_epi32( s[95],s[93],s[91],s[89],s[94],s[92],s[90],s[88] );
d[12] = _mm256_set_epi32( s[103],s[101],s[99],s[97],s[102],s[100],s[98],s[96] );
d[13] = _mm256_set_epi32( s[111],s[109],s[107],s[105],s[110],s[108],s[106],s[104] );
d[14] = _mm256_set_epi32( s[119],s[117],s[115],s[113],s[118],s[116],s[114],s[112] );
d[15] = _mm256_set_epi32( s[127],s[125],s[123],s[121],s[126],s[124],s[122],s[120] );
// bit_len == 1024
}
static inline void mm256_rintrlv_4x64_2x128( void *dst0, void *dst1,
const void *src, int bit_len )
{
__m256i* d0 = (__m256i*)dst0;
__m256i* d1 = (__m256i*)dst1;
uint64_t *s = (uint64_t*)src;
d0[0] = _mm256_set_epi64x( s[ 5], s[ 1], s[ 4], s[ 0] );
d1[0] = _mm256_set_epi64x( s[ 7], s[ 3], s[ 6], s[ 2] );
d0[1] = _mm256_set_epi64x( s[13], s[ 9], s[12], s[ 8] );
d1[1] = _mm256_set_epi64x( s[15], s[11], s[14], s[10] );
if ( bit_len <= 256 ) return;
d0[2] = _mm256_set_epi64x( s[21], s[17], s[20], s[16] );
d1[2] = _mm256_set_epi64x( s[23], s[19], s[22], s[18] );
d0[3] = _mm256_set_epi64x( s[29], s[25], s[28], s[24] );
d1[3] = _mm256_set_epi64x( s[31], s[27], s[30], s[26] );
if ( bit_len <= 512 ) return;
d0[4] = _mm256_set_epi64x( s[37], s[33], s[36], s[32] );
d1[4] = _mm256_set_epi64x( s[39], s[35], s[38], s[34] );
d0[5] = _mm256_set_epi64x( s[45], s[41], s[44], s[40] );
d1[5] = _mm256_set_epi64x( s[47], s[43], s[46], s[42] );
d0[6] = _mm256_set_epi64x( s[53], s[49], s[52], s[48] );
d1[6] = _mm256_set_epi64x( s[55], s[51], s[54], s[50] );
d0[7] = _mm256_set_epi64x( s[61], s[57], s[60], s[56] );
d1[7] = _mm256_set_epi64x( s[63], s[59], s[62], s[58] );
}
static inline void mm256_rintrlv_2x128_4x64( void *dst, const void *src0,
const void *src1, int bit_len )
{
__m256i* d = (__m256i*)dst;
uint64_t *s0 = (uint64_t*)src0;
uint64_t *s1 = (uint64_t*)src1;
d[ 0] = _mm256_set_epi64x( s1[2], s1[0], s0[2], s0[0] );
d[ 1] = _mm256_set_epi64x( s1[3], s1[1], s0[3], s0[1] );
d[ 2] = _mm256_set_epi64x( s1[6], s1[4], s0[6], s0[4] );
d[ 3] = _mm256_set_epi64x( s1[7], s1[5], s0[7], s0[5] );
if ( bit_len <= 256 ) return;
d[ 4] = _mm256_set_epi64x( s1[10], s1[ 8], s0[10], s0[ 8] );
d[ 5] = _mm256_set_epi64x( s1[11], s1[ 9], s0[11], s0[ 9] );
d[ 6] = _mm256_set_epi64x( s1[14], s1[12], s0[14], s0[12] );
d[ 7] = _mm256_set_epi64x( s1[15], s1[13], s0[15], s0[13] );
if ( bit_len <= 512 ) return;
d[ 8] = _mm256_set_epi64x( s1[18], s1[16], s0[18], s0[16] );
d[ 9] = _mm256_set_epi64x( s1[19], s1[17], s0[19], s0[17] );
d[10] = _mm256_set_epi64x( s1[22], s1[20], s0[22], s0[20] );
d[11] = _mm256_set_epi64x( s1[23], s1[21], s0[23], s0[21] );
d[12] = _mm256_set_epi64x( s1[26], s1[24], s0[26], s0[24] );
d[13] = _mm256_set_epi64x( s1[27], s1[25], s0[27], s0[25] );
d[14] = _mm256_set_epi64x( s1[30], s1[28], s0[30], s0[28] );
d[15] = _mm256_set_epi64x( s1[31], s1[29], s0[31], s0[29] );
}
static inline void mm256_intrlv_2x128( const void *d, const void *s0,
void *s1, const int bit_len )
{
__m128i s1hi = _mm256_extracti128_si256( casti_m256i( s1,0), 1 );
__m128i s0hi = _mm256_extracti128_si256( casti_m256i( s0,0), 1 );
casti_m256i( d,0 ) = mm256_concat_128(
_mm256_castsi256_si128( casti_m256i( s1,0 ) ),
_mm256_castsi256_si128( casti_m256i( s0,0 ) ) );
casti_m256i( d,1 ) = mm256_concat_128( s1hi, s0hi );
if ( bit_len <= 256 ) return;
s0hi = _mm256_extracti128_si256( casti_m256i( s0,1), 1 );
s1hi = _mm256_extracti128_si256( casti_m256i( s1,1), 1 );
casti_m256i( d,2 ) = mm256_concat_128(
_mm256_castsi256_si128( casti_m256i( s1,1 ) ),
_mm256_castsi256_si128( casti_m256i( s0,1 ) ) );
casti_m256i( d,3 ) = mm256_concat_128( s1hi, s0hi );
if ( bit_len <= 512 ) return;
if ( bit_len <= 640 )
{
casti_m256i( d,4 ) = mm256_concat_128(
_mm256_castsi256_si128( casti_m256i( s1,2 ) ),
_mm256_castsi256_si128( casti_m256i( s0,2 ) ) );
return;
}
s0hi = _mm256_extracti128_si256( casti_m256i( s0,2), 1 );
s1hi = _mm256_extracti128_si256( casti_m256i( s1,2), 1 );
casti_m256i( d,4 ) = mm256_concat_128(
_mm256_castsi256_si128( casti_m256i( s1,2 ) ),
_mm256_castsi256_si128( casti_m256i( s0,2 ) ) );
casti_m256i( d,5 ) = mm256_concat_128( s1hi, s0hi );
s0hi = _mm256_extracti128_si256( casti_m256i( s0,3), 1 );
s1hi = _mm256_extracti128_si256( casti_m256i( s1,3), 1 );
casti_m256i( d,6 ) = mm256_concat_128(
_mm256_castsi256_si128( casti_m256i( s1,3 ) ),
_mm256_castsi256_si128( casti_m256i( s0,3 ) ) );
casti_m256i( d,7 ) = mm256_concat_128( s1hi, s0hi );
}
static inline void mm256_dintrlv_2x128( void *dst0, void *dst1, const void *s,
int bit_len )
{
__m256i *d0 = (__m256i*)dst0;
__m256i *d1 = (__m256i*)dst1;
__m256i s0 = casti_m256i( s, 0 );
__m256i s1 = casti_m256i( s, 1 );
d0[0] = _mm256_inserti128_si256( s0, mm128_extr_lo128_256( s1 ), 1 );
d1[0] = _mm256_inserti128_si256( s1, mm128_extr_hi128_256( s0 ), 0 );
if ( bit_len <= 256 ) return;
s0 = casti_m256i( s, 2 );
s1 = casti_m256i( s, 3 );
d0[1] = _mm256_inserti128_si256( s0, mm128_extr_lo128_256( s1 ), 1 );
d1[1] = _mm256_inserti128_si256( s1, mm128_extr_hi128_256( s0 ), 0 );
if ( bit_len <= 512 ) return;
s0 = casti_m256i( s, 4 );
s1 = casti_m256i( s, 5 );
d0[2] = _mm256_inserti128_si256( s0, mm128_extr_lo128_256( s1 ), 1 );
d1[2] = _mm256_inserti128_si256( s1, mm128_extr_hi128_256( s0 ), 0 );
s0 = casti_m256i( s, 6 );
s1 = casti_m256i( s, 7 );
d0[3] = _mm256_inserti128_si256( s0, mm128_extr_lo128_256( s1 ), 1 );
d1[3] = _mm256_inserti128_si256( s1, mm128_extr_hi128_256( s0 ), 0 );
}
#undef extr64_cast128_256
#undef extr32_cast128_256
#endif // AVX2
#endif // INTRLV_AVX22_H__
#endif // INTRLV_AVX2_H__

View File

@@ -442,7 +442,7 @@ static inline void mm512_dintrlv_16x32( void *d00, void *d01, void *d02,
1, src+1024 );
}
static inline void mm512_extract_lane_16x32( void *dst, const void *src,
static inline void mm512_extr_lane_16x32( void *dst, const void *src,
const int lane, const int bit_len )
{
if ( bit_len <= 256 )
@@ -506,7 +506,7 @@ static inline void mm512_dintrlv_8x64( void *d0, void *d1, void *d2,
}
// Extract one lane from 64 bit interleaved data
static inline void mm512_extract_lane_8x64( void *d, const void *s,
static inline void mm512_extr_lane_8x64( void *d, const void *s,
const int lane, const int bit_len )
{
if ( bit_len <= 256 )
@@ -661,7 +661,7 @@ static inline void mm512_rintrlv_4x128_8x64( void *dst, const void *src0,
}
static inline void mm512_extract_lane_4x128( void *d, const void *s,
static inline void mm512_extr_lane_4x128( void *d, const void *s,
const int lane, const int bit_len )
{
int l = lane<<1;

View File

@@ -104,7 +104,7 @@ static inline void mm64_dintrlv_2x32( void *d00, void *d01, const int n,
casti_m64( d01,15 ) = mm64_get_32( s, 61, 63 );
}
static inline void mm64_extract_lane_2x32( void *d, const void *s,
static inline void mm64_extr_lane_2x32( void *d, const void *s,
const int lane, const int bit_len )
{
casti_m64( d, 0 ) = mm64_get_32( s, lane , lane+ 4 );

View File

@@ -0,0 +1,77 @@
#if !defined(INTRLV_SELECTOR_H__)
#define INTRLV_SELECTOR_H__
//////////////////////////////////////////////////////////////
//
// Generic interface for interleaving data for parallel processing.
//
// Best tech is chosen atomatically.
/*
#if defined(__AVX512F__)
#define intrlv_4x128 mm512_intrlv_4x128
#define intrlv_4x128 mm512_intrlv_4x128
#define intrlv_8x64 mm512_intrlv_8x64
#define dintrlv_8x64 mm512_dintrlv_8x64
#define extr_lane_8x64 mm512_extr_lane_8x64
#define intrlv_16x32 mm512_intrlv_16x32
#define dintrlv_16x32 mm512_dintrlv_16x32
#define extr_lane_16x32 mm512_extr_lane_16x32
#define intrlv_2x128 mm512_intrlv_2x128
#define dintrlv_2x128 mm512_dintrlv_2x128
#define intrlv_4x64 mm512_intrlv_4x64
#define dintrlv_4x64 mm512_dintrlv_4x64
#define extr_lane_4x64 mm512_extr_lane_4x64
#define intrlv_8x32 mm512_intrlv_8x32
#define dintrlv_8x32 mm512_dintrlv_8x32
#define extr_lane_8x32 mm512_extr_lane_8x32
#elif defined(__AVX__)
*/
#if defined(__AVX__)
#define intrlv_2x128 mm256_intrlv_2x128
#define dintrlv_2x128 mm256_dintrlv_2x128
#define intrlv_4x64 mm256_intrlv_4x64
#define dintrlv_4x64 mm256_dintrlv_4x64
#define extr_lane_4x64 mm256_extr_lane_4x64
#define intrlv_8x32 mm256_intrlv_8x32
#define dintrlv_8x32 mm256_dintrlv_8x32
#define extr_lane_8x32 mm256_extr_lane_8x32
#define intrlv_4x32 mm256_intrlv_4x32
#define dintrlv_4x32 mm256_dintrlv_4x32
#define extr_lane_4x32 mm256_extr_lane_4x32
#else
#define intrlv_2x128 mm128_intrlv_2x128
#define dintrlv_2x128 mm128_dintrlv_2x128
#define intrlv_4x64 mm128_intrlv_4x64
#define dintrlv_4x64 mm128_dintrlv_4x64
#define extr_lane_4x64 mm128_extr_lane_4x64
#define intrlv_8x32 mm128_intrlv_8x32
#define dintrlv_8x32 mm128_dintrlv_8x32
#define extr_lane_8x32 mm128_extr_lane_8x32
#define intrlv_2x64 mm128_intrlv_2x64
#define dintrlv_2x64 mm128_dintrlv_2x64
#define extr_lane_2x64 mm128_extr_lane_2x64
#define intrlv_4x32 mm128_intrlv_4x32
#define dintrlv_4x32 mm128_dintrlv_4x32
#define extr_lane_4x32 mm128_extr_lane_4x32
#endif
#endif // INTRLV_SELECTOR_H__

View File

@@ -162,8 +162,8 @@ static inline void mm128_dintrlv_4x32( void *d0, void *d1, void *d2,
}
// extract and deinterleave specified lane.
static inline void mm128_extract_lane_4x32( void *d, const void *s,
const int lane, const int bit_len )
static inline void mm128_extr_lane_4x32( void *d, const void *s,
const int lane, const int bit_len )
{
casti_m128i( d, 0 ) =
mm128_get_32( s, lane , lane+ 4, lane+ 8, lane+12 );

243
simd-utils/simd-avx.h Normal file
View File

@@ -0,0 +1,243 @@
#if !defined(SIMD_AVX_H__)
#define SIMD_AVX_H__ 1
#if defined(__AVX__)
/////////////////////////////////////////////////////////////////////
//
// AVX 256 bit vectors
//
// Basic support for 256 bit vectors. Most of the good stuff needs AVX2.
// Compile time vector constants and initializers.
//
// The following macro constants and functions should only be used
// for compile time initialization of constant and variable vector
// arrays. These constants use memory, use _mm256_set at run time to
// avoid using memory.
#define mm256_const_64( x3, x2, x1, x0 ) {{ x3, x2, x1, x0 }}
#define mm256_const1_64( x ) {{ x,x,x,x }}
#define mm256_const_32( x7, x6, x5, x4, x3, x2, x1, x0 ) \
{{ x7, x6, x5, x4, x3, x2, x1, x0 }}
#define mm256_const1_32( x ) {{ x,x,x,x, x,x,x,x }}
#define mm256_const_16( x15, x14, x13, x12, x11, x10, x09, x08, \
x07, x06, x05, x04, x03, x02, x01, x00 ) \
{{ x15, x14, x13, x12, x11, x10, x09, x08, \
x07, x06, x05, x04, x03, x02, x01, x00 }}
#define mm256_const1_16( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
#define mm256_const_8( x31, x30, x29, x28, x27, x26, x25, x24, \
x23, x22, x21, x20, x19, x18, x17, x16, \
x15, x14, x13, x12, x11, x10, x09, x08, \
x07, x06, x05, x04, x03, x02, x01, x00 ) \
{{ x31, x30, x29, x28, x27, x26, x25, x24, \
x23, x22, x21, x20, x19, x18, x17, x16, \
x15, x14, x13, x12, x11, x10, x09, x08, \
x07, x06, x05, x04, x03, x02, x01, x00 }}
#define mm256_const1_8( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, \
x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
// Predefined compile time constant vectors.
// Use Pseudo constants at run time for all simple constant vectors.
#define c256_zero mm256_const1_64( 0ULL )
#define c256_one_256 mm256_const_64( 0ULL, 0ULL, 0ULL, 1ULL )
#define c256_one_128 mm256_const_64( 0ULL, 1ULL, 0ULL, 1ULL )
#define c256_one_64 mm256_const1_64( 1ULL )
#define c256_one_32 mm256_const1_32( 1UL )
#define c256_one_16 mm256_const1_16( 1U )
#define c256_one_8 mm256_const1_8( 1U )
#define c256_neg1 mm256_const1_64( 0xFFFFFFFFFFFFFFFFULL )
#define c256_neg1_64 mm256_const1_64( 0xFFFFFFFFFFFFFFFFULL )
#define c256_neg1_32 mm256_const1_32( 0xFFFFFFFFUL )
#define c256_neg1_16 mm256_const1_16( 0xFFFFU )
#define c256_neg1_8 mm256_const1_8( 0xFFU )
//
// Pseudo constants.
// These can't be used for compile time initialization but are preferable
// for simple constant vectors at run time.
#define m256_zero _mm256_setzero_si256()
#define m256_one_256 _mm256_set_epi64x( 0ULL, 0ULL, 0ULL, 1ULL )
#define m256_one_128 _mm256_set_epi64x( 0ULL, 1ULL, 0ULL, 1ULL )
#define m256_one_64 _mm256_set1_epi64x( 1ULL )
#define m256_one_32 _mm256_set1_epi32( 1UL )
#define m256_one_16 _mm256_set1_epi16( 1U )
#define m256_one_8 _mm256_set1_epi8( 1U )
#define m256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
//
// Vector size conversion.
//
// Allows operations on either or both halves of a 256 bit vector serially.
// Handy for parallel AES.
// Caveats:
// _mm256_castsi256_si128 is free and without side effects.
// _mm256_castsi128_si256 is also free but leaves the high half
// undefined. That's ok if the hi half will be subseqnently assigned.
// If assigning both, do lo first, If assigning only 1, use
// _mm256_inserti128_si256.
//
// What to do about extractf128 (AVX) and extracti128 (AVX2)?
#define mm128_extr_lo128_256( a ) _mm256_castsi256_si128( a )
#define mm128_extr_hi128_256( a ) _mm256_extractf128_si256( a, 1 )
// Extract 4 u64 from 256 bit vector.
#define mm256_extr_4x64( a0, a1, a2, a3, src ) \
do { \
__m128i hi = _mm256_extractf128_si256( src, 1 ); \
a0 = _mm_extract_epi64( _mm256_castsi256_si128( src ), 0 ); \
a1 = _mm_extract_epi64( _mm256_castsi256_si128( src ), 1 ); \
a2 = _mm_extract_epi64( hi, 0 ); \
a3 = _mm_extract_epi64( hi, 1 ); \
} while(0)
#define mm256_extr_8x32( a0, a1, a2, a3, a4, a5, a6, a7, src ) \
do { \
__m128i hi = _mm256_extractf128_si256( src, 1 ); \
a0 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 0 ); \
a1 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 1 ); \
a2 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 2 ); \
a3 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 3 ); \
a4 = _mm_extract_epi32( hi, 0 ); \
a5 = _mm_extract_epi32( hi, 1 ); \
a6 = _mm_extract_epi32( hi, 2 ); \
a7 = _mm_extract_epi32( hi, 3 ); \
} while(0)
// input __m128i, returns __m256i
// To build a 256 bit vector from 2 128 bit vectors lo must be done first.
// lo alone leaves hi undefined, hi alone leaves lo unchanged.
// Both cost one clock while preserving the other half..
// Insert b into specified half of a leaving other half of a unchanged.
#define mm256_ins_lo128_256( a, b ) _mm256_insertf128_si256( a, b, 0 )
#define mm256_ins_hi128_256( a, b ) _mm256_insertf128_si256( a, b, 1 )
// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
#define mm256_concat_128( hi, lo ) \
mm256_ins_hi128_256( _mm256_castsi128_si256( lo ), hi )
// Horizontal vector testing
// Needs int128 support
// Bit-wise test of entire vector, useful to test results of cmp.
#define mm256_anybits0( a ) \
( (uint128_t)mm128_extr_hi128_256( a ) \
| (uint128_t)mm128_extr_lo128_256( a ) )
#define mm256_anybits1( a ) \
( ( (uint128_t)mm128_extr_hi128_256( a ) + 1 ) \
| ( (uint128_t)mm128_extr_lo128_256( a ) + 1 ) )
#define mm256_allbits0_256( a ) ( !mm256_anybits1(a) )
#define mm256_allbits1_256( a ) ( !mm256_anybits0(a) )
// Parallel AES, for when x is expected to be in a 256 bit register.
#define mm256_aesenc_2x128( x ) \
mm256_concat_128( \
_mm_aesenc_si128( mm128_extr_hi128_256( x ), m128_zero ), \
_mm_aesenc_si128( mm128_extr_lo128_256( x ), m128_zero ) )
#define mm256_aesenckey_2x128( x, k ) \
mm256_concat_128( \
_mm_aesenc_si128( mm128_extr_hi128_256( x ), \
mm128_extr_lo128_256( k ) ), \
_mm_aesenc_si128( mm128_extr_hi128_256( x ), \
mm128_extr_lo128_256( k ) ) )
#define mm256_paesenc_2x128( y, x ) do \
{ \
__m256i *X = (__m256i*)x; \
__m256i *Y = (__m256i*)y; \
y[0] = _mm_aesenc_si128( x[0], m128_zero ); \
y[1] = _mm_aesenc_si128( x[1], m128_zero ); \
} while(0);
// With pointers.
#define mm256_paesenckey_2x128( y, x, k ) do \
{ \
__m256i *X = (__m256i*)x; \
__m256i *Y = (__m256i*)y; \
__m256i *K = (__m256i*)ky; \
y[0] = _mm_aesenc_si128( x[0], K[0] ); \
y[1] = _mm_aesenc_si128( x[1], K[1] ); \
} while(0);
//
// Pointer casting
// p = any aligned pointer
// returns p as pointer to vector type, not very useful
#define castp_m256i(p) ((__m256i*)(p))
// p = any aligned pointer
// returns *p, watch your pointer arithmetic
#define cast_m256i(p) (*((__m256i*)(p)))
// p = any aligned pointer, i = scaled array index
// returns value p[i]
#define casti_m256i(p,i) (((__m256i*)(p))[(i)])
// p = any aligned pointer, o = scaled offset
// returns pointer p+o
#define casto_m256i(p,o) (((__m256i*)(p))+(o))
// Gather scatter
#define mm256_gather_64( d, s0, s1, s2, s3 ) \
((uint64_t*)(d))[0] = (uint64_t)(s0); \
((uint64_t*)(d))[1] = (uint64_t)(s1); \
((uint64_t*)(d))[2] = (uint64_t)(s2); \
((uint64_t*)(d))[3] = (uint64_t)(s3);
#define mm256_gather_32( d, s0, s1, s2, s3, s4, s5, s6, s7 ) \
((uint32_t*)(d))[0] = (uint32_t)(s0); \
((uint32_t*)(d))[1] = (uint32_t)(s1); \
((uint32_t*)(d))[2] = (uint32_t)(s2); \
((uint32_t*)(d))[3] = (uint32_t)(s3); \
((uint32_t*)(d))[4] = (uint32_t)(s4); \
((uint32_t*)(d))[5] = (uint32_t)(s5); \
((uint32_t*)(d))[6] = (uint32_t)(s6); \
((uint32_t*)(d))[7] = (uint32_t)(s7);
// Scatter data from contiguous memory.
// All arguments are pointers
#define mm256_scatter_64( d0, d1, d2, d3, s ) \
*((uint64_t*)(d0)) = ((uint64_t*)(s))[0]; \
*((uint64_t*)(d1)) = ((uint64_t*)(s))[1]; \
*((uint64_t*)(d2)) = ((uint64_t*)(s))[2]; \
*((uint64_t*)(d3)) = ((uint64_t*)(s))[3];
#define mm256_scatter_32( d0, d1, d2, d3, d4, d5, d6, d7, s ) \
*((uint32_t*)(d0)) = ((uint32_t*)(s))[0]; \
*((uint32_t*)(d1)) = ((uint32_t*)(s))[1]; \
*((uint32_t*)(d2)) = ((uint32_t*)(s))[2]; \
*((uint32_t*)(d3)) = ((uint32_t*)(s))[3]; \
*((uint32_t*)(d4)) = ((uint32_t*)(s))[4]; \
*((uint32_t*)(d5)) = ((uint32_t*)(s))[5]; \
*((uint32_t*)(d6)) = ((uint32_t*)(s))[6]; \
*((uint32_t*)(d7)) = ((uint32_t*)(s))[7];
//
// Memory functions
// n = number of 256 bit (32 byte) vectors
static inline void memset_zero_256( __m256i *dst, int n )
{ for ( int i = 0; i < n; i++ ) dst[i] = m256_zero; }
static inline void memset_256( __m256i *dst, const __m256i a, int n )
{ for ( int i = 0; i < n; i++ ) dst[i] = a; }
static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
#endif // __AVX__
#endif // SIMD_AVX_H__

View File

@@ -17,66 +17,6 @@
// Constants of these types reside in memory.
// Compile time vector constants and initializers.
//
// The following macro constants and functions should only be used
// for compile time initialization of constant and variable vector
// arrays. These constants use memory, use _mm256_set at run time to
// avoid using memory.
#define mm256_const_64( x3, x2, x1, x0 ) {{ x3, x2, x1, x0 }}
#define mm256_const1_64( x ) {{ x,x,x,x }}
#define mm256_const_32( x7, x6, x5, x4, x3, x2, x1, x0 ) \
{{ x7, x6, x5, x4, x3, x2, x1, x0 }}
#define mm256_const1_32( x ) {{ x,x,x,x, x,x,x,x }}
#define mm256_const_16( x15, x14, x13, x12, x11, x10, x09, x08, \
x07, x06, x05, x04, x03, x02, x01, x00 ) \
{{ x15, x14, x13, x12, x11, x10, x09, x08, \
x07, x06, x05, x04, x03, x02, x01, x00 }}
#define mm256_const1_16( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
#define mm256_const_8( x31, x30, x29, x28, x27, x26, x25, x24, \
x23, x22, x21, x20, x19, x18, x17, x16, \
x15, x14, x13, x12, x11, x10, x09, x08, \
x07, x06, x05, x04, x03, x02, x01, x00 ) \
{{ x31, x30, x29, x28, x27, x26, x25, x24, \
x23, x22, x21, x20, x19, x18, x17, x16, \
x15, x14, x13, x12, x11, x10, x09, x08, \
x07, x06, x05, x04, x03, x02, x01, x00 }}
#define mm256_const1_8( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, \
x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
// Predefined compile time constant vectors.
// Use Pseudo constants at run time for all simple constant vectors.
#define c256_zero mm256_const1_64( 0ULL )
#define c256_one_256 mm256_const_64( 0ULL, 0ULL, 0ULL, 1ULL )
#define c256_one_128 mm256_const_64( 0ULL, 1ULL, 0ULL, 1ULL )
#define c256_one_64 mm256_const1_64( 1ULL )
#define c256_one_32 mm256_const1_32( 1UL )
#define c256_one_16 mm256_const1_16( 1U )
#define c256_one_8 mm256_const1_8( 1U )
#define c256_neg1 mm256_const1_64( 0xFFFFFFFFFFFFFFFFULL )
#define c256_neg1_64 mm256_const1_64( 0xFFFFFFFFFFFFFFFFULL )
#define c256_neg1_32 mm256_const1_32( 0xFFFFFFFFUL )
#define c256_neg1_16 mm256_const1_16( 0xFFFFU )
#define c256_neg1_8 mm256_const1_8( 0xFFU )
//
// Pseudo constants.
// These can't be used for compile time initialization but are preferable
// for simple constant vectors at run time.
#define m256_zero _mm256_setzero_si256()
#define m256_one_256 _mm256_set_epi64x( 0ULL, 0ULL, 0ULL, 1ULL )
#define m256_one_128 _mm256_set_epi64x( 0ULL, 1ULL, 0ULL, 1ULL )
#define m256_one_64 _mm256_set1_epi64x( 1ULL )
#define m256_one_32 _mm256_set1_epi32( 1UL )
#define m256_one_16 _mm256_set1_epi16( 1U )
#define m256_one_8 _mm256_set1_epi8( 1U )
#define m256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
//
// Basic operations without SIMD equivalent
@@ -88,6 +28,11 @@
#define mm256_negate_32( a ) _mm256_sub_epi32( m256_zero, a )
#define mm256_negate_16( a ) _mm256_sub_epi16( m256_zero, a )
/***************************
*
* extracti128 (AVX2) vs extractf128 (AVX)???
//
// Vector size conversion.
//
@@ -133,7 +78,9 @@ do { \
// Insert b into specified half of a leaving other half of a unchanged.
#define mm256_ins_lo128_256( a, b ) _mm256_inserti128_si256( a, b, 0 )
#define mm256_ins_hi128_256( a, b ) _mm256_inserti128_si256( a, b, 1 )
*/
/*
// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
#define mm256_concat_128( hi, lo ) \
mm256_ins_hi128_256( _mm256_castsi128_si256( lo ), hi )
@@ -254,6 +201,8 @@ static inline void memset_256( __m256i *dst, const __m256i a, int n )
static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
*************************************/
//
// Bit rotations.
//

View File

@@ -14,11 +14,13 @@
#ifndef WIN32
#define HWMON_PATH \
"/sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input"
"/sys/class/hwmon/hwmon2/temp1_input"
#define HWMON_ALT \
"/sys/class/hwmon/hwmon1/temp1_input"
#define HWMON_ALT2 \
"/sys/class/hwmon/hwmon0/temp1_input"
#define HWMON_ALT1 \
"/sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input"
#define HWMON_ALT2 \
"/sys/class/hwmon/hwmon1/temp1_input"
#define HWMON_ALT3 \
"/sys/devices/platform/coretemp.0/hwmon/hwmon0/temp2_input"
#define HWMON_ALT4 \

5
util.c
View File

@@ -1631,7 +1631,7 @@ bool rpc2_job_decode(const json_t *job, struct work *work)
hashrate += thr_hashrates[i];
pthread_mutex_unlock(&stats_lock);
double diff = trunc( ( ((double)0xffffffff) / target ) );
if ( opt_showdiff )
if ( !opt_quiet )
// xmr pool diff can change a lot...
applog(LOG_WARNING, "Stratum difficulty set to %g", diff);
stratum_diff = diff;
@@ -1813,7 +1813,8 @@ static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params)
/* store for api stats */
stratum_diff = diff;
applog(LOG_WARNING, "Stratum difficulty set to %g", diff);
if ( !opt_quiet )
applog(LOG_BLUE, "Stratum difficulty set to %g", diff);
return true;
}