mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.7.6
This commit is contained in:
@@ -108,6 +108,7 @@ cpuminer_SOURCES = \
|
||||
algo/lyra2/lyra2z.c \
|
||||
algo/lyra2/lyra2z-4way.c \
|
||||
algo/lyra2/lyra2z330.c \
|
||||
algo/lyra2/lyra2h.c \
|
||||
algo/m7m.c \
|
||||
algo/neoscrypt.c \
|
||||
algo/nist5/nist5-gate.c \
|
||||
@@ -128,6 +129,7 @@ cpuminer_SOURCES = \
|
||||
algo/sha/sha256t.c \
|
||||
algo/shabal/sph_shabal.c \
|
||||
algo/shavite/sph_shavite.c \
|
||||
algo/shavite/sph-shavite-aesni.c \
|
||||
algo/shavite/shavite.c \
|
||||
algo/simd/sph_simd.c \
|
||||
algo/simd/sse2/nist.c \
|
||||
@@ -155,11 +157,12 @@ cpuminer_SOURCES = \
|
||||
algo/whirlpool/whirlpool-4way.c \
|
||||
algo/whirlpool/whirlpool.c \
|
||||
algo/whirlpool/whirlpoolx.c \
|
||||
algo/x11/phi1612.c \
|
||||
algo/x11/x11-gate.c \
|
||||
algo/x11/x11.c \
|
||||
algo/x11/x11evo.c \
|
||||
algo/x11/x11gost.c \
|
||||
algo/x11/c11.c \
|
||||
algo/x11/phi1612.c \
|
||||
algo/x13/x13.c \
|
||||
algo/x13/x13sm3.c \
|
||||
algo/x14/x14.c \
|
||||
|
@@ -40,6 +40,7 @@ Supported Algorithms
|
||||
keccakc Creative coin
|
||||
lbry LBC, LBRY Credits
|
||||
luffa Luffa
|
||||
lyra2h Hppcoin
|
||||
lyra2re lyra2
|
||||
lyra2rev2 lyra2v2, Vertcoin
|
||||
lyra2z Zcoin (XZC)
|
||||
|
@@ -164,6 +164,12 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.7.6
|
||||
|
||||
Added kyra2h algo for Hppcoin.
|
||||
Added support for more than 64 CPUs.
|
||||
Optimized shavite512 with AES, improves x11 etc.
|
||||
|
||||
v3.7.5
|
||||
|
||||
New algo keccakc for Creative coin with 4way optimizations
|
||||
|
@@ -138,6 +138,10 @@ void init_algo_gate( algo_gate_t* gate )
|
||||
gate->work_cmp_size = STD_WORK_CMP_SIZE;
|
||||
}
|
||||
|
||||
// Ignore warnings for not yet defined register functions
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
|
||||
|
||||
// called by each thread that uses the gate
|
||||
bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
{
|
||||
@@ -151,11 +155,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
|
||||
switch (algo)
|
||||
{
|
||||
|
||||
// Ignore warnings for not yet defined register fucntions
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
|
||||
|
||||
case ALGO_ARGON2: register_argon2_algo ( gate ); break;
|
||||
case ALGO_AXIOM: register_axiom_algo ( gate ); break;
|
||||
case ALGO_BASTION: register_bastion_algo ( gate ); break;
|
||||
@@ -180,6 +179,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_KECCAKC: register_keccakc_algo ( gate ); break;
|
||||
case ALGO_LBRY: register_lbry_algo ( gate ); break;
|
||||
case ALGO_LUFFA: register_luffa_algo ( gate ); break;
|
||||
case ALGO_LYRA2H: register_lyra2h_algo ( gate ); break;
|
||||
case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break;
|
||||
case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break;
|
||||
case ALGO_LYRA2Z: register_lyra2z_algo ( gate ); break;
|
||||
@@ -221,10 +221,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
|
||||
case ALGO_ZR5: register_zr5_algo ( gate ); break;
|
||||
|
||||
// restore warnings
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
default:
|
||||
applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
|
||||
return false;
|
||||
@@ -239,6 +235,9 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
return true;
|
||||
}
|
||||
|
||||
// restore warnings
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
// override std defaults with jr2 defaults
|
||||
bool register_json_rpc2( algo_gate_t *gate )
|
||||
{
|
||||
|
@@ -32,12 +32,12 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[4*4] __attribute__ ((aligned (32)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
// uint32_t HTarget = ptarget[7];
|
||||
uint32_t _ALIGN(32) endiandata[20];
|
||||
uint32_t _ALIGN(32) edata[20];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
@@ -47,18 +47,17 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
// HTarget = 0x7f;
|
||||
|
||||
// we need big endian data...
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
swab32_array( edata, pdata, 20 );
|
||||
|
||||
mm_interleave_4x32( vdata, endiandata, endiandata, endiandata,
|
||||
endiandata, 640 );
|
||||
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
uint32_t *noncep = vdata + 76; // 19*4
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep +2, n+1 );
|
||||
be32enc( noncep +4, n+2 );
|
||||
be32enc( noncep +6, n+3 );
|
||||
be32enc( noncep +1, n+1 );
|
||||
be32enc( noncep +2, n+2 );
|
||||
be32enc( noncep +3, n+3 );
|
||||
|
||||
blakehash_4way( hash, vdata );
|
||||
|
||||
@@ -74,7 +73,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
if ( (hash+8)[7] == 0 )
|
||||
{
|
||||
if ( fulltest( hash, ptarget ) )
|
||||
if ( fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
@@ -83,7 +82,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
if ( (hash+16)[7] == 0 )
|
||||
{
|
||||
if ( fulltest( hash, ptarget ) )
|
||||
if ( fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
@@ -92,15 +91,14 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
if ( (hash+24)[7] == 0 )
|
||||
{
|
||||
if ( fulltest( hash, ptarget ) )
|
||||
if ( fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
}
|
||||
}
|
||||
|
||||
n += 4;
|
||||
n += 4;
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
|
||||
} while ( (num_found == 0) && (n < max_nonce)
|
||||
|
@@ -17,7 +17,6 @@ bool register_blake_algo( algo_gate_t* gate )
|
||||
gate->optimizations = FOUR_WAY_OPT;
|
||||
gate->scanhash = (void*)&scanhash_blake_4way;
|
||||
gate->hash = (void*)&blakehash_4way;
|
||||
four_way_not_tested();
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake;
|
||||
gate->hash = (void*)&blakehash;
|
||||
|
@@ -524,18 +524,18 @@ do { \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm_xor_si128( s0, _mmset_epi32( CS0, CS0, CS0, CS0 ) ); \
|
||||
V9 = _mm_xor_si128( s1, _mmset_epi32( CS1, CS1, CS1, CS1 ) ); \
|
||||
VA = _mm_xor_si128( s2, _mmset_epi32( CS2, CS2, CS2, CS2 ) ); \
|
||||
VB = _mm_xor_si128( s3, _mmset_epi32( CS3, CS3, CS3, CS3 ) ); \
|
||||
VC = _mm_xor_si128( _mmset_epi32( T0, T0, T0, T0 ), \
|
||||
_mmset_epi32( CS4, CS4, CS4, CS4 ) ); \
|
||||
VD = _mm_xor_si128( _mmset_epi32( T0, T0, T0, T0 ), \
|
||||
_mmset_epi32( CS5, CS5, CS5, CS5 ) ); \
|
||||
VE = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ) \
|
||||
, _mmset_epi32( CS6, CS6, CS6, CS6 ) ); \
|
||||
VF = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ), \
|
||||
_mmset_epi32( CS7, CS7, CS7, CS7 ) ); \
|
||||
V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
|
||||
V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
|
||||
VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
|
||||
VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
|
||||
VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
|
||||
_mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \
|
||||
VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
|
||||
_mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \
|
||||
VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ) \
|
||||
, _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
|
||||
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
|
||||
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
|
||||
M[0x0] = mm_byteswap_32( *(buf + 0) ); \
|
||||
M[0x1] = mm_byteswap_32( *(buf + 1) ); \
|
||||
M[0x2] = mm_byteswap_32( *(buf + 2) ); \
|
||||
@@ -710,18 +710,18 @@ do { \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm256_xor_si256( S0, _mm256_set_epi64( CB0, CB0, CB0, CB0 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, _mm256_set_epi64( CB1, CB1, CB1, CB1 ) ); \
|
||||
VA = _mm256_xor_si256( S2, _mm256_set_epi64( CB2, CB2, CB2, CB2 ) ); \
|
||||
VB = _mm256_xor_si256( S3, _mm256_set_epi64( CB3, CB3, CB3, CB3 ) ); \
|
||||
VC = _mm256_xor_si128( _mm256_set_epi64( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64( CB4, CB4, CB4, CB4 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set_epi64( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64( CB5, CB5, CB5, CB5 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \
|
||||
_mm256_set256_epi64( CB6, CB6, CB6, CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \
|
||||
_mm256_set256_epi64( CB7, CB7, CB7, CB7 ) ); \
|
||||
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
|
||||
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
|
||||
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
|
||||
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
|
||||
M[0x0] = mm256_byteswap_64( *(buf+0) ); \
|
||||
M[0x1] = mm256_byteswap_64( *(buf+1) ); \
|
||||
M[0x2] = mm256_byteswap_64( *(buf+2) ); \
|
||||
@@ -867,7 +867,6 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
|
||||
if ( len < buf_size - ptr )
|
||||
{
|
||||
memcpy_128( buf + (ptr>>2), vdata, len>>2 );
|
||||
@@ -915,9 +914,10 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
unsigned z = 0x80 >> n;
|
||||
unsigned zz = ((ub & -z) | z) & 0xFF;
|
||||
u.buf[ptr>>2] = _mm_set_epi32( zz, zz, zz, zz );
|
||||
// unsigned z = 0x80 >> n;
|
||||
// unsigned zz = ((ub & -z) | z) & 0xFF;
|
||||
// u.buf[ptr>>2] = _mm_set_epi32( zz, zz, zz, zz );
|
||||
u.buf[ptr>>2] = _mm_set1_epi32( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
|
||||
@@ -934,9 +934,11 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
else
|
||||
sc->T0 -= 512 - bit_len;
|
||||
|
||||
if ( ptr <= 48 )
|
||||
// if ( ptr <= 48 )
|
||||
if ( ptr <= 52 )
|
||||
{
|
||||
memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
|
||||
memset_zero_128( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
|
||||
// memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
|
||||
if (out_size_w32 == 8)
|
||||
u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
|
||||
_mm_set_epi32( 0x010000000, 0x01000000,
|
||||
@@ -962,6 +964,7 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
out = (__m128i*)dst;
|
||||
for ( k = 0; k < out_size_w32; k++ )
|
||||
out[k] = mm_byteswap_32( sc->H[k] );
|
||||
// out[k] = sc->H[k];
|
||||
}
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
@@ -13,46 +13,35 @@ static __thread bool ctx_midstate_done = false;
|
||||
|
||||
void decred_hash_4way( void *state, const void *input )
|
||||
{
|
||||
uint32_t vhash[4*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[4] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[4] __attribute__ ((aligned (32)));
|
||||
uint32_t hash2[4] __attribute__ ((aligned (32)));
|
||||
uint32_t hash3[4] __attribute__ ((aligned (32)));
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (32)));
|
||||
blake256_4way_context ctx __attribute__ ((aligned (64)));
|
||||
|
||||
sph_blake256_context ctx2 __attribute__ ((aligned (64)));
|
||||
uint32_t hash[16] __attribute__ ((aligned (64)));
|
||||
uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
|
||||
|
||||
mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 );
|
||||
|
||||
void *tail = input + DECRED_MIDSTATE_LEN;
|
||||
void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
|
||||
int tail_len = 180 - DECRED_MIDSTATE_LEN;
|
||||
// #define MIDSTATE_LEN 128
|
||||
/*
|
||||
uint8_t *ending = (uint8_t*) input;
|
||||
ending += MIDSTATE_LEN;
|
||||
|
||||
if ( !ctx_midstate_done )
|
||||
{
|
||||
blake256_4way_init( &blake_mid );
|
||||
blake256_4way( &blake_mid, input, DECRED_MIDSTATE_LEN );
|
||||
ctx_midstate_done = true;
|
||||
}
|
||||
memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
|
||||
|
||||
blake256_4way( &ctx, tail, tail_len );
|
||||
blake256_4way_close( &ctx, vhash );
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
sph_blake256_init( &ctx2 );
|
||||
sph_blake256( &ctx2, sin0, 180 );
|
||||
sph_blake256_close( &ctx2, hash );
|
||||
|
||||
*/
|
||||
/*
|
||||
blake256_4way_init( &ctx );
|
||||
blake256_4way( &ctx, input, 180 );
|
||||
blake256_4way_close( &ctx, vhash );
|
||||
|
||||
*/
|
||||
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
/*
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
@@ -66,22 +55,21 @@ printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1),
|
||||
printf("\n");
|
||||
*/
|
||||
|
||||
// memcpy( state, hash0, 32 );
|
||||
// memcpy( state+32, hash1, 32 );
|
||||
// memcpy( state+64, hash1, 32 );
|
||||
// memcpy( state+96, hash1, 32 );
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
|
||||
memcpy( state, hash, 32 );
|
||||
// memcpy( state, hash, 32 );
|
||||
|
||||
}
|
||||
|
||||
int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done)
|
||||
{
|
||||
uint32_t vdata[45*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[4*4] __attribute__ ((aligned (32)));
|
||||
uint32_t _ALIGN(64) endiandata[48];
|
||||
// uint32_t _ALIGN(64) hash32[8];
|
||||
uint32_t vdata[48*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t _ALIGN(64) edata[48];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
|
||||
@@ -91,28 +79,25 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
|
||||
// #define DCR_NONCE_OFT32 35
|
||||
|
||||
ctx_midstate_done = false;
|
||||
|
||||
// memcpy(endiandata, pdata, 180);
|
||||
memcpy( edata, pdata, 180 );
|
||||
|
||||
// use the old way until new way updated for size.
|
||||
mm_interleave_4x32x( vdata, pdata, pdata, pdata, pdata, 180*8 );
|
||||
mm_interleave_4x32( vdata, edata, edata, edata, edata, 180*8 );
|
||||
|
||||
blake256_4way_init( &blake_mid );
|
||||
blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
|
||||
|
||||
uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
* noncep = n;
|
||||
*(noncep+2) = n+1;
|
||||
*(noncep+4) = n+2;
|
||||
*(noncep+6) = n+3;
|
||||
*(noncep+1) = n+1;
|
||||
*(noncep+2) = n+2;
|
||||
*(noncep+3) = n+3;
|
||||
|
||||
decred_hash_4way( hash, vdata );
|
||||
|
||||
// endiandata[DCR_NONCE_OFT32] = n;
|
||||
// decred_hash(hash32, endiandata);
|
||||
|
||||
if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
|
||||
{
|
||||
work_set_target_ratio( work, hash );
|
||||
@@ -121,29 +106,47 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
nonces[0] = n;
|
||||
pdata[DECRED_NONCE_INDEX] = n;
|
||||
}
|
||||
/* if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
|
||||
/*
|
||||
if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
printf("found 1\n");
|
||||
|
||||
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[8], hash[9], hash[10],hash[11] );
|
||||
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[12], hash[13], hash[14],hash[15] );
|
||||
printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
|
||||
printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
|
||||
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n;
|
||||
nonces[1] = n+1;
|
||||
}
|
||||
*/
|
||||
if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n;
|
||||
nonces[2] = n+2;
|
||||
}
|
||||
/*
|
||||
if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
printf("found 3\n");
|
||||
|
||||
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[0], hash[1], hash[2],hash[3] );
|
||||
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[4], hash[5], hash[6],hash[7] );
|
||||
printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
|
||||
printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
|
||||
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n;
|
||||
nonces[3] = n+3;
|
||||
}
|
||||
*/
|
||||
n += 4;
|
||||
n += 2;
|
||||
// n += 4;
|
||||
} while ( (num_found == 0) && (n < max_nonce)
|
||||
&& !work_restart[thr_id].restart );
|
||||
|
||||
|
@@ -872,6 +872,7 @@ blake32_close(sph_blake_small_context *sc,
|
||||
} else {
|
||||
sc->T0 -= 512 - bit_len;
|
||||
}
|
||||
|
||||
if (bit_len <= 446) {
|
||||
memset(u.buf + ptr + 1, 0, 55 - ptr);
|
||||
if (out_size_w32 == 8)
|
||||
|
@@ -25,7 +25,7 @@ void jha_hash_4way( void *out, const void *input )
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash0[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash1[8*4] __attribute__ ((aligned (64)));
|
||||
__m256i mask0, mask1;
|
||||
__m256i mask, mask0, mask1;
|
||||
__m256i* vh = (__m256i*)vhash;
|
||||
__m256i* vh0 = (__m256i*)vhash0;
|
||||
__m256i* vh1 = (__m256i*)vhash1;
|
||||
@@ -47,38 +47,37 @@ void jha_hash_4way( void *out, const void *input )
|
||||
// Heavy & Light Pair Loop
|
||||
for ( int round = 0; round < 3; round++ )
|
||||
{
|
||||
// memset_zero_256( vh0, 20 );
|
||||
// memset_zero_256( vh1, 20 );
|
||||
|
||||
// positive logic, if maski select vhi
|
||||
// going from bit to mask reverses logic such that if the test bit is set
|
||||
// zero will be put in mask0, meaning don't take vh0. mask1 is
|
||||
// inverted so 1 will be put in mask1 meaning take it.
|
||||
mask0 = mm256_negate_64(
|
||||
// select next function based on bit 0 of previous hash.
|
||||
// Specutively execute both functions and use mask to
|
||||
// select results from correct function for each lane.
|
||||
// hash = mask : vhash0 ? vhash1
|
||||
mask = mm256_negate_64(
|
||||
_mm256_and_si256( vh[0], _mm256_set1_epi64x( 0x1 ) ) );
|
||||
mask1 = mm256_not( mask0 );
|
||||
|
||||
// second version
|
||||
// mask0 = mask
|
||||
// mask1 = mm256_not( mask );
|
||||
|
||||
// first version
|
||||
// mask = _mm256_sub_epi64( _mm256_and_si256( vh[0],
|
||||
// _mm256_set1_epi64x( 0x1 ) ), _mm256_set1_epi64x( 0x1 ) );
|
||||
|
||||
// groestl (serial) v skein
|
||||
// groestl (serial) vs skein
|
||||
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash0,
|
||||
(char*)hash0, 512 );
|
||||
|
||||
(char*)hash0, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash1,
|
||||
(char*)hash1, 512 );
|
||||
|
||||
(char*)hash1, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash2,
|
||||
(char*)hash2, 512 );
|
||||
(char*)hash2, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash3,
|
||||
(char*)hash3, 512 );
|
||||
(char*)hash3, 512 );
|
||||
|
||||
mm256_interleave_4x64( vhash0, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
@@ -91,14 +90,20 @@ void jha_hash_4way( void *out, const void *input )
|
||||
// merge vectored hash
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
{
|
||||
vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
|
||||
_mm256_and_si256( vh1[i], mask1 ) );
|
||||
// blend should be faster
|
||||
vh[i] = _mm256_blendv_epi8( vh0[i], vh1[i], mask );
|
||||
|
||||
// second version
|
||||
// vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
|
||||
// _mm256_and_si256( vh1[i], mask1 ) );
|
||||
|
||||
// first version
|
||||
/*
|
||||
vha256[i] = _mm256_maskload_epi64(
|
||||
vhasha + i*4, mm256_not( mask ) );
|
||||
vhb256[i] = _mm256_maskload_epi64(
|
||||
vhashb + i*4, mask );
|
||||
vh256[i] = _mm256_or_si256( vha256[i], vhb256[i] );
|
||||
vh0[i] = _mm256_maskload_epi64(
|
||||
vhash0 + i*4, mm256_not( mask ) );
|
||||
vh1[i] = _mm256_maskload_epi64(
|
||||
vhash1 + i*4, mask );
|
||||
vh[i] = _mm256_or_si256( vh0[i], vh1[i] );
|
||||
*/
|
||||
}
|
||||
|
||||
|
93
algo/lyra2/lyra2h.c
Normal file
93
algo/lyra2/lyra2h.c
Normal file
@@ -0,0 +1,93 @@
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "algo-gate-api.h"
|
||||
#include "lyra2.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
|
||||
__thread uint64_t* lyra2h_matrix;
|
||||
|
||||
bool lyra2h_thread_init()
|
||||
{
|
||||
const int i = 16 * 16 * 96;
|
||||
lyra2h_matrix = _mm_malloc( i, 64 );
|
||||
return lyra2h_matrix;
|
||||
}
|
||||
|
||||
static __thread sph_blake256_context lyra2h_blake_mid;
|
||||
|
||||
void lyra2h_midstate( const void* input )
|
||||
{
|
||||
sph_blake256_init( &lyra2h_blake_mid );
|
||||
sph_blake256( &lyra2h_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
void lyra2h_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t _ALIGN(64) hash[16];
|
||||
|
||||
sph_blake256_context ctx_blake __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx_blake, &lyra2h_blake_mid, sizeof lyra2h_blake_mid );
|
||||
sph_blake256( &ctx_blake, input + 64, 16 );
|
||||
sph_blake256_close( &ctx_blake, hash );
|
||||
|
||||
LYRA2Z( lyra2h_matrix, hash, 32, hash, 32, hash, 32, 8, 8, 8);
|
||||
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
|
||||
if (opt_benchmark)
|
||||
ptarget[7] = 0x0000ff;
|
||||
|
||||
for (int i=0; i < 19; i++) {
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
}
|
||||
|
||||
lyra2h_midstate( endiandata );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], nonce);
|
||||
lyra2h_hash( hash, endiandata );
|
||||
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
work_set_target_ratio(work, hash);
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
}
|
||||
nonce++;
|
||||
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void lyra2h_set_target( struct work* work, double job_diff )
|
||||
{
|
||||
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
bool register_lyra2h_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX_OPT | AVX2_OPT;
|
||||
gate->miner_thread_init = (void*)&lyra2h_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2h;
|
||||
gate->hash = (void*)&lyra2h_hash;
|
||||
gate->get_max64 = (void*)&get_max64_0xffffLL;
|
||||
gate->set_target = (void*)&lyra2h_set_target;
|
||||
return true;
|
||||
};
|
||||
|
@@ -110,7 +110,8 @@ printf("found 0\n");
|
||||
nonces[0] = pdata[19] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
/* if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
|
||||
/*
|
||||
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
printf("found 1\n");
|
||||
found[1] = true;
|
||||
|
@@ -65,13 +65,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
|
||||
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm256_rotl256_1x64( s1); \
|
||||
s1 = mm256_rotr256_1x64( s1); \
|
||||
s2 = mm256_swap_128( s2 ); \
|
||||
s3 = mm256_rotr256_1x64( s3 ); \
|
||||
s3 = mm256_rotl256_1x64( s3 ); \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm256_rotr256_1x64( s1 ); \
|
||||
s1 = mm256_rotl256_1x64( s1 ); \
|
||||
s2 = mm256_swap_128( s2 ); \
|
||||
s3 = mm256_rotl256_1x64( s3 );
|
||||
s3 = mm256_rotr256_1x64( s3 );
|
||||
|
||||
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
|
673
algo/shavite/sph-shavite-aesni.c
Normal file
673
algo/shavite/sph-shavite-aesni.c
Normal file
@@ -0,0 +1,673 @@
|
||||
/* $Id: shavite.c 227 2010-06-16 17:28:38Z tp $ */
|
||||
/*
|
||||
* SHAvite-3 implementation.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __AES__
|
||||
|
||||
#include "sph_shavite.h"
|
||||
#include "avxdefs.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHAVITE
|
||||
#define SPH_SMALL_FOOTPRINT_SHAVITE 1
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
#define C32 SPH_C32
|
||||
|
||||
/*
|
||||
* As of round 2 of the SHA-3 competition, the published reference
|
||||
* implementation and test vectors are wrong, because they use
|
||||
* big-endian AES tables while the internal decoding uses little-endian.
|
||||
* The code below follows the specification. To turn it into a code
|
||||
* which follows the reference implementation (the one called "BugFix"
|
||||
* on the SHAvite-3 web site, published on Nov 23rd, 2009), comment out
|
||||
* the code below (from the '#define AES_BIG_ENDIAN...' to the definition
|
||||
* of the AES_ROUND_NOKEY macro) and replace it with the version which
|
||||
* is commented out afterwards.
|
||||
*/
|
||||
|
||||
#define AES_BIG_ENDIAN 0
|
||||
#include "algo/sha/aes_helper.c"
|
||||
|
||||
static const sph_u32 IV512[] = {
|
||||
C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC),
|
||||
C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC),
|
||||
C32(0x8E45D73D), C32(0x681AB538), C32(0xBDE86578), C32(0xDD577E47),
|
||||
C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
|
||||
};
|
||||
|
||||
#define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \
|
||||
sph_u32 t0 = (x0); \
|
||||
sph_u32 t1 = (x1); \
|
||||
sph_u32 t2 = (x2); \
|
||||
sph_u32 t3 = (x3); \
|
||||
AES_ROUND_NOKEY_LE(t0, t1, t2, t3, x0, x1, x2, x3); \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define KEY_EXPAND_ELT(k0, k1, k2, k3) do { \
|
||||
sph_u32 kt; \
|
||||
AES_ROUND_NOKEY(k1, k2, k3, k0); \
|
||||
kt = (k0); \
|
||||
(k0) = (k1); \
|
||||
(k1) = (k2); \
|
||||
(k2) = (k3); \
|
||||
(k3) = kt; \
|
||||
} while (0)
|
||||
|
||||
|
||||
#if SPH_SMALL_FOOTPRINT_SHAVITE
|
||||
|
||||
/*
|
||||
* This function assumes that "msg" is aligned for 32-bit access.
|
||||
*/
|
||||
static void
|
||||
c512(sph_shavite_big_context *sc, const void *msg)
|
||||
{
|
||||
sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
|
||||
sph_u32 p8, p9, pA, pB, pC, pD, pE, pF;
|
||||
sph_u32 rk[448];
|
||||
size_t u;
|
||||
int r, s;
|
||||
|
||||
#if SPH_LITTLE_ENDIAN
|
||||
memcpy(rk, msg, 128);
|
||||
#else
|
||||
for (u = 0; u < 32; u += 4) {
|
||||
rk[u + 0] = sph_dec32le_aligned(
|
||||
(const unsigned char *)msg + (u << 2) + 0);
|
||||
rk[u + 1] = sph_dec32le_aligned(
|
||||
(const unsigned char *)msg + (u << 2) + 4);
|
||||
rk[u + 2] = sph_dec32le_aligned(
|
||||
(const unsigned char *)msg + (u << 2) + 8);
|
||||
rk[u + 3] = sph_dec32le_aligned(
|
||||
(const unsigned char *)msg + (u << 2) + 12);
|
||||
}
|
||||
#endif
|
||||
u = 32;
|
||||
for (;;) {
|
||||
for (s = 0; s < 4; s ++) {
|
||||
sph_u32 x0, x1, x2, x3;
|
||||
|
||||
x0 = rk[u - 31];
|
||||
x1 = rk[u - 30];
|
||||
x2 = rk[u - 29];
|
||||
x3 = rk[u - 32];
|
||||
AES_ROUND_NOKEY(x0, x1, x2, x3);
|
||||
rk[u + 0] = x0 ^ rk[u - 4];
|
||||
rk[u + 1] = x1 ^ rk[u - 3];
|
||||
rk[u + 2] = x2 ^ rk[u - 2];
|
||||
rk[u + 3] = x3 ^ rk[u - 1];
|
||||
if (u == 32) {
|
||||
rk[ 32] ^= sc->count0;
|
||||
rk[ 33] ^= sc->count1;
|
||||
rk[ 34] ^= sc->count2;
|
||||
rk[ 35] ^= SPH_T32(~sc->count3);
|
||||
} else if (u == 440) {
|
||||
rk[440] ^= sc->count1;
|
||||
rk[441] ^= sc->count0;
|
||||
rk[442] ^= sc->count3;
|
||||
rk[443] ^= SPH_T32(~sc->count2);
|
||||
}
|
||||
u += 4;
|
||||
|
||||
x0 = rk[u - 31];
|
||||
x1 = rk[u - 30];
|
||||
x2 = rk[u - 29];
|
||||
x3 = rk[u - 32];
|
||||
AES_ROUND_NOKEY(x0, x1, x2, x3);
|
||||
rk[u + 0] = x0 ^ rk[u - 4];
|
||||
rk[u + 1] = x1 ^ rk[u - 3];
|
||||
rk[u + 2] = x2 ^ rk[u - 2];
|
||||
rk[u + 3] = x3 ^ rk[u - 1];
|
||||
if (u == 164) {
|
||||
rk[164] ^= sc->count3;
|
||||
rk[165] ^= sc->count2;
|
||||
rk[166] ^= sc->count1;
|
||||
rk[167] ^= SPH_T32(~sc->count0);
|
||||
} else if (u == 316) {
|
||||
rk[316] ^= sc->count2;
|
||||
rk[317] ^= sc->count3;
|
||||
rk[318] ^= sc->count0;
|
||||
rk[319] ^= SPH_T32(~sc->count1);
|
||||
}
|
||||
u += 4;
|
||||
}
|
||||
if (u == 448)
|
||||
break;
|
||||
for (s = 0; s < 8; s ++) {
|
||||
rk[u + 0] = rk[u - 32] ^ rk[u - 7];
|
||||
rk[u + 1] = rk[u - 31] ^ rk[u - 6];
|
||||
rk[u + 2] = rk[u - 30] ^ rk[u - 5];
|
||||
rk[u + 3] = rk[u - 29] ^ rk[u - 4];
|
||||
u += 4;
|
||||
}
|
||||
}
|
||||
|
||||
p0 = sc->h[0x0];
|
||||
p1 = sc->h[0x1];
|
||||
p2 = sc->h[0x2];
|
||||
p3 = sc->h[0x3];
|
||||
p4 = sc->h[0x4];
|
||||
p5 = sc->h[0x5];
|
||||
p6 = sc->h[0x6];
|
||||
p7 = sc->h[0x7];
|
||||
p8 = sc->h[0x8];
|
||||
p9 = sc->h[0x9];
|
||||
pA = sc->h[0xA];
|
||||
pB = sc->h[0xB];
|
||||
pC = sc->h[0xC];
|
||||
pD = sc->h[0xD];
|
||||
pE = sc->h[0xE];
|
||||
pF = sc->h[0xF];
|
||||
u = 0;
|
||||
for (r = 0; r < 14; r ++) {
|
||||
#define C512_ELT(l0, l1, l2, l3, r0, r1, r2, r3) do { \
|
||||
sph_u32 x0, x1, x2, x3; \
|
||||
x0 = r0 ^ rk[u ++]; \
|
||||
x1 = r1 ^ rk[u ++]; \
|
||||
x2 = r2 ^ rk[u ++]; \
|
||||
x3 = r3 ^ rk[u ++]; \
|
||||
AES_ROUND_NOKEY(x0, x1, x2, x3); \
|
||||
x0 ^= rk[u ++]; \
|
||||
x1 ^= rk[u ++]; \
|
||||
x2 ^= rk[u ++]; \
|
||||
x3 ^= rk[u ++]; \
|
||||
AES_ROUND_NOKEY(x0, x1, x2, x3); \
|
||||
x0 ^= rk[u ++]; \
|
||||
x1 ^= rk[u ++]; \
|
||||
x2 ^= rk[u ++]; \
|
||||
x3 ^= rk[u ++]; \
|
||||
AES_ROUND_NOKEY(x0, x1, x2, x3); \
|
||||
x0 ^= rk[u ++]; \
|
||||
x1 ^= rk[u ++]; \
|
||||
x2 ^= rk[u ++]; \
|
||||
x3 ^= rk[u ++]; \
|
||||
AES_ROUND_NOKEY(x0, x1, x2, x3); \
|
||||
l0 ^= x0; \
|
||||
l1 ^= x1; \
|
||||
l2 ^= x2; \
|
||||
l3 ^= x3; \
|
||||
} while (0)
|
||||
|
||||
#define WROT(a, b, c, d) do { \
|
||||
sph_u32 t = d; \
|
||||
d = c; \
|
||||
c = b; \
|
||||
b = a; \
|
||||
a = t; \
|
||||
} while (0)
|
||||
|
||||
C512_ELT(p0, p1, p2, p3, p4, p5, p6, p7);
|
||||
C512_ELT(p8, p9, pA, pB, pC, pD, pE, pF);
|
||||
|
||||
WROT(p0, p4, p8, pC);
|
||||
WROT(p1, p5, p9, pD);
|
||||
WROT(p2, p6, pA, pE);
|
||||
WROT(p3, p7, pB, pF);
|
||||
|
||||
#undef C512_ELT
|
||||
#undef WROT
|
||||
}
|
||||
sc->h[0x0] ^= p0;
|
||||
sc->h[0x1] ^= p1;
|
||||
sc->h[0x2] ^= p2;
|
||||
sc->h[0x3] ^= p3;
|
||||
sc->h[0x4] ^= p4;
|
||||
sc->h[0x5] ^= p5;
|
||||
sc->h[0x6] ^= p6;
|
||||
sc->h[0x7] ^= p7;
|
||||
sc->h[0x8] ^= p8;
|
||||
sc->h[0x9] ^= p9;
|
||||
sc->h[0xA] ^= pA;
|
||||
sc->h[0xB] ^= pB;
|
||||
sc->h[0xC] ^= pC;
|
||||
sc->h[0xD] ^= pD;
|
||||
sc->h[0xE] ^= pE;
|
||||
sc->h[0xF] ^= pF;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/*
|
||||
* This function assumes that "msg" is aligned for 32-bit access.
|
||||
*/
|
||||
static void
|
||||
c512( sph_shavite_big_context *sc, const void *msg )
|
||||
{
|
||||
__m128i p0, p1, p2, p3, x;
|
||||
__m128i k00, k01, k02, k03, k10, k11, k12, k13;
|
||||
__m128i *m = (__m128i*)msg;
|
||||
__m128i *h = (__m128i*)sc->h;
|
||||
int r;
|
||||
|
||||
p0 = h[0];
|
||||
p1 = h[1];
|
||||
p2 = h[2];
|
||||
p3 = h[3];
|
||||
|
||||
// round
|
||||
k00 = m[0];
|
||||
x = _mm_xor_si128( p1, k00 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k01 = m[1];
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k02 = m[2];
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k03 = m[3];
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
|
||||
k10 = m[4];
|
||||
x = _mm_xor_si128( p3, k10 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k11 = m[5];
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k12 = m[6];
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k13 = m[7];
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p2 = _mm_xor_si128( p2, x );
|
||||
|
||||
for ( r = 0; r < 3; r ++ )
|
||||
{
|
||||
// round 1, 5, 9
|
||||
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
|
||||
k00 = _mm_xor_si128( k00, k13 );
|
||||
|
||||
if ( r == 0 )
|
||||
k00 = _mm_xor_si128( k00, _mm_set_epi32(
|
||||
~sc->count3, sc->count2, sc->count1, sc->count0 ) );
|
||||
|
||||
x = _mm_xor_si128( p0, k00 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
|
||||
k01 = _mm_xor_si128( k01, k00 );
|
||||
|
||||
if ( r == 1 )
|
||||
k01 = _mm_xor_si128( k01, _mm_set_epi32(
|
||||
~sc->count0, sc->count1, sc->count2, sc->count3 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p3 = _mm_xor_si128( p3, x );
|
||||
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
|
||||
x = _mm_xor_si128( p2, k10 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
|
||||
k12 = _mm_xor_si128( k12, k11 );
|
||||
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
|
||||
k13 = _mm_xor_si128( k13, k12 );
|
||||
|
||||
if ( r == 2 )
|
||||
k13 = _mm_xor_si128( k13, _mm_set_epi32(
|
||||
~sc->count1, sc->count0, sc->count3, sc->count2 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p1 = _mm_xor_si128( p1, x );
|
||||
|
||||
// round 2, 6, 10
|
||||
|
||||
k00 = _mm_xor_si128( k00, mm_rotr256_32( k12, k13, 1 ) );
|
||||
x = _mm_xor_si128( p3, k00 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k01 = _mm_xor_si128( k01, mm_rotr256_32( k13, k00, 1 ) );
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k02 = _mm_xor_si128( k02, mm_rotr256_32( k00, k01, 1 ) );
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k03 = _mm_xor_si128( k03, mm_rotr256_32( k01, k02, 1 ) );
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
p2 = _mm_xor_si128( p2, x );
|
||||
k10 = _mm_xor_si128( k10, mm_rotr256_32( k02, k03, 1 ) );
|
||||
x = _mm_xor_si128( p1, k10 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k11 = _mm_xor_si128( k11, mm_rotr256_32( k03, k10, 1 ) );
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k12 = _mm_xor_si128( k12, mm_rotr256_32( k10, k11, 1 ) );
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k13 = _mm_xor_si128( k13, mm_rotr256_32( k11, k12, 1 ) );
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
|
||||
// round 3, 7, 11
|
||||
|
||||
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
|
||||
k00 = _mm_xor_si128( k00, k13 );
|
||||
|
||||
x = _mm_xor_si128( p2, k00 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
|
||||
k01 = _mm_xor_si128( k01, k00 );
|
||||
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p1 = _mm_xor_si128( p1, x );
|
||||
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
|
||||
x = _mm_xor_si128( p0, k10 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
|
||||
k12 = _mm_xor_si128( k12, k11 );
|
||||
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
|
||||
k13 = _mm_xor_si128( k13, k12 );
|
||||
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p3 = _mm_xor_si128( p3, x );
|
||||
|
||||
// round 4, 8, 12
|
||||
|
||||
k00 = _mm_xor_si128( k00, mm_rotr256_32( k12, k13, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( p1, k00 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k01 = _mm_xor_si128( k01, mm_rotr256_32( k13, k00, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k02 = _mm_xor_si128( k02, mm_rotr256_32( k00, k01, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k03 = _mm_xor_si128( k03, mm_rotr256_32( k01, k02, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
k10 = _mm_xor_si128( k10, mm_rotr256_32( k02, k03, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( p3, k10 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k11 = _mm_xor_si128( k11, mm_rotr256_32( k03, k10, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k12 = _mm_xor_si128( k12, mm_rotr256_32( k10, k11, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k13 = _mm_xor_si128( k13, mm_rotr256_32( k11, k12, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p2 = _mm_xor_si128( p2, x );
|
||||
}
|
||||
|
||||
// round 13
|
||||
|
||||
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
|
||||
k00 = _mm_xor_si128( k00, k13 );
|
||||
|
||||
x = _mm_xor_si128( p0, k00 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
|
||||
k01 = _mm_xor_si128( k01, k00 );
|
||||
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p3 = _mm_xor_si128( p3, x );
|
||||
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
|
||||
x = _mm_xor_si128( p2, k10 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
|
||||
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
|
||||
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
|
||||
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
|
||||
k13 = _mm_xor_si128( k13, k12 );
|
||||
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p1 = _mm_xor_si128( p1, x );
|
||||
|
||||
h[0] = _mm_xor_si128( h[0], p2 );
|
||||
h[1] = _mm_xor_si128( h[1], p3 );
|
||||
h[2] = _mm_xor_si128( h[2], p0 );
|
||||
h[3] = _mm_xor_si128( h[3], p1 );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static void
|
||||
shavite_big_aesni_init( sph_shavite_big_context *sc, const sph_u32 *iv )
|
||||
{
|
||||
memcpy( sc->h, iv, sizeof sc->h );
|
||||
sc->ptr = 0;
|
||||
sc->count0 = 0;
|
||||
sc->count1 = 0;
|
||||
sc->count2 = 0;
|
||||
sc->count3 = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
shavite_big_aesni_core( sph_shavite_big_context *sc, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
unsigned char *buf;
|
||||
size_t ptr;
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
while (len > 0) {
|
||||
size_t clen;
|
||||
|
||||
clen = (sizeof sc->buf) - ptr;
|
||||
if (clen > len)
|
||||
clen = len;
|
||||
memcpy(buf + ptr, data, clen);
|
||||
data = (const unsigned char *)data + clen;
|
||||
ptr += clen;
|
||||
len -= clen;
|
||||
if (ptr == sizeof sc->buf) {
|
||||
if ((sc->count0 = SPH_T32(sc->count0 + 1024)) == 0) {
|
||||
sc->count1 = SPH_T32(sc->count1 + 1);
|
||||
if (sc->count1 == 0) {
|
||||
sc->count2 = SPH_T32(sc->count2 + 1);
|
||||
if (sc->count2 == 0) {
|
||||
sc->count3 = SPH_T32(
|
||||
sc->count3 + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
c512(sc, buf);
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
|
||||
static void
|
||||
shavite_big_aesni_close( sph_shavite_big_context *sc, unsigned ub, unsigned n,
|
||||
void *dst, size_t out_size_w32 )
|
||||
{
|
||||
unsigned char *buf;
|
||||
size_t ptr, u;
|
||||
unsigned z;
|
||||
sph_u32 count0, count1, count2, count3;
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
count0 = (sc->count0 += SPH_T32(ptr << 3) + n);
|
||||
count1 = sc->count1;
|
||||
count2 = sc->count2;
|
||||
count3 = sc->count3;
|
||||
z = 0x80 >> n;
|
||||
z = ((ub & -z) | z) & 0xFF;
|
||||
if (ptr == 0 && n == 0) {
|
||||
buf[0] = 0x80;
|
||||
memset(buf + 1, 0, 109);
|
||||
sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
|
||||
} else if (ptr < 110) {
|
||||
buf[ptr ++] = z;
|
||||
memset(buf + ptr, 0, 110 - ptr);
|
||||
} else {
|
||||
buf[ptr ++] = z;
|
||||
memset(buf + ptr, 0, 128 - ptr);
|
||||
c512(sc, buf);
|
||||
memset(buf, 0, 110);
|
||||
sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
|
||||
}
|
||||
sph_enc32le(buf + 110, count0);
|
||||
sph_enc32le(buf + 114, count1);
|
||||
sph_enc32le(buf + 118, count2);
|
||||
sph_enc32le(buf + 122, count3);
|
||||
buf[126] = (unsigned char) (out_size_w32 << 5);
|
||||
buf[127] = (unsigned char) (out_size_w32 >> 3);
|
||||
c512(sc, buf);
|
||||
for (u = 0; u < out_size_w32; u ++)
|
||||
sph_enc32le((unsigned char *)dst + (u << 2), sc->h[u]);
|
||||
}
|
||||
|
||||
void
|
||||
sph_shavite512_aesni_init(void *cc)
|
||||
{
|
||||
shavite_big_aesni_init(cc, IV512);
|
||||
}
|
||||
|
||||
void
|
||||
sph_shavite512_aesni(void *cc, const void *data, size_t len)
|
||||
{
|
||||
shavite_big_aesni_core(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
sph_shavite512_aesni_close(void *cc, void *dst)
|
||||
{
|
||||
shavite_big_aesni_close(cc, 0, 0, dst, 16);
|
||||
}
|
||||
|
||||
void
|
||||
sph_shavite512_aesni_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst)
|
||||
{
|
||||
shavite_big_aesni_close(cc, ub, n, dst, 16);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -1731,21 +1731,21 @@ sph_shavite384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
|
||||
/* see sph_shavite.h */
|
||||
void
|
||||
sph_shavite512_init(void *cc)
|
||||
sph_shavite512_sw_init(void *cc)
|
||||
{
|
||||
shavite_big_init(cc, IV512);
|
||||
}
|
||||
|
||||
/* see sph_shavite.h */
|
||||
void
|
||||
sph_shavite512(void *cc, const void *data, size_t len)
|
||||
sph_shavite512_sw(void *cc, const void *data, size_t len)
|
||||
{
|
||||
shavite_big_core(cc, data, len);
|
||||
}
|
||||
|
||||
/* see sph_shavite.h */
|
||||
void
|
||||
sph_shavite512_close(void *cc, void *dst)
|
||||
sph_shavite512_sw_close(void *cc, void *dst)
|
||||
{
|
||||
shavite_big_close(cc, 0, 0, dst, 16);
|
||||
// shavite_big_init(cc, IV512);
|
||||
@@ -1753,7 +1753,7 @@ sph_shavite512_close(void *cc, void *dst)
|
||||
|
||||
/* see sph_shavite.h */
|
||||
void
|
||||
sph_shavite512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
sph_shavite512_sw_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
{
|
||||
shavite_big_close(cc, ub, n, dst, 16);
|
||||
// shavite_big_init(cc, IV512);
|
||||
|
@@ -77,9 +77,9 @@ extern "C"{
|
||||
*/
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
unsigned char buf[64]; /* first field, for alignment */
|
||||
unsigned char buf[64] __attribute__ ((aligned (64)));
|
||||
sph_u32 h[8] __attribute__ ((aligned (32)));
|
||||
size_t ptr;
|
||||
sph_u32 h[8];
|
||||
sph_u32 count0, count1;
|
||||
#endif
|
||||
} sph_shavite_small_context;
|
||||
@@ -108,9 +108,9 @@ typedef sph_shavite_small_context sph_shavite256_context;
|
||||
*/
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
unsigned char buf[128]; /* first field, for alignment */
|
||||
unsigned char buf[128] __attribute__ ((aligned (64)));
|
||||
sph_u32 h[16] __attribute__ ((aligned (32)));;
|
||||
size_t ptr;
|
||||
sph_u32 h[16];
|
||||
sph_u32 count0, count1, count2, count3;
|
||||
#endif
|
||||
} sph_shavite_big_context;
|
||||
@@ -262,51 +262,37 @@ void sph_shavite384_close(void *cc, void *dst);
|
||||
void sph_shavite384_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
/**
|
||||
* Initialize a SHAvite-512 context. This process performs no memory allocation.
|
||||
*
|
||||
* @param cc the SHAvite-512 context (pointer to a
|
||||
* <code>sph_shavite512_context</code>)
|
||||
*/
|
||||
void sph_shavite512_init(void *cc);
|
||||
|
||||
/**
|
||||
* Process some data bytes. It is acceptable that <code>len</code> is zero
|
||||
* (in which case this function does nothing).
|
||||
*
|
||||
* @param cc the SHAvite-512 context
|
||||
* @param data the input data
|
||||
* @param len the input data length (in bytes)
|
||||
*/
|
||||
void sph_shavite512(void *cc, const void *data, size_t len);
|
||||
|
||||
/**
|
||||
* Terminate the current SHAvite-512 computation and output the result into
|
||||
* the provided buffer. The destination buffer must be wide enough to
|
||||
* accomodate the result (64 bytes). The context is automatically
|
||||
* reinitialized.
|
||||
*
|
||||
* @param cc the SHAvite-512 context
|
||||
* @param dst the destination buffer
|
||||
*/
|
||||
void sph_shavite512_close(void *cc, void *dst);
|
||||
|
||||
/**
|
||||
* Add a few additional bits (0 to 7) to the current computation, then
|
||||
* terminate it and output the result in the provided buffer, which must
|
||||
* be wide enough to accomodate the result (64 bytes). If bit number i
|
||||
* in <code>ub</code> has value 2^i, then the extra bits are those
|
||||
* numbered 7 downto 8-n (this is the big-endian convention at the byte
|
||||
* level). The context is automatically reinitialized.
|
||||
*
|
||||
* @param cc the SHAvite-512 context
|
||||
* @param ub the extra bits
|
||||
* @param n the number of extra bits (0 to 7)
|
||||
* @param dst the destination buffer
|
||||
*/
|
||||
void sph_shavite512_addbits_and_close(
|
||||
// Always define sw but only define aesni when available
|
||||
// Define fptrs for aesni or sw, not both.
|
||||
void sph_shavite512_sw_init(void *cc);
|
||||
void sph_shavite512_sw(void *cc, const void *data, size_t len);
|
||||
void sph_shavite512_sw_close(void *cc, void *dst);
|
||||
void sph_shavite512_sw_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
|
||||
#ifdef __AES__
|
||||
void sph_shavite512_aesni_init(void *cc);
|
||||
void sph_shavite512_aesni(void *cc, const void *data, size_t len);
|
||||
void sph_shavite512_aesni_close(void *cc, void *dst);
|
||||
void sph_shavite512_aesni_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#define sph_shavite512_init sph_shavite512_aesni_init
|
||||
#define sph_shavite512 sph_shavite512_aesni
|
||||
#define sph_shavite512_close sph_shavite512_aesni_close
|
||||
#define sph_shavite512_addbits_and_close \
|
||||
sph_shavite512_aesni_addbits_and_close
|
||||
|
||||
#else
|
||||
|
||||
#define sph_shavite512_init sph_shavite512_sw_init
|
||||
#define sph_shavite512 sph_shavite512_sw
|
||||
#define sph_shavite512_close sph_shavite512_sw_close
|
||||
#define sph_shavite512_addbits_and_close \
|
||||
sph_shavite512_sw_addbits_and_close
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -104,7 +104,7 @@ int scanhash_veltor(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
|
||||
|
||||
bool register_veltor_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT;
|
||||
init_veltor_ctx();
|
||||
gate->scanhash = (void*)&scanhash_veltor;
|
||||
gate->hash = (void*)&veltorhash;
|
||||
|
@@ -252,8 +252,8 @@ SPH_XCAT(HASH, _addbits_and_close)(void *cc,
|
||||
current = (unsigned)sc->count_low & (SPH_BLEN - 1U);
|
||||
#endif
|
||||
|
||||
uint64_t *b= (uint64_t*)sc->buf;
|
||||
uint64_t *s= (uint64_t*)sc->state;
|
||||
//uint64_t *b= (uint64_t*)sc->buf;
|
||||
//uint64_t *s= (uint64_t*)sc->state;
|
||||
// printf("Sptr 1= %u\n",current);
|
||||
// printf("SBuf %016llx %016llx %016llx %016llx\n", b[0], b[1], b[2], b[3] );
|
||||
// printf("SBuf %016llx %016llx %016llx %016llx\n", b[4], b[5], b[6], b[7] );
|
||||
|
@@ -1,4 +1,7 @@
|
||||
#include "whirlpool-gate.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
@@ -6,8 +9,6 @@
|
||||
#include "sph_whirlpool.h"
|
||||
#include "whirlpool-hash-4way.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
static __thread whirlpool_4way_context whirl_mid;
|
||||
|
||||
void whirlpool_hash_4way( void *state, const void *input )
|
||||
@@ -50,7 +51,7 @@ void whirlpool_hash_4way( void *state, const void *input )
|
||||
}
|
||||
|
||||
int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce,
|
||||
unsigned long *hashes_done )
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
@@ -67,8 +68,8 @@ int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce,
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
|
||||
// if (opt_benchmark)
|
||||
// ((uint32_t*)ptarget)[7] = 0x0000ff;
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x0000ff;
|
||||
|
||||
for (int i=0; i < 19; i++)
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
|
@@ -2,14 +2,15 @@
|
||||
|
||||
bool register_whirlpool_algo( algo_gate_t* gate )
|
||||
{
|
||||
//#if defined (WHIRLPOOL_4WAY)
|
||||
// gate->scanhash = (void*)&scanhash_whirlpool_4way;
|
||||
// gate->hash = (void*)&whirlpool_hash_4way;
|
||||
//#else
|
||||
#if defined (WHIRLPOOL_4WAY)
|
||||
four_way_not_tested();
|
||||
gate->scanhash = (void*)&scanhash_whirlpool_4way;
|
||||
gate->hash = (void*)&whirlpool_hash_4way;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_whirlpool;
|
||||
gate->hash = (void*)&whirlpool_hash;
|
||||
init_whirlpool_ctx();
|
||||
//#endif
|
||||
#endif
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -8,13 +8,13 @@
|
||||
#define WHIRLPOOL_4WAY
|
||||
#endif
|
||||
|
||||
//#if defined (WHIRLPOOL_4WAY)
|
||||
#if defined (WHIRLPOOL_4WAY)
|
||||
|
||||
//void whirlpool_hash_4way(void *state, const void *input);
|
||||
void whirlpool_hash_4way(void *state, const void *input);
|
||||
|
||||
//int scanhash_whirlpool_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done );
|
||||
//#endif
|
||||
int scanhash_whirlpool_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
#else
|
||||
|
||||
void whirlpool_hash( void *state, const void *input );
|
||||
|
||||
@@ -22,3 +22,4 @@ int scanhash_whirlpool( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@@ -3365,6 +3365,8 @@ do { \
|
||||
// scalar array of constants "table" and return referenced 64 bit entry.
|
||||
#define t_lane( table, inv, row, lane ) \
|
||||
table[ _mm256_extract_epi64( t_row( inv, row ), lane ) ]
|
||||
// table[ t_rwo( inv, row )[ lane ] ];
|
||||
|
||||
|
||||
// Build a vector from elements of non-contiguous 64 bit data extracted from
|
||||
// scalar "table".
|
||||
|
18
algo/x11/x11-gate.c
Normal file
18
algo/x11/x11-gate.c
Normal file
@@ -0,0 +1,18 @@
|
||||
#include "x11-gate.h"
|
||||
|
||||
bool register_x11_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X11_4WAY)
|
||||
init_x11_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x11_4way;
|
||||
gate->hash = (void*)&x11_hash_4way;
|
||||
#else
|
||||
init_x11_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x11;
|
||||
gate->hash = (void*)&x11_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
30
algo/x11/x11-gate.h
Normal file
30
algo/x11/x11-gate.h
Normal file
@@ -0,0 +1,30 @@
|
||||
#ifndef X11_GATE_H__
|
||||
#define X11_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
//#if defined(HASH_4WAY) && !defined(NO_AES_NI)
|
||||
// #define X11_4WAY
|
||||
//#endif
|
||||
|
||||
bool register_x11_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(X11_4WAY)
|
||||
|
||||
void x11_hash_4way( void *state, const void *input );
|
||||
|
||||
int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
#endif
|
||||
|
||||
void x11_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x11_ctx();
|
||||
|
||||
#endif
|
||||
|
@@ -1,5 +1,5 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "algo-gate-api.h"
|
||||
#include "x11-gate.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
@@ -61,7 +61,7 @@ void init_x11_ctx()
|
||||
#endif
|
||||
}
|
||||
|
||||
static void x11_hash( void *state, const void *input )
|
||||
void x11_hash( void *state, const void *input )
|
||||
{
|
||||
unsigned char hash[128] __attribute__ ((aligned (32)));
|
||||
unsigned char hashbuf[128] __attribute__ ((aligned (16)));
|
||||
@@ -189,7 +189,7 @@ int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
bool register_x11_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
@@ -199,4 +199,4 @@ bool register_x11_algo( algo_gate_t* gate )
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
||||
*/
|
||||
|
@@ -440,7 +440,7 @@ bool register_yescrypt_algo( algo_gate_t* gate )
|
||||
|
||||
bool register_yescryptr16_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yescrypt;
|
||||
gate->hash = (void*)&yescrypt_hash;
|
||||
gate->set_target = (void*)&scrypt_set_target;
|
||||
|
296
avxdefs.h
296
avxdefs.h
@@ -26,14 +26,6 @@
|
||||
|
||||
#define mm_negate_32( a ) _mm_sub_epi32( mm_zero, a )
|
||||
|
||||
// Blend 128 bit vectors based on vector mask. Bits are copied from arg a0
|
||||
// if corresponding mask bits are clear and from arg a1 if set.
|
||||
// Should be faster than maskload.
|
||||
// isn't working.
|
||||
#define mm_merge( a0, a1, mask ) \
|
||||
_mm_or_si128( _mm_and_si128( a0, mm_not( mask ) ), \
|
||||
_mm_and_si128( a1, mask ) )
|
||||
|
||||
// Memory functions
|
||||
// n = number of __m128i, bytes/16
|
||||
|
||||
@@ -59,7 +51,6 @@ inline void memcpy_64( uint64_t* dst, const uint64_t* src, int n )
|
||||
dst[i] = src[i];
|
||||
}
|
||||
|
||||
|
||||
// Pointer cast
|
||||
|
||||
// p = any aligned pointer
|
||||
@@ -80,47 +71,56 @@ inline void memcpy_64( uint64_t* dst, const uint64_t* src, int n )
|
||||
#define mm_rotr_64( w, c ) _mm_or_si128( _mm_srli_epi64( w, c ), \
|
||||
_mm_slli_epi64( w, 64-c ) )
|
||||
|
||||
#define mm_rotl_64( w, c ) _mm_or_si128( _mm_slli_epi64( w, c ), \
|
||||
_mm_srli_epi64( w, 64-c ) )
|
||||
|
||||
#define mm_rotr_32( w, c ) _mm_or_si128( _mm_srli_epi32( w, c ), \
|
||||
_mm_slli_epi32( w, 32-c ) )
|
||||
|
||||
// Rotate vector elements
|
||||
#define mm_rotl_32( w, c ) _mm_or_si128( _mm_slli_epi32( w, c ), \
|
||||
_mm_srli_epi32( w, 32-c ) )
|
||||
|
||||
// Rotate elements in vector
|
||||
|
||||
// Swap upper and lower 64 bits of 128 bit source vector
|
||||
// __m128i mm128_swap64( __m128 )
|
||||
#define mm_swap_64(s) _mm_shuffle_epi32( s, 0x4e )
|
||||
|
||||
// Rotate 128 vector by 1 32 bit element.
|
||||
#define mm_rotr_1x32( w ) _mm_shuffle_epi32( w, 0x39 )
|
||||
|
||||
#define mm_rotl_1x32( w ) _mm_shuffle_epi32( w, 0x93 )
|
||||
|
||||
// Rotate 256 bits through two 128 bit vectors
|
||||
|
||||
// Swap 128 bit source vectors
|
||||
// Swap 128 bit source vectors in place.
|
||||
// void mm128_swap128( __m128i, __m128i )
|
||||
// macro is better to update two args
|
||||
#define mm_swap_128(s0, s1) s0 = _mm_xor_si128(s0, s1); \
|
||||
s1 = _mm_xor_si128(s0, s1); \
|
||||
s0 = _mm_xor_si128(s0, s1);
|
||||
|
||||
// Rotate two 128 bit vectors as one 256 vector by 1 element
|
||||
#define mm_rotl256_1x64x( s0, s1 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
s0 = mm_swap_64( s0 ); \
|
||||
s1 = mm_swap_64( s1 ); \
|
||||
t = mm_merge( s0, s1, _mm_set_epi64x( 0xffffffffffffffffull, 0ull ) );\
|
||||
s1 = mm_merge( s0, s1, _mm_set_epi64x( 0ull, 0xffffffffffffffffull ) ); \
|
||||
s0 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotr256_1x64x( s0, s1 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
s0 = mm_swap_64( s0 ); \
|
||||
s1 = mm_swap_64( s1 ); \
|
||||
t = mm_merge( s0, s1, _mm_set_epi64x( 0ull, 0xffffffffffffffffull ) );\
|
||||
s1 = mm_merge( s0, s1, _mm_set_epi64x( 0xffffffffffffffffull, 0ull ) ); \
|
||||
s0 = t; \
|
||||
} while(0)
|
||||
|
||||
|
||||
// Rotate two 128 bit vectors in place as one 256 vector by 1 element
|
||||
#define mm_rotl256_1x64( s0, s1 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
s0 = mm_swap_64( s0 ); \
|
||||
s1 = mm_swap_64( s1 ); \
|
||||
t = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
|
||||
s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
|
||||
s0 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotr256_1x64( s0, s1 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
s0 = mm_swap_64( s0 ); \
|
||||
s1 = mm_swap_64( s1 ); \
|
||||
t = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
|
||||
s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
|
||||
s0 = t; \
|
||||
} while(0)
|
||||
|
||||
// Older slower
|
||||
#define mm_rotl256_1x64x( s0, s1 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
s0 = mm_swap_64( s0 ); \
|
||||
@@ -134,10 +134,10 @@ do { \
|
||||
s0 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotr256_1x64( s0, s1 ) \
|
||||
#define mm_rotr256_1x64x( s0, s1 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
s0 = mm_swap_64( s0) ; \
|
||||
s0 = mm_swap_64( s0 ) ; \
|
||||
s1 = mm_swap_64( s1 ); \
|
||||
t = _mm_or_si128( \
|
||||
_mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
|
||||
@@ -148,6 +148,21 @@ do { \
|
||||
s0 = t; \
|
||||
} while(0)
|
||||
|
||||
// Rotate 256 bits through two 128 bit vectors by n*32 bits and return
|
||||
// the rotated s0.
|
||||
// Similar to mm_rotr256_1x32 but only a partial rotation as s1 is not
|
||||
// completed. It's faster than a full rotation.
|
||||
inline __m128i mm_rotr256_32( __m128i s0, __m128i s1, int n )
|
||||
{
|
||||
return _mm_or_si128( _mm_srli_si128( s0, n<<2 ),
|
||||
_mm_slli_si128( s1, 16 - (n<<2) ) );
|
||||
}
|
||||
|
||||
inline __m128i mm_rotl256_32( __m128i s0, __m128i s1, int n )
|
||||
{
|
||||
return _mm_or_si128( _mm_slli_si128( s0, n<<2 ),
|
||||
_mm_srli_si128( s1, 16 - (n<<2) ) );
|
||||
}
|
||||
|
||||
// Swap bytes in vector elements
|
||||
inline __m128i mm_byteswap_32( __m128i x )
|
||||
@@ -161,6 +176,21 @@ inline __m128i mm_byteswap_32( __m128i x )
|
||||
return _mm_or_si128( _mm_or_si128( x0, x1 ), _mm_or_si128( x2, x3 ) );
|
||||
}
|
||||
|
||||
inline __m128i mm_byteswap_64( __m128i x )
|
||||
{
|
||||
x = _mm_or_si128( _mm_srli_epi64( x, 32 ), _mm_slli_epi64( x, 32 ));
|
||||
|
||||
x = _mm_or_si128( _mm_srli_epi64( _mm_and_si128( x,
|
||||
_mm_set1_epi64x( 0xFFFF0000FFFF0000 ) ), 16 ),
|
||||
_mm_slli_epi64( _mm_and_si128( x,
|
||||
_mm_set1_epi64x( 0x0000FFFF0000FFFF ) ), 16 ));
|
||||
|
||||
return _mm_or_si128( _mm_srli_epi64( _mm_and_si128( x,
|
||||
_mm_set1_epi64x( 0xFF00FF00FF00FF00 ) ), 8 ),
|
||||
_mm_slli_epi64( _mm_and_si128( x,
|
||||
_mm_set1_epi64x( 0x00FF00FF00FF00FF ) ), 8 ));
|
||||
}
|
||||
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
@@ -180,13 +210,6 @@ inline __m128i mm_byteswap_32( __m128i x )
|
||||
|
||||
#define mm256_negate_32( a ) _mm256_sub_epi32( mm256_zero, a )
|
||||
|
||||
// Blend 256 bit vectors based on vector mask. Bits are copied from arg a0
|
||||
// if corresponding mask bits are clear and from arg a1 if set.
|
||||
// Should be faster than maskload.
|
||||
#define mm256_merge( a0, a1, mask ) \
|
||||
_mm256_or_si256( _mm256_and_si256( a0, mm256_not( mask ) ), \
|
||||
_mm256_and_si256( a1, mask )
|
||||
|
||||
// Pack/Unpack two 128 bit vectors into/from one 256 bit vector
|
||||
// usefulness tbd
|
||||
#define mm256_pack_2x128( hi, lo ) \
|
||||
@@ -198,7 +221,7 @@ inline __m128i mm_byteswap_32( __m128i x )
|
||||
|
||||
|
||||
// Memory functions
|
||||
// n = number of __m256i (32 bytes)
|
||||
// n = number of 256 bit (32 byte) vectors
|
||||
|
||||
inline void memset_zero_256( __m256i *dst, int n )
|
||||
{
|
||||
@@ -231,7 +254,7 @@ inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
|
||||
|
||||
// Rotate bits in vector elements
|
||||
|
||||
// Rotate bits in 4 uint64 (3 instructions)
|
||||
// Rotate bits in 64 bit elements
|
||||
// w = packed 64 bit data, n= number of bits to rotate
|
||||
#define mm256_rotr_64( w, c ) \
|
||||
_mm256_or_si256( _mm256_srli_epi64(w, c), _mm256_slli_epi64(w, 64 - c) )
|
||||
@@ -239,19 +262,30 @@ inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
|
||||
#define mm256_rotl_64( w, c ) \
|
||||
_mm256_or_si256( _mm256_slli_epi64(w, c), _mm256_srli_epi64(w, 64 - c) )
|
||||
|
||||
// Rotate vector elements
|
||||
// Rotate bits in 32 bit elements
|
||||
#define mm256_rotr_32( w, c ) \
|
||||
_mm256_or_si256( _mm256_srli_epi32(w, c), _mm256_slli_epi32(w, 32 - c) )
|
||||
|
||||
// Rotate 256 bits by 64 bits (4 uint64 by one uint64)
|
||||
#define mm256_rotl_32( w, c ) \
|
||||
_mm256_or_si256( _mm256_slli_epi32(w, c), _mm256_srli_epi32(w, 32 - c) )
|
||||
|
||||
// Rotate elements in vector
|
||||
|
||||
// Rotate vector by one 64 bit element (aka two 32 bit elements)
|
||||
//__m256i mm256_rotl256_1x64( _mm256i, int )
|
||||
#define mm256_rotl256_1x64( w ) _mm256_permute4x64_epi64( w, 0x39 )
|
||||
#define mm256_rotl256_1x64( w ) _mm256_permute4x64_epi64( w, 0x93 )
|
||||
|
||||
#define mm256_rotr256_1x64( w ) _mm256_permute4x64_epi64( w, 0x93 )
|
||||
#define mm256_rotr256_1x64( w ) _mm256_permute4x64_epi64( w, 0x39 )
|
||||
|
||||
// Same as 2x64 rotate in either direction
|
||||
// Swap 128 bit elements (aka rotate by two 64 bit, four 32 bit elements))
|
||||
#define mm256_swap_128( w ) _mm256_permute2f128_si256( w, w, 1 )
|
||||
|
||||
// Swap bytes in vector elements
|
||||
// Rotate by one 32 bit element (aka two 16 bit elements)
|
||||
#define mm256_rotl256_1x32( w ) _mm256_shuffle_epi32( w, 0x93 )
|
||||
|
||||
#define mm256_rotr256_1x32( w ) _mm256_shuffle_epi32( w, 0x39 )
|
||||
|
||||
// Swap bytes in vector elements
|
||||
inline __m256i mm256_byteswap_32( __m256i x )
|
||||
{
|
||||
__m256i x1 = _mm256_and_si256( x, _mm256_set1_epi32( 0x0000ff00 ) );
|
||||
@@ -269,14 +303,14 @@ inline __m256i mm256_byteswap_64( __m256i x )
|
||||
x = _mm256_or_si256( _mm256_srli_epi64( x, 32 ), _mm256_slli_epi64( x, 32 ));
|
||||
|
||||
x = _mm256_or_si256( _mm256_srli_epi64( _mm256_and_si256( x,
|
||||
_mm256_set1_epi64x( 0xFFFF0000FFFF0000 ) ), 16 ),
|
||||
_mm256_set1_epi64x( 0xFFFF0000FFFF0000 ) ), 16 ),
|
||||
_mm256_slli_epi64( _mm256_and_si256( x,
|
||||
_mm256_set1_epi64x( 0x0000FFFF0000FFFF ) ), 16 ));
|
||||
_mm256_set1_epi64x( 0x0000FFFF0000FFFF ) ), 16 ));
|
||||
|
||||
return _mm256_or_si256( _mm256_srli_epi64( _mm256_and_si256( x,
|
||||
_mm256_set1_epi64x( 0xFF00FF00FF00FF00 ) ), 8 ),
|
||||
_mm256_slli_epi64( _mm256_and_si256( x,
|
||||
_mm256_set1_epi64x( 0x00FF00FF00FF00FF ) ), 8 ));
|
||||
return _mm256_or_si256( _mm256_srli_epi64( _mm256_and_si256( x,
|
||||
_mm256_set1_epi64x( 0xFF00FF00FF00FF00 ) ), 8 ),
|
||||
_mm256_slli_epi64( _mm256_and_si256( x,
|
||||
_mm256_set1_epi64x( 0x00FF00FF00FF00FF ) ), 8 ));
|
||||
}
|
||||
|
||||
// Pseudo parallel aes
|
||||
@@ -287,7 +321,6 @@ inline __m256i mm256_aesenc_2x128( __m256i x, __m256i k )
|
||||
|
||||
mm256_unpack_2x128( hi, lo, x );
|
||||
mm256_unpack_2x128( khi, klo, k );
|
||||
|
||||
lo = _mm_aesenc_si128( lo, klo );
|
||||
hi = _mm_aesenc_si128( hi, khi );
|
||||
|
||||
@@ -299,7 +332,6 @@ inline __m256i mm256_aesenc_nokey_2x128( __m256i x )
|
||||
__m128i hi, lo;
|
||||
|
||||
mm256_unpack_2x128( hi, lo, x );
|
||||
|
||||
lo = _mm_aesenc_si128( lo, mm_zero );
|
||||
hi = _mm_aesenc_si128( hi, mm_zero );
|
||||
|
||||
@@ -308,32 +340,37 @@ inline __m256i mm256_aesenc_nokey_2x128( __m256i x )
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
// AVX
|
||||
|
||||
// Paired functions for interleaving and deinterleaving data for vector
|
||||
// processing.
|
||||
// Size is specfied in bits regardless of vector size to avoid pointer
|
||||
// arithmetic confusion with different size vectors and be consistent with
|
||||
// the function's name.
|
||||
|
||||
// Only 256, 512 and 640 bit length, (32, 64 & 80 bytes respectively)
|
||||
// are supported.
|
||||
// Buffer length is specified in bits to match the function naming format.
|
||||
//
|
||||
// Each function has 2 implementations, an optimized version that uses
|
||||
// vector indexing and a slower version that uses pointers. The optimized
|
||||
// version can only be used with 64 bit elements and only supports sizes
|
||||
// of 256, 512 or 640 bits, 32, 64, and 80 bytes respectively.
|
||||
//
|
||||
// NOTE: Contrary to GCC documentation accessing vector elements using array
|
||||
// indexes only works with 64 bit elements.
|
||||
// Interleaving and deinterleaving of vectors of 32 bit elements
|
||||
// must use the slower implementations that don't use vector indexing.
|
||||
//
|
||||
// All data must be aligned to 256 bits for AVX2, or 128 bits for AVX.
|
||||
// Interleave source args and deinterleave destination args are not required
|
||||
// to be contiguous but it's more efficient if they are.
|
||||
// to be contiguous in memory but it's more efficient if they are.
|
||||
// Interleave source agrs may be the same actual arg repeated.
|
||||
// 640 bit deinterleaving 4x64 or 8x32 using 256 bit AVX2 requires the
|
||||
// 640 bit deinterleaving 4x64 using 256 bit AVX2 requires the
|
||||
// destination buffers be defined with padding up to 768 bits for overrun
|
||||
// space.
|
||||
// Overrrun space is not needed when interleaving or when deinterleaving
|
||||
// 4x32 using 128 bit AVX.
|
||||
// Overrun space use is non destructive and should be ignored by the
|
||||
// caller.
|
||||
// space. Although overrun space use is non destructive it should not overlay
|
||||
// useful data and should be ignored by the caller.
|
||||
|
||||
// interleave 4 arrays of 32 bit elements for AVX processing
|
||||
// SSE2 AVX
|
||||
|
||||
// interleave 4 arrays of 32 bit elements for 128 bit processing
|
||||
// bit_len must be 256, 512 or 640 bits.
|
||||
inline void mm_interleave_4x32( void *dst, const void *src0, const void *src1,
|
||||
// Vector indexing doesn't work with 32 bit data.
|
||||
inline void mm_interleave_4x32x( void *dst, const void *src0, const void *src1,
|
||||
const void *src2, const void *src3, int bit_len )
|
||||
{
|
||||
uint32_t *s0 = (uint32_t*)src0;
|
||||
@@ -346,7 +383,6 @@ inline void mm_interleave_4x32( void *dst, const void *src0, const void *src1,
|
||||
d[1] = _mm_set_epi32( s3[ 1], s2[ 1], s1[ 1], s0[ 1] );
|
||||
d[2] = _mm_set_epi32( s3[ 2], s2[ 2], s1[ 2], s0[ 2] );
|
||||
d[3] = _mm_set_epi32( s3[ 3], s2[ 3], s1[ 3], s0[ 3] );
|
||||
|
||||
d[4] = _mm_set_epi32( s3[ 4], s2[ 4], s1[ 4], s0[ 4] );
|
||||
d[5] = _mm_set_epi32( s3[ 5], s2[ 5], s1[ 5], s0[ 5] );
|
||||
d[6] = _mm_set_epi32( s3[ 6], s2[ 6], s1[ 6], s0[ 6] );
|
||||
@@ -371,22 +407,27 @@ inline void mm_interleave_4x32( void *dst, const void *src0, const void *src1,
|
||||
d[19] = _mm_set_epi32( s3[19], s2[19], s1[19], s0[19] );
|
||||
}
|
||||
|
||||
// interleave 4 arrays of 32 bit elements for AVX processing
|
||||
// bit_len must be multiple of 32
|
||||
inline void mm_interleave_4x32x( uint32_t *dst, uint32_t *src0,
|
||||
uint32_t *src1, uint32_t *src2, uint32_t *src3, int bit_len )
|
||||
inline void mm_interleave_4x32( void *dst, void *src0, void *src1,
|
||||
void *src2, void *src3, int bit_len )
|
||||
{
|
||||
uint32_t *d = dst;;
|
||||
uint32_t *d = (uint32_t*)dst;
|
||||
uint32_t *s0 = (uint32_t*)src0;
|
||||
uint32_t *s1 = (uint32_t*)src1;
|
||||
uint32_t *s2 = (uint32_t*)src2;
|
||||
uint32_t *s3 = (uint32_t*)src3;
|
||||
|
||||
for ( int i = 0; i < bit_len >> 5; i++, d += 4 )
|
||||
{
|
||||
*d = *(src0+i);
|
||||
*(d+1) = *(src1+i);
|
||||
*(d+2) = *(src2+i);
|
||||
*(d+3) = *(src3+i);
|
||||
*d = *(s0+i);
|
||||
*(d+1) = *(s1+i);
|
||||
*(d+2) = *(s2+i);
|
||||
*(d+3) = *(s3+i);
|
||||
}
|
||||
}
|
||||
|
||||
inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
|
||||
// doesn't work with 32 bit elements
|
||||
inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src, int bit_len )
|
||||
{
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
@@ -428,17 +469,21 @@ inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
|
||||
|
||||
// deinterleave 4 arrays into individual buffers for scalarm processing
|
||||
// bit_len must be multiple of 32
|
||||
inline void mm_deinterleave_4x32x( uint32_t *dst0, uint32_t *dst1,
|
||||
uint32_t *dst2,uint32_t *dst3, uint32_t *src,
|
||||
int bit_len )
|
||||
inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src, int bit_len )
|
||||
{
|
||||
uint32_t *s = src;
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
uint32_t *d0 = (uint32_t*)dst0;
|
||||
uint32_t *d1 = (uint32_t*)dst1;
|
||||
uint32_t *d2 = (uint32_t*)dst2;
|
||||
uint32_t *d3 = (uint32_t*)dst3;
|
||||
|
||||
for ( int i = 0; i < bit_len >> 5; i++, s += 4 )
|
||||
{
|
||||
*(dst0+i) = *s;
|
||||
*(dst1+i) = *(s+1);
|
||||
*(dst2+i) = *(s+2);
|
||||
*(dst3+i) = *(s+3);
|
||||
*(d0+i) = *s;
|
||||
*(d1+i) = *(s+1);
|
||||
*(d2+i) = *(s+2);
|
||||
*(d3+i) = *(s+3);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -473,23 +518,27 @@ inline void mm256_interleave_4x64( void *dst, const void *src0,
|
||||
d[9] = _mm256_set_epi64x( s3[9], s2[9], s1[9], s0[9] );
|
||||
}
|
||||
|
||||
|
||||
// interleave 4 arrays of 64 bit elements for AVX2 processing
|
||||
// Slower version
|
||||
// bit_len must be multiple of 64
|
||||
inline void mm256_interleave_4x64x( uint64_t *dst, uint64_t *src0,
|
||||
uint64_t *src1, uint64_t *src2, uint64_t *src3, int bit_len )
|
||||
inline void mm256_interleave_4x64x( void *dst, void *src0, void *src1,
|
||||
void *src2, void *src3, int bit_len )
|
||||
{
|
||||
uint64_t *d = dst;
|
||||
uint64_t *d = (uint64_t*)dst;
|
||||
uint64_t *s0 = (uint64_t*)src0;
|
||||
uint64_t *s1 = (uint64_t*)src1;
|
||||
uint64_t *s2 = (uint64_t*)src2;
|
||||
uint64_t *s3 = (uint64_t*)src3;
|
||||
|
||||
for ( int i = 0; i < bit_len>>6; i++, d += 4 )
|
||||
{
|
||||
*d = *(src0+i);
|
||||
*(d+1) = *(src1+i);
|
||||
*(d+2) = *(src2+i);
|
||||
*(d+3) = *(src3+i);
|
||||
*d = *(s0+i);
|
||||
*(d+1) = *(s1+i);
|
||||
*(d+2) = *(s2+i);
|
||||
*(d+3) = *(s3+i);
|
||||
}
|
||||
}
|
||||
|
||||
// Deinterleave 4 buffers of 32 bit data from the source buffer.
|
||||
// Deinterleave 4 buffers of 64 bit data from the source buffer.
|
||||
inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src, int bit_len )
|
||||
{
|
||||
@@ -520,25 +569,30 @@ inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2,
|
||||
d3[2] = _mm256_set_epi64x( d3[2][3], d3[2][2], s[39], s[35] );
|
||||
}
|
||||
|
||||
|
||||
// Deinterleave 4 arrays into indivudual 64 bit arrays for scalar processing
|
||||
// Slower version
|
||||
// bit_len must be multiple 0f 64
|
||||
inline void mm256_deinterleave_4x64x( uint64_t *dst0, uint64_t *dst1,
|
||||
uint64_t *dst2,uint64_t *dst3, uint64_t *src, int bit_len )
|
||||
inline void mm256_deinterleave_4x64x( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, void *src, int bit_len )
|
||||
{
|
||||
uint64_t *s = src;
|
||||
for ( int i = 0; i < bit_len>>6; i++, s += 4 )
|
||||
uint64_t *s = (uint64_t*)src;
|
||||
uint64_t *d0 = (uint64_t*)dst0;
|
||||
uint64_t *d1 = (uint64_t*)dst1;
|
||||
uint64_t *d2 = (uint64_t*)dst2;
|
||||
uint64_t *d3 = (uint64_t*)dst3;
|
||||
|
||||
for ( int i = 0; i < bit_len>>6; i++, s += 4 )
|
||||
{
|
||||
*(dst0+i) = *s;
|
||||
*(dst1+i) = *(s+1);
|
||||
*(dst2+i) = *(s+2);
|
||||
*(dst3+i) = *(s+3);
|
||||
*(d0+i) = *s;
|
||||
*(d1+i) = *(s+1);
|
||||
*(d2+i) = *(s+2);
|
||||
*(d3+i) = *(s+3);
|
||||
}
|
||||
}
|
||||
|
||||
// Interleave 8 source buffers containing 32 bit data into the destination
|
||||
// buffer
|
||||
inline void mm256_interleave_8x32( void *dst, const void *src0,
|
||||
// vector
|
||||
// Doesn't work, vecror indexing doesn't work for 32 bit elements
|
||||
inline void mm256_interleave_8x32x( void *dst, const void *src0,
|
||||
const void *src1, const void *src2, const void *src3, const void *src4,
|
||||
const void *src5, const void *src6, const void *src7, int bit_len )
|
||||
{
|
||||
@@ -600,10 +654,9 @@ inline void mm256_interleave_8x32( void *dst, const void *src0,
|
||||
s3[19], s2[19], s1[19], s0[19] );
|
||||
}
|
||||
|
||||
|
||||
// interleave 8 arrays of 32 bit elements for AVX2 processing
|
||||
// Slower but it works with 32 bit data
|
||||
// bit_len must be multiple of 32
|
||||
inline void mm256_interleave_8x32x( uint32_t *dst, uint32_t *src0,
|
||||
inline void mm256_interleave_8x32( uint32_t *dst, uint32_t *src0,
|
||||
uint32_t *src1, uint32_t *src2, uint32_t *src3, uint32_t *src4,
|
||||
uint32_t *src5, uint32_t *src6, uint32_t *src7, int bit_len )
|
||||
{
|
||||
@@ -622,7 +675,7 @@ inline void mm256_interleave_8x32x( uint32_t *dst, uint32_t *src0,
|
||||
}
|
||||
|
||||
// Deinterleave 8 buffers of 32 bit data from the source buffer.
|
||||
inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2,
|
||||
inline void mm256_deinterleave_8x32x( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, void *dst4, void *dst5, void *dst6, void *dst7,
|
||||
const void *src, int bit_len )
|
||||
{
|
||||
@@ -703,10 +756,9 @@ inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2,
|
||||
s[159], s[151], s[143], s[135] );
|
||||
}
|
||||
|
||||
|
||||
// Deinterleave 8 arrays into indivdual buffers for scalar processing
|
||||
// bit_len must be multiple of 32
|
||||
inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1,
|
||||
inline void mm256_deinterleave_8x32( uint32_t *dst0, uint32_t *dst1,
|
||||
uint32_t *dst2,uint32_t *dst3, uint32_t *dst4, uint32_t *dst5,
|
||||
uint32_t *dst6,uint32_t *dst7,uint32_t *src, int bit_len )
|
||||
{
|
||||
@@ -763,4 +815,4 @@ inline void mm_reinterleave_4x32( uint32_t *dst, uint64_t *src,
|
||||
}
|
||||
|
||||
#endif // __AVX2__
|
||||
#endif // AVX_DEF_H__
|
||||
#endif // AVXDEFS_H__
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.5.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.6.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.7.5'
|
||||
PACKAGE_STRING='cpuminer-opt 3.7.5'
|
||||
PACKAGE_VERSION='3.7.6'
|
||||
PACKAGE_STRING='cpuminer-opt 3.7.6'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.7.5 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.7.6 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1392,7 +1392,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.7.5:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.7.6:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1497,7 +1497,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.7.5
|
||||
cpuminer-opt configure 3.7.6
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.7.5, which was
|
||||
It was created by cpuminer-opt $as_me 3.7.6, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2981,7 +2981,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.7.5'
|
||||
VERSION='3.7.6'
|
||||
|
||||
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.7.5, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.7.6, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -6743,7 +6743,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.7.5
|
||||
cpuminer-opt config.status 3.7.6
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.7.5])
|
||||
AC_INIT([cpuminer-opt], [3.7.6])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
107
cpu-miner.c
107
cpu-miner.c
@@ -103,7 +103,7 @@ enum algos opt_algo = ALGO_NULL;
|
||||
int opt_scrypt_n = 0;
|
||||
int opt_pluck_n = 128;
|
||||
int opt_n_threads = 0;
|
||||
int64_t opt_affinity = -1L;
|
||||
int64_t opt_affinity = -1;
|
||||
int opt_priority = 0;
|
||||
int num_cpus;
|
||||
char *rpc_url = NULL;;
|
||||
@@ -195,20 +195,27 @@ static inline void drop_policy(void)
|
||||
#define pthread_setaffinity_np(tid,sz,s) {} /* only do process affinity */
|
||||
#endif
|
||||
|
||||
static void affine_to_cpu_mask(int id, unsigned long mask) {
|
||||
cpu_set_t set;
|
||||
CPU_ZERO(&set);
|
||||
for (uint8_t i = 0; i < num_cpus; i++) {
|
||||
// cpu mask
|
||||
if (mask & (1UL<<i)) { CPU_SET(i, &set); }
|
||||
}
|
||||
if (id == -1) {
|
||||
// process affinity
|
||||
sched_setaffinity(0, sizeof(&set), &set);
|
||||
} else {
|
||||
// thread only
|
||||
pthread_setaffinity_np(thr_info[id].pth, sizeof(&set), &set);
|
||||
}
|
||||
static void affine_to_cpu_mask( int id, unsigned long long mask )
|
||||
{
|
||||
cpu_set_t set;
|
||||
CPU_ZERO(&set);
|
||||
uint8_t ncpus = (num_cpus > 256) ? 256 : num_cpus;
|
||||
|
||||
for ( uint8_t i = 0; i < ncpus; i++ )
|
||||
{
|
||||
// cpu mask
|
||||
if( (ncpus > 64) || ( mask & (1UL << i) ) ) CPU_SET( i, &set );
|
||||
}
|
||||
if ( id == -1 )
|
||||
{
|
||||
// process affinity
|
||||
sched_setaffinity(0, sizeof(&set), &set);
|
||||
}
|
||||
else
|
||||
{
|
||||
// thread only
|
||||
pthread_setaffinity_np(thr_info[id].pth, sizeof(&set), &set);
|
||||
}
|
||||
}
|
||||
|
||||
#elif defined(WIN32) /* Windows */
|
||||
@@ -1669,21 +1676,30 @@ static void *miner_thread( void *userdata )
|
||||
drop_policy();
|
||||
}
|
||||
// CPU thread affinity
|
||||
if (num_cpus > 1)
|
||||
if ( num_cpus > 64 )
|
||||
{
|
||||
if (opt_affinity == -1 && opt_n_threads > 1)
|
||||
// opt_affinity ignored with more than 64 cpus.
|
||||
if (opt_debug)
|
||||
applog( LOG_DEBUG, "Binding thread %d to cpu %d",
|
||||
thr_id, thr_id % num_cpus );
|
||||
affine_to_cpu_mask( thr_id, -1 );
|
||||
}
|
||||
else if ( num_cpus > 1 )
|
||||
{
|
||||
if ( (opt_affinity == -1) && (opt_n_threads) > 1 )
|
||||
{
|
||||
if (opt_debug)
|
||||
applog( LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)",
|
||||
applog( LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)",
|
||||
thr_id, thr_id % num_cpus, ( 1 << (thr_id % num_cpus) ) );
|
||||
affine_to_cpu_mask(thr_id, 1UL << (thr_id % num_cpus));
|
||||
|
||||
affine_to_cpu_mask( thr_id, 1 << (thr_id % num_cpus) );
|
||||
}
|
||||
else if (opt_affinity != -1L)
|
||||
else if (opt_affinity != -1)
|
||||
{
|
||||
if (opt_debug)
|
||||
applog( LOG_DEBUG, "Binding thread %d to cpu mask %x",
|
||||
if (opt_debug)
|
||||
applog( LOG_DEBUG, "Binding thread %d to cpu mask %x",
|
||||
thr_id, opt_affinity);
|
||||
affine_to_cpu_mask(thr_id, (unsigned long)opt_affinity);
|
||||
affine_to_cpu_mask( thr_id, opt_affinity );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1822,10 +1838,10 @@ static void *miner_thread( void *userdata )
|
||||
num_submitted++;
|
||||
}
|
||||
#if FOUR_WAY
|
||||
if (num_submitted>1)
|
||||
applog(LOG_NOTICE, "4 WAY hash, %u nonces submitted," CL_MAG " BONUS!" CL_WHT, num_submitted);
|
||||
if (num_submitted > 1)
|
||||
applog(LOG_NOTICE, "4 WAY hash nonces submitted: %u" CL_MAG " BONUS!" CL_N, num_submitted);
|
||||
else
|
||||
applog(LOG_NOTICE, "4 WAY hash %u nonce submitted", num_submitted);
|
||||
applog(LOG_NOTICE, "4 WAY hash nonces submitted: %u", num_submitted);
|
||||
#endif
|
||||
// must be a one way algo, nonce is already in work data
|
||||
if ( !num_submitted )
|
||||
@@ -1836,7 +1852,7 @@ else
|
||||
break;
|
||||
}
|
||||
#if FOUR_WAY
|
||||
applog(LOG_NOTICE, "1 WAY hash 1 nonce submitted");
|
||||
applog(LOG_NOTICE, "1 WAY hash nonce submitted");
|
||||
#endif
|
||||
|
||||
}
|
||||
@@ -2948,12 +2964,16 @@ bool check_cpu_capability ()
|
||||
|
||||
|
||||
printf(".\nAlgo features:");
|
||||
if ( algo_has_sse2 ) printf( " SSE2" );
|
||||
if ( algo_has_aes ) printf( " AES" );
|
||||
if ( algo_has_avx ) printf( " AVX" );
|
||||
if ( algo_has_avx2 ) printf( " AVX2" );
|
||||
if ( algo_has_4way ) printf( " 4WAY" );
|
||||
if ( algo_has_sha ) printf( " SHA" );
|
||||
if ( algo_features == EMPTY_SET ) printf( " None" );
|
||||
else
|
||||
{
|
||||
if ( algo_has_sse2 ) printf( " SSE2" );
|
||||
if ( algo_has_aes ) printf( " AES" );
|
||||
if ( algo_has_avx ) printf( " AVX" );
|
||||
if ( algo_has_avx2 ) printf( " AVX2" );
|
||||
if ( algo_has_4way ) printf( " 4WAY" );
|
||||
if ( algo_has_sha ) printf( " SHA" );
|
||||
}
|
||||
printf(".\n");
|
||||
|
||||
// Check for CPU and build incompatibilities
|
||||
@@ -3166,13 +3186,22 @@ int main(int argc, char *argv[])
|
||||
SetPriorityClass(GetCurrentProcess(), prio);
|
||||
}
|
||||
#endif
|
||||
if (opt_affinity != -1)
|
||||
{
|
||||
if (!opt_quiet)
|
||||
applog(LOG_DEBUG, "Binding process to cpu mask %x", opt_affinity);
|
||||
affine_to_cpu_mask(-1, (unsigned long)opt_affinity);
|
||||
}
|
||||
|
||||
if ( opt_affinity != -1 )
|
||||
{
|
||||
if ( num_cpus > 64 )
|
||||
{
|
||||
applog(LOG_WARNING,"--cpu-affinity argument is not supported with more");
|
||||
applog(LOG_WARNING," than 64 CPUs, using default affinity.");
|
||||
opt_affinity = -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!opt_quiet)
|
||||
applog(LOG_DEBUG, "Binding process to cpu mask %x", opt_affinity);
|
||||
affine_to_cpu_mask( -1, (unsigned long)opt_affinity );
|
||||
}
|
||||
}
|
||||
|
||||
//#ifdef HAVE_SYSLOG_H
|
||||
// if (use_syslog)
|
||||
|
3
miner.h
3
miner.h
@@ -506,6 +506,7 @@ enum algos {
|
||||
ALGO_KECCAKC,
|
||||
ALGO_LBRY,
|
||||
ALGO_LUFFA,
|
||||
ALGO_LYRA2H,
|
||||
ALGO_LYRA2RE,
|
||||
ALGO_LYRA2REV2,
|
||||
ALGO_LYRA2Z,
|
||||
@@ -576,6 +577,7 @@ static const char* const algo_names[] = {
|
||||
"keccakc",
|
||||
"lbry",
|
||||
"luffa",
|
||||
"lyra2h",
|
||||
"lyra2re",
|
||||
"lyra2rev2",
|
||||
"lyra2z",
|
||||
@@ -700,6 +702,7 @@ Options:\n\
|
||||
keccakc Creative Coin\n\
|
||||
lbry LBC, LBRY Credits\n\
|
||||
luffa Luffa\n\
|
||||
lyra2h Hppcoin\n\
|
||||
lyra2re lyra2\n\
|
||||
lyra2rev2 lyrav2, Vertcoin\n\
|
||||
lyra2z Zcoin (XZC)\n\
|
||||
|
Reference in New Issue
Block a user