This commit is contained in:
Jay D Dee
2017-12-14 18:28:51 -05:00
parent af1c940919
commit 7a1389998b
31 changed files with 1285 additions and 377 deletions

View File

@@ -108,6 +108,7 @@ cpuminer_SOURCES = \
algo/lyra2/lyra2z.c \ algo/lyra2/lyra2z.c \
algo/lyra2/lyra2z-4way.c \ algo/lyra2/lyra2z-4way.c \
algo/lyra2/lyra2z330.c \ algo/lyra2/lyra2z330.c \
algo/lyra2/lyra2h.c \
algo/m7m.c \ algo/m7m.c \
algo/neoscrypt.c \ algo/neoscrypt.c \
algo/nist5/nist5-gate.c \ algo/nist5/nist5-gate.c \
@@ -128,6 +129,7 @@ cpuminer_SOURCES = \
algo/sha/sha256t.c \ algo/sha/sha256t.c \
algo/shabal/sph_shabal.c \ algo/shabal/sph_shabal.c \
algo/shavite/sph_shavite.c \ algo/shavite/sph_shavite.c \
algo/shavite/sph-shavite-aesni.c \
algo/shavite/shavite.c \ algo/shavite/shavite.c \
algo/simd/sph_simd.c \ algo/simd/sph_simd.c \
algo/simd/sse2/nist.c \ algo/simd/sse2/nist.c \
@@ -155,11 +157,12 @@ cpuminer_SOURCES = \
algo/whirlpool/whirlpool-4way.c \ algo/whirlpool/whirlpool-4way.c \
algo/whirlpool/whirlpool.c \ algo/whirlpool/whirlpool.c \
algo/whirlpool/whirlpoolx.c \ algo/whirlpool/whirlpoolx.c \
algo/x11/phi1612.c \ algo/x11/x11-gate.c \
algo/x11/x11.c \ algo/x11/x11.c \
algo/x11/x11evo.c \ algo/x11/x11evo.c \
algo/x11/x11gost.c \ algo/x11/x11gost.c \
algo/x11/c11.c \ algo/x11/c11.c \
algo/x11/phi1612.c \
algo/x13/x13.c \ algo/x13/x13.c \
algo/x13/x13sm3.c \ algo/x13/x13sm3.c \
algo/x14/x14.c \ algo/x14/x14.c \

View File

@@ -40,6 +40,7 @@ Supported Algorithms
keccakc Creative coin keccakc Creative coin
lbry LBC, LBRY Credits lbry LBC, LBRY Credits
luffa Luffa luffa Luffa
lyra2h Hppcoin
lyra2re lyra2 lyra2re lyra2
lyra2rev2 lyra2v2, Vertcoin lyra2rev2 lyra2v2, Vertcoin
lyra2z Zcoin (XZC) lyra2z Zcoin (XZC)

View File

@@ -164,6 +164,12 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
Change Log Change Log
---------- ----------
v3.7.6
Added kyra2h algo for Hppcoin.
Added support for more than 64 CPUs.
Optimized shavite512 with AES, improves x11 etc.
v3.7.5 v3.7.5
New algo keccakc for Creative coin with 4way optimizations New algo keccakc for Creative coin with 4way optimizations

View File

@@ -138,6 +138,10 @@ void init_algo_gate( algo_gate_t* gate )
gate->work_cmp_size = STD_WORK_CMP_SIZE; gate->work_cmp_size = STD_WORK_CMP_SIZE;
} }
// Ignore warnings for not yet defined register functions
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
// called by each thread that uses the gate // called by each thread that uses the gate
bool register_algo_gate( int algo, algo_gate_t *gate ) bool register_algo_gate( int algo, algo_gate_t *gate )
{ {
@@ -151,11 +155,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
switch (algo) switch (algo)
{ {
// Ignore warnings for not yet defined register fucntions
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
case ALGO_ARGON2: register_argon2_algo ( gate ); break; case ALGO_ARGON2: register_argon2_algo ( gate ); break;
case ALGO_AXIOM: register_axiom_algo ( gate ); break; case ALGO_AXIOM: register_axiom_algo ( gate ); break;
case ALGO_BASTION: register_bastion_algo ( gate ); break; case ALGO_BASTION: register_bastion_algo ( gate ); break;
@@ -180,6 +179,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_KECCAKC: register_keccakc_algo ( gate ); break; case ALGO_KECCAKC: register_keccakc_algo ( gate ); break;
case ALGO_LBRY: register_lbry_algo ( gate ); break; case ALGO_LBRY: register_lbry_algo ( gate ); break;
case ALGO_LUFFA: register_luffa_algo ( gate ); break; case ALGO_LUFFA: register_luffa_algo ( gate ); break;
case ALGO_LYRA2H: register_lyra2h_algo ( gate ); break;
case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break; case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break;
case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break; case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break;
case ALGO_LYRA2Z: register_lyra2z_algo ( gate ); break; case ALGO_LYRA2Z: register_lyra2z_algo ( gate ); break;
@@ -221,10 +221,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break; case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break; case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
case ALGO_ZR5: register_zr5_algo ( gate ); break; case ALGO_ZR5: register_zr5_algo ( gate ); break;
// restore warnings
#pragma GCC diagnostic pop
default: default:
applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] ); applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
return false; return false;
@@ -239,6 +235,9 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
return true; return true;
} }
// restore warnings
#pragma GCC diagnostic pop
// override std defaults with jr2 defaults // override std defaults with jr2 defaults
bool register_json_rpc2( algo_gate_t *gate ) bool register_json_rpc2( algo_gate_t *gate )
{ {

View File

@@ -32,12 +32,12 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done ) uint64_t *hashes_done )
{ {
uint32_t vdata[20*4] __attribute__ ((aligned (64))); uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[4*4] __attribute__ ((aligned (32))); uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
uint32_t *ptarget = work->target; uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
// uint32_t HTarget = ptarget[7]; // uint32_t HTarget = ptarget[7];
uint32_t _ALIGN(32) endiandata[20]; uint32_t _ALIGN(32) edata[20];
uint32_t n = first_nonce; uint32_t n = first_nonce;
uint32_t *nonces = work->nonces; uint32_t *nonces = work->nonces;
bool *found = work->nfound; bool *found = work->nfound;
@@ -47,18 +47,17 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
// HTarget = 0x7f; // HTarget = 0x7f;
// we need big endian data... // we need big endian data...
swab32_array( endiandata, pdata, 20 ); swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, endiandata, endiandata, endiandata, mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
endiandata, 640 );
uint32_t *noncep = vdata + 76; // 19*4 uint32_t *noncep = vdata + 76; // 19*4
do { do {
found[0] = found[1] = found[2] = found[3] = false; found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep, n ); be32enc( noncep, n );
be32enc( noncep +2, n+1 ); be32enc( noncep +1, n+1 );
be32enc( noncep +4, n+2 ); be32enc( noncep +2, n+2 );
be32enc( noncep +6, n+3 ); be32enc( noncep +3, n+3 );
blakehash_4way( hash, vdata ); blakehash_4way( hash, vdata );
@@ -74,7 +73,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
} }
if ( (hash+8)[7] == 0 ) if ( (hash+8)[7] == 0 )
{ {
if ( fulltest( hash, ptarget ) ) if ( fulltest( hash+8, ptarget ) )
{ {
found[1] = true; found[1] = true;
num_found++; num_found++;
@@ -83,7 +82,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
} }
if ( (hash+16)[7] == 0 ) if ( (hash+16)[7] == 0 )
{ {
if ( fulltest( hash, ptarget ) ) if ( fulltest( hash+8, ptarget ) )
{ {
found[2] = true; found[2] = true;
num_found++; num_found++;
@@ -92,15 +91,14 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
} }
if ( (hash+24)[7] == 0 ) if ( (hash+24)[7] == 0 )
{ {
if ( fulltest( hash, ptarget ) ) if ( fulltest( hash+8, ptarget ) )
{ {
found[3] = true; found[3] = true;
num_found++; num_found++;
nonces[3] = n+3; nonces[3] = n+3;
} }
} }
n += 4;
n += 4;
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
} while ( (num_found == 0) && (n < max_nonce) } while ( (num_found == 0) && (n < max_nonce)

View File

@@ -17,7 +17,6 @@ bool register_blake_algo( algo_gate_t* gate )
gate->optimizations = FOUR_WAY_OPT; gate->optimizations = FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_blake_4way; gate->scanhash = (void*)&scanhash_blake_4way;
gate->hash = (void*)&blakehash_4way; gate->hash = (void*)&blakehash_4way;
four_way_not_tested();
#else #else
gate->scanhash = (void*)&scanhash_blake; gate->scanhash = (void*)&scanhash_blake;
gate->hash = (void*)&blakehash; gate->hash = (void*)&blakehash;

View File

@@ -524,18 +524,18 @@ do { \
V5 = H5; \ V5 = H5; \
V6 = H6; \ V6 = H6; \
V7 = H7; \ V7 = H7; \
V8 = _mm_xor_si128( s0, _mmset_epi32( CS0, CS0, CS0, CS0 ) ); \ V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
V9 = _mm_xor_si128( s1, _mmset_epi32( CS1, CS1, CS1, CS1 ) ); \ V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
VA = _mm_xor_si128( s2, _mmset_epi32( CS2, CS2, CS2, CS2 ) ); \ VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
VB = _mm_xor_si128( s3, _mmset_epi32( CS3, CS3, CS3, CS3 ) ); \ VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
VC = _mm_xor_si128( _mmset_epi32( T0, T0, T0, T0 ), \ VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
_mmset_epi32( CS4, CS4, CS4, CS4 ) ); \ _mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \
VD = _mm_xor_si128( _mmset_epi32( T0, T0, T0, T0 ), \ VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
_mmset_epi32( CS5, CS5, CS5, CS5 ) ); \ _mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \
VE = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ) \ VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ) \
, _mmset_epi32( CS6, CS6, CS6, CS6 ) ); \ , _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
VF = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ), \ VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
_mmset_epi32( CS7, CS7, CS7, CS7 ) ); \ _mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
M[0x0] = mm_byteswap_32( *(buf + 0) ); \ M[0x0] = mm_byteswap_32( *(buf + 0) ); \
M[0x1] = mm_byteswap_32( *(buf + 1) ); \ M[0x1] = mm_byteswap_32( *(buf + 1) ); \
M[0x2] = mm_byteswap_32( *(buf + 2) ); \ M[0x2] = mm_byteswap_32( *(buf + 2) ); \
@@ -710,18 +710,18 @@ do { \
V5 = H5; \ V5 = H5; \
V6 = H6; \ V6 = H6; \
V7 = H7; \ V7 = H7; \
V8 = _mm256_xor_si256( S0, _mm256_set_epi64( CB0, CB0, CB0, CB0 ) ); \ V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
V9 = _mm256_xor_si256( S1, _mm256_set_epi64( CB1, CB1, CB1, CB1 ) ); \ V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
VA = _mm256_xor_si256( S2, _mm256_set_epi64( CB2, CB2, CB2, CB2 ) ); \ VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
VB = _mm256_xor_si256( S3, _mm256_set_epi64( CB3, CB3, CB3, CB3 ) ); \ VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
VC = _mm256_xor_si128( _mm256_set_epi64( T0, T0, T0, T0 ), \ VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
_mm256_set_epi64( CB4, CB4, CB4, CB4 ) ); \ _mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
VD = _mm256_xor_si256( _mm256_set_epi64( T0, T0, T0, T0 ), \ VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
_mm256_set_epi64( CB5, CB5, CB5, CB5 ) ); \ _mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
VE = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \ VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set256_epi64( CB6, CB6, CB6, CB6 ) ); \ _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \ VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set256_epi64( CB7, CB7, CB7, CB7 ) ); \ _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M[0x0] = mm256_byteswap_64( *(buf+0) ); \ M[0x0] = mm256_byteswap_64( *(buf+0) ); \
M[0x1] = mm256_byteswap_64( *(buf+1) ); \ M[0x1] = mm256_byteswap_64( *(buf+1) ); \
M[0x2] = mm256_byteswap_64( *(buf+2) ); \ M[0x2] = mm256_byteswap_64( *(buf+2) ); \
@@ -867,7 +867,6 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
buf = sc->buf; buf = sc->buf;
ptr = sc->ptr; ptr = sc->ptr;
if ( len < buf_size - ptr ) if ( len < buf_size - ptr )
{ {
memcpy_128( buf + (ptr>>2), vdata, len>>2 ); memcpy_128( buf + (ptr>>2), vdata, len>>2 );
@@ -915,9 +914,10 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
ptr = sc->ptr; ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3); bit_len = ((unsigned)ptr << 3);
unsigned z = 0x80 >> n; // unsigned z = 0x80 >> n;
unsigned zz = ((ub & -z) | z) & 0xFF; // unsigned zz = ((ub & -z) | z) & 0xFF;
u.buf[ptr>>2] = _mm_set_epi32( zz, zz, zz, zz ); // u.buf[ptr>>2] = _mm_set_epi32( zz, zz, zz, zz );
u.buf[ptr>>2] = _mm_set1_epi32( 0x80 );
tl = sc->T0 + bit_len; tl = sc->T0 + bit_len;
th = sc->T1; th = sc->T1;
@@ -934,9 +934,11 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
else else
sc->T0 -= 512 - bit_len; sc->T0 -= 512 - bit_len;
if ( ptr <= 48 ) // if ( ptr <= 48 )
if ( ptr <= 52 )
{ {
memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 ); memset_zero_128( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
// memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
if (out_size_w32 == 8) if (out_size_w32 == 8)
u.buf[52>>2] = _mm_or_si128( u.buf[52>>2], u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
_mm_set_epi32( 0x010000000, 0x01000000, _mm_set_epi32( 0x010000000, 0x01000000,
@@ -962,6 +964,7 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
out = (__m128i*)dst; out = (__m128i*)dst;
for ( k = 0; k < out_size_w32; k++ ) for ( k = 0; k < out_size_w32; k++ )
out[k] = mm_byteswap_32( sc->H[k] ); out[k] = mm_byteswap_32( sc->H[k] );
// out[k] = sc->H[k];
} }
#if defined (__AVX2__) #if defined (__AVX2__)

View File

@@ -13,46 +13,35 @@ static __thread bool ctx_midstate_done = false;
void decred_hash_4way( void *state, const void *input ) void decred_hash_4way( void *state, const void *input )
{ {
uint32_t vhash[4*4] __attribute__ ((aligned (64))); uint32_t vhash[8*4] __attribute__ ((aligned (64)));
uint32_t hash0[4] __attribute__ ((aligned (32))); uint32_t hash0[8] __attribute__ ((aligned (32)));
uint32_t hash1[4] __attribute__ ((aligned (32))); uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[4] __attribute__ ((aligned (32))); uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[4] __attribute__ ((aligned (32))); uint32_t hash3[8] __attribute__ ((aligned (32)));
blake256_4way_context ctx __attribute__ ((aligned (64))); blake256_4way_context ctx __attribute__ ((aligned (64)));
sph_blake256_context ctx2 __attribute__ ((aligned (64))); sph_blake256_context ctx2 __attribute__ ((aligned (64)));
uint32_t hash[16] __attribute__ ((aligned (64))); uint32_t hash[16] __attribute__ ((aligned (64)));
uint32_t sin0[45], sin1[45], sin2[45], sin3[45]; uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 ); mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 );
void *tail = input + DECRED_MIDSTATE_LEN; void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
int tail_len = 180 - DECRED_MIDSTATE_LEN; int tail_len = 180 - DECRED_MIDSTATE_LEN;
// #define MIDSTATE_LEN 128
/*
uint8_t *ending = (uint8_t*) input;
ending += MIDSTATE_LEN;
if ( !ctx_midstate_done )
{
blake256_4way_init( &blake_mid );
blake256_4way( &blake_mid, input, DECRED_MIDSTATE_LEN );
ctx_midstate_done = true;
}
memcpy( &ctx, &blake_mid, sizeof(blake_mid) ); memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
blake256_4way( &ctx, tail, tail_len ); blake256_4way( &ctx, tail, tail_len );
blake256_4way_close( &ctx, vhash ); blake256_4way_close( &ctx, vhash );
*/ /*
sph_blake256_init( &ctx2 ); sph_blake256_init( &ctx2 );
sph_blake256( &ctx2, sin0, 180 ); sph_blake256( &ctx2, sin0, 180 );
sph_blake256_close( &ctx2, hash ); sph_blake256_close( &ctx2, hash );
*/
/*
blake256_4way_init( &ctx ); blake256_4way_init( &ctx );
blake256_4way( &ctx, input, 180 ); blake256_4way( &ctx, input, 180 );
blake256_4way_close( &ctx, vhash ); blake256_4way_close( &ctx, vhash );
*/
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 ); mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
/* /*
for ( int i = 0; i < 8; i++ ) for ( int i = 0; i < 8; i++ )
@@ -66,22 +55,21 @@ printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1),
printf("\n"); printf("\n");
*/ */
// memcpy( state, hash0, 32 ); memcpy( state, hash0, 32 );
// memcpy( state+32, hash1, 32 ); memcpy( state+32, hash1, 32 );
// memcpy( state+64, hash1, 32 ); memcpy( state+64, hash2, 32 );
// memcpy( state+96, hash1, 32 ); memcpy( state+96, hash3, 32 );
memcpy( state, hash, 32 ); // memcpy( state, hash, 32 );
} }
int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce, int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done) uint64_t *hashes_done)
{ {
uint32_t vdata[45*4] __attribute__ ((aligned (64))); uint32_t vdata[48*4] __attribute__ ((aligned (64)));
uint32_t hash[4*4] __attribute__ ((aligned (32))); uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t _ALIGN(64) endiandata[48]; uint32_t _ALIGN(64) edata[48];
// uint32_t _ALIGN(64) hash32[8];
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
uint32_t *ptarget = work->target; uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX]; const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
@@ -91,28 +79,25 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
bool *found = work->nfound; bool *found = work->nfound;
int num_found = 0; int num_found = 0;
// #define DCR_NONCE_OFT32 35
ctx_midstate_done = false; ctx_midstate_done = false;
memcpy( edata, pdata, 180 );
// memcpy(endiandata, pdata, 180);
// use the old way until new way updated for size. // use the old way until new way updated for size.
mm_interleave_4x32x( vdata, pdata, pdata, pdata, pdata, 180*8 ); mm_interleave_4x32( vdata, edata, edata, edata, edata, 180*8 );
blake256_4way_init( &blake_mid );
blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4; uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
do { do {
found[0] = found[1] = found[2] = found[3] = false; found[0] = found[1] = found[2] = found[3] = false;
* noncep = n; * noncep = n;
*(noncep+2) = n+1; *(noncep+1) = n+1;
*(noncep+4) = n+2; *(noncep+2) = n+2;
*(noncep+6) = n+3; *(noncep+3) = n+3;
decred_hash_4way( hash, vdata ); decred_hash_4way( hash, vdata );
// endiandata[DCR_NONCE_OFT32] = n;
// decred_hash(hash32, endiandata);
if ( hash[7] <= HTarget && fulltest( hash, ptarget ) ) if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
{ {
work_set_target_ratio( work, hash ); work_set_target_ratio( work, hash );
@@ -121,29 +106,47 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
nonces[0] = n; nonces[0] = n;
pdata[DECRED_NONCE_INDEX] = n; pdata[DECRED_NONCE_INDEX] = n;
} }
/* if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) ) /*
if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
{ {
printf("found 1\n");
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[8], hash[9], hash[10],hash[11] );
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[12], hash[13], hash[14],hash[15] );
printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
work_set_target_ratio( work, hash+8 ); work_set_target_ratio( work, hash+8 );
found[1] = true; found[1] = true;
num_found++; num_found++;
nonces[1] = n; nonces[1] = n+1;
} }
*/
if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) ) if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
{ {
work_set_target_ratio( work, hash+16 ); work_set_target_ratio( work, hash+16 );
found[2] = true; found[2] = true;
num_found++; num_found++;
nonces[2] = n; nonces[2] = n+2;
} }
/*
if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) ) if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
{ {
printf("found 3\n");
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[0], hash[1], hash[2],hash[3] );
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[4], hash[5], hash[6],hash[7] );
printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
work_set_target_ratio( work, hash+24 ); work_set_target_ratio( work, hash+24 );
found[3] = true; found[3] = true;
num_found++; num_found++;
nonces[3] = n; nonces[3] = n+3;
} }
*/ */
n += 4; n += 2;
// n += 4;
} while ( (num_found == 0) && (n < max_nonce) } while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart ); && !work_restart[thr_id].restart );

View File

@@ -872,6 +872,7 @@ blake32_close(sph_blake_small_context *sc,
} else { } else {
sc->T0 -= 512 - bit_len; sc->T0 -= 512 - bit_len;
} }
if (bit_len <= 446) { if (bit_len <= 446) {
memset(u.buf + ptr + 1, 0, 55 - ptr); memset(u.buf + ptr + 1, 0, 55 - ptr);
if (out_size_w32 == 8) if (out_size_w32 == 8)

View File

@@ -25,7 +25,7 @@ void jha_hash_4way( void *out, const void *input )
uint64_t vhash[8*4] __attribute__ ((aligned (64))); uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhash0[8*4] __attribute__ ((aligned (64))); uint64_t vhash0[8*4] __attribute__ ((aligned (64)));
uint64_t vhash1[8*4] __attribute__ ((aligned (64))); uint64_t vhash1[8*4] __attribute__ ((aligned (64)));
__m256i mask0, mask1; __m256i mask, mask0, mask1;
__m256i* vh = (__m256i*)vhash; __m256i* vh = (__m256i*)vhash;
__m256i* vh0 = (__m256i*)vhash0; __m256i* vh0 = (__m256i*)vhash0;
__m256i* vh1 = (__m256i*)vhash1; __m256i* vh1 = (__m256i*)vhash1;
@@ -47,38 +47,37 @@ void jha_hash_4way( void *out, const void *input )
// Heavy & Light Pair Loop // Heavy & Light Pair Loop
for ( int round = 0; round < 3; round++ ) for ( int round = 0; round < 3; round++ )
{ {
// memset_zero_256( vh0, 20 ); // select next function based on bit 0 of previous hash.
// memset_zero_256( vh1, 20 ); // Specutively execute both functions and use mask to
// select results from correct function for each lane.
// positive logic, if maski select vhi // hash = mask : vhash0 ? vhash1
// going from bit to mask reverses logic such that if the test bit is set mask = mm256_negate_64(
// zero will be put in mask0, meaning don't take vh0. mask1 is
// inverted so 1 will be put in mask1 meaning take it.
mask0 = mm256_negate_64(
_mm256_and_si256( vh[0], _mm256_set1_epi64x( 0x1 ) ) ); _mm256_and_si256( vh[0], _mm256_set1_epi64x( 0x1 ) ) );
mask1 = mm256_not( mask0 );
// second version
// mask0 = mask
// mask1 = mm256_not( mask );
// first version
// mask = _mm256_sub_epi64( _mm256_and_si256( vh[0], // mask = _mm256_sub_epi64( _mm256_and_si256( vh[0],
// _mm256_set1_epi64x( 0x1 ) ), _mm256_set1_epi64x( 0x1 ) ); // _mm256_set1_epi64x( 0x1 ) ), _mm256_set1_epi64x( 0x1 ) );
// groestl (serial) v skein // groestl (serial) vs skein
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx_groestl, 64 ); init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash0, update_and_final_groestl( &ctx_groestl, (char*)hash0,
(char*)hash0, 512 ); (char*)hash0, 512 );
init_groestl( &ctx_groestl, 64 ); init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash1, update_and_final_groestl( &ctx_groestl, (char*)hash1,
(char*)hash1, 512 ); (char*)hash1, 512 );
init_groestl( &ctx_groestl, 64 ); init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash2, update_and_final_groestl( &ctx_groestl, (char*)hash2,
(char*)hash2, 512 ); (char*)hash2, 512 );
init_groestl( &ctx_groestl, 64 ); init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash3, update_and_final_groestl( &ctx_groestl, (char*)hash3,
(char*)hash3, 512 ); (char*)hash3, 512 );
mm256_interleave_4x64( vhash0, hash0, hash1, hash2, hash3, 512 ); mm256_interleave_4x64( vhash0, hash0, hash1, hash2, hash3, 512 );
@@ -91,14 +90,20 @@ void jha_hash_4way( void *out, const void *input )
// merge vectored hash // merge vectored hash
for ( int i = 0; i < 8; i++ ) for ( int i = 0; i < 8; i++ )
{ {
vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ), // blend should be faster
_mm256_and_si256( vh1[i], mask1 ) ); vh[i] = _mm256_blendv_epi8( vh0[i], vh1[i], mask );
// second version
// vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
// _mm256_and_si256( vh1[i], mask1 ) );
// first version
/* /*
vha256[i] = _mm256_maskload_epi64( vh0[i] = _mm256_maskload_epi64(
vhasha + i*4, mm256_not( mask ) ); vhash0 + i*4, mm256_not( mask ) );
vhb256[i] = _mm256_maskload_epi64( vh1[i] = _mm256_maskload_epi64(
vhashb + i*4, mask ); vhash1 + i*4, mask );
vh256[i] = _mm256_or_si256( vha256[i], vhb256[i] ); vh[i] = _mm256_or_si256( vh0[i], vh1[i] );
*/ */
} }

93
algo/lyra2/lyra2h.c Normal file
View File

@@ -0,0 +1,93 @@
#include <memory.h>
#include <mm_malloc.h>
#include "algo-gate-api.h"
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
__thread uint64_t* lyra2h_matrix;
bool lyra2h_thread_init()
{
const int i = 16 * 16 * 96;
lyra2h_matrix = _mm_malloc( i, 64 );
return lyra2h_matrix;
}
static __thread sph_blake256_context lyra2h_blake_mid;
void lyra2h_midstate( const void* input )
{
sph_blake256_init( &lyra2h_blake_mid );
sph_blake256( &lyra2h_blake_mid, input, 64 );
}
void lyra2h_hash( void *state, const void *input )
{
uint32_t _ALIGN(64) hash[16];
sph_blake256_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &lyra2h_blake_mid, sizeof lyra2h_blake_mid );
sph_blake256( &ctx_blake, input + 64, 16 );
sph_blake256_close( &ctx_blake, hash );
LYRA2Z( lyra2h_matrix, hash, 32, hash, 32, hash, 32, 8, 8, 8);
memcpy(state, hash, 32);
}
int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t _ALIGN(64) hash[8];
uint32_t _ALIGN(64) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
if (opt_benchmark)
ptarget[7] = 0x0000ff;
for (int i=0; i < 19; i++) {
be32enc(&endiandata[i], pdata[i]);
}
lyra2h_midstate( endiandata );
do {
be32enc(&endiandata[19], nonce);
lyra2h_hash( hash, endiandata );
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
work_set_target_ratio(work, hash);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 1;
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
void lyra2h_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_lyra2h_algo( algo_gate_t* gate )
{
gate->optimizations = AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2h_thread_init;
gate->scanhash = (void*)&scanhash_lyra2h;
gate->hash = (void*)&lyra2h_hash;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&lyra2h_set_target;
return true;
};

View File

@@ -110,7 +110,8 @@ printf("found 0\n");
nonces[0] = pdata[19] = n; nonces[0] = pdata[19] = n;
work_set_target_ratio( work, hash ); work_set_target_ratio( work, hash );
} }
/* if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) ) /*
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{ {
printf("found 1\n"); printf("found 1\n");
found[1] = true; found[1] = true;

View File

@@ -65,13 +65,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
G_4X64( s0, s1, s2, s3 ); \ G_4X64( s0, s1, s2, s3 ); \
s1 = mm256_rotl256_1x64( s1); \ s1 = mm256_rotr256_1x64( s1); \
s2 = mm256_swap_128( s2 ); \ s2 = mm256_swap_128( s2 ); \
s3 = mm256_rotr256_1x64( s3 ); \ s3 = mm256_rotl256_1x64( s3 ); \
G_4X64( s0, s1, s2, s3 ); \ G_4X64( s0, s1, s2, s3 ); \
s1 = mm256_rotr256_1x64( s1 ); \ s1 = mm256_rotl256_1x64( s1 ); \
s2 = mm256_swap_128( s2 ); \ s2 = mm256_swap_128( s2 ); \
s3 = mm256_rotl256_1x64( s3 ); s3 = mm256_rotr256_1x64( s3 );
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \ #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \

View File

@@ -0,0 +1,673 @@
/* $Id: shavite.c 227 2010-06-16 17:28:38Z tp $ */
/*
* SHAvite-3 implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#include <stdio.h>
#include <stddef.h>
#include <string.h>
#ifdef __AES__
#include "sph_shavite.h"
#include "avxdefs.h"
#ifdef __cplusplus
extern "C"{
#endif
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHAVITE
#define SPH_SMALL_FOOTPRINT_SHAVITE 1
#endif
#ifdef _MSC_VER
#pragma warning (disable: 4146)
#endif
#define C32 SPH_C32
/*
* As of round 2 of the SHA-3 competition, the published reference
* implementation and test vectors are wrong, because they use
* big-endian AES tables while the internal decoding uses little-endian.
* The code below follows the specification. To turn it into a code
* which follows the reference implementation (the one called "BugFix"
* on the SHAvite-3 web site, published on Nov 23rd, 2009), comment out
* the code below (from the '#define AES_BIG_ENDIAN...' to the definition
* of the AES_ROUND_NOKEY macro) and replace it with the version which
* is commented out afterwards.
*/
#define AES_BIG_ENDIAN 0
#include "algo/sha/aes_helper.c"
static const sph_u32 IV512[] = {
C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC),
C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC),
C32(0x8E45D73D), C32(0x681AB538), C32(0xBDE86578), C32(0xDD577E47),
C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
};
#define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \
sph_u32 t0 = (x0); \
sph_u32 t1 = (x1); \
sph_u32 t2 = (x2); \
sph_u32 t3 = (x3); \
AES_ROUND_NOKEY_LE(t0, t1, t2, t3, x0, x1, x2, x3); \
} while (0)
#define KEY_EXPAND_ELT(k0, k1, k2, k3) do { \
sph_u32 kt; \
AES_ROUND_NOKEY(k1, k2, k3, k0); \
kt = (k0); \
(k0) = (k1); \
(k1) = (k2); \
(k2) = (k3); \
(k3) = kt; \
} while (0)
#if SPH_SMALL_FOOTPRINT_SHAVITE
/*
* This function assumes that "msg" is aligned for 32-bit access.
*/
static void
c512(sph_shavite_big_context *sc, const void *msg)
{
sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
sph_u32 p8, p9, pA, pB, pC, pD, pE, pF;
sph_u32 rk[448];
size_t u;
int r, s;
#if SPH_LITTLE_ENDIAN
memcpy(rk, msg, 128);
#else
for (u = 0; u < 32; u += 4) {
rk[u + 0] = sph_dec32le_aligned(
(const unsigned char *)msg + (u << 2) + 0);
rk[u + 1] = sph_dec32le_aligned(
(const unsigned char *)msg + (u << 2) + 4);
rk[u + 2] = sph_dec32le_aligned(
(const unsigned char *)msg + (u << 2) + 8);
rk[u + 3] = sph_dec32le_aligned(
(const unsigned char *)msg + (u << 2) + 12);
}
#endif
u = 32;
for (;;) {
for (s = 0; s < 4; s ++) {
sph_u32 x0, x1, x2, x3;
x0 = rk[u - 31];
x1 = rk[u - 30];
x2 = rk[u - 29];
x3 = rk[u - 32];
AES_ROUND_NOKEY(x0, x1, x2, x3);
rk[u + 0] = x0 ^ rk[u - 4];
rk[u + 1] = x1 ^ rk[u - 3];
rk[u + 2] = x2 ^ rk[u - 2];
rk[u + 3] = x3 ^ rk[u - 1];
if (u == 32) {
rk[ 32] ^= sc->count0;
rk[ 33] ^= sc->count1;
rk[ 34] ^= sc->count2;
rk[ 35] ^= SPH_T32(~sc->count3);
} else if (u == 440) {
rk[440] ^= sc->count1;
rk[441] ^= sc->count0;
rk[442] ^= sc->count3;
rk[443] ^= SPH_T32(~sc->count2);
}
u += 4;
x0 = rk[u - 31];
x1 = rk[u - 30];
x2 = rk[u - 29];
x3 = rk[u - 32];
AES_ROUND_NOKEY(x0, x1, x2, x3);
rk[u + 0] = x0 ^ rk[u - 4];
rk[u + 1] = x1 ^ rk[u - 3];
rk[u + 2] = x2 ^ rk[u - 2];
rk[u + 3] = x3 ^ rk[u - 1];
if (u == 164) {
rk[164] ^= sc->count3;
rk[165] ^= sc->count2;
rk[166] ^= sc->count1;
rk[167] ^= SPH_T32(~sc->count0);
} else if (u == 316) {
rk[316] ^= sc->count2;
rk[317] ^= sc->count3;
rk[318] ^= sc->count0;
rk[319] ^= SPH_T32(~sc->count1);
}
u += 4;
}
if (u == 448)
break;
for (s = 0; s < 8; s ++) {
rk[u + 0] = rk[u - 32] ^ rk[u - 7];
rk[u + 1] = rk[u - 31] ^ rk[u - 6];
rk[u + 2] = rk[u - 30] ^ rk[u - 5];
rk[u + 3] = rk[u - 29] ^ rk[u - 4];
u += 4;
}
}
p0 = sc->h[0x0];
p1 = sc->h[0x1];
p2 = sc->h[0x2];
p3 = sc->h[0x3];
p4 = sc->h[0x4];
p5 = sc->h[0x5];
p6 = sc->h[0x6];
p7 = sc->h[0x7];
p8 = sc->h[0x8];
p9 = sc->h[0x9];
pA = sc->h[0xA];
pB = sc->h[0xB];
pC = sc->h[0xC];
pD = sc->h[0xD];
pE = sc->h[0xE];
pF = sc->h[0xF];
u = 0;
for (r = 0; r < 14; r ++) {
#define C512_ELT(l0, l1, l2, l3, r0, r1, r2, r3) do { \
sph_u32 x0, x1, x2, x3; \
x0 = r0 ^ rk[u ++]; \
x1 = r1 ^ rk[u ++]; \
x2 = r2 ^ rk[u ++]; \
x3 = r3 ^ rk[u ++]; \
AES_ROUND_NOKEY(x0, x1, x2, x3); \
x0 ^= rk[u ++]; \
x1 ^= rk[u ++]; \
x2 ^= rk[u ++]; \
x3 ^= rk[u ++]; \
AES_ROUND_NOKEY(x0, x1, x2, x3); \
x0 ^= rk[u ++]; \
x1 ^= rk[u ++]; \
x2 ^= rk[u ++]; \
x3 ^= rk[u ++]; \
AES_ROUND_NOKEY(x0, x1, x2, x3); \
x0 ^= rk[u ++]; \
x1 ^= rk[u ++]; \
x2 ^= rk[u ++]; \
x3 ^= rk[u ++]; \
AES_ROUND_NOKEY(x0, x1, x2, x3); \
l0 ^= x0; \
l1 ^= x1; \
l2 ^= x2; \
l3 ^= x3; \
} while (0)
#define WROT(a, b, c, d) do { \
sph_u32 t = d; \
d = c; \
c = b; \
b = a; \
a = t; \
} while (0)
C512_ELT(p0, p1, p2, p3, p4, p5, p6, p7);
C512_ELT(p8, p9, pA, pB, pC, pD, pE, pF);
WROT(p0, p4, p8, pC);
WROT(p1, p5, p9, pD);
WROT(p2, p6, pA, pE);
WROT(p3, p7, pB, pF);
#undef C512_ELT
#undef WROT
}
sc->h[0x0] ^= p0;
sc->h[0x1] ^= p1;
sc->h[0x2] ^= p2;
sc->h[0x3] ^= p3;
sc->h[0x4] ^= p4;
sc->h[0x5] ^= p5;
sc->h[0x6] ^= p6;
sc->h[0x7] ^= p7;
sc->h[0x8] ^= p8;
sc->h[0x9] ^= p9;
sc->h[0xA] ^= pA;
sc->h[0xB] ^= pB;
sc->h[0xC] ^= pC;
sc->h[0xD] ^= pD;
sc->h[0xE] ^= pE;
sc->h[0xF] ^= pF;
}
#else
/*
* This function assumes that "msg" is aligned for 32-bit access.
*/
static void
c512( sph_shavite_big_context *sc, const void *msg )
{
__m128i p0, p1, p2, p3, x;
__m128i k00, k01, k02, k03, k10, k11, k12, k13;
__m128i *m = (__m128i*)msg;
__m128i *h = (__m128i*)sc->h;
int r;
p0 = h[0];
p1 = h[1];
p2 = h[2];
p3 = h[3];
// round
k00 = m[0];
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, mm_zero );
k01 = m[1];
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
k02 = m[2];
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
k03 = m[3];
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
p0 = _mm_xor_si128( p0, x );
k10 = m[4];
x = _mm_xor_si128( p3, k10 );
x = _mm_aesenc_si128( x, mm_zero );
k11 = m[5];
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
k12 = m[6];
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
k13 = m[7];
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
p2 = _mm_xor_si128( p2, x );
for ( r = 0; r < 3; r ++ )
{
// round 1, 5, 9
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
k00 = _mm_xor_si128( k00, k13 );
if ( r == 0 )
k00 = _mm_xor_si128( k00, _mm_set_epi32(
~sc->count3, sc->count2, sc->count1, sc->count0 ) );
x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, mm_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
k01 = _mm_xor_si128( k01, k00 );
if ( r == 1 )
k01 = _mm_xor_si128( k01, _mm_set_epi32(
~sc->count0, sc->count1, sc->count2, sc->count3 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
p3 = _mm_xor_si128( p3, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, mm_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
k13 = _mm_xor_si128( k13, k12 );
if ( r == 2 )
k13 = _mm_xor_si128( k13, _mm_set_epi32(
~sc->count1, sc->count0, sc->count3, sc->count2 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
p1 = _mm_xor_si128( p1, x );
// round 2, 6, 10
k00 = _mm_xor_si128( k00, mm_rotr256_32( k12, k13, 1 ) );
x = _mm_xor_si128( p3, k00 );
x = _mm_aesenc_si128( x, mm_zero );
k01 = _mm_xor_si128( k01, mm_rotr256_32( k13, k00, 1 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
k02 = _mm_xor_si128( k02, mm_rotr256_32( k00, k01, 1 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
k03 = _mm_xor_si128( k03, mm_rotr256_32( k01, k02, 1 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
p2 = _mm_xor_si128( p2, x );
k10 = _mm_xor_si128( k10, mm_rotr256_32( k02, k03, 1 ) );
x = _mm_xor_si128( p1, k10 );
x = _mm_aesenc_si128( x, mm_zero );
k11 = _mm_xor_si128( k11, mm_rotr256_32( k03, k10, 1 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
k12 = _mm_xor_si128( k12, mm_rotr256_32( k10, k11, 1 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
k13 = _mm_xor_si128( k13, mm_rotr256_32( k11, k12, 1 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
p0 = _mm_xor_si128( p0, x );
// round 3, 7, 11
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p2, k00 );
x = _mm_aesenc_si128( x, mm_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
p1 = _mm_xor_si128( p1, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p0, k10 );
x = _mm_aesenc_si128( x, mm_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
p3 = _mm_xor_si128( p3, x );
// round 4, 8, 12
k00 = _mm_xor_si128( k00, mm_rotr256_32( k12, k13, 1 ) );
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, mm_zero );
k01 = _mm_xor_si128( k01, mm_rotr256_32( k13, k00, 1 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
k02 = _mm_xor_si128( k02, mm_rotr256_32( k00, k01, 1 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
k03 = _mm_xor_si128( k03, mm_rotr256_32( k01, k02, 1 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
p0 = _mm_xor_si128( p0, x );
k10 = _mm_xor_si128( k10, mm_rotr256_32( k02, k03, 1 ) );
x = _mm_xor_si128( p3, k10 );
x = _mm_aesenc_si128( x, mm_zero );
k11 = _mm_xor_si128( k11, mm_rotr256_32( k03, k10, 1 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
k12 = _mm_xor_si128( k12, mm_rotr256_32( k10, k11, 1 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
k13 = _mm_xor_si128( k13, mm_rotr256_32( k11, k12, 1 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
p2 = _mm_xor_si128( p2, x );
}
// round 13
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, mm_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
p3 = _mm_xor_si128( p3, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, mm_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
p1 = _mm_xor_si128( p1, x );
h[0] = _mm_xor_si128( h[0], p2 );
h[1] = _mm_xor_si128( h[1], p3 );
h[2] = _mm_xor_si128( h[2], p0 );
h[3] = _mm_xor_si128( h[3], p1 );
}
#endif
static void
shavite_big_aesni_init( sph_shavite_big_context *sc, const sph_u32 *iv )
{
memcpy( sc->h, iv, sizeof sc->h );
sc->ptr = 0;
sc->count0 = 0;
sc->count1 = 0;
sc->count2 = 0;
sc->count3 = 0;
}
static void
shavite_big_aesni_core( sph_shavite_big_context *sc, const void *data,
size_t len )
{
unsigned char *buf;
size_t ptr;
buf = sc->buf;
ptr = sc->ptr;
while (len > 0) {
size_t clen;
clen = (sizeof sc->buf) - ptr;
if (clen > len)
clen = len;
memcpy(buf + ptr, data, clen);
data = (const unsigned char *)data + clen;
ptr += clen;
len -= clen;
if (ptr == sizeof sc->buf) {
if ((sc->count0 = SPH_T32(sc->count0 + 1024)) == 0) {
sc->count1 = SPH_T32(sc->count1 + 1);
if (sc->count1 == 0) {
sc->count2 = SPH_T32(sc->count2 + 1);
if (sc->count2 == 0) {
sc->count3 = SPH_T32(
sc->count3 + 1);
}
}
}
c512(sc, buf);
ptr = 0;
}
}
sc->ptr = ptr;
}
static void
shavite_big_aesni_close( sph_shavite_big_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
unsigned char *buf;
size_t ptr, u;
unsigned z;
sph_u32 count0, count1, count2, count3;
buf = sc->buf;
ptr = sc->ptr;
count0 = (sc->count0 += SPH_T32(ptr << 3) + n);
count1 = sc->count1;
count2 = sc->count2;
count3 = sc->count3;
z = 0x80 >> n;
z = ((ub & -z) | z) & 0xFF;
if (ptr == 0 && n == 0) {
buf[0] = 0x80;
memset(buf + 1, 0, 109);
sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
} else if (ptr < 110) {
buf[ptr ++] = z;
memset(buf + ptr, 0, 110 - ptr);
} else {
buf[ptr ++] = z;
memset(buf + ptr, 0, 128 - ptr);
c512(sc, buf);
memset(buf, 0, 110);
sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
}
sph_enc32le(buf + 110, count0);
sph_enc32le(buf + 114, count1);
sph_enc32le(buf + 118, count2);
sph_enc32le(buf + 122, count3);
buf[126] = (unsigned char) (out_size_w32 << 5);
buf[127] = (unsigned char) (out_size_w32 >> 3);
c512(sc, buf);
for (u = 0; u < out_size_w32; u ++)
sph_enc32le((unsigned char *)dst + (u << 2), sc->h[u]);
}
void
sph_shavite512_aesni_init(void *cc)
{
shavite_big_aesni_init(cc, IV512);
}
void
sph_shavite512_aesni(void *cc, const void *data, size_t len)
{
shavite_big_aesni_core(cc, data, len);
}
void
sph_shavite512_aesni_close(void *cc, void *dst)
{
shavite_big_aesni_close(cc, 0, 0, dst, 16);
}
void
sph_shavite512_aesni_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst)
{
shavite_big_aesni_close(cc, ub, n, dst, 16);
}
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -1731,21 +1731,21 @@ sph_shavite384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
/* see sph_shavite.h */ /* see sph_shavite.h */
void void
sph_shavite512_init(void *cc) sph_shavite512_sw_init(void *cc)
{ {
shavite_big_init(cc, IV512); shavite_big_init(cc, IV512);
} }
/* see sph_shavite.h */ /* see sph_shavite.h */
void void
sph_shavite512(void *cc, const void *data, size_t len) sph_shavite512_sw(void *cc, const void *data, size_t len)
{ {
shavite_big_core(cc, data, len); shavite_big_core(cc, data, len);
} }
/* see sph_shavite.h */ /* see sph_shavite.h */
void void
sph_shavite512_close(void *cc, void *dst) sph_shavite512_sw_close(void *cc, void *dst)
{ {
shavite_big_close(cc, 0, 0, dst, 16); shavite_big_close(cc, 0, 0, dst, 16);
// shavite_big_init(cc, IV512); // shavite_big_init(cc, IV512);
@@ -1753,7 +1753,7 @@ sph_shavite512_close(void *cc, void *dst)
/* see sph_shavite.h */ /* see sph_shavite.h */
void void
sph_shavite512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) sph_shavite512_sw_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{ {
shavite_big_close(cc, ub, n, dst, 16); shavite_big_close(cc, ub, n, dst, 16);
// shavite_big_init(cc, IV512); // shavite_big_init(cc, IV512);

View File

@@ -77,9 +77,9 @@ extern "C"{
*/ */
typedef struct { typedef struct {
#ifndef DOXYGEN_IGNORE #ifndef DOXYGEN_IGNORE
unsigned char buf[64]; /* first field, for alignment */ unsigned char buf[64] __attribute__ ((aligned (64)));
sph_u32 h[8] __attribute__ ((aligned (32)));
size_t ptr; size_t ptr;
sph_u32 h[8];
sph_u32 count0, count1; sph_u32 count0, count1;
#endif #endif
} sph_shavite_small_context; } sph_shavite_small_context;
@@ -108,9 +108,9 @@ typedef sph_shavite_small_context sph_shavite256_context;
*/ */
typedef struct { typedef struct {
#ifndef DOXYGEN_IGNORE #ifndef DOXYGEN_IGNORE
unsigned char buf[128]; /* first field, for alignment */ unsigned char buf[128] __attribute__ ((aligned (64)));
sph_u32 h[16] __attribute__ ((aligned (32)));;
size_t ptr; size_t ptr;
sph_u32 h[16];
sph_u32 count0, count1, count2, count3; sph_u32 count0, count1, count2, count3;
#endif #endif
} sph_shavite_big_context; } sph_shavite_big_context;
@@ -262,51 +262,37 @@ void sph_shavite384_close(void *cc, void *dst);
void sph_shavite384_addbits_and_close( void sph_shavite384_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst); void *cc, unsigned ub, unsigned n, void *dst);
/** // Always define sw but only define aesni when available
* Initialize a SHAvite-512 context. This process performs no memory allocation. // Define fptrs for aesni or sw, not both.
* void sph_shavite512_sw_init(void *cc);
* @param cc the SHAvite-512 context (pointer to a void sph_shavite512_sw(void *cc, const void *data, size_t len);
* <code>sph_shavite512_context</code>) void sph_shavite512_sw_close(void *cc, void *dst);
*/ void sph_shavite512_sw_addbits_and_close(
void sph_shavite512_init(void *cc);
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing).
*
* @param cc the SHAvite-512 context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_shavite512(void *cc, const void *data, size_t len);
/**
* Terminate the current SHAvite-512 computation and output the result into
* the provided buffer. The destination buffer must be wide enough to
* accomodate the result (64 bytes). The context is automatically
* reinitialized.
*
* @param cc the SHAvite-512 context
* @param dst the destination buffer
*/
void sph_shavite512_close(void *cc, void *dst);
/**
* Add a few additional bits (0 to 7) to the current computation, then
* terminate it and output the result in the provided buffer, which must
* be wide enough to accomodate the result (64 bytes). If bit number i
* in <code>ub</code> has value 2^i, then the extra bits are those
* numbered 7 downto 8-n (this is the big-endian convention at the byte
* level). The context is automatically reinitialized.
*
* @param cc the SHAvite-512 context
* @param ub the extra bits
* @param n the number of extra bits (0 to 7)
* @param dst the destination buffer
*/
void sph_shavite512_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst); void *cc, unsigned ub, unsigned n, void *dst);
#ifdef __AES__
void sph_shavite512_aesni_init(void *cc);
void sph_shavite512_aesni(void *cc, const void *data, size_t len);
void sph_shavite512_aesni_close(void *cc, void *dst);
void sph_shavite512_aesni_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#define sph_shavite512_init sph_shavite512_aesni_init
#define sph_shavite512 sph_shavite512_aesni
#define sph_shavite512_close sph_shavite512_aesni_close
#define sph_shavite512_addbits_and_close \
sph_shavite512_aesni_addbits_and_close
#else
#define sph_shavite512_init sph_shavite512_sw_init
#define sph_shavite512 sph_shavite512_sw
#define sph_shavite512_close sph_shavite512_sw_close
#define sph_shavite512_addbits_and_close \
sph_shavite512_sw_addbits_and_close
#endif
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@@ -104,7 +104,7 @@ int scanhash_veltor(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
bool register_veltor_algo( algo_gate_t* gate ) bool register_veltor_algo( algo_gate_t* gate )
{ {
gate->optimizations = SSE2_OPT; gate->optimizations = SSE2_OPT | AES_OPT;
init_veltor_ctx(); init_veltor_ctx();
gate->scanhash = (void*)&scanhash_veltor; gate->scanhash = (void*)&scanhash_veltor;
gate->hash = (void*)&veltorhash; gate->hash = (void*)&veltorhash;

View File

@@ -252,8 +252,8 @@ SPH_XCAT(HASH, _addbits_and_close)(void *cc,
current = (unsigned)sc->count_low & (SPH_BLEN - 1U); current = (unsigned)sc->count_low & (SPH_BLEN - 1U);
#endif #endif
uint64_t *b= (uint64_t*)sc->buf; //uint64_t *b= (uint64_t*)sc->buf;
uint64_t *s= (uint64_t*)sc->state; //uint64_t *s= (uint64_t*)sc->state;
// printf("Sptr 1= %u\n",current); // printf("Sptr 1= %u\n",current);
// printf("SBuf %016llx %016llx %016llx %016llx\n", b[0], b[1], b[2], b[3] ); // printf("SBuf %016llx %016llx %016llx %016llx\n", b[0], b[1], b[2], b[3] );
// printf("SBuf %016llx %016llx %016llx %016llx\n", b[4], b[5], b[6], b[7] ); // printf("SBuf %016llx %016llx %016llx %016llx\n", b[4], b[5], b[6], b[7] );

View File

@@ -1,4 +1,7 @@
#include "whirlpool-gate.h" #include "whirlpool-gate.h"
#if defined(__AVX2__)
#include <stdlib.h> #include <stdlib.h>
#include <stdint.h> #include <stdint.h>
#include <string.h> #include <string.h>
@@ -6,8 +9,6 @@
#include "sph_whirlpool.h" #include "sph_whirlpool.h"
#include "whirlpool-hash-4way.h" #include "whirlpool-hash-4way.h"
#if defined(__AVX2__)
static __thread whirlpool_4way_context whirl_mid; static __thread whirlpool_4way_context whirl_mid;
void whirlpool_hash_4way( void *state, const void *input ) void whirlpool_hash_4way( void *state, const void *input )
@@ -50,7 +51,7 @@ void whirlpool_hash_4way( void *state, const void *input )
} }
int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce, int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce,
unsigned long *hashes_done ) uint64_t *hashes_done )
{ {
uint32_t hash[4*8] __attribute__ ((aligned (64))); uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64))); uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -67,8 +68,8 @@ int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce,
uint32_t *noncep2 = vdata + 77; uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79; uint32_t *noncep3 = vdata + 79;
// if (opt_benchmark) if (opt_benchmark)
// ((uint32_t*)ptarget)[7] = 0x0000ff; ((uint32_t*)ptarget)[7] = 0x0000ff;
for (int i=0; i < 19; i++) for (int i=0; i < 19; i++)
be32enc(&endiandata[i], pdata[i]); be32enc(&endiandata[i], pdata[i]);

View File

@@ -2,14 +2,15 @@
bool register_whirlpool_algo( algo_gate_t* gate ) bool register_whirlpool_algo( algo_gate_t* gate )
{ {
//#if defined (WHIRLPOOL_4WAY) #if defined (WHIRLPOOL_4WAY)
// gate->scanhash = (void*)&scanhash_whirlpool_4way; four_way_not_tested();
// gate->hash = (void*)&whirlpool_hash_4way; gate->scanhash = (void*)&scanhash_whirlpool_4way;
//#else gate->hash = (void*)&whirlpool_hash_4way;
#else
gate->scanhash = (void*)&scanhash_whirlpool; gate->scanhash = (void*)&scanhash_whirlpool;
gate->hash = (void*)&whirlpool_hash; gate->hash = (void*)&whirlpool_hash;
init_whirlpool_ctx(); init_whirlpool_ctx();
//#endif #endif
return true; return true;
}; };

View File

@@ -8,13 +8,13 @@
#define WHIRLPOOL_4WAY #define WHIRLPOOL_4WAY
#endif #endif
//#if defined (WHIRLPOOL_4WAY) #if defined (WHIRLPOOL_4WAY)
//void whirlpool_hash_4way(void *state, const void *input); void whirlpool_hash_4way(void *state, const void *input);
//int scanhash_whirlpool_4way( int thr_id, struct work *work, uint32_t max_nonce, int scanhash_whirlpool_4way( int thr_id, struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done ); uint64_t *hashes_done );
//#endif #else
void whirlpool_hash( void *state, const void *input ); void whirlpool_hash( void *state, const void *input );
@@ -22,3 +22,4 @@ int scanhash_whirlpool( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done ); uint64_t *hashes_done );
#endif #endif
#endif

View File

@@ -3365,6 +3365,8 @@ do { \
// scalar array of constants "table" and return referenced 64 bit entry. // scalar array of constants "table" and return referenced 64 bit entry.
#define t_lane( table, inv, row, lane ) \ #define t_lane( table, inv, row, lane ) \
table[ _mm256_extract_epi64( t_row( inv, row ), lane ) ] table[ _mm256_extract_epi64( t_row( inv, row ), lane ) ]
// table[ t_rwo( inv, row )[ lane ] ];
// Build a vector from elements of non-contiguous 64 bit data extracted from // Build a vector from elements of non-contiguous 64 bit data extracted from
// scalar "table". // scalar "table".

18
algo/x11/x11-gate.c Normal file
View File

@@ -0,0 +1,18 @@
#include "x11-gate.h"
bool register_x11_algo( algo_gate_t* gate )
{
#if defined (X11_4WAY)
init_x11_4way_ctx();
gate->scanhash = (void*)&scanhash_x11_4way;
gate->hash = (void*)&x11_hash_4way;
#else
init_x11_ctx();
gate->scanhash = (void*)&scanhash_x11;
gate->hash = (void*)&x11_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

30
algo/x11/x11-gate.h Normal file
View File

@@ -0,0 +1,30 @@
#ifndef X11_GATE_H__
#define X11_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
//#if defined(HASH_4WAY) && !defined(NO_AES_NI)
// #define X11_4WAY
//#endif
bool register_x11_algo( algo_gate_t* gate );
#if defined(X11_4WAY)
void x11_hash_4way( void *state, const void *input );
int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif
void x11_hash( void *state, const void *input );
int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x11_ctx();
#endif

View File

@@ -1,5 +1,5 @@
#include "cpuminer-config.h" #include "cpuminer-config.h"
#include "algo-gate-api.h" #include "x11-gate.h"
#include <string.h> #include <string.h>
#include <stdint.h> #include <stdint.h>
@@ -61,7 +61,7 @@ void init_x11_ctx()
#endif #endif
} }
static void x11_hash( void *state, const void *input ) void x11_hash( void *state, const void *input )
{ {
unsigned char hash[128] __attribute__ ((aligned (32))); unsigned char hash[128] __attribute__ ((aligned (32)));
unsigned char hashbuf[128] __attribute__ ((aligned (16))); unsigned char hashbuf[128] __attribute__ ((aligned (16)));
@@ -189,7 +189,7 @@ int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
pdata[19] = n; pdata[19] = n;
return 0; return 0;
} }
/*
bool register_x11_algo( algo_gate_t* gate ) bool register_x11_algo( algo_gate_t* gate )
{ {
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
@@ -199,4 +199,4 @@ bool register_x11_algo( algo_gate_t* gate )
gate->get_max64 = (void*)&get_max64_0x3ffff; gate->get_max64 = (void*)&get_max64_0x3ffff;
return true; return true;
}; };
*/

View File

@@ -440,7 +440,7 @@ bool register_yescrypt_algo( algo_gate_t* gate )
bool register_yescryptr16_algo( algo_gate_t* gate ) bool register_yescryptr16_algo( algo_gate_t* gate )
{ {
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT; gate->optimizations = SSE2_OPT | AVX_OPT;
gate->scanhash = (void*)&scanhash_yescrypt; gate->scanhash = (void*)&scanhash_yescrypt;
gate->hash = (void*)&yescrypt_hash; gate->hash = (void*)&yescrypt_hash;
gate->set_target = (void*)&scrypt_set_target; gate->set_target = (void*)&scrypt_set_target;

296
avxdefs.h
View File

@@ -26,14 +26,6 @@
#define mm_negate_32( a ) _mm_sub_epi32( mm_zero, a ) #define mm_negate_32( a ) _mm_sub_epi32( mm_zero, a )
// Blend 128 bit vectors based on vector mask. Bits are copied from arg a0
// if corresponding mask bits are clear and from arg a1 if set.
// Should be faster than maskload.
// isn't working.
#define mm_merge( a0, a1, mask ) \
_mm_or_si128( _mm_and_si128( a0, mm_not( mask ) ), \
_mm_and_si128( a1, mask ) )
// Memory functions // Memory functions
// n = number of __m128i, bytes/16 // n = number of __m128i, bytes/16
@@ -59,7 +51,6 @@ inline void memcpy_64( uint64_t* dst, const uint64_t* src, int n )
dst[i] = src[i]; dst[i] = src[i];
} }
// Pointer cast // Pointer cast
// p = any aligned pointer // p = any aligned pointer
@@ -80,47 +71,56 @@ inline void memcpy_64( uint64_t* dst, const uint64_t* src, int n )
#define mm_rotr_64( w, c ) _mm_or_si128( _mm_srli_epi64( w, c ), \ #define mm_rotr_64( w, c ) _mm_or_si128( _mm_srli_epi64( w, c ), \
_mm_slli_epi64( w, 64-c ) ) _mm_slli_epi64( w, 64-c ) )
#define mm_rotl_64( w, c ) _mm_or_si128( _mm_slli_epi64( w, c ), \
_mm_srli_epi64( w, 64-c ) )
#define mm_rotr_32( w, c ) _mm_or_si128( _mm_srli_epi32( w, c ), \ #define mm_rotr_32( w, c ) _mm_or_si128( _mm_srli_epi32( w, c ), \
_mm_slli_epi32( w, 32-c ) ) _mm_slli_epi32( w, 32-c ) )
// Rotate vector elements #define mm_rotl_32( w, c ) _mm_or_si128( _mm_slli_epi32( w, c ), \
_mm_srli_epi32( w, 32-c ) )
// Rotate elements in vector
// Swap upper and lower 64 bits of 128 bit source vector // Swap upper and lower 64 bits of 128 bit source vector
// __m128i mm128_swap64( __m128 )
#define mm_swap_64(s) _mm_shuffle_epi32( s, 0x4e ) #define mm_swap_64(s) _mm_shuffle_epi32( s, 0x4e )
// Rotate 128 vector by 1 32 bit element.
#define mm_rotr_1x32( w ) _mm_shuffle_epi32( w, 0x39 )
#define mm_rotl_1x32( w ) _mm_shuffle_epi32( w, 0x93 )
// Rotate 256 bits through two 128 bit vectors // Rotate 256 bits through two 128 bit vectors
// Swap 128 bit source vectors // Swap 128 bit source vectors in place.
// void mm128_swap128( __m128i, __m128i ) // void mm128_swap128( __m128i, __m128i )
// macro is better to update two args
#define mm_swap_128(s0, s1) s0 = _mm_xor_si128(s0, s1); \ #define mm_swap_128(s0, s1) s0 = _mm_xor_si128(s0, s1); \
s1 = _mm_xor_si128(s0, s1); \ s1 = _mm_xor_si128(s0, s1); \
s0 = _mm_xor_si128(s0, s1); s0 = _mm_xor_si128(s0, s1);
// Rotate two 128 bit vectors as one 256 vector by 1 element // Rotate two 128 bit vectors in place as one 256 vector by 1 element
#define mm_rotl256_1x64x( s0, s1 ) \
do { \
__m128i t; \
s0 = mm_swap_64( s0 ); \
s1 = mm_swap_64( s1 ); \
t = mm_merge( s0, s1, _mm_set_epi64x( 0xffffffffffffffffull, 0ull ) );\
s1 = mm_merge( s0, s1, _mm_set_epi64x( 0ull, 0xffffffffffffffffull ) ); \
s0 = t; \
} while(0)
#define mm_rotr256_1x64x( s0, s1 ) \
do { \
__m128i t; \
s0 = mm_swap_64( s0 ); \
s1 = mm_swap_64( s1 ); \
t = mm_merge( s0, s1, _mm_set_epi64x( 0ull, 0xffffffffffffffffull ) );\
s1 = mm_merge( s0, s1, _mm_set_epi64x( 0xffffffffffffffffull, 0ull ) ); \
s0 = t; \
} while(0)
#define mm_rotl256_1x64( s0, s1 ) \ #define mm_rotl256_1x64( s0, s1 ) \
do { \
__m128i t; \
s0 = mm_swap_64( s0 ); \
s1 = mm_swap_64( s1 ); \
t = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
s0 = t; \
} while(0)
#define mm_rotr256_1x64( s0, s1 ) \
do { \
__m128i t; \
s0 = mm_swap_64( s0 ); \
s1 = mm_swap_64( s1 ); \
t = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
s0 = t; \
} while(0)
// Older slower
#define mm_rotl256_1x64x( s0, s1 ) \
do { \ do { \
__m128i t; \ __m128i t; \
s0 = mm_swap_64( s0 ); \ s0 = mm_swap_64( s0 ); \
@@ -134,10 +134,10 @@ do { \
s0 = t; \ s0 = t; \
} while(0) } while(0)
#define mm_rotr256_1x64( s0, s1 ) \ #define mm_rotr256_1x64x( s0, s1 ) \
do { \ do { \
__m128i t; \ __m128i t; \
s0 = mm_swap_64( s0) ; \ s0 = mm_swap_64( s0 ) ; \
s1 = mm_swap_64( s1 ); \ s1 = mm_swap_64( s1 ); \
t = _mm_or_si128( \ t = _mm_or_si128( \
_mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \ _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
@@ -148,6 +148,21 @@ do { \
s0 = t; \ s0 = t; \
} while(0) } while(0)
// Rotate 256 bits through two 128 bit vectors by n*32 bits and return
// the rotated s0.
// Similar to mm_rotr256_1x32 but only a partial rotation as s1 is not
// completed. It's faster than a full rotation.
inline __m128i mm_rotr256_32( __m128i s0, __m128i s1, int n )
{
return _mm_or_si128( _mm_srli_si128( s0, n<<2 ),
_mm_slli_si128( s1, 16 - (n<<2) ) );
}
inline __m128i mm_rotl256_32( __m128i s0, __m128i s1, int n )
{
return _mm_or_si128( _mm_slli_si128( s0, n<<2 ),
_mm_srli_si128( s1, 16 - (n<<2) ) );
}
// Swap bytes in vector elements // Swap bytes in vector elements
inline __m128i mm_byteswap_32( __m128i x ) inline __m128i mm_byteswap_32( __m128i x )
@@ -161,6 +176,21 @@ inline __m128i mm_byteswap_32( __m128i x )
return _mm_or_si128( _mm_or_si128( x0, x1 ), _mm_or_si128( x2, x3 ) ); return _mm_or_si128( _mm_or_si128( x0, x1 ), _mm_or_si128( x2, x3 ) );
} }
inline __m128i mm_byteswap_64( __m128i x )
{
x = _mm_or_si128( _mm_srli_epi64( x, 32 ), _mm_slli_epi64( x, 32 ));
x = _mm_or_si128( _mm_srli_epi64( _mm_and_si128( x,
_mm_set1_epi64x( 0xFFFF0000FFFF0000 ) ), 16 ),
_mm_slli_epi64( _mm_and_si128( x,
_mm_set1_epi64x( 0x0000FFFF0000FFFF ) ), 16 ));
return _mm_or_si128( _mm_srli_epi64( _mm_and_si128( x,
_mm_set1_epi64x( 0xFF00FF00FF00FF00 ) ), 8 ),
_mm_slli_epi64( _mm_and_si128( x,
_mm_set1_epi64x( 0x00FF00FF00FF00FF ) ), 8 ));
}
#if defined (__AVX2__) #if defined (__AVX2__)
@@ -180,13 +210,6 @@ inline __m128i mm_byteswap_32( __m128i x )
#define mm256_negate_32( a ) _mm256_sub_epi32( mm256_zero, a ) #define mm256_negate_32( a ) _mm256_sub_epi32( mm256_zero, a )
// Blend 256 bit vectors based on vector mask. Bits are copied from arg a0
// if corresponding mask bits are clear and from arg a1 if set.
// Should be faster than maskload.
#define mm256_merge( a0, a1, mask ) \
_mm256_or_si256( _mm256_and_si256( a0, mm256_not( mask ) ), \
_mm256_and_si256( a1, mask )
// Pack/Unpack two 128 bit vectors into/from one 256 bit vector // Pack/Unpack two 128 bit vectors into/from one 256 bit vector
// usefulness tbd // usefulness tbd
#define mm256_pack_2x128( hi, lo ) \ #define mm256_pack_2x128( hi, lo ) \
@@ -198,7 +221,7 @@ inline __m128i mm_byteswap_32( __m128i x )
// Memory functions // Memory functions
// n = number of __m256i (32 bytes) // n = number of 256 bit (32 byte) vectors
inline void memset_zero_256( __m256i *dst, int n ) inline void memset_zero_256( __m256i *dst, int n )
{ {
@@ -231,7 +254,7 @@ inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
// Rotate bits in vector elements // Rotate bits in vector elements
// Rotate bits in 4 uint64 (3 instructions) // Rotate bits in 64 bit elements
// w = packed 64 bit data, n= number of bits to rotate // w = packed 64 bit data, n= number of bits to rotate
#define mm256_rotr_64( w, c ) \ #define mm256_rotr_64( w, c ) \
_mm256_or_si256( _mm256_srli_epi64(w, c), _mm256_slli_epi64(w, 64 - c) ) _mm256_or_si256( _mm256_srli_epi64(w, c), _mm256_slli_epi64(w, 64 - c) )
@@ -239,19 +262,30 @@ inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
#define mm256_rotl_64( w, c ) \ #define mm256_rotl_64( w, c ) \
_mm256_or_si256( _mm256_slli_epi64(w, c), _mm256_srli_epi64(w, 64 - c) ) _mm256_or_si256( _mm256_slli_epi64(w, c), _mm256_srli_epi64(w, 64 - c) )
// Rotate vector elements // Rotate bits in 32 bit elements
#define mm256_rotr_32( w, c ) \
_mm256_or_si256( _mm256_srli_epi32(w, c), _mm256_slli_epi32(w, 32 - c) )
// Rotate 256 bits by 64 bits (4 uint64 by one uint64) #define mm256_rotl_32( w, c ) \
_mm256_or_si256( _mm256_slli_epi32(w, c), _mm256_srli_epi32(w, 32 - c) )
// Rotate elements in vector
// Rotate vector by one 64 bit element (aka two 32 bit elements)
//__m256i mm256_rotl256_1x64( _mm256i, int ) //__m256i mm256_rotl256_1x64( _mm256i, int )
#define mm256_rotl256_1x64( w ) _mm256_permute4x64_epi64( w, 0x39 ) #define mm256_rotl256_1x64( w ) _mm256_permute4x64_epi64( w, 0x93 )
#define mm256_rotr256_1x64( w ) _mm256_permute4x64_epi64( w, 0x93 ) #define mm256_rotr256_1x64( w ) _mm256_permute4x64_epi64( w, 0x39 )
// Same as 2x64 rotate in either direction // Swap 128 bit elements (aka rotate by two 64 bit, four 32 bit elements))
#define mm256_swap_128( w ) _mm256_permute2f128_si256( w, w, 1 ) #define mm256_swap_128( w ) _mm256_permute2f128_si256( w, w, 1 )
// Swap bytes in vector elements // Rotate by one 32 bit element (aka two 16 bit elements)
#define mm256_rotl256_1x32( w ) _mm256_shuffle_epi32( w, 0x93 )
#define mm256_rotr256_1x32( w ) _mm256_shuffle_epi32( w, 0x39 )
// Swap bytes in vector elements
inline __m256i mm256_byteswap_32( __m256i x ) inline __m256i mm256_byteswap_32( __m256i x )
{ {
__m256i x1 = _mm256_and_si256( x, _mm256_set1_epi32( 0x0000ff00 ) ); __m256i x1 = _mm256_and_si256( x, _mm256_set1_epi32( 0x0000ff00 ) );
@@ -269,14 +303,14 @@ inline __m256i mm256_byteswap_64( __m256i x )
x = _mm256_or_si256( _mm256_srli_epi64( x, 32 ), _mm256_slli_epi64( x, 32 )); x = _mm256_or_si256( _mm256_srli_epi64( x, 32 ), _mm256_slli_epi64( x, 32 ));
x = _mm256_or_si256( _mm256_srli_epi64( _mm256_and_si256( x, x = _mm256_or_si256( _mm256_srli_epi64( _mm256_and_si256( x,
_mm256_set1_epi64x( 0xFFFF0000FFFF0000 ) ), 16 ), _mm256_set1_epi64x( 0xFFFF0000FFFF0000 ) ), 16 ),
_mm256_slli_epi64( _mm256_and_si256( x, _mm256_slli_epi64( _mm256_and_si256( x,
_mm256_set1_epi64x( 0x0000FFFF0000FFFF ) ), 16 )); _mm256_set1_epi64x( 0x0000FFFF0000FFFF ) ), 16 ));
return _mm256_or_si256( _mm256_srli_epi64( _mm256_and_si256( x, return _mm256_or_si256( _mm256_srli_epi64( _mm256_and_si256( x,
_mm256_set1_epi64x( 0xFF00FF00FF00FF00 ) ), 8 ), _mm256_set1_epi64x( 0xFF00FF00FF00FF00 ) ), 8 ),
_mm256_slli_epi64( _mm256_and_si256( x, _mm256_slli_epi64( _mm256_and_si256( x,
_mm256_set1_epi64x( 0x00FF00FF00FF00FF ) ), 8 )); _mm256_set1_epi64x( 0x00FF00FF00FF00FF ) ), 8 ));
} }
// Pseudo parallel aes // Pseudo parallel aes
@@ -287,7 +321,6 @@ inline __m256i mm256_aesenc_2x128( __m256i x, __m256i k )
mm256_unpack_2x128( hi, lo, x ); mm256_unpack_2x128( hi, lo, x );
mm256_unpack_2x128( khi, klo, k ); mm256_unpack_2x128( khi, klo, k );
lo = _mm_aesenc_si128( lo, klo ); lo = _mm_aesenc_si128( lo, klo );
hi = _mm_aesenc_si128( hi, khi ); hi = _mm_aesenc_si128( hi, khi );
@@ -299,7 +332,6 @@ inline __m256i mm256_aesenc_nokey_2x128( __m256i x )
__m128i hi, lo; __m128i hi, lo;
mm256_unpack_2x128( hi, lo, x ); mm256_unpack_2x128( hi, lo, x );
lo = _mm_aesenc_si128( lo, mm_zero ); lo = _mm_aesenc_si128( lo, mm_zero );
hi = _mm_aesenc_si128( hi, mm_zero ); hi = _mm_aesenc_si128( hi, mm_zero );
@@ -308,32 +340,37 @@ inline __m256i mm256_aesenc_nokey_2x128( __m256i x )
#endif // AVX2 #endif // AVX2
// AVX
// Paired functions for interleaving and deinterleaving data for vector // Paired functions for interleaving and deinterleaving data for vector
// processing. // processing.
// Size is specfied in bits regardless of vector size to avoid pointer // Size is specfied in bits regardless of vector size to avoid pointer
// arithmetic confusion with different size vectors and be consistent with // arithmetic confusion with different size vectors and be consistent with
// the function's name. // the function's name.
//
// Only 256, 512 and 640 bit length, (32, 64 & 80 bytes respectively) // Each function has 2 implementations, an optimized version that uses
// are supported. // vector indexing and a slower version that uses pointers. The optimized
// Buffer length is specified in bits to match the function naming format. // version can only be used with 64 bit elements and only supports sizes
// of 256, 512 or 640 bits, 32, 64, and 80 bytes respectively.
//
// NOTE: Contrary to GCC documentation accessing vector elements using array
// indexes only works with 64 bit elements.
// Interleaving and deinterleaving of vectors of 32 bit elements
// must use the slower implementations that don't use vector indexing.
//
// All data must be aligned to 256 bits for AVX2, or 128 bits for AVX. // All data must be aligned to 256 bits for AVX2, or 128 bits for AVX.
// Interleave source args and deinterleave destination args are not required // Interleave source args and deinterleave destination args are not required
// to be contiguous but it's more efficient if they are. // to be contiguous in memory but it's more efficient if they are.
// Interleave source agrs may be the same actual arg repeated. // Interleave source agrs may be the same actual arg repeated.
// 640 bit deinterleaving 4x64 or 8x32 using 256 bit AVX2 requires the // 640 bit deinterleaving 4x64 using 256 bit AVX2 requires the
// destination buffers be defined with padding up to 768 bits for overrun // destination buffers be defined with padding up to 768 bits for overrun
// space. // space. Although overrun space use is non destructive it should not overlay
// Overrrun space is not needed when interleaving or when deinterleaving // useful data and should be ignored by the caller.
// 4x32 using 128 bit AVX.
// Overrun space use is non destructive and should be ignored by the
// caller.
// interleave 4 arrays of 32 bit elements for AVX processing // SSE2 AVX
// interleave 4 arrays of 32 bit elements for 128 bit processing
// bit_len must be 256, 512 or 640 bits. // bit_len must be 256, 512 or 640 bits.
inline void mm_interleave_4x32( void *dst, const void *src0, const void *src1, // Vector indexing doesn't work with 32 bit data.
inline void mm_interleave_4x32x( void *dst, const void *src0, const void *src1,
const void *src2, const void *src3, int bit_len ) const void *src2, const void *src3, int bit_len )
{ {
uint32_t *s0 = (uint32_t*)src0; uint32_t *s0 = (uint32_t*)src0;
@@ -346,7 +383,6 @@ inline void mm_interleave_4x32( void *dst, const void *src0, const void *src1,
d[1] = _mm_set_epi32( s3[ 1], s2[ 1], s1[ 1], s0[ 1] ); d[1] = _mm_set_epi32( s3[ 1], s2[ 1], s1[ 1], s0[ 1] );
d[2] = _mm_set_epi32( s3[ 2], s2[ 2], s1[ 2], s0[ 2] ); d[2] = _mm_set_epi32( s3[ 2], s2[ 2], s1[ 2], s0[ 2] );
d[3] = _mm_set_epi32( s3[ 3], s2[ 3], s1[ 3], s0[ 3] ); d[3] = _mm_set_epi32( s3[ 3], s2[ 3], s1[ 3], s0[ 3] );
d[4] = _mm_set_epi32( s3[ 4], s2[ 4], s1[ 4], s0[ 4] ); d[4] = _mm_set_epi32( s3[ 4], s2[ 4], s1[ 4], s0[ 4] );
d[5] = _mm_set_epi32( s3[ 5], s2[ 5], s1[ 5], s0[ 5] ); d[5] = _mm_set_epi32( s3[ 5], s2[ 5], s1[ 5], s0[ 5] );
d[6] = _mm_set_epi32( s3[ 6], s2[ 6], s1[ 6], s0[ 6] ); d[6] = _mm_set_epi32( s3[ 6], s2[ 6], s1[ 6], s0[ 6] );
@@ -371,22 +407,27 @@ inline void mm_interleave_4x32( void *dst, const void *src0, const void *src1,
d[19] = _mm_set_epi32( s3[19], s2[19], s1[19], s0[19] ); d[19] = _mm_set_epi32( s3[19], s2[19], s1[19], s0[19] );
} }
// interleave 4 arrays of 32 bit elements for AVX processing
// bit_len must be multiple of 32 // bit_len must be multiple of 32
inline void mm_interleave_4x32x( uint32_t *dst, uint32_t *src0, inline void mm_interleave_4x32( void *dst, void *src0, void *src1,
uint32_t *src1, uint32_t *src2, uint32_t *src3, int bit_len ) void *src2, void *src3, int bit_len )
{ {
uint32_t *d = dst;; uint32_t *d = (uint32_t*)dst;
uint32_t *s0 = (uint32_t*)src0;
uint32_t *s1 = (uint32_t*)src1;
uint32_t *s2 = (uint32_t*)src2;
uint32_t *s3 = (uint32_t*)src3;
for ( int i = 0; i < bit_len >> 5; i++, d += 4 ) for ( int i = 0; i < bit_len >> 5; i++, d += 4 )
{ {
*d = *(src0+i); *d = *(s0+i);
*(d+1) = *(src1+i); *(d+1) = *(s1+i);
*(d+2) = *(src2+i); *(d+2) = *(s2+i);
*(d+3) = *(src3+i); *(d+3) = *(s3+i);
} }
} }
inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2, // doesn't work with 32 bit elements
inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src, int bit_len ) void *dst3, const void *src, int bit_len )
{ {
uint32_t *s = (uint32_t*)src; uint32_t *s = (uint32_t*)src;
@@ -428,17 +469,21 @@ inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
// deinterleave 4 arrays into individual buffers for scalarm processing // deinterleave 4 arrays into individual buffers for scalarm processing
// bit_len must be multiple of 32 // bit_len must be multiple of 32
inline void mm_deinterleave_4x32x( uint32_t *dst0, uint32_t *dst1, inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
uint32_t *dst2,uint32_t *dst3, uint32_t *src, void *dst3, const void *src, int bit_len )
int bit_len )
{ {
uint32_t *s = src; uint32_t *s = (uint32_t*)src;
uint32_t *d0 = (uint32_t*)dst0;
uint32_t *d1 = (uint32_t*)dst1;
uint32_t *d2 = (uint32_t*)dst2;
uint32_t *d3 = (uint32_t*)dst3;
for ( int i = 0; i < bit_len >> 5; i++, s += 4 ) for ( int i = 0; i < bit_len >> 5; i++, s += 4 )
{ {
*(dst0+i) = *s; *(d0+i) = *s;
*(dst1+i) = *(s+1); *(d1+i) = *(s+1);
*(dst2+i) = *(s+2); *(d2+i) = *(s+2);
*(dst3+i) = *(s+3); *(d3+i) = *(s+3);
} }
} }
@@ -473,23 +518,27 @@ inline void mm256_interleave_4x64( void *dst, const void *src0,
d[9] = _mm256_set_epi64x( s3[9], s2[9], s1[9], s0[9] ); d[9] = _mm256_set_epi64x( s3[9], s2[9], s1[9], s0[9] );
} }
// Slower version
// interleave 4 arrays of 64 bit elements for AVX2 processing
// bit_len must be multiple of 64 // bit_len must be multiple of 64
inline void mm256_interleave_4x64x( uint64_t *dst, uint64_t *src0, inline void mm256_interleave_4x64x( void *dst, void *src0, void *src1,
uint64_t *src1, uint64_t *src2, uint64_t *src3, int bit_len ) void *src2, void *src3, int bit_len )
{ {
uint64_t *d = dst; uint64_t *d = (uint64_t*)dst;
uint64_t *s0 = (uint64_t*)src0;
uint64_t *s1 = (uint64_t*)src1;
uint64_t *s2 = (uint64_t*)src2;
uint64_t *s3 = (uint64_t*)src3;
for ( int i = 0; i < bit_len>>6; i++, d += 4 ) for ( int i = 0; i < bit_len>>6; i++, d += 4 )
{ {
*d = *(src0+i); *d = *(s0+i);
*(d+1) = *(src1+i); *(d+1) = *(s1+i);
*(d+2) = *(src2+i); *(d+2) = *(s2+i);
*(d+3) = *(src3+i); *(d+3) = *(s3+i);
} }
} }
// Deinterleave 4 buffers of 32 bit data from the source buffer. // Deinterleave 4 buffers of 64 bit data from the source buffer.
inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2, inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src, int bit_len ) void *dst3, const void *src, int bit_len )
{ {
@@ -520,25 +569,30 @@ inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2,
d3[2] = _mm256_set_epi64x( d3[2][3], d3[2][2], s[39], s[35] ); d3[2] = _mm256_set_epi64x( d3[2][3], d3[2][2], s[39], s[35] );
} }
// Slower version
// Deinterleave 4 arrays into indivudual 64 bit arrays for scalar processing
// bit_len must be multiple 0f 64 // bit_len must be multiple 0f 64
inline void mm256_deinterleave_4x64x( uint64_t *dst0, uint64_t *dst1, inline void mm256_deinterleave_4x64x( void *dst0, void *dst1, void *dst2,
uint64_t *dst2,uint64_t *dst3, uint64_t *src, int bit_len ) void *dst3, void *src, int bit_len )
{ {
uint64_t *s = src; uint64_t *s = (uint64_t*)src;
for ( int i = 0; i < bit_len>>6; i++, s += 4 ) uint64_t *d0 = (uint64_t*)dst0;
uint64_t *d1 = (uint64_t*)dst1;
uint64_t *d2 = (uint64_t*)dst2;
uint64_t *d3 = (uint64_t*)dst3;
for ( int i = 0; i < bit_len>>6; i++, s += 4 )
{ {
*(dst0+i) = *s; *(d0+i) = *s;
*(dst1+i) = *(s+1); *(d1+i) = *(s+1);
*(dst2+i) = *(s+2); *(d2+i) = *(s+2);
*(dst3+i) = *(s+3); *(d3+i) = *(s+3);
} }
} }
// Interleave 8 source buffers containing 32 bit data into the destination // Interleave 8 source buffers containing 32 bit data into the destination
// buffer // vector
inline void mm256_interleave_8x32( void *dst, const void *src0, // Doesn't work, vecror indexing doesn't work for 32 bit elements
inline void mm256_interleave_8x32x( void *dst, const void *src0,
const void *src1, const void *src2, const void *src3, const void *src4, const void *src1, const void *src2, const void *src3, const void *src4,
const void *src5, const void *src6, const void *src7, int bit_len ) const void *src5, const void *src6, const void *src7, int bit_len )
{ {
@@ -600,10 +654,9 @@ inline void mm256_interleave_8x32( void *dst, const void *src0,
s3[19], s2[19], s1[19], s0[19] ); s3[19], s2[19], s1[19], s0[19] );
} }
// Slower but it works with 32 bit data
// interleave 8 arrays of 32 bit elements for AVX2 processing
// bit_len must be multiple of 32 // bit_len must be multiple of 32
inline void mm256_interleave_8x32x( uint32_t *dst, uint32_t *src0, inline void mm256_interleave_8x32( uint32_t *dst, uint32_t *src0,
uint32_t *src1, uint32_t *src2, uint32_t *src3, uint32_t *src4, uint32_t *src1, uint32_t *src2, uint32_t *src3, uint32_t *src4,
uint32_t *src5, uint32_t *src6, uint32_t *src7, int bit_len ) uint32_t *src5, uint32_t *src6, uint32_t *src7, int bit_len )
{ {
@@ -622,7 +675,7 @@ inline void mm256_interleave_8x32x( uint32_t *dst, uint32_t *src0,
} }
// Deinterleave 8 buffers of 32 bit data from the source buffer. // Deinterleave 8 buffers of 32 bit data from the source buffer.
inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2, inline void mm256_deinterleave_8x32x( void *dst0, void *dst1, void *dst2,
void *dst3, void *dst4, void *dst5, void *dst6, void *dst7, void *dst3, void *dst4, void *dst5, void *dst6, void *dst7,
const void *src, int bit_len ) const void *src, int bit_len )
{ {
@@ -703,10 +756,9 @@ inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2,
s[159], s[151], s[143], s[135] ); s[159], s[151], s[143], s[135] );
} }
// Deinterleave 8 arrays into indivdual buffers for scalar processing // Deinterleave 8 arrays into indivdual buffers for scalar processing
// bit_len must be multiple of 32 // bit_len must be multiple of 32
inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1, inline void mm256_deinterleave_8x32( uint32_t *dst0, uint32_t *dst1,
uint32_t *dst2,uint32_t *dst3, uint32_t *dst4, uint32_t *dst5, uint32_t *dst2,uint32_t *dst3, uint32_t *dst4, uint32_t *dst5,
uint32_t *dst6,uint32_t *dst7,uint32_t *src, int bit_len ) uint32_t *dst6,uint32_t *dst7,uint32_t *src, int bit_len )
{ {
@@ -763,4 +815,4 @@ inline void mm_reinterleave_4x32( uint32_t *dst, uint64_t *src,
} }
#endif // __AVX2__ #endif // __AVX2__
#endif // AVX_DEF_H__ #endif // AVXDEFS_H__

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.5. # Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.6.
# #
# #
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='cpuminer-opt' PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.7.5' PACKAGE_VERSION='3.7.6'
PACKAGE_STRING='cpuminer-opt 3.7.5' PACKAGE_STRING='cpuminer-opt 3.7.6'
PACKAGE_BUGREPORT='' PACKAGE_BUGREPORT=''
PACKAGE_URL='' PACKAGE_URL=''
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
\`configure' configures cpuminer-opt 3.7.5 to adapt to many kinds of systems. \`configure' configures cpuminer-opt 3.7.6 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1392,7 +1392,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.7.5:";; short | recursive ) echo "Configuration of cpuminer-opt 3.7.6:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@@ -1497,7 +1497,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
cpuminer-opt configure 3.7.5 cpuminer-opt configure 3.7.6
generated by GNU Autoconf 2.69 generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc. Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.7.5, which was It was created by cpuminer-opt $as_me 3.7.6, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@ $ $0 $@
@@ -2981,7 +2981,7 @@ fi
# Define the identity of the package. # Define the identity of the package.
PACKAGE='cpuminer-opt' PACKAGE='cpuminer-opt'
VERSION='3.7.5' VERSION='3.7.6'
cat >>confdefs.h <<_ACEOF cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by cpuminer-opt $as_me 3.7.5, which was This file was extended by cpuminer-opt $as_me 3.7.6, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\ ac_cs_version="\\
cpuminer-opt config.status 3.7.5 cpuminer-opt config.status 3.7.6
configured by $0, generated by GNU Autoconf 2.69, configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.7.5]) AC_INIT([cpuminer-opt], [3.7.6])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM

View File

@@ -103,7 +103,7 @@ enum algos opt_algo = ALGO_NULL;
int opt_scrypt_n = 0; int opt_scrypt_n = 0;
int opt_pluck_n = 128; int opt_pluck_n = 128;
int opt_n_threads = 0; int opt_n_threads = 0;
int64_t opt_affinity = -1L; int64_t opt_affinity = -1;
int opt_priority = 0; int opt_priority = 0;
int num_cpus; int num_cpus;
char *rpc_url = NULL;; char *rpc_url = NULL;;
@@ -195,20 +195,27 @@ static inline void drop_policy(void)
#define pthread_setaffinity_np(tid,sz,s) {} /* only do process affinity */ #define pthread_setaffinity_np(tid,sz,s) {} /* only do process affinity */
#endif #endif
static void affine_to_cpu_mask(int id, unsigned long mask) { static void affine_to_cpu_mask( int id, unsigned long long mask )
cpu_set_t set; {
CPU_ZERO(&set); cpu_set_t set;
for (uint8_t i = 0; i < num_cpus; i++) { CPU_ZERO(&set);
// cpu mask uint8_t ncpus = (num_cpus > 256) ? 256 : num_cpus;
if (mask & (1UL<<i)) { CPU_SET(i, &set); }
} for ( uint8_t i = 0; i < ncpus; i++ )
if (id == -1) { {
// process affinity // cpu mask
sched_setaffinity(0, sizeof(&set), &set); if( (ncpus > 64) || ( mask & (1UL << i) ) ) CPU_SET( i, &set );
} else { }
// thread only if ( id == -1 )
pthread_setaffinity_np(thr_info[id].pth, sizeof(&set), &set); {
} // process affinity
sched_setaffinity(0, sizeof(&set), &set);
}
else
{
// thread only
pthread_setaffinity_np(thr_info[id].pth, sizeof(&set), &set);
}
} }
#elif defined(WIN32) /* Windows */ #elif defined(WIN32) /* Windows */
@@ -1669,21 +1676,30 @@ static void *miner_thread( void *userdata )
drop_policy(); drop_policy();
} }
// CPU thread affinity // CPU thread affinity
if (num_cpus > 1) if ( num_cpus > 64 )
{ {
if (opt_affinity == -1 && opt_n_threads > 1) // opt_affinity ignored with more than 64 cpus.
if (opt_debug)
applog( LOG_DEBUG, "Binding thread %d to cpu %d",
thr_id, thr_id % num_cpus );
affine_to_cpu_mask( thr_id, -1 );
}
else if ( num_cpus > 1 )
{
if ( (opt_affinity == -1) && (opt_n_threads) > 1 )
{ {
if (opt_debug) if (opt_debug)
applog( LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)", applog( LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)",
thr_id, thr_id % num_cpus, ( 1 << (thr_id % num_cpus) ) ); thr_id, thr_id % num_cpus, ( 1 << (thr_id % num_cpus) ) );
affine_to_cpu_mask(thr_id, 1UL << (thr_id % num_cpus));
affine_to_cpu_mask( thr_id, 1 << (thr_id % num_cpus) );
} }
else if (opt_affinity != -1L) else if (opt_affinity != -1)
{ {
if (opt_debug) if (opt_debug)
applog( LOG_DEBUG, "Binding thread %d to cpu mask %x", applog( LOG_DEBUG, "Binding thread %d to cpu mask %x",
thr_id, opt_affinity); thr_id, opt_affinity);
affine_to_cpu_mask(thr_id, (unsigned long)opt_affinity); affine_to_cpu_mask( thr_id, opt_affinity );
} }
} }
@@ -1822,10 +1838,10 @@ static void *miner_thread( void *userdata )
num_submitted++; num_submitted++;
} }
#if FOUR_WAY #if FOUR_WAY
if (num_submitted>1) if (num_submitted > 1)
applog(LOG_NOTICE, "4 WAY hash, %u nonces submitted," CL_MAG " BONUS!" CL_WHT, num_submitted); applog(LOG_NOTICE, "4 WAY hash nonces submitted: %u" CL_MAG " BONUS!" CL_N, num_submitted);
else else
applog(LOG_NOTICE, "4 WAY hash %u nonce submitted", num_submitted); applog(LOG_NOTICE, "4 WAY hash nonces submitted: %u", num_submitted);
#endif #endif
// must be a one way algo, nonce is already in work data // must be a one way algo, nonce is already in work data
if ( !num_submitted ) if ( !num_submitted )
@@ -1836,7 +1852,7 @@ else
break; break;
} }
#if FOUR_WAY #if FOUR_WAY
applog(LOG_NOTICE, "1 WAY hash 1 nonce submitted"); applog(LOG_NOTICE, "1 WAY hash nonce submitted");
#endif #endif
} }
@@ -2948,12 +2964,16 @@ bool check_cpu_capability ()
printf(".\nAlgo features:"); printf(".\nAlgo features:");
if ( algo_has_sse2 ) printf( " SSE2" ); if ( algo_features == EMPTY_SET ) printf( " None" );
if ( algo_has_aes ) printf( " AES" ); else
if ( algo_has_avx ) printf( " AVX" ); {
if ( algo_has_avx2 ) printf( " AVX2" ); if ( algo_has_sse2 ) printf( " SSE2" );
if ( algo_has_4way ) printf( " 4WAY" ); if ( algo_has_aes ) printf( " AES" );
if ( algo_has_sha ) printf( " SHA" ); if ( algo_has_avx ) printf( " AVX" );
if ( algo_has_avx2 ) printf( " AVX2" );
if ( algo_has_4way ) printf( " 4WAY" );
if ( algo_has_sha ) printf( " SHA" );
}
printf(".\n"); printf(".\n");
// Check for CPU and build incompatibilities // Check for CPU and build incompatibilities
@@ -3166,13 +3186,22 @@ int main(int argc, char *argv[])
SetPriorityClass(GetCurrentProcess(), prio); SetPriorityClass(GetCurrentProcess(), prio);
} }
#endif #endif
if (opt_affinity != -1)
{
if (!opt_quiet)
applog(LOG_DEBUG, "Binding process to cpu mask %x", opt_affinity);
affine_to_cpu_mask(-1, (unsigned long)opt_affinity);
}
if ( opt_affinity != -1 )
{
if ( num_cpus > 64 )
{
applog(LOG_WARNING,"--cpu-affinity argument is not supported with more");
applog(LOG_WARNING," than 64 CPUs, using default affinity.");
opt_affinity = -1;
}
else
{
if (!opt_quiet)
applog(LOG_DEBUG, "Binding process to cpu mask %x", opt_affinity);
affine_to_cpu_mask( -1, (unsigned long)opt_affinity );
}
}
//#ifdef HAVE_SYSLOG_H //#ifdef HAVE_SYSLOG_H
// if (use_syslog) // if (use_syslog)

View File

@@ -506,6 +506,7 @@ enum algos {
ALGO_KECCAKC, ALGO_KECCAKC,
ALGO_LBRY, ALGO_LBRY,
ALGO_LUFFA, ALGO_LUFFA,
ALGO_LYRA2H,
ALGO_LYRA2RE, ALGO_LYRA2RE,
ALGO_LYRA2REV2, ALGO_LYRA2REV2,
ALGO_LYRA2Z, ALGO_LYRA2Z,
@@ -576,6 +577,7 @@ static const char* const algo_names[] = {
"keccakc", "keccakc",
"lbry", "lbry",
"luffa", "luffa",
"lyra2h",
"lyra2re", "lyra2re",
"lyra2rev2", "lyra2rev2",
"lyra2z", "lyra2z",
@@ -700,6 +702,7 @@ Options:\n\
keccakc Creative Coin\n\ keccakc Creative Coin\n\
lbry LBC, LBRY Credits\n\ lbry LBC, LBRY Credits\n\
luffa Luffa\n\ luffa Luffa\n\
lyra2h Hppcoin\n\
lyra2re lyra2\n\ lyra2re lyra2\n\
lyra2rev2 lyrav2, Vertcoin\n\ lyra2rev2 lyrav2, Vertcoin\n\
lyra2z Zcoin (XZC)\n\ lyra2z Zcoin (XZC)\n\