This commit is contained in:
Jay D Dee
2017-12-08 15:39:28 -05:00
parent 4b57ac0eb9
commit af1c940919
53 changed files with 1324 additions and 4790 deletions

12
AUTHORS
View File

@@ -16,4 +16,16 @@ LucasJones
tpruvot@github
elmad
djm34
palmd
ig0tik3d
Wolf0
Optiminer
Jay D Dee

View File

@@ -104,7 +104,9 @@ cpuminer_SOURCES = \
algo/lyra2/sponge.c \
algo/lyra2/lyra2rev2.c \
algo/lyra2/lyra2re.c \
algo/lyra2/zcoin.c \
algo/lyra2/lyra2z-gate.c \
algo/lyra2/lyra2z.c \
algo/lyra2/lyra2z-4way.c \
algo/lyra2/lyra2z330.c \
algo/m7m.c \
algo/neoscrypt.c \

View File

@@ -35,8 +35,9 @@ Supported Algorithms
heavy Heavy
hmq1725 Espers
hodl Hodlcoin
jha jackpotcoin
keccak Keccak
jha Jackpotcoin
keccak Maxcoin
keccakc Creative coin
lbry LBC, LBRY Credits
luffa Luffa
lyra2re lyra2
@@ -50,7 +51,7 @@ Supported Algorithms
pentablake Pentablake
phi1612 phi, LUX coin
pluck Pluck:128 (Supcoin)
polytimos
polytimos Ninja
quark Quark
qubit Qubit
scrypt scrypt(1024, 1, 1) (default)

View File

@@ -164,6 +164,17 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
Change Log
----------
v3.7.5
New algo keccakc for Creative coin with 4way optimizations
Rewrote some AVX/AVX2 code for more consistent implementation and some
optimizing.
Enhanced capabilities check to support 4way, mor eprecise reporting of
features (not all algos use SSE2), and better error messages when using
an incompatible pre-built version (Windows users).
v3.7.4
Removed unnecessary build options.

View File

@@ -77,6 +77,12 @@ void algo_not_tested()
applog(LOG_WARNING,"and bad things may happen. Use at your own risk.");
}
void four_way_not_tested()
{
applog( LOG_WARNING,"Algo %s has not been tested using 4way. It may not", algo_names[opt_algo] );
applog( LOG_WARNING,"work or may be slower. Please report your results.");
}
void algo_not_implemented()
{
applog(LOG_ERR,"Algo %s has not been Implemented.",algo_names[opt_algo]);
@@ -124,7 +130,7 @@ void init_algo_gate( algo_gate_t* gate )
gate->do_this_thread = (void*)&return_true;
gate->longpoll_rpc_call = (void*)&std_longpoll_rpc_call;
gate->stratum_handle_response = (void*)&std_stratum_handle_response;
gate->optimizations = SSE2_OPT;
gate->optimizations = EMPTY_SET;
gate->ntime_index = STD_NTIME_INDEX;
gate->nbits_index = STD_NBITS_INDEX;
gate->nonce_index = STD_NONCE_INDEX;
@@ -171,11 +177,12 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_HODL: register_hodl_algo ( gate ); break;
case ALGO_JHA: register_jha_algo ( gate ); break;
case ALGO_KECCAK: register_keccak_algo ( gate ); break;
case ALGO_KECCAKC: register_keccakc_algo ( gate ); break;
case ALGO_LBRY: register_lbry_algo ( gate ); break;
case ALGO_LUFFA: register_luffa_algo ( gate ); break;
case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break;
case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break;
case ALGO_LYRA2Z: register_zcoin_algo ( gate ); break;
case ALGO_LYRA2Z: register_lyra2z_algo ( gate ); break;
case ALGO_LYRA2Z330: register_lyra2z330_algo ( gate ); break;
case ALGO_M7M: register_m7m_algo ( gate ); break;
case ALGO_MYR_GR: register_myriad_algo ( gate ); break;

View File

@@ -90,7 +90,8 @@ typedef uint32_t set_t;
#define AES_OPT 2
#define AVX_OPT 4
#define AVX2_OPT 8
#define SHA_OPT 16
#define SHA_OPT 0x10
#define FOUR_WAY_OPT 0x20
// return set containing all elements from sets a & b
inline set_t set_union ( set_t a, set_t b ) { return a | b; }
@@ -156,7 +157,7 @@ bool return_false();
void *return_null();
void algo_not_tested();
void algo_not_implemented();
void four_way_not_tested();
// Warning: algo_gate.nonce_index should only be used in targetted code
// due to different behaviours by different targets. The JR2 index uses an

View File

@@ -9,18 +9,18 @@
void blakehash_4way(void *state, const void *input)
{
uint32_t hash0[16] __attribute__ ((aligned (64)));
uint32_t hash1[16] __attribute__ ((aligned (64)));
uint32_t hash2[16] __attribute__ ((aligned (64)));
uint32_t hash3[16] __attribute__ ((aligned (64)));
uint32_t vhash[16*4] __attribute__ ((aligned (64)));
uint32_t vhash[4*4] __attribute__ ((aligned (64)));
uint32_t hash0[4] __attribute__ ((aligned (32)));
uint32_t hash1[4] __attribute__ ((aligned (32)));
uint32_t hash2[4] __attribute__ ((aligned (32)));
uint32_t hash3[4] __attribute__ ((aligned (32)));
blake256_4way_context ctx;
blake256_4way_init( &ctx );
blake256_4way( &ctx, input, 16 );
blake256_4way_close( &ctx, vhash );
m128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
@@ -32,7 +32,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t hash[4*4] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -49,7 +49,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
// we need big endian data...
swab32_array( endiandata, pdata, 20 );
m128_interleave_4x32( vdata, endiandata, endiandata, endiandata,
mm_interleave_4x32( vdata, endiandata, endiandata, endiandata,
endiandata, 640 );
uint32_t *noncep = vdata + 76; // 19*4

View File

@@ -13,11 +13,12 @@ bool register_blake_algo( algo_gate_t* gate )
// gate->scanhash = (void*)&scanhash_blake_8way;
// gate->hash = (void*)&blakehash_8way;
#if defined(BLAKE_4WAY)
gate->optimizations = SSE2_OPT | AVX_OPT;
four_way_not_tested();
gate->optimizations = FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_blake_4way;
gate->hash = (void*)&blakehash_4way;
four_way_not_tested();
#else
gate->optimizations = SSE2_OPT;
gate->scanhash = (void*)&scanhash_blake;
gate->hash = (void*)&blakehash;
#endif

View File

@@ -536,22 +536,22 @@ do { \
, _mmset_epi32( CS6, CS6, CS6, CS6 ) ); \
VF = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ), \
_mmset_epi32( CS7, CS7, CS7, CS7 ) ); \
M[0x0] = mm_byteswap_epi32( *(buf + 0) ); \
M[0x1] = mm_byteswap_epi32( *(buf + 1) ); \
M[0x2] = mm_byteswap_epi32( *(buf + 2) ); \
M[0x3] = mm_byteswap_epi32( *(buf + 3) ); \
M[0x4] = mm_byteswap_epi32( *(buf + 4) ); \
M[0x5] = mm_byteswap_epi32( *(buf + 5) ); \
M[0x6] = mm_byteswap_epi32( *(buf + 6) ); \
M[0x7] = mm_byteswap_epi32( *(buf + 7) ); \
M[0x8] = mm_byteswap_epi32( *(buf + 8) ); \
M[0x9] = mm_byteswap_epi32( *(buf + 9) ); \
M[0xA] = mm_byteswap_epi32( *(buf + 10) ); \
M[0xB] = mm_byteswap_epi32( *(buf + 11) ); \
M[0xC] = mm_byteswap_epi32( *(buf + 12) ); \
M[0xD] = mm_byteswap_epi32( *(buf + 13) ); \
M[0xE] = mm_byteswap_epi32( *(buf + 14) ); \
M[0xF] = mm_byteswap_epi32( *(buf + 15) ); \
M[0x0] = mm_byteswap_32( *(buf + 0) ); \
M[0x1] = mm_byteswap_32( *(buf + 1) ); \
M[0x2] = mm_byteswap_32( *(buf + 2) ); \
M[0x3] = mm_byteswap_32( *(buf + 3) ); \
M[0x4] = mm_byteswap_32( *(buf + 4) ); \
M[0x5] = mm_byteswap_32( *(buf + 5) ); \
M[0x6] = mm_byteswap_32( *(buf + 6) ); \
M[0x7] = mm_byteswap_32( *(buf + 7) ); \
M[0x8] = mm_byteswap_32( *(buf + 8) ); \
M[0x9] = mm_byteswap_32( *(buf + 9) ); \
M[0xA] = mm_byteswap_32( *(buf + 10) ); \
M[0xB] = mm_byteswap_32( *(buf + 11) ); \
M[0xC] = mm_byteswap_32( *(buf + 12) ); \
M[0xD] = mm_byteswap_32( *(buf + 13) ); \
M[0xE] = mm_byteswap_32( *(buf + 14) ); \
M[0xF] = mm_byteswap_32( *(buf + 15) ); \
for (r = 0; r < BLAKE32_ROUNDS; r ++) \
ROUND_S_4WAY(r); \
H0 = _mm_xor_si128( _mm_xor_si128( \
@@ -601,22 +601,22 @@ do { \
_mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
M0 = mm_byteswap_epi32( * buf ); \
M1 = mm_byteswap_epi32( *(buf+1) ); \
M2 = mm_byteswap_epi32( *(buf+2) ); \
M3 = mm_byteswap_epi32( *(buf+3) ); \
M4 = mm_byteswap_epi32( *(buf+4) ); \
M5 = mm_byteswap_epi32( *(buf+5) ); \
M6 = mm_byteswap_epi32( *(buf+6) ); \
M7 = mm_byteswap_epi32( *(buf+7) ); \
M8 = mm_byteswap_epi32( *(buf+8) ); \
M9 = mm_byteswap_epi32( *(buf+9) ); \
MA = mm_byteswap_epi32( *(buf+10) ); \
MB = mm_byteswap_epi32( *(buf+11) ); \
MC = mm_byteswap_epi32( *(buf+12) ); \
MD = mm_byteswap_epi32( *(buf+13) ); \
ME = mm_byteswap_epi32( *(buf+14) ); \
MF = mm_byteswap_epi32( *(buf+15) ); \
M0 = mm_byteswap_32( * buf ); \
M1 = mm_byteswap_32( *(buf+1) ); \
M2 = mm_byteswap_32( *(buf+2) ); \
M3 = mm_byteswap_32( *(buf+3) ); \
M4 = mm_byteswap_32( *(buf+4) ); \
M5 = mm_byteswap_32( *(buf+5) ); \
M6 = mm_byteswap_32( *(buf+6) ); \
M7 = mm_byteswap_32( *(buf+7) ); \
M8 = mm_byteswap_32( *(buf+8) ); \
M9 = mm_byteswap_32( *(buf+9) ); \
MA = mm_byteswap_32( *(buf+10) ); \
MB = mm_byteswap_32( *(buf+11) ); \
MC = mm_byteswap_32( *(buf+12) ); \
MD = mm_byteswap_32( *(buf+13) ); \
ME = mm_byteswap_32( *(buf+14) ); \
MF = mm_byteswap_32( *(buf+15) ); \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \
@@ -722,22 +722,22 @@ do { \
_mm256_set256_epi64( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \
_mm256_set256_epi64( CB7, CB7, CB7, CB7 ) ); \
M[0x0] = mm256_byteswap_epi64( *(buf+0) ); \
M[0x1] = mm256_byteswap_epi64( *(buf+1) ); \
M[0x2] = mm256_byteswap_epi64( *(buf+2) ); \
M[0x3] = mm256_byteswap_epi64( *(buf+3) ); \
M[0x4] = mm256_byteswap_epi64( *(buf+4) ); \
M[0x5] = mm256_byteswap_epi64( *(buf+5) ); \
M[0x6] = mm256_byteswap_epi64( *(buf+6) ); \
M[0x7] = mm256_byteswap_epi64( *(buf+7) ); \
M[0x8] = mm256_byteswap_epi64( *(buf+8) ); \
M[0x9] = mm256_byteswap_epi64( *(buf+9) ); \
M[0xA] = mm256_byteswap_epi64( *(buf+10) ); \
M[0xB] = mm256_byteswap_epi64( *(buf+11) ); \
M[0xC] = mm256_byteswap_epi64( *(buf+12) ); \
M[0xD] = mm256_byteswap_epi64( *(buf+13) ); \
M[0xE] = mm256_byteswap_epi64( *(buf+14) ); \
M[0xF] = mm256_byteswap_epi64( *(buf+15) ); \
M[0x0] = mm256_byteswap_64( *(buf+0) ); \
M[0x1] = mm256_byteswap_64( *(buf+1) ); \
M[0x2] = mm256_byteswap_64( *(buf+2) ); \
M[0x3] = mm256_byteswap_64( *(buf+3) ); \
M[0x4] = mm256_byteswap_64( *(buf+4) ); \
M[0x5] = mm256_byteswap_64( *(buf+5) ); \
M[0x6] = mm256_byteswap_64( *(buf+6) ); \
M[0x7] = mm256_byteswap_64( *(buf+7) ); \
M[0x8] = mm256_byteswap_64( *(buf+8) ); \
M[0x9] = mm256_byteswap_64( *(buf+9) ); \
M[0xA] = mm256_byteswap_64( *(buf+10) ); \
M[0xB] = mm256_byteswap_64( *(buf+11) ); \
M[0xC] = mm256_byteswap_64( *(buf+12) ); \
M[0xD] = mm256_byteswap_64( *(buf+13) ); \
M[0xE] = mm256_byteswap_64( *(buf+14) ); \
M[0xF] = mm256_byteswap_64( *(buf+15) ); \
for (r = 0; r < 16; r ++) \
ROUND_B_4WAY(r); \
H0 = _mm256_xor_si256( _mm256_xor_si256( \
@@ -787,22 +787,22 @@ do { \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M0 = mm256_byteswap_epi64( *(buf + 0) ); \
M1 = mm256_byteswap_epi64( *(buf + 1) ); \
M2 = mm256_byteswap_epi64( *(buf + 2) ); \
M3 = mm256_byteswap_epi64( *(buf + 3) ); \
M4 = mm256_byteswap_epi64( *(buf + 4) ); \
M5 = mm256_byteswap_epi64( *(buf + 5) ); \
M6 = mm256_byteswap_epi64( *(buf + 6) ); \
M7 = mm256_byteswap_epi64( *(buf + 7) ); \
M8 = mm256_byteswap_epi64( *(buf + 8) ); \
M9 = mm256_byteswap_epi64( *(buf + 9) ); \
MA = mm256_byteswap_epi64( *(buf + 10) ); \
MB = mm256_byteswap_epi64( *(buf + 11) ); \
MC = mm256_byteswap_epi64( *(buf + 12) ); \
MD = mm256_byteswap_epi64( *(buf + 13) ); \
ME = mm256_byteswap_epi64( *(buf + 14) ); \
MF = mm256_byteswap_epi64( *(buf + 15) ); \
M0 = mm256_byteswap_64( *(buf + 0) ); \
M1 = mm256_byteswap_64( *(buf + 1) ); \
M2 = mm256_byteswap_64( *(buf + 2) ); \
M3 = mm256_byteswap_64( *(buf + 3) ); \
M4 = mm256_byteswap_64( *(buf + 4) ); \
M5 = mm256_byteswap_64( *(buf + 5) ); \
M6 = mm256_byteswap_64( *(buf + 6) ); \
M7 = mm256_byteswap_64( *(buf + 7) ); \
M8 = mm256_byteswap_64( *(buf + 8) ); \
M9 = mm256_byteswap_64( *(buf + 9) ); \
MA = mm256_byteswap_64( *(buf + 10) ); \
MB = mm256_byteswap_64( *(buf + 11) ); \
MC = mm256_byteswap_64( *(buf + 12) ); \
MD = mm256_byteswap_64( *(buf + 13) ); \
ME = mm256_byteswap_64( *(buf + 14) ); \
MF = mm256_byteswap_64( *(buf + 15) ); \
ROUND_B_4WAY(0); \
ROUND_B_4WAY(1); \
ROUND_B_4WAY(2); \
@@ -870,7 +870,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
if ( len < buf_size - ptr )
{
memcpy_m128i( buf + (ptr>>2), vdata, len>>2 );
memcpy_128( buf + (ptr>>2), vdata, len>>2 );
ptr += len;
sc->ptr = ptr;
return;
@@ -884,7 +884,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
clen = buf_size - ptr;
if (clen > len)
clen = len;
memcpy_m128i( buf + (ptr>>2), vdata, clen>>2 );
memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
ptr += clen;
vdata += (clen>>2);
len -= clen;
@@ -936,32 +936,32 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
if ( ptr <= 48 )
{
memset_zero_m128i( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
_mm_set_epi32( 0x010000000, 0x01000000,
0x010000000, 0x01000000 ) );
*(u.buf+(56>>2)) = mm_byteswap_epi32( _mm_set_epi32( th, th, th, th ) );
*(u.buf+(60>>2)) = mm_byteswap_epi32( _mm_set_epi32( tl, tl, tl, tl ) );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
}
else
{
memset_zero_m128i( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
memset_zero_128( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
sc->T0 = SPH_C32(0xFFFFFE00);
sc->T1 = SPH_C32(0xFFFFFFFF);
memset_zero_m128i( u.buf, 56>>2 );
memset_zero_128( u.buf, 56>>2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm_set_epi32( 0x010000000, 0x01000000,
0x010000000, 0x01000000 );
*(u.buf+(56>>2)) = mm_byteswap_epi32( _mm_set_epi32( th, th, th, th ) );
*(u.buf+(60>>2)) = mm_byteswap_epi32( _mm_set_epi32( tl, tl, tl, tl ) );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
blake32_4way( sc, u.buf, 64 );
}
out = (__m128i*)dst;
for ( k = 0; k < out_size_w32; k++ )
out[k] = mm_byteswap_epi32( sc->H[k] );
out[k] = mm_byteswap_32( sc->H[k] );
}
#if defined (__AVX2__)
@@ -995,7 +995,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
ptr = sc->ptr;
if ( len < (buf_size - ptr) )
{
memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
memcpy_256( buf + (ptr>>3), vdata, len>>3 );
ptr += len;
sc->ptr = ptr;
return;
@@ -1009,7 +1009,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
ptr += clen;
vdata = vdata + (clen>>3);
len -= clen;
@@ -1062,44 +1062,44 @@ blake64_4way_close( blake_4way_big_context *sc,
}
if ( ptr <= 104 )
{
memset_zero_m256i( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
memset_zero_256( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
if ( out_size_w64 == 8 )
u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
_mm256_set_epi64x( 0x0100000000000000,
0x0100000000000000,
0x0100000000000000,
0x0100000000000000 ) );
*(u.buf+(112>>3)) = mm256_byteswap_epi64(
*(u.buf+(112>>3)) = mm256_byteswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_byteswap_epi64(
*(u.buf+(120>>3)) = mm256_byteswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
}
else
{
memset_zero_m256i( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
memset_zero_256( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
memset_zero_m256i( u.buf, 112>>3 );
memset_zero_256( u.buf, 112>>3 );
if ( out_size_w64 == 8 )
u.buf[104>>3] = _mm256_set_epi64x( 0x0100000000000000,
0x0100000000000000,
0x0100000000000000,
0x0100000000000000 );
*(u.buf+(112>>3)) = mm256_byteswap_epi64(
*(u.buf+(112>>3)) = mm256_byteswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_byteswap_epi64(
*(u.buf+(120>>3)) = mm256_byteswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, u.buf, 128 );
}
out = (__m256i*)dst;
for ( k = 0; k < out_size_w64; k++ )
out[k] = mm256_byteswap_epi64( sc->H[k] );
out[k] = mm256_byteswap_64( sc->H[k] );
}
#endif

View File

@@ -13,17 +13,17 @@ static __thread bool ctx_midstate_done = false;
void decred_hash_4way( void *state, const void *input )
{
uint32_t hash0[16] __attribute__ ((aligned (64)));
uint32_t hash1[16] __attribute__ ((aligned (64)));
uint32_t hash2[16] __attribute__ ((aligned (64)));
uint32_t hash3[16] __attribute__ ((aligned (64)));
uint32_t vhash[16*4] __attribute__ ((aligned (64)));
uint32_t vhash[4*4] __attribute__ ((aligned (64)));
uint32_t hash0[4] __attribute__ ((aligned (32)));
uint32_t hash1[4] __attribute__ ((aligned (32)));
uint32_t hash2[4] __attribute__ ((aligned (32)));
uint32_t hash3[4] __attribute__ ((aligned (32)));
blake256_4way_context ctx __attribute__ ((aligned (64)));
sph_blake256_context ctx2 __attribute__ ((aligned (64)));
uint32_t hash[16] __attribute__ ((aligned (64)));
uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
m128_deinterleave_4x32( sin0, sin1, sin2, sin3, (uint32_t*)input, 180*8 );
mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 );
void *tail = input + DECRED_MIDSTATE_LEN;
int tail_len = 180 - DECRED_MIDSTATE_LEN;
@@ -53,7 +53,7 @@ void decred_hash_4way( void *state, const void *input )
blake256_4way( &ctx, input, 180 );
blake256_4way_close( &ctx, vhash );
m128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
/*
for ( int i = 0; i < 8; i++ )
if ( hash[i] != hash0[i] )
@@ -79,7 +79,7 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t vdata[45*4] __attribute__ ((aligned (64)));
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t hash[4*4] __attribute__ ((aligned (32)));
uint32_t _ALIGN(64) endiandata[48];
// uint32_t _ALIGN(64) hash32[8];
uint32_t *pdata = work->data;
@@ -97,7 +97,8 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
// memcpy(endiandata, pdata, 180);
m128_interleave_4x32( vdata, pdata, pdata, pdata, pdata, 180*8 );
// use the old way until new way updated for size.
mm_interleave_4x32x( vdata, pdata, pdata, pdata, pdata, 180*8 );
uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
do {

View File

@@ -144,7 +144,8 @@ bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
bool register_decred_algo( algo_gate_t* gate )
{
#if defined(DECRED_4WAY)
gate->optimizations = SSE2_OPT | AVX_OPT;
four_way_not_tested();
gate->optimizations = FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_decred_4way;
gate->hash = (void*)&decred_hash_4way;
#else
@@ -153,9 +154,6 @@ bool register_decred_algo( algo_gate_t* gate )
gate->hash = (void*)&decred_hash;
#endif
// gate->optimizations = SSE2_OPT;
// gate->scanhash = (void*)&scanhash_decred;
// gate->hash = (void*)&decred_hash;
gate->get_nonceptr = (void*)&decred_get_nonceptr;
gate->get_max64 = (void*)&get_max64_0x3fffffLL;
gate->display_extra_data = (void*)&decred_decode_extradata;

View File

@@ -30,13 +30,13 @@ extern void pentablakehash_4way( void *output, const void *input )
blake512_4way_close( &ctx, vhash );
uint64_t sin0[10], sin1[10], sin2[10], sin3[10];
m256_deinterleave_4x64( sin0, sin1, sin2, sin3, input, 640 );
mm256_deinterleave_4x64( sin0, sin1, sin2, sin3, input, 640 );
sph_blake512_context ctx2_blake;
sph_blake512_init(&ctx2_blake);
sph_blake512(&ctx2_blake, sin0, 80);
sph_blake512_close(&ctx2_blake, (void*) hash);
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
uint64_t* hash64 = (uint64_t*)hash;
for( int i = 0; i < 8; i++ )
{
@@ -60,7 +60,7 @@ for( int i = 0; i < 8; i++ )
blake512_4way( &ctx, vhash, 64 );
blake512_4way_close( &ctx, vhash );
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
@@ -141,7 +141,7 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for ( int m=0; m < 6; m++ )
{

View File

@@ -3,13 +3,13 @@
bool register_pentablake_algo( algo_gate_t* gate )
{
#if defined (PENTABLAKE_4WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_pentablake_4way;
gate->hash = (void*)&pentablakehash_4way;
#else
gate->scanhash = (void*)&scanhash_pentablake;
gate->hash = (void*)&pentablakehash;
#endif
gate->optimizations = FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

View File

@@ -95,13 +95,13 @@ extern "C"{
#define Sb(x0, x1, x2, x3, c) \
do { \
__m256i cc = _mm256_set_epi64x( c, c, c, c ); \
x3 = mm256_bitnot( x3 ); \
x0 = _mm256_xor_si256( x0, _mm256_and_si256( cc, mm256_bitnot( x2 ) ) ); \
x3 = mm256_not( x3 ); \
x0 = _mm256_xor_si256( x0, _mm256_and_si256( cc, mm256_not( x2 ) ) ); \
tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \
x3 = _mm256_xor_si256( x3, _mm256_and_si256( mm256_bitnot( x1 ), x2 ) ); \
x3 = _mm256_xor_si256( x3, _mm256_and_si256( mm256_not( x1 ), x2 ) ); \
x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \
x2 = _mm256_xor_si256( x2, _mm256_and_si256( x0, mm256_bitnot( x3 ) ) ); \
x2 = _mm256_xor_si256( x2, _mm256_and_si256( x0, mm256_not( x3 ) ) ); \
x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \
x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \
x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \
@@ -532,7 +532,7 @@ jh_4way_core( jh_4way_context *sc, const void *data, size_t len )
if ( len < (buf_size - ptr) )
{
memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
memcpy_256( buf + (ptr>>3), vdata, len>>3 );
ptr += len;
sc->ptr = ptr;
return;
@@ -546,7 +546,7 @@ jh_4way_core( jh_4way_context *sc, const void *data, size_t len )
if ( clen > len )
clen = len;
memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
ptr += clen;
vdata += (clen>>3);
len -= clen;
@@ -579,7 +579,7 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
else
numz = 112 - sc->ptr;
memset_zero_m256i( buf+1, (numz>>3) - 1 );
memset_zero_256( buf+1, (numz>>3) - 1 );
l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
l1 = SPH_T64(sc->block_count >> 55);
@@ -593,7 +593,7 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
for ( u=0; u < 8; u++ )
buf[u] = sc->H[u+8];
memcpy_m256i( dst256, buf, 8 );
memcpy_256( dst256, buf, 8 );
}
void

View File

@@ -1,11 +1,12 @@
#if defined(JHA_4WAY)
#include "jha-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "avxdefs.h"
//#include "avxdefs.h"
#if defined(JHA_4WAY)
#include "algo/blake/blake-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
@@ -15,19 +16,19 @@
//static __thread keccak512_4way_context jha_kec_mid
// __attribute__ ((aligned (64)));
void jha_hash_4way( void *output, const void *input )
void jha_hash_4way( void *out, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhasha[8*4] __attribute__ ((aligned (64)));
uint64_t vhashb[8*4] __attribute__ ((aligned (64)));
__m256i mask;
__m256i* vh256 = (__m256i*)vhash;
__m256i* vha256 = (__m256i*)vhasha;
__m256i* vhb256 = (__m256i*)vhashb;
uint64_t vhash0[8*4] __attribute__ ((aligned (64)));
uint64_t vhash1[8*4] __attribute__ ((aligned (64)));
__m256i mask0, mask1;
__m256i* vh = (__m256i*)vhash;
__m256i* vh0 = (__m256i*)vhash0;
__m256i* vh1 = (__m256i*)vhash1;
blake512_4way_context ctx_blake;
hashState_groestl ctx_groestl;
@@ -40,21 +41,29 @@ void jha_hash_4way( void *output, const void *input )
keccak512_4way_close( &ctx_keccak, vhash );
// memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
// keccak512_4way( &ctx_keccak, input+64, 16 );
// keccak512_4way( &ctx_keccak, input + (64<<2), 16 );
// keccak512_4way_close( &ctx_keccak, vhash );
// Heavy & Light Pair Loop
for ( int round = 0; round < 3; round++ )
{
memset_zero_m256i( vha256, 20 );
memset_zero_m256i( vhb256, 20 );
// memset_zero_256( vh0, 20 );
// memset_zero_256( vh1, 20 );
mask = _mm256_sub_epi64( _mm256_and_si256( vh256[0],
mm256_vec_epi64( 0x1 ) ), mm256_vec_epi64( 0x1 ) );
// positive logic, if maski select vhi
// going from bit to mask reverses logic such that if the test bit is set
// zero will be put in mask0, meaning don't take vh0. mask1 is
// inverted so 1 will be put in mask1 meaning take it.
mask0 = mm256_negate_64(
_mm256_and_si256( vh[0], _mm256_set1_epi64x( 0x1 ) ) );
mask1 = mm256_not( mask0 );
// mask = _mm256_sub_epi64( _mm256_and_si256( vh[0],
// _mm256_set1_epi64x( 0x1 ) ), _mm256_set1_epi64x( 0x1 ) );
// groestl (serial) v skein
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash0,
@@ -71,58 +80,66 @@ void jha_hash_4way( void *output, const void *input )
update_and_final_groestl( &ctx_groestl, (char*)hash3,
(char*)hash3, 512 );
m256_interleave_4x64( vhasha, hash0, hash1, hash2, hash3, 512 );
mm256_interleave_4x64( vhash0, hash0, hash1, hash2, hash3, 512 );
// skein
skein512_4way_init( &ctx_skein );
skein512_4way( &ctx_skein, vhash, 64 );
skein512_4way_close( &ctx_skein, vhashb );
skein512_4way_close( &ctx_skein, vhash1 );
// merge vectored hash
for ( int i = 0; i < 8; i++ )
{
vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
_mm256_and_si256( vh1[i], mask1 ) );
/*
vha256[i] = _mm256_maskload_epi64(
vhasha + i*4, mm256_bitnot(mask ) );
vhasha + i*4, mm256_not( mask ) );
vhb256[i] = _mm256_maskload_epi64(
vhashb + i*4, mask );
vh256[i] = _mm256_or_si256( vha256[i], vhb256[i] );
*/
}
// blake v jh
blake512_4way_init( &ctx_blake );
blake512_4way( &ctx_blake, vhash, 64 );
blake512_4way_close( &ctx_blake, vhasha );
blake512_4way_close( &ctx_blake, vhash0 );
jh512_4way_init( &ctx_jh );
jh512_4way( &ctx_jh, vhash, 64 );
jh512_4way_close( &ctx_jh, vhashb );
jh512_4way_close( &ctx_jh, vhash1 );
// merge vectored hash
// merge hash
for ( int i = 0; i < 8; i++ )
{
vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
_mm256_and_si256( vh1[i], mask1 ) );
/*
vha256[i] = _mm256_maskload_epi64(
vhasha + i*4, mm256_bitnot(mask ) );
vhasha + i*4, mm256_not( mask ) );
vhb256[i] = _mm256_maskload_epi64(
vhashb + i*4, mask );
vh256[i] = _mm256_or_si256( vha256[i], vhb256[i] );
*/
}
}
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
memcpy( output+96, hash3, 32 );
// memcpy( output, hash0, 32 );
// memcpy( output+32, hash1, 32 );
// memcpy( output+64, hash2, 32 );
// memcpy( output+96, hash3, 32 );
}
int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
@@ -160,7 +177,7 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
be32enc( &endiandata[i], pdata[i] );
uint64_t *edata = (uint64_t*)endiandata;
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
// precalc midstate for keccak
// keccak512_4way_init( &jha_kec_mid );

View File

@@ -3,15 +3,16 @@
bool register_jha_algo( algo_gate_t* gate )
{
//#if defined (JHA_4WAY)
// gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
// gate->scanhash = (void*)&scanhash_jha_4way;
// gate->hash = (void*)&jha_hash_4way;
//#else
gate->optimizations = SSE2_OPT | AES_OPT;
#if defined (JHA_4WAY)
four_way_not_tested();
gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_jha_4way;
gate->hash = (void*)&jha_hash_4way;
#else
gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_jha;
gate->hash = (void*)&jha_hash;
//#endif
#endif
gate->set_target = (void*)&scrypt_set_target;
return true;
};

View File

@@ -9,19 +9,17 @@
#define JHA_4WAY
#endif
//#if defined JHA_4WAY
//void jha_hash_4way( void *state, const void *input );
#if defined JHA_4WAY
void jha_hash_4way( void *state, const void *input );
//int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done );
//#else
int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif
void jha_hash( void *state, const void *input );
int scanhash_jha( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
//#endif
#endif

View File

@@ -1,39 +1,30 @@
#include "keccak-gate.h"
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "sph_keccak.h"
#include "keccak-hash-4way.h"
#ifdef KECCAK_4WAY
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "sph_keccak.h"
#include "keccak-hash-4way.h"
void keccakhash_4way(void *state, const void *input)
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhash[4*4] __attribute__ ((aligned (64)));
keccak256_4way_context ctx;
keccak256_4way_init( &ctx );
keccak256_4way( &ctx, input, 80 );
keccak256_4way_close( &ctx, vhash );
m256_deinterleave_4x64x( hash0, hash1, hash2, hash3, vhash, 512 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
@@ -52,7 +43,7 @@ int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
be32enc( &endiandata[i], pdata[i] );
uint64_t *edata = (uint64_t*)endiandata;
m256_interleave_4x64x( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do {
found[0] = found[1] = found[2] = found[3] = false;

View File

@@ -9,19 +9,38 @@ int64_t keccak_get_max64() { return 0x7ffffLL; }
bool register_keccak_algo( algo_gate_t* gate )
{
gate->optimizations = FOUR_WAY_OPT;
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
gate->set_target = (void*)&keccak_set_target;
gate->get_max64 = (void*)&keccak_get_max64;
#if defined (KECCAK_4WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_keccak_4way;
gate->hash = (void*)&keccakhash_4way;
#else
gate->optimizations = SSE2_OPT;
gate->scanhash = (void*)&scanhash_keccak;
gate->hash = (void*)&keccakhash;
#endif
return true;
};
void keccakc_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_keccakc_algo( algo_gate_t* gate )
{
gate->optimizations = FOUR_WAY_OPT;
gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
gate->set_target = (void*)&keccakc_set_target;
gate->get_max64 = (void*)&keccak_get_max64;
#if defined (KECCAK_4WAY)
gate->scanhash = (void*)&scanhash_keccak_4way;
gate->hash = (void*)&keccakhash_4way;
#else
gate->scanhash = (void*)&scanhash_keccak;
gate->hash = (void*)&keccakhash;
#endif
return true;
};

View File

@@ -1,5 +1,5 @@
#ifndef __KECCAK_GATE_H__
#define __KECCAK_GATE_H__
#ifndef KECCAK_GATE_H__
#define KECCAK_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>

View File

@@ -54,10 +54,6 @@ static const sph_u64 RC[] = {
kc->w[j ] = _mm256_xor_si256( kc->w[j], buf[j] ); \
} while (0)
#define mm256_neg1 \
(_mm256_set_epi64x( 0xffffffffffffffff, 0xffffffffffffffff, \
0xffffffffffffffff, 0xffffffffffffffff ) )
#define DECL64(x) __m256i x
#define MOV64(d, s) (d = s)
#define XOR64(d, a, b) (d = _mm256_xor_si256(a,b))
@@ -403,7 +399,7 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
if ( len < (lim - ptr) )
{
memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
memcpy_256( buf + (ptr>>3), vdata, len>>3 );
kc->ptr = ptr + len;
return;
}
@@ -416,7 +412,7 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
clen = (lim - ptr);
if ( clen > len )
clen = len;
memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
ptr += clen;
vdata = vdata + (clen>>3);
len -= clen;
@@ -453,7 +449,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
{
j = lim - kc->ptr;
u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb );
memset_zero_m256i( u.tmp + 1, (j>>3) - 2 );
memset_zero_256( u.tmp + 1, (j>>3) - 2 );
u.tmp[ (j>>3) - 1] = _mm256_set_epi64x( 0x8000000000000000,
0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
}
@@ -467,7 +463,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
NOT64( kc->w[20], kc->w[20] );
for ( j = 0; j < m256_len; j++ )
u.tmp[j] = kc->w[j];
memcpy_m256i( dst, u.tmp, m256_len );
memcpy_256( dst, u.tmp, m256_len );
}
void keccak256_4way_init( void *kc )

View File

@@ -272,8 +272,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
// full blocks
for ( i = 0; i < blocks; i++ )
{
rnd512( state, mm_byteswap_epi32( casti_m128i( data, 1 ) ),
mm_byteswap_epi32( casti_m128i( data, 0 ) ) );
rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
mm_byteswap_32( casti_m128i( data, 0 ) ) );
data += MSG_BLOCK_BYTE_LEN;
}
@@ -282,7 +282,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
if ( state->rembytes )
{
// remaining data bytes
casti_m128i( state->buffer, 0 ) = mm_byteswap_epi32( cast_m128i( data ) );
casti_m128i( state->buffer, 0 ) = mm_byteswap_32( cast_m128i( data ) );
// padding of partial block
casti_m128i( state->buffer, 1 ) =
_mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
@@ -324,8 +324,8 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
// full blocks
for ( i = 0; i < blocks; i++ )
{
rnd512( state, mm_byteswap_epi32( casti_m128i( data, 1 ) ),
mm_byteswap_epi32( casti_m128i( data, 0 ) ) );
rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
mm_byteswap_32( casti_m128i( data, 0 ) ) );
data += MSG_BLOCK_BYTE_LEN;
}
@@ -334,7 +334,7 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
{
// padding of partial block
rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
mm_byteswap_epi32( cast_m128i( data ) ) );
mm_byteswap_32( cast_m128i( data ) ) );
}
else
{
@@ -542,7 +542,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 0 ) = mm256_byteswap_epi32( casti_m256i( hash, 0 ) );
casti_m256i( b, 0 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
rnd512( state, zero, zero );
@@ -555,7 +555,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 1 ) = mm256_byteswap_epi32( casti_m256i( hash, 0 ) );
casti_m256i( b, 1 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
}
#else
@@ -587,8 +587,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm_store_si128((__m128i*)&hash[0], t[0]);
_mm_store_si128((__m128i*)&hash[4], t[1]);
casti_m128i( b, 0 ) = mm_byteswap_epi32( casti_m128i( hash, 0 ) );
casti_m128i( b, 1 ) = mm_byteswap_epi32( casti_m128i( hash, 1 ) );
casti_m128i( b, 0 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 1 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
rnd512( state, zero, zero );
@@ -609,8 +609,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm_store_si128((__m128i*)&hash[0], t[0]);
_mm_store_si128((__m128i*)&hash[4], t[1]);
casti_m128i( b, 2 ) = mm_byteswap_epi32( casti_m128i( hash, 0 ) );
casti_m128i( b, 3 ) = mm_byteswap_epi32( casti_m128i( hash, 1 ) );
casti_m128i( b, 2 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 3 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
}
#endif

View File

@@ -377,7 +377,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
if (wholeMatrix == NULL)
return -1;
/*
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
#elif defined(__AVX__)
@@ -385,7 +385,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
#else
memset(wholeMatrix, 0, i);
#endif
*/
uint64_t *ptrWord = wholeMatrix;
//=== Getting the password + salt + basil padded with 10*1 ==========//

View File

@@ -128,34 +128,10 @@ void lyra2re_set_target ( struct work* work, double job_diff )
work_set_target(work, job_diff / (128.0 * opt_diff_factor) );
}
/*
bool lyra2re_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
lyra2re_wholeMatrix = _mm_malloc( i, 64 );
if ( lyra2re_wholeMatrix == NULL )
return false;
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)lyra2re_wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)lyra2re_wholeMatrix, i/16 );
#else
memset( lyra2re_wholeMatrix, 0, i );
#endif
return true;
}
*/
bool register_lyra2re_algo( algo_gate_t* gate )
{
init_lyra2re_ctx();
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
// gate->miner_thread_init = (void*)&lyra2re_thread_init;
gate->scanhash = (void*)&scanhash_lyra2re;
gate->hash = (void*)&lyra2re_hash;
gate->get_max64 = (void*)&lyra2re_get_max64;

View File

@@ -132,23 +132,13 @@ bool lyra2rev2_thread_init()
int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
l2v2_wholeMatrix = _mm_malloc( i, 64 );
if ( l2v2_wholeMatrix == NULL )
return false;
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)l2v2_wholeMatrix, i/32 );
#elif defined (__AVX__)
memset_zero_m128i( (__m128i*)l2v2_wholeMatrix, i/16 );
#else
memset( l2v2_wholeMatrix, 0, i );
#endif
return true;
return l2v2_wholeMatrix;
}
bool register_lyra2rev2_algo( algo_gate_t* gate )
{
init_lyra2rev2_ctx();
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->optimizations = AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
gate->scanhash = (void*)&scanhash_lyra2rev2;
gate->hash = (void*)&lyra2rev2_hash;

168
algo/lyra2/lyra2z-4way.c Normal file
View File

@@ -0,0 +1,168 @@
#include "lyra2z-gate.h"
#ifdef LYRA2Z_4WAY
#include <memory.h>
#include <mm_malloc.h>
//#include "algo-gate-api.h"
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake-hash-4way.h"
//#include "avxdefs.h"
// same size, only difference is the name, lyra2 is done serially
__thread uint64_t* lyra2z_4way_matrix;
bool lyra2z_4way_thread_init()
{
return ( lyra2z_4way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
}
static __thread blake256_4way_context l2z_4way_blake_mid;
void lyra2z_4way_midstate( const void* input )
{
blake256_4way_init( &l2z_4way_blake_mid );
blake256_4way( &l2z_4way_blake_mid, input, 64 );
}
// block 2050 new algo, blake plus new lyra parms. new input
// is power of 2 so normal lyra can be used
//void zcoin_hash(void *state, const void *input, uint32_t height)
void lyra2z_4way_hash( void *state, const void *input )
{
// uint32_t _ALIGN(64) hash[16];
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
// memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
// blake256_4way( &ctx_blake, input + (64*4), 16 );
// blake256_4way_close( &ctx_blake, vhash );
blake256_4way_init( &ctx_blake );
blake256_4way( &ctx_blake, input, 80 );
blake256_4way_close( &ctx_blake, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
// LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
// LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
// memcpy(state, hash, 32);
}
int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
// uint32_t _ALIGN(64) hash[8];
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 76; // 19*4
uint32_t *noncep1 = vdata + 77;
uint32_t *noncep2 = vdata + 78;
uint32_t *noncep3 = vdata + 79;
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
for ( int i=0; i < 19; i++ )
be32enc( &edata[i], pdata[i] );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
// lyra2z_4way_midstate( vdata );
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
be32enc( &edata[19], n );
lyra2z_4way_hash( hash, vdata );
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
printf("found 0\n");
found[0] = true;
num_found++;
nonces[0] = pdata[19] = n;
work_set_target_ratio( work, hash );
}
/* if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{
printf("found 1\n");
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
*/
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{
printf("found 2\n");
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
/*
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{
printf("found 3\n");
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
*/
n += 2;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif
/*
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
work_set_target_ratio(work, hash);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 1;
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
*/

28
algo/lyra2/lyra2z-gate.c Normal file
View File

@@ -0,0 +1,28 @@
#include "lyra2z-gate.h"
#include "lyra2.h"
void lyra2z_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_lyra2z_algo( algo_gate_t* gate )
{
#ifdef LYRA2Z_4WAY
four_way_not_tested();
gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_4way;
gate->hash = (void*)&lyra2z_4way_hash;
#else
gate->optimizations = AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2z_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z;
gate->hash = (void*)&lyra2z_hash;
#endif
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&lyra2z_set_target;
return true;
};

33
algo/lyra2/lyra2z-gate.h Normal file
View File

@@ -0,0 +1,33 @@
#ifndef LYRA2Z_GATE_H__
#define LYRA2Z_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY)
#define LYRA2Z_4WAY
#endif
#define LYRA2Z_MATRIX_SIZE BLOCK_LEN_INT64 * 8 * 8 * 8
#if defined(LYRA2Z_4WAY)
void lyra2z_4way_hash( void *state, const void *input );
int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
bool lyra2z_4way_thread_init();
#endif
void lyra2z_hash( void *state, const void *input );
int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
bool lyra2z_thread_init();
#endif

View File

@@ -1,40 +1,49 @@
#include <memory.h>
#include <mm_malloc.h>
#include "algo-gate-api.h"
#include "lyra2z-gate.h"
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
#include "avxdefs.h"
__thread uint64_t* zcoin_wholeMatrix;
__thread uint64_t* lyra2z_matrix;
static __thread sph_blake256_context zcoin_blake_mid;
void zcoin_midstate( const void* input )
bool lyra2z_thread_init()
{
sph_blake256_init( &zcoin_blake_mid );
sph_blake256( &zcoin_blake_mid, input, 64 );
// const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
// int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
const int i = BLOCK_LEN_INT64 * 8 * 8 * 8;
lyra2z_matrix = _mm_malloc( i, 64 );
return lyra2z_matrix;
}
static __thread sph_blake256_context lyra2z_blake_mid;
void lyra2z_midstate( const void* input )
{
sph_blake256_init( &lyra2z_blake_mid );
sph_blake256( &lyra2z_blake_mid, input, 64 );
}
// block 2050 new algo, blake plus new lyra parms. new input
// is power of 2 so normal lyra can be used
//void zcoin_hash(void *state, const void *input, uint32_t height)
void zcoin_hash(void *state, const void *input )
void lyra2z_hash( void *state, const void *input )
{
uint32_t _ALIGN(64) hash[16];
sph_blake256_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &zcoin_blake_mid, sizeof zcoin_blake_mid );
memcpy( &ctx_blake, &lyra2z_blake_mid, sizeof lyra2z_blake_mid );
sph_blake256( &ctx_blake, input + 64, 16 );
sph_blake256_close( &ctx_blake, hash );
LYRA2Z( zcoin_wholeMatrix, hash, 32, hash, 32, hash, 32, 8, 8, 8);
LYRA2Z( lyra2z_matrix, hash, 32, hash, 32, hash, 32, 8, 8, 8);
memcpy(state, hash, 32);
}
int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t _ALIGN(64) hash[8];
@@ -52,11 +61,11 @@ int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
be32enc(&endiandata[i], pdata[i]);
}
zcoin_midstate( endiandata );
lyra2z_midstate( endiandata );
do {
be32enc(&endiandata[19], nonce);
zcoin_hash( hash, endiandata );
lyra2z_hash( hash, endiandata );
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
work_set_target_ratio(work, hash);
@@ -73,50 +82,41 @@ int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
return 0;
}
/*
//int64_t get_max64_0xffffLL() { return 0xffffLL; };
void zcoin_set_target( struct work* work, double job_diff )
void lyra2z_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
/*
bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
{
work->height = sctx->bloc_height;
return false;
}
*/
bool zcoin_thread_init()
bool lyra2z_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
zcoin_wholeMatrix = _mm_malloc( i, 64 );
lyra2z_wholeMatrix = _mm_malloc( i, 64 );
if ( zcoin_wholeMatrix == NULL )
return false;
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)zcoin_wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)zcoin_wholeMatrix, i/16 );
#else
memset( zcoin_wholeMatrix, 0, i );
#endif
return true;
return lyra2z_wholeMatrix;
}
bool register_zcoin_algo( algo_gate_t* gate )
bool register_lyra2z_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&zcoin_thread_init;
gate->scanhash = (void*)&scanhash_zcoin;
gate->hash = (void*)&zcoin_hash;
gate->miner_thread_init = (void*)&lyra2z_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z;
gate->hash = (void*)&lyra2z_hash;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&zcoin_set_target;
gate->set_target = (void*)&lyra2z_set_target;
// gate->prevent_dupes = (void*)&zcoin_get_work_height;
return true;
};
*/

View File

@@ -64,22 +64,12 @@ bool lyra2z330_thread_init()
int i = (int64_t)ROW_LEN_BYTES * 330; // nRows;
lyra2z330_wholeMatrix = _mm_malloc( i, 64 );
if ( lyra2z330_wholeMatrix == NULL )
return false;
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)lyra2z330_wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)lyra2z330_wholeMatrix, i/16 );
#else
memset( lyra2z330_wholeMatrix, 0, i );
#endif
return true;
return lyra2z330_wholeMatrix;
}
bool register_lyra2z330_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->optimizations = AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2z330_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z330;
gate->hash = (void*)&lyra2z330_hash;

View File

@@ -130,12 +130,12 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
//Squeezes full blocks
for ( i = 0; i < fullBlocks; i++ )
{
memcpy_m256i( out, state, BLOCK_LEN_M256I );
memcpy_256( out, state, BLOCK_LEN_M256I );
LYRA_ROUND_AVX2( state[0], state[1], state[2], state[3] );
out += BLOCK_LEN_M256I;
}
//Squeezes remaining bytes
memcpy_m256i( out, state, ( len_m256i % BLOCK_LEN_M256I ) );
memcpy_256( out, state, ( len_m256i % BLOCK_LEN_M256I ) );
#elif defined (__AVX__)
@@ -148,13 +148,13 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
//Squeezes full blocks
for ( i = 0; i < fullBlocks; i++ )
{
memcpy_m128i( out, state, BLOCK_LEN_M128I );
memcpy_128( out, state, BLOCK_LEN_M128I );
LYRA_ROUND_AVX( state[0], state[1], state[2], state[3],
state[4], state[5], state[6], state[7] );
out += BLOCK_LEN_M128I;
}
//Squeezes remaining bytes
memcpy_m128i( out, state, ( len_m128i % BLOCK_LEN_M128I ) );
memcpy_128( out, state, ( len_m128i % BLOCK_LEN_M128I ) );
#else

View File

@@ -66,11 +66,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
G_4X64( s0, s1, s2, s3 ); \
s1 = mm256_rotl256_1x64( s1); \
s2 = mm256_swap128( s2 ); \
s2 = mm256_swap_128( s2 ); \
s3 = mm256_rotr256_1x64( s3 ); \
G_4X64( s0, s1, s2, s3 ); \
s1 = mm256_rotr256_1x64( s1 ); \
s2 = mm256_swap128( s2 ); \
s2 = mm256_swap_128( s2 ); \
s3 = mm256_rotl256_1x64( s3 );
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
@@ -105,14 +105,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm128_rotl256_1x64( s2, s3 ); \
mm128_swap128( s4, s5 ); \
mm128_rotr256_1x64( s6, s7 ); \
mm_rotl256_1x64( s2, s3 ); \
mm_swap_128( s4, s5 ); \
mm_rotr256_1x64( s6, s7 ); \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm128_rotr256_1x64( s2, s3 ); \
mm128_swap128( s4, s5 ); \
mm128_rotl256_1x64( s6, s7 );
mm_rotr256_1x64( s2, s3 ); \
mm_swap_128( s4, s5 ); \
mm_rotl256_1x64( s6, s7 );
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \

View File

@@ -15,7 +15,7 @@
// no improvement with midstate
//static __thread blake512_4way_context ctx_mid;
void nist5hash_4way( void *output, const void *input )
void nist5hash_4way( void *out, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
@@ -35,7 +35,7 @@ void nist5hash_4way( void *output, const void *input )
blake512_4way( &ctx_blake, input, 80 );
blake512_4way_close( &ctx_blake, vhash );
m256_deinterleave_4x64x( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash0,
@@ -50,7 +50,7 @@ void nist5hash_4way( void *output, const void *input )
update_and_final_groestl( &ctx_groestl, (char*)hash3,
(const char*)hash3, 512 );
m256_interleave_4x64x( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
jh512_4way_init( &ctx_jh );
jh512_4way( &ctx_jh, vhash, 64 );
@@ -64,12 +64,7 @@ void nist5hash_4way( void *output, const void *input )
skein512_4way( &ctx_skein, vhash, 64 );
skein512_4way_close( &ctx_skein, vhash );
m256_deinterleave_4x64x( hash0, hash1, hash2, hash3, vhash, 512 );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
memcpy( output+96, hash3, 32 );
mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
}
int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -109,7 +104,7 @@ int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
// precalc midstate
// blake512_4way_init( &ctx_mid );

View File

@@ -2,12 +2,11 @@
bool register_nist5_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
#if defined (NIST5_4WAY)
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_nist5_4way;
gate->hash = (void*)&nist5hash_4way;
#else
gate->optimizations = SSE2_OPT | AES_OPT;
init_nist5_ctx();
gate->scanhash = (void*)&scanhash_nist5;
gate->hash = (void*)&nist5hash;

View File

@@ -20,7 +20,7 @@ void skeinhash_4way( void *state, const void *input )
skein512_4way( &ctx_skein, input, 80 );
skein512_4way_close( &ctx_skein, vhash );
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
@@ -38,21 +38,20 @@ void skeinhash_4way( void *state, const void *input )
SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
memcpy( (char*)state, (char*)hash0, 32 );
memcpy( ((char*)state) + 32, (char*)hash1, 32 );
memcpy( ((char*)state) + 64, (char*)hash2, 32 );
memcpy( ((char*)state) + 96, (char*)hash3, 32 );
memcpy( state, hash0, 32 );
memcpy( state + 32, hash1, 32 );
memcpy( state + 64, hash2, 32 );
memcpy( state + 96, hash3, 32 );
}
int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint64_t *edata = (uint64_t*)endiandata;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
@@ -63,9 +62,9 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
// data is 80 bytes, 20 u32 or 4 u64.
swab32_array( endiandata, pdata, 20 );
swab32_array( edata, pdata, 20 );
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_interleave_4x64( vdata, edata, edata, edata, edata, 640 );
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;

View File

@@ -6,8 +6,8 @@ int64_t skein_get_max64() { return 0x7ffffLL; }
bool register_skein_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX_OPT| AVX2_OPT | SHA_OPT;
#if defined (SKEIN_4WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_skein_4way;
gate->hash = (void*)&skeinhash_4way;
#else

View File

@@ -463,7 +463,7 @@ skein_big_core_4way( skein512_4way_context *sc, const void *data,
if ( len <= buf_size - ptr )
{
memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
memcpy_256( buf + (ptr>>3), vdata, len>>3 );
sc->ptr = ptr + len;
return;
}
@@ -483,7 +483,7 @@ skein_big_core_4way( skein512_4way_context *sc, const void *data,
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
ptr += clen;
vdata += (clen>>3);
len -= clen;
@@ -520,11 +520,11 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
READ_STATE_BIG(sc);
memset_zero_m256i( buf + (ptr>>3), (buf_size - ptr) >> 3 );
memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_4WAY( et, ptr );
memset_zero_m256i( buf, buf_size >> 3 );
memset_zero_256( buf, buf_size >> 3 );
bcount = 0;
UBI_BIG_4WAY( 510, 8 );
@@ -537,7 +537,7 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
buf[6] = h6;
buf[7] = h7;
memcpy_m256i( dst, buf, out_len >> 3 );
memcpy_256( dst, buf, out_len >> 3 );
}
static const sph_u64 IV256[] = {

View File

@@ -19,13 +19,13 @@ void skein2hash_4way( void *output, const void *input )
skein512_4way( &ctx, hash, 64 );
skein512_4way_close( &ctx, hash );
m256_deinterleave_4x64( out64, out64+4, out64+8, out64+12, hash, 256 );
mm256_deinterleave_4x64( out64, out64+4, out64+8, out64+12, hash, 256 );
}
int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__ ((aligned (64)));
uint64_t *edata = (uint64_t*)endiandata;
@@ -41,7 +41,7 @@ int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
swab32_array( endiandata, pdata, 20 );
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_interleave_4x64( vdata, edata, edata, edata, edata, 640 );
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;

View File

@@ -9,12 +9,12 @@ int64_t skein2_get_max64 ()
bool register_skein2_algo( algo_gate_t* gate )
{
gate->optimizations = FOUR_WAY_OPT;
#if defined (FOUR_WAY) && defined (__AVX2__)
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_skein2_4way;
gate->hash = (void*)&skein2hash_4way;
four_way_not_tested();
#else
gate->optimizations = SSE2_OPT;
gate->scanhash = (void*)&scanhash_skein2;
gate->hash = (void*)&skein2hash;
#endif

View File

@@ -31,7 +31,7 @@ void tribus_hash_4way(void *state, const void *input)
keccak512_4way( &ctx_keccak, vhash, 64 );
keccak512_4way_close( &ctx_keccak, vhash );
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// hash echo serially
init_echo( &ctx_echo, 512 );
@@ -92,7 +92,7 @@ int scanhash_tribus_4way(int thr_id, struct work *work, uint32_t max_nonce, uint
}
uint64_t *edata = (uint64_t*)endiandata;
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
// precalc midstate
// doing it one way then then interleaving would be faster but too

View File

@@ -14,15 +14,13 @@ bool tribus_thread_init()
*/
bool register_tribus_algo( algo_gate_t* gate )
{
// gate->miner_thread_init = (void*)&tribus_thread_init;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0x1ffff;
#if defined (TRIBUS_4WAY)
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_tribus_4way;
gate->hash = (void*)&tribus_hash_4way;
#else
gate->miner_thread_init = (void*)&tribus_thread_init;
gate->optimizations = SSE2_OPT | AES_OPT;
gate->scanhash = (void*)&scanhash_tribus;
gate->hash = (void*)&tribus_hash;
#endif

View File

@@ -140,7 +140,7 @@ HASH ( void *cc, const void *data, size_t len )
clen = SPH_BLEN - ptr;
if ( clen > len )
clen = len;
memcpy_m256i( sc->buf + (ptr>>3), vdata, clen>>3 );
memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
vdata = vdata + (clen>>3);
ptr += clen;
len -= clen;
@@ -195,19 +195,19 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
sc = cc;
ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
uint64_t *b= (uint64_t*)sc->buf;
uint64_t *s= (uint64_t*)sc->state;
//uint64_t *b= (uint64_t*)sc->buf;
//uint64_t *s= (uint64_t*)sc->state;
//printf("Vptr 1= %u\n", ptr);
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );
#ifdef PW01
sc->buf[ptr>>3] = mm256_vec_epi64( 0x100 >> 8 );
sc->buf[ptr>>3] = _mm256_set1_epi64x( 0x100 >> 8 );
// sc->buf[ptr++] = 0x100 >> 8;
#else
// need to overwrite exactly one byte
// sc->buf[ptr>>3] = _mm256_set_epi64x( 0, 0, 0, 0x80 );
sc->buf[ptr>>3] = mm256_vec_epi64( 0x80 );
sc->buf[ptr>>3] = _mm256_set1_epi64x( 0x80 );
// ptr++;
#endif
ptr += 8;
@@ -218,43 +218,43 @@ uint64_t *s= (uint64_t*)sc->state;
if ( ptr > SPH_MAXPAD )
{
memset_zero_m256i( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
memset_zero_256( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
RFUN( sc->buf, SPH_VAL );
memset_zero_m256i( sc->buf, SPH_MAXPAD >> 3 );
memset_zero_256( sc->buf, SPH_MAXPAD >> 3 );
}
else
{
memset_zero_m256i( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
memset_zero_256( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
}
#if defined BE64
#if defined PLW1
sc->buf[ SPH_MAXPAD>>3 ] =
mm256_byteswap_epi64( mm256_vec_epi64( sc->count << 3 ) );
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#elif defined PLW4
memset_zero_m256i( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_epi64( mm256_vec_epi64( sc->count >> 61 ) );
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_epi64( mm256_vec_epi64( sc->count << 3 ) );
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#else
sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_epi64( mm256_vec_epi64( sc->count >> 61 ) );
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_epi64( mm256_vec_epi64( sc->count << 3 ) );
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#endif // PLW
#else // LE64
#if defined PLW1
sc->buf[ SPH_MAXPAD >> 3 ] = mm256_vec_epi64( sc->count << 3 );
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
#elif defined PLW4
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_vec_epi64( sc->count << 3 );
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
mm256_vec_epi64( c->count >> 61 );
memset_zero_m256i( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
_mm256_set1_epi64x( c->count >> 61 );
memset_zero_256( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
2 * SPH_WLEN );
#else
sc->buf[ SPH_MAXPAD >> 3 ] = mm256_vec_epi64( sc->count << 3 );
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
mm256_vec_epi64( sc->count >> 61 );
_mm256_set1_epi64x( sc->count >> 61 );
#endif // PLW
#endif // LE64
@@ -276,7 +276,7 @@ uint64_t *s= (uint64_t*)sc->state;
for ( u = 0; u < rnum; u ++ )
{
#if defined BE64
((__m256i*)dst)[u] = mm256_byteswap_epi64( sc->val[u] );
((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
#else // LE64
((__m256i*)dst)[u] = sc->val[u];
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -1,209 +0,0 @@
/* $Id: sph_whirlpool.h 216 2010-06-08 09:46:57Z tp $ */
/**
* WHIRLPOOL interface.
*
* WHIRLPOOL knows three variants, dubbed "WHIRLPOOL-0" (original
* version, published in 2000, studied by NESSIE), "WHIRLPOOL-1"
* (first revision, 2001, with a new S-box) and "WHIRLPOOL" (current
* version, 2003, with a new diffusion matrix, also described as "plain
* WHIRLPOOL"). All three variants are implemented here.
*
* The original WHIRLPOOL (i.e. WHIRLPOOL-0) was published in: P. S. L.
* M. Barreto, V. Rijmen, "The Whirlpool Hashing Function", First open
* NESSIE Workshop, Leuven, Belgium, November 13--14, 2000.
*
* The current WHIRLPOOL specification and a reference implementation
* can be found on the WHIRLPOOL web page:
* http://paginas.terra.com.br/informatica/paulobarreto/WhirlpoolPage.html
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_whirlpool.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef SPH_WHIRLPOOL_H__
#define SPH_WHIRLPOOL_H__
#include <stddef.h>
#include "algo/sha/sph_types.h"
#if SPH_64
/**
* Output size (in bits) for WHIRLPOOL.
*/
#define SPH_SIZE_whirlpool 512
/**
* Output size (in bits) for WHIRLPOOL-0.
*/
#define SPH_SIZE_whirlpool0 512
/**
* Output size (in bits) for WHIRLPOOL-1.
*/
#define SPH_SIZE_whirlpool1 512
/**
* This structure is a context for WHIRLPOOL computations: it contains the
* intermediate values and some data from the last entered block. Once
* a WHIRLPOOL computation has been performed, the context can be reused for
* another computation.
*
* The contents of this structure are private. A running WHIRLPOOL computation
* can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char buf[64]; /* first field, for alignment */
sph_u64 state[8];
#if SPH_64
sph_u64 count;
#else
sph_u32 count_high, count_low;
#endif
#endif
} sph_whirlpool_context;
/**
* Initialize a WHIRLPOOL context. This process performs no memory allocation.
*
* @param cc the WHIRLPOOL context (pointer to a
* <code>sph_whirlpool_context</code>)
*/
void sph_whirlpool_init(void *cc);
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing). This function applies the
* plain WHIRLPOOL algorithm.
*
* @param cc the WHIRLPOOL context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_whirlpool(void *cc, const void *data, size_t len);
/**
* Terminate the current WHIRLPOOL computation and output the result into the
* provided buffer. The destination buffer must be wide enough to
* accomodate the result (64 bytes). The context is automatically
* reinitialized.
*
* @param cc the WHIRLPOOL context
* @param dst the destination buffer
*/
void sph_whirlpool_close(void *cc, void *dst);
/**
* WHIRLPOOL-0 uses the same structure than plain WHIRLPOOL.
*/
typedef sph_whirlpool_context sph_whirlpool0_context;
#ifdef DOXYGEN_IGNORE
/**
* Initialize a WHIRLPOOL-0 context. This function is identical to
* <code>sph_whirlpool_init()</code>.
*
* @param cc the WHIRLPOOL context (pointer to a
* <code>sph_whirlpool0_context</code>)
*/
void sph_whirlpool0_init(void *cc);
#endif
#ifndef DOXYGEN_IGNORE
#define sph_whirlpool0_init sph_whirlpool_init
#endif
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing). This function applies the
* WHIRLPOOL-0 algorithm.
*
* @param cc the WHIRLPOOL context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_whirlpool0(void *cc, const void *data, size_t len);
/**
* Terminate the current WHIRLPOOL-0 computation and output the result into the
* provided buffer. The destination buffer must be wide enough to
* accomodate the result (64 bytes). The context is automatically
* reinitialized.
*
* @param cc the WHIRLPOOL-0 context
* @param dst the destination buffer
*/
void sph_whirlpool0_close(void *cc, void *dst);
/**
* WHIRLPOOL-1 uses the same structure than plain WHIRLPOOL.
*/
typedef sph_whirlpool_context sph_whirlpool1_context;
#ifdef DOXYGEN_IGNORE
/**
* Initialize a WHIRLPOOL-1 context. This function is identical to
* <code>sph_whirlpool_init()</code>.
*
* @param cc the WHIRLPOOL context (pointer to a
* <code>sph_whirlpool1_context</code>)
*/
void sph_whirlpool1_init(void *cc);
#endif
#ifndef DOXYGEN_IGNORE
#define sph_whirlpool1_init sph_whirlpool_init
#endif
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing). This function applies the
* WHIRLPOOL-1 algorithm.
*
* @param cc the WHIRLPOOL context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_whirlpool1(void *cc, const void *data, size_t len);
/**
* Terminate the current WHIRLPOOL-1 computation and output the result into the
* provided buffer. The destination buffer must be wide enough to
* accomodate the result (64 bytes). The context is automatically
* reinitialized.
*
* @param cc the WHIRLPOOL-1 context
* @param dst the destination buffer
*/
void sph_whirlpool1_close(void *cc, void *dst);
#endif
#endif

View File

@@ -41,7 +41,7 @@ void whirlpool_hash_4way( void *state, const void *input )
whirlpool1_4way( &ctx, vhash, 64 );
whirlpool1_4way_close( &ctx, vhash);
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
memcpy( state , hash0, 32 );
memcpy( state+32, hash1, 32 );
@@ -74,7 +74,7 @@ int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce,
be32enc(&endiandata[i], pdata[i]);
uint64_t *edata = (uint64_t*)endiandata;
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
// midstate
whirlpool1_4way_init( &whirl_mid );

View File

@@ -3346,7 +3346,7 @@ do { \
#define ROUND0 MUL8(ROUND0_W)
#define UPDATE_STATE MUL8(UPDATE_STATE_W)
#define BYTE(x, n) \
_mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), mm256_vec_epi64( 0xFF ) )
_mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
// A very complex, but structured, expression with a mix of scalar
// and vector operations to retrieve specific 64 bit constants from
@@ -3359,7 +3359,7 @@ do { \
// Pack the data in a vector and return it.
#define t_row( inv, row ) \
_mm256_and_si256( \
_mm256_srli_epi64( inv, row << 3 ), mm256_vec_epi64( 0xFF ) )
_mm256_srli_epi64( inv, row << 3 ), _mm256_set1_epi64x( 0xFF ) )
// Extract vector element from "lane" of vector "in[row]" and use it to index
// scalar array of constants "table" and return referenced 64 bit entry.
@@ -3454,7 +3454,7 @@ void
whirlpool_4way_init(void *cc)
{
whirlpool_4way_context *sc = cc;;
memset_zero_m256i( sc->state, 8 );
memset_zero_256( sc->state, 8 );
sc->count = 0;
}
@@ -3470,7 +3470,7 @@ name ## _round( const void *src, __m256i *state ) \
ROUND0; \
for (r = 0; r < 10; r ++) { \
DECL8(tmp); \
ROUND_KSCHED( type ## _T, h, tmp, mm256_vec_epi64( type ## _RC[r] ) ); \
ROUND_KSCHED( type ## _T, h, tmp, _mm256_set1_epi64x( type ## _RC[r] ) ); \
TRANSFER( h, tmp ); \
ROUND_WENC( type ## _T, n, h, tmp ); \
TRANSFER( n, tmp ); \

1188
avxdefs.h

File diff suppressed because it is too large Load Diff

View File

@@ -7,6 +7,8 @@ CFLAGS="-O3 -march=core-avx2 -Wall -DUSE_SPH_SHA -DFOUR_WAY" ./configure --with-
make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-4way.exe
strip -s cpuminer
mv cpuminer cpuminer-4way
make clean
rm -f config.status
@@ -15,6 +17,8 @@ CFLAGS="-O3 -march=core-avx2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-aes-avx2.exe
strip -s cpuminer
mv cpuminer cpuminer-aes-avx2
make clean || echo clean
rm -f config.status
@@ -23,6 +27,8 @@ CFLAGS="-O3 -march=corei7-avx -Wall -DUSE_SPH_SHA" ./configure --with-curl
make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-aes-avx.exe
strip -s cpuminer
mv cpuminer cpuminer-aes-avx
make clean || echo clean
rm -f config.status
@@ -31,6 +37,8 @@ CFLAGS="-O3 -maes -msse4.2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-aes-sse42.exe
strip -s cpuminer
mv cpuminer cpuminer-aes-sse42
make clean || echo clean
rm -f config.status
@@ -39,6 +47,8 @@ CFLAGS="-O3 -march=corei7 -Wall -DUSE_SPH_SHA" ./configure --with-curl
make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-sse42.exe
strip -s cpuminer
mv cpuminer cpuminer-sse42
make clean || echo clean
rm -f config.status
@@ -47,6 +57,8 @@ CFLAGS="-O3 -march=core2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-sse2.exe
strip -s cpuminer
mv cpuminer cpuminer-sse2
make clean || echo done

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.4.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.5.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.7.4'
PACKAGE_STRING='cpuminer-opt 3.7.4'
PACKAGE_VERSION='3.7.5'
PACKAGE_STRING='cpuminer-opt 3.7.5'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.7.4 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.7.5 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1392,7 +1392,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.7.4:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.7.5:";;
esac
cat <<\_ACEOF
@@ -1497,7 +1497,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.7.4
cpuminer-opt configure 3.7.5
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.7.4, which was
It was created by cpuminer-opt $as_me 3.7.5, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2981,7 +2981,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.7.4'
VERSION='3.7.5'
cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.7.4, which was
This file was extended by cpuminer-opt $as_me 3.7.5, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.7.4
cpuminer-opt config.status 3.7.5
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.7.4])
AC_INIT([cpuminer-opt], [3.7.5])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -2860,45 +2860,50 @@ static void show_credits()
{
printf("\n ********** "PACKAGE_NAME" "PACKAGE_VERSION" *********** \n");
printf(" A CPU miner with multi algo support and optimized for CPUs\n");
printf(" with AES_NI and AVX extensions.\n");
printf(" BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n");
printf(" Forked from TPruvot's cpuminer-multi with credits\n");
printf(" to Lucas Jones, elmad, palmd, djm34, pooler, ig0tik3d,\n");
printf(" Wolf0, Jeff Garzik and Optiminer.\n\n");
printf(" with AES_NI and AVX2 and SHA extensions.\n");
printf(" BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n\n");
}
bool check_cpu_capability ()
{
char cpu_brand[0x40];
// there is no CPU related feature specific to 4way, just AVX2 and AES
bool cpu_has_sse2 = has_sse2();
bool cpu_has_aes = has_aes_ni();
bool cpu_has_avx = has_avx1();
bool cpu_has_avx2 = has_avx2();
bool cpu_has_sha = has_sha();
bool sw_has_sse2 = false;
// no need to check if sw has sse2,
// the code won't compile without it.
// bool sw_has_sse2 = false;
bool sw_has_aes = false;
bool sw_has_avx = false;
bool sw_has_avx2 = false;
bool sw_has_sha = false;
bool sw_has_4way = false;
set_t algo_features = algo_gate.optimizations;
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
bool algo_has_aes = set_incl( AES_OPT, algo_features );
bool algo_has_avx = set_incl( AVX_OPT, algo_features );
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
bool algo_has_sha = set_incl( SHA_OPT, algo_features );
bool algo_has_4way = set_incl( FOUR_WAY_OPT, algo_features );
bool use_aes;
bool use_sse2;
bool use_avx;
bool use_avx2;
bool use_sha;
bool use_4way;
bool use_none;
#ifdef __AES__
sw_has_aes = true;
#endif
#ifdef __SSE2__
sw_has_sse2 = true;
#endif
// #ifdef __SSE2__
// sw_has_sse2 = true;
// #endif
#ifdef __AVX__
sw_has_avx = true;;
sw_has_avx = true;
#endif
#ifdef __AVX2__
sw_has_avx2 = true;
@@ -2906,14 +2911,26 @@ bool check_cpu_capability ()
#ifdef __SHA__
sw_has_sha = true;
#endif
#ifdef HASH_4WAY
sw_has_4way = true;
#endif
#if !((__AES__) || (__SSE2__))
printf("Neither __AES__ nor __SSE2__ defined.\n");
#endif
cpu_brand_string( cpu_brand );
printf( "CPU: %s\n", cpu_brand );
printf( "CPU: %s.\n", cpu_brand );
printf("SW built on " __DATE__
#ifdef _MSC_VER
" with VC++ 2013\n");
#elif defined(__GNUC__)
" with GCC");
printf(" %d.%d.%d.\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
#else
printf(".\n");
#endif
printf("CPU features:");
if ( cpu_has_sse2 ) printf( " SSE2" );
@@ -2922,48 +2939,76 @@ bool check_cpu_capability ()
if ( cpu_has_avx2 ) printf( " AVX2" );
if ( cpu_has_sha ) printf( " SHA" );
printf("\nSW built on " __DATE__
#ifdef _MSC_VER
" with VC++ 2013\n");
#elif defined(__GNUC__)
" with GCC");
printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
#else
printf("\n");
#endif
printf("SW features:");
if ( sw_has_sse2 ) printf( " SSE2" );
printf(".\nSW features: SSE2");
if ( sw_has_aes ) printf( " AES" );
if ( sw_has_avx ) printf( " AVX" );
if ( sw_has_avx2 ) printf( " AVX2" );
if ( sw_has_4way ) printf( " 4WAY" );
if ( sw_has_sha ) printf( " SHA" );
// SSE2 defaults to yes regardless
printf("\nAlgo features: SSE2");
printf(".\nAlgo features:");
if ( algo_has_sse2 ) printf( " SSE2" );
if ( algo_has_aes ) printf( " AES" );
if ( algo_has_avx ) printf( " AVX" );
if ( algo_has_avx2 ) printf( " AVX2" );
if ( algo_has_4way ) printf( " 4WAY" );
if ( algo_has_sha ) printf( " SHA" );
printf("\n");
printf(".\n");
use_sse2 = cpu_has_sse2 && sw_has_sse2;
// Check for CPU and build incompatibilities
if ( !cpu_has_sse2 )
{
printf( "A CPU with SSE2 is required to use cpuminer-opt\n" );
return false;
}
if ( sw_has_avx2 && !( cpu_has_avx2 && cpu_has_aes ) )
{
if ( sw_has_4way && algo_has_4way )
printf( "A CPU with AES and AVX2 is required to use 4way!\n" );
else if ( algo_has_avx2 )
printf( "A CPU with AES and AVX2 is required!\n" );
return false;
}
if ( sw_has_avx && !( cpu_has_avx && cpu_has_aes ) )
{
printf( "A CPU with AES and AVX2 is required!\n" );
return false;
}
if ( sw_has_aes && algo_has_aes && !cpu_has_aes )
{
printf( "A CPU with AES is required!\n" );
return false;
}
if ( sw_has_sha && algo_has_sha && !cpu_has_sha )
{
printf( "A CPU with SHA is required!\n" );
return false;
}
// Determine mining options
use_sse2 = cpu_has_sse2 && algo_has_sse2;
use_aes = cpu_has_aes && sw_has_aes && algo_has_aes;
use_avx = cpu_has_avx && sw_has_avx && algo_has_avx;
use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2;
use_sha = cpu_has_sha && sw_has_sha && algo_has_sha;
use_4way = cpu_has_avx2 && sw_has_4way && algo_has_4way;
use_none = !( use_sse2 || use_aes || use_avx || use_avx2 || use_sha
|| use_4way );
if ( use_sse2 )
// Display best options
printf( "Start mining with" );
if ( use_none ) printf( " no optimizations" );
else
{
printf( "Start mining with SSE2" );
if ( use_aes ) printf( " AES" );
if ( use_avx2 ) printf( " AVX2" );
else if ( use_avx ) printf( " AVX" );
else if ( use_sse2 ) printf( " SSE2" );
if ( use_4way ) printf( " 4WAY" );
if ( use_sha ) printf( " SHA" );
printf( "\n\n" );
}
else
printf( CL_RED "Unsupported CPU, miner will likely crash!\n\n" CL_N );
printf( ".\n\n" );
return true;
}

View File

@@ -12,6 +12,10 @@
#endif
//#endif
#if defined(FOUR_WAY) && defined(__AVX2__)
#define HASH_4WAY
#endif
#ifdef _MSC_VER
#undef USE_ASM /* to fix */
@@ -499,6 +503,7 @@ enum algos {
ALGO_HODL,
ALGO_JHA,
ALGO_KECCAK,
ALGO_KECCAKC,
ALGO_LBRY,
ALGO_LUFFA,
ALGO_LYRA2RE,
@@ -568,6 +573,7 @@ static const char* const algo_names[] = {
"hodl",
"jha",
"keccak",
"keccakc",
"lbry",
"luffa",
"lyra2re",
@@ -690,7 +696,8 @@ Options:\n\
hmq1725 Espers\n\
hodl Hodlcoin\n\
jha jackppot (Jackpotcoin)\n\
keccak Keccak\n\
keccak Maxcoin\n\
keccakc Creative Coin\n\
lbry LBC, LBRY Credits\n\
luffa Luffa\n\
lyra2re lyra2\n\

View File

@@ -1,26 +0,0 @@
#!/bin/bash
#if [ "$OS" = "Windows_NT" ]; then
# ./mingw64.sh
# exit 0
#fi
# Linux build
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
# Ubuntu 10.04 (gcc 4.4)
# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
make -j 4
strip -s cpuminer