This commit is contained in:
Jay D Dee
2017-12-08 15:39:28 -05:00
parent 4b57ac0eb9
commit af1c940919
53 changed files with 1324 additions and 4790 deletions

View File

@@ -9,18 +9,18 @@
void blakehash_4way(void *state, const void *input)
{
uint32_t hash0[16] __attribute__ ((aligned (64)));
uint32_t hash1[16] __attribute__ ((aligned (64)));
uint32_t hash2[16] __attribute__ ((aligned (64)));
uint32_t hash3[16] __attribute__ ((aligned (64)));
uint32_t vhash[16*4] __attribute__ ((aligned (64)));
uint32_t vhash[4*4] __attribute__ ((aligned (64)));
uint32_t hash0[4] __attribute__ ((aligned (32)));
uint32_t hash1[4] __attribute__ ((aligned (32)));
uint32_t hash2[4] __attribute__ ((aligned (32)));
uint32_t hash3[4] __attribute__ ((aligned (32)));
blake256_4way_context ctx;
blake256_4way_init( &ctx );
blake256_4way( &ctx, input, 16 );
blake256_4way_close( &ctx, vhash );
m128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
@@ -32,7 +32,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t hash[4*4] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -49,7 +49,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
// we need big endian data...
swab32_array( endiandata, pdata, 20 );
m128_interleave_4x32( vdata, endiandata, endiandata, endiandata,
mm_interleave_4x32( vdata, endiandata, endiandata, endiandata,
endiandata, 640 );
uint32_t *noncep = vdata + 76; // 19*4

View File

@@ -13,11 +13,12 @@ bool register_blake_algo( algo_gate_t* gate )
// gate->scanhash = (void*)&scanhash_blake_8way;
// gate->hash = (void*)&blakehash_8way;
#if defined(BLAKE_4WAY)
gate->optimizations = SSE2_OPT | AVX_OPT;
four_way_not_tested();
gate->optimizations = FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_blake_4way;
gate->hash = (void*)&blakehash_4way;
four_way_not_tested();
#else
gate->optimizations = SSE2_OPT;
gate->scanhash = (void*)&scanhash_blake;
gate->hash = (void*)&blakehash;
#endif

View File

@@ -536,22 +536,22 @@ do { \
, _mmset_epi32( CS6, CS6, CS6, CS6 ) ); \
VF = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ), \
_mmset_epi32( CS7, CS7, CS7, CS7 ) ); \
M[0x0] = mm_byteswap_epi32( *(buf + 0) ); \
M[0x1] = mm_byteswap_epi32( *(buf + 1) ); \
M[0x2] = mm_byteswap_epi32( *(buf + 2) ); \
M[0x3] = mm_byteswap_epi32( *(buf + 3) ); \
M[0x4] = mm_byteswap_epi32( *(buf + 4) ); \
M[0x5] = mm_byteswap_epi32( *(buf + 5) ); \
M[0x6] = mm_byteswap_epi32( *(buf + 6) ); \
M[0x7] = mm_byteswap_epi32( *(buf + 7) ); \
M[0x8] = mm_byteswap_epi32( *(buf + 8) ); \
M[0x9] = mm_byteswap_epi32( *(buf + 9) ); \
M[0xA] = mm_byteswap_epi32( *(buf + 10) ); \
M[0xB] = mm_byteswap_epi32( *(buf + 11) ); \
M[0xC] = mm_byteswap_epi32( *(buf + 12) ); \
M[0xD] = mm_byteswap_epi32( *(buf + 13) ); \
M[0xE] = mm_byteswap_epi32( *(buf + 14) ); \
M[0xF] = mm_byteswap_epi32( *(buf + 15) ); \
M[0x0] = mm_byteswap_32( *(buf + 0) ); \
M[0x1] = mm_byteswap_32( *(buf + 1) ); \
M[0x2] = mm_byteswap_32( *(buf + 2) ); \
M[0x3] = mm_byteswap_32( *(buf + 3) ); \
M[0x4] = mm_byteswap_32( *(buf + 4) ); \
M[0x5] = mm_byteswap_32( *(buf + 5) ); \
M[0x6] = mm_byteswap_32( *(buf + 6) ); \
M[0x7] = mm_byteswap_32( *(buf + 7) ); \
M[0x8] = mm_byteswap_32( *(buf + 8) ); \
M[0x9] = mm_byteswap_32( *(buf + 9) ); \
M[0xA] = mm_byteswap_32( *(buf + 10) ); \
M[0xB] = mm_byteswap_32( *(buf + 11) ); \
M[0xC] = mm_byteswap_32( *(buf + 12) ); \
M[0xD] = mm_byteswap_32( *(buf + 13) ); \
M[0xE] = mm_byteswap_32( *(buf + 14) ); \
M[0xF] = mm_byteswap_32( *(buf + 15) ); \
for (r = 0; r < BLAKE32_ROUNDS; r ++) \
ROUND_S_4WAY(r); \
H0 = _mm_xor_si128( _mm_xor_si128( \
@@ -601,22 +601,22 @@ do { \
_mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
M0 = mm_byteswap_epi32( * buf ); \
M1 = mm_byteswap_epi32( *(buf+1) ); \
M2 = mm_byteswap_epi32( *(buf+2) ); \
M3 = mm_byteswap_epi32( *(buf+3) ); \
M4 = mm_byteswap_epi32( *(buf+4) ); \
M5 = mm_byteswap_epi32( *(buf+5) ); \
M6 = mm_byteswap_epi32( *(buf+6) ); \
M7 = mm_byteswap_epi32( *(buf+7) ); \
M8 = mm_byteswap_epi32( *(buf+8) ); \
M9 = mm_byteswap_epi32( *(buf+9) ); \
MA = mm_byteswap_epi32( *(buf+10) ); \
MB = mm_byteswap_epi32( *(buf+11) ); \
MC = mm_byteswap_epi32( *(buf+12) ); \
MD = mm_byteswap_epi32( *(buf+13) ); \
ME = mm_byteswap_epi32( *(buf+14) ); \
MF = mm_byteswap_epi32( *(buf+15) ); \
M0 = mm_byteswap_32( * buf ); \
M1 = mm_byteswap_32( *(buf+1) ); \
M2 = mm_byteswap_32( *(buf+2) ); \
M3 = mm_byteswap_32( *(buf+3) ); \
M4 = mm_byteswap_32( *(buf+4) ); \
M5 = mm_byteswap_32( *(buf+5) ); \
M6 = mm_byteswap_32( *(buf+6) ); \
M7 = mm_byteswap_32( *(buf+7) ); \
M8 = mm_byteswap_32( *(buf+8) ); \
M9 = mm_byteswap_32( *(buf+9) ); \
MA = mm_byteswap_32( *(buf+10) ); \
MB = mm_byteswap_32( *(buf+11) ); \
MC = mm_byteswap_32( *(buf+12) ); \
MD = mm_byteswap_32( *(buf+13) ); \
ME = mm_byteswap_32( *(buf+14) ); \
MF = mm_byteswap_32( *(buf+15) ); \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \
@@ -722,22 +722,22 @@ do { \
_mm256_set256_epi64( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \
_mm256_set256_epi64( CB7, CB7, CB7, CB7 ) ); \
M[0x0] = mm256_byteswap_epi64( *(buf+0) ); \
M[0x1] = mm256_byteswap_epi64( *(buf+1) ); \
M[0x2] = mm256_byteswap_epi64( *(buf+2) ); \
M[0x3] = mm256_byteswap_epi64( *(buf+3) ); \
M[0x4] = mm256_byteswap_epi64( *(buf+4) ); \
M[0x5] = mm256_byteswap_epi64( *(buf+5) ); \
M[0x6] = mm256_byteswap_epi64( *(buf+6) ); \
M[0x7] = mm256_byteswap_epi64( *(buf+7) ); \
M[0x8] = mm256_byteswap_epi64( *(buf+8) ); \
M[0x9] = mm256_byteswap_epi64( *(buf+9) ); \
M[0xA] = mm256_byteswap_epi64( *(buf+10) ); \
M[0xB] = mm256_byteswap_epi64( *(buf+11) ); \
M[0xC] = mm256_byteswap_epi64( *(buf+12) ); \
M[0xD] = mm256_byteswap_epi64( *(buf+13) ); \
M[0xE] = mm256_byteswap_epi64( *(buf+14) ); \
M[0xF] = mm256_byteswap_epi64( *(buf+15) ); \
M[0x0] = mm256_byteswap_64( *(buf+0) ); \
M[0x1] = mm256_byteswap_64( *(buf+1) ); \
M[0x2] = mm256_byteswap_64( *(buf+2) ); \
M[0x3] = mm256_byteswap_64( *(buf+3) ); \
M[0x4] = mm256_byteswap_64( *(buf+4) ); \
M[0x5] = mm256_byteswap_64( *(buf+5) ); \
M[0x6] = mm256_byteswap_64( *(buf+6) ); \
M[0x7] = mm256_byteswap_64( *(buf+7) ); \
M[0x8] = mm256_byteswap_64( *(buf+8) ); \
M[0x9] = mm256_byteswap_64( *(buf+9) ); \
M[0xA] = mm256_byteswap_64( *(buf+10) ); \
M[0xB] = mm256_byteswap_64( *(buf+11) ); \
M[0xC] = mm256_byteswap_64( *(buf+12) ); \
M[0xD] = mm256_byteswap_64( *(buf+13) ); \
M[0xE] = mm256_byteswap_64( *(buf+14) ); \
M[0xF] = mm256_byteswap_64( *(buf+15) ); \
for (r = 0; r < 16; r ++) \
ROUND_B_4WAY(r); \
H0 = _mm256_xor_si256( _mm256_xor_si256( \
@@ -787,22 +787,22 @@ do { \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M0 = mm256_byteswap_epi64( *(buf + 0) ); \
M1 = mm256_byteswap_epi64( *(buf + 1) ); \
M2 = mm256_byteswap_epi64( *(buf + 2) ); \
M3 = mm256_byteswap_epi64( *(buf + 3) ); \
M4 = mm256_byteswap_epi64( *(buf + 4) ); \
M5 = mm256_byteswap_epi64( *(buf + 5) ); \
M6 = mm256_byteswap_epi64( *(buf + 6) ); \
M7 = mm256_byteswap_epi64( *(buf + 7) ); \
M8 = mm256_byteswap_epi64( *(buf + 8) ); \
M9 = mm256_byteswap_epi64( *(buf + 9) ); \
MA = mm256_byteswap_epi64( *(buf + 10) ); \
MB = mm256_byteswap_epi64( *(buf + 11) ); \
MC = mm256_byteswap_epi64( *(buf + 12) ); \
MD = mm256_byteswap_epi64( *(buf + 13) ); \
ME = mm256_byteswap_epi64( *(buf + 14) ); \
MF = mm256_byteswap_epi64( *(buf + 15) ); \
M0 = mm256_byteswap_64( *(buf + 0) ); \
M1 = mm256_byteswap_64( *(buf + 1) ); \
M2 = mm256_byteswap_64( *(buf + 2) ); \
M3 = mm256_byteswap_64( *(buf + 3) ); \
M4 = mm256_byteswap_64( *(buf + 4) ); \
M5 = mm256_byteswap_64( *(buf + 5) ); \
M6 = mm256_byteswap_64( *(buf + 6) ); \
M7 = mm256_byteswap_64( *(buf + 7) ); \
M8 = mm256_byteswap_64( *(buf + 8) ); \
M9 = mm256_byteswap_64( *(buf + 9) ); \
MA = mm256_byteswap_64( *(buf + 10) ); \
MB = mm256_byteswap_64( *(buf + 11) ); \
MC = mm256_byteswap_64( *(buf + 12) ); \
MD = mm256_byteswap_64( *(buf + 13) ); \
ME = mm256_byteswap_64( *(buf + 14) ); \
MF = mm256_byteswap_64( *(buf + 15) ); \
ROUND_B_4WAY(0); \
ROUND_B_4WAY(1); \
ROUND_B_4WAY(2); \
@@ -870,7 +870,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
if ( len < buf_size - ptr )
{
memcpy_m128i( buf + (ptr>>2), vdata, len>>2 );
memcpy_128( buf + (ptr>>2), vdata, len>>2 );
ptr += len;
sc->ptr = ptr;
return;
@@ -884,7 +884,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
clen = buf_size - ptr;
if (clen > len)
clen = len;
memcpy_m128i( buf + (ptr>>2), vdata, clen>>2 );
memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
ptr += clen;
vdata += (clen>>2);
len -= clen;
@@ -936,32 +936,32 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
if ( ptr <= 48 )
{
memset_zero_m128i( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
_mm_set_epi32( 0x010000000, 0x01000000,
0x010000000, 0x01000000 ) );
*(u.buf+(56>>2)) = mm_byteswap_epi32( _mm_set_epi32( th, th, th, th ) );
*(u.buf+(60>>2)) = mm_byteswap_epi32( _mm_set_epi32( tl, tl, tl, tl ) );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
}
else
{
memset_zero_m128i( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
memset_zero_128( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
sc->T0 = SPH_C32(0xFFFFFE00);
sc->T1 = SPH_C32(0xFFFFFFFF);
memset_zero_m128i( u.buf, 56>>2 );
memset_zero_128( u.buf, 56>>2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm_set_epi32( 0x010000000, 0x01000000,
0x010000000, 0x01000000 );
*(u.buf+(56>>2)) = mm_byteswap_epi32( _mm_set_epi32( th, th, th, th ) );
*(u.buf+(60>>2)) = mm_byteswap_epi32( _mm_set_epi32( tl, tl, tl, tl ) );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
blake32_4way( sc, u.buf, 64 );
}
out = (__m128i*)dst;
for ( k = 0; k < out_size_w32; k++ )
out[k] = mm_byteswap_epi32( sc->H[k] );
out[k] = mm_byteswap_32( sc->H[k] );
}
#if defined (__AVX2__)
@@ -995,7 +995,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
ptr = sc->ptr;
if ( len < (buf_size - ptr) )
{
memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
memcpy_256( buf + (ptr>>3), vdata, len>>3 );
ptr += len;
sc->ptr = ptr;
return;
@@ -1009,7 +1009,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
ptr += clen;
vdata = vdata + (clen>>3);
len -= clen;
@@ -1062,44 +1062,44 @@ blake64_4way_close( blake_4way_big_context *sc,
}
if ( ptr <= 104 )
{
memset_zero_m256i( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
memset_zero_256( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
if ( out_size_w64 == 8 )
u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
_mm256_set_epi64x( 0x0100000000000000,
0x0100000000000000,
0x0100000000000000,
0x0100000000000000 ) );
*(u.buf+(112>>3)) = mm256_byteswap_epi64(
*(u.buf+(112>>3)) = mm256_byteswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_byteswap_epi64(
*(u.buf+(120>>3)) = mm256_byteswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
}
else
{
memset_zero_m256i( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
memset_zero_256( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
memset_zero_m256i( u.buf, 112>>3 );
memset_zero_256( u.buf, 112>>3 );
if ( out_size_w64 == 8 )
u.buf[104>>3] = _mm256_set_epi64x( 0x0100000000000000,
0x0100000000000000,
0x0100000000000000,
0x0100000000000000 );
*(u.buf+(112>>3)) = mm256_byteswap_epi64(
*(u.buf+(112>>3)) = mm256_byteswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_byteswap_epi64(
*(u.buf+(120>>3)) = mm256_byteswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, u.buf, 128 );
}
out = (__m256i*)dst;
for ( k = 0; k < out_size_w64; k++ )
out[k] = mm256_byteswap_epi64( sc->H[k] );
out[k] = mm256_byteswap_64( sc->H[k] );
}
#endif

View File

@@ -13,17 +13,17 @@ static __thread bool ctx_midstate_done = false;
void decred_hash_4way( void *state, const void *input )
{
uint32_t hash0[16] __attribute__ ((aligned (64)));
uint32_t hash1[16] __attribute__ ((aligned (64)));
uint32_t hash2[16] __attribute__ ((aligned (64)));
uint32_t hash3[16] __attribute__ ((aligned (64)));
uint32_t vhash[16*4] __attribute__ ((aligned (64)));
uint32_t vhash[4*4] __attribute__ ((aligned (64)));
uint32_t hash0[4] __attribute__ ((aligned (32)));
uint32_t hash1[4] __attribute__ ((aligned (32)));
uint32_t hash2[4] __attribute__ ((aligned (32)));
uint32_t hash3[4] __attribute__ ((aligned (32)));
blake256_4way_context ctx __attribute__ ((aligned (64)));
sph_blake256_context ctx2 __attribute__ ((aligned (64)));
uint32_t hash[16] __attribute__ ((aligned (64)));
uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
m128_deinterleave_4x32( sin0, sin1, sin2, sin3, (uint32_t*)input, 180*8 );
mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 );
void *tail = input + DECRED_MIDSTATE_LEN;
int tail_len = 180 - DECRED_MIDSTATE_LEN;
@@ -53,7 +53,7 @@ void decred_hash_4way( void *state, const void *input )
blake256_4way( &ctx, input, 180 );
blake256_4way_close( &ctx, vhash );
m128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
/*
for ( int i = 0; i < 8; i++ )
if ( hash[i] != hash0[i] )
@@ -79,7 +79,7 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t vdata[45*4] __attribute__ ((aligned (64)));
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t hash[4*4] __attribute__ ((aligned (32)));
uint32_t _ALIGN(64) endiandata[48];
// uint32_t _ALIGN(64) hash32[8];
uint32_t *pdata = work->data;
@@ -97,7 +97,8 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
// memcpy(endiandata, pdata, 180);
m128_interleave_4x32( vdata, pdata, pdata, pdata, pdata, 180*8 );
// use the old way until new way updated for size.
mm_interleave_4x32x( vdata, pdata, pdata, pdata, pdata, 180*8 );
uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
do {

View File

@@ -144,7 +144,8 @@ bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
bool register_decred_algo( algo_gate_t* gate )
{
#if defined(DECRED_4WAY)
gate->optimizations = SSE2_OPT | AVX_OPT;
four_way_not_tested();
gate->optimizations = FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_decred_4way;
gate->hash = (void*)&decred_hash_4way;
#else
@@ -153,9 +154,6 @@ bool register_decred_algo( algo_gate_t* gate )
gate->hash = (void*)&decred_hash;
#endif
// gate->optimizations = SSE2_OPT;
// gate->scanhash = (void*)&scanhash_decred;
// gate->hash = (void*)&decred_hash;
gate->get_nonceptr = (void*)&decred_get_nonceptr;
gate->get_max64 = (void*)&get_max64_0x3fffffLL;
gate->display_extra_data = (void*)&decred_decode_extradata;

View File

@@ -30,13 +30,13 @@ extern void pentablakehash_4way( void *output, const void *input )
blake512_4way_close( &ctx, vhash );
uint64_t sin0[10], sin1[10], sin2[10], sin3[10];
m256_deinterleave_4x64( sin0, sin1, sin2, sin3, input, 640 );
mm256_deinterleave_4x64( sin0, sin1, sin2, sin3, input, 640 );
sph_blake512_context ctx2_blake;
sph_blake512_init(&ctx2_blake);
sph_blake512(&ctx2_blake, sin0, 80);
sph_blake512_close(&ctx2_blake, (void*) hash);
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
uint64_t* hash64 = (uint64_t*)hash;
for( int i = 0; i < 8; i++ )
{
@@ -60,7 +60,7 @@ for( int i = 0; i < 8; i++ )
blake512_4way( &ctx, vhash, 64 );
blake512_4way_close( &ctx, vhash );
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
@@ -141,7 +141,7 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for ( int m=0; m < 6; m++ )
{

View File

@@ -3,13 +3,13 @@
bool register_pentablake_algo( algo_gate_t* gate )
{
#if defined (PENTABLAKE_4WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_pentablake_4way;
gate->hash = (void*)&pentablakehash_4way;
#else
gate->scanhash = (void*)&scanhash_pentablake;
gate->hash = (void*)&pentablakehash;
#endif
gate->optimizations = FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

View File

@@ -95,13 +95,13 @@ extern "C"{
#define Sb(x0, x1, x2, x3, c) \
do { \
__m256i cc = _mm256_set_epi64x( c, c, c, c ); \
x3 = mm256_bitnot( x3 ); \
x0 = _mm256_xor_si256( x0, _mm256_and_si256( cc, mm256_bitnot( x2 ) ) ); \
x3 = mm256_not( x3 ); \
x0 = _mm256_xor_si256( x0, _mm256_and_si256( cc, mm256_not( x2 ) ) ); \
tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \
x3 = _mm256_xor_si256( x3, _mm256_and_si256( mm256_bitnot( x1 ), x2 ) ); \
x3 = _mm256_xor_si256( x3, _mm256_and_si256( mm256_not( x1 ), x2 ) ); \
x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \
x2 = _mm256_xor_si256( x2, _mm256_and_si256( x0, mm256_bitnot( x3 ) ) ); \
x2 = _mm256_xor_si256( x2, _mm256_and_si256( x0, mm256_not( x3 ) ) ); \
x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \
x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \
x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \
@@ -532,7 +532,7 @@ jh_4way_core( jh_4way_context *sc, const void *data, size_t len )
if ( len < (buf_size - ptr) )
{
memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
memcpy_256( buf + (ptr>>3), vdata, len>>3 );
ptr += len;
sc->ptr = ptr;
return;
@@ -546,7 +546,7 @@ jh_4way_core( jh_4way_context *sc, const void *data, size_t len )
if ( clen > len )
clen = len;
memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
ptr += clen;
vdata += (clen>>3);
len -= clen;
@@ -579,7 +579,7 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
else
numz = 112 - sc->ptr;
memset_zero_m256i( buf+1, (numz>>3) - 1 );
memset_zero_256( buf+1, (numz>>3) - 1 );
l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
l1 = SPH_T64(sc->block_count >> 55);
@@ -593,7 +593,7 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
for ( u=0; u < 8; u++ )
buf[u] = sc->H[u+8];
memcpy_m256i( dst256, buf, 8 );
memcpy_256( dst256, buf, 8 );
}
void

View File

@@ -1,11 +1,12 @@
#if defined(JHA_4WAY)
#include "jha-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "avxdefs.h"
//#include "avxdefs.h"
#if defined(JHA_4WAY)
#include "algo/blake/blake-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
@@ -15,19 +16,19 @@
//static __thread keccak512_4way_context jha_kec_mid
// __attribute__ ((aligned (64)));
void jha_hash_4way( void *output, const void *input )
void jha_hash_4way( void *out, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhasha[8*4] __attribute__ ((aligned (64)));
uint64_t vhashb[8*4] __attribute__ ((aligned (64)));
__m256i mask;
__m256i* vh256 = (__m256i*)vhash;
__m256i* vha256 = (__m256i*)vhasha;
__m256i* vhb256 = (__m256i*)vhashb;
uint64_t vhash0[8*4] __attribute__ ((aligned (64)));
uint64_t vhash1[8*4] __attribute__ ((aligned (64)));
__m256i mask0, mask1;
__m256i* vh = (__m256i*)vhash;
__m256i* vh0 = (__m256i*)vhash0;
__m256i* vh1 = (__m256i*)vhash1;
blake512_4way_context ctx_blake;
hashState_groestl ctx_groestl;
@@ -40,21 +41,29 @@ void jha_hash_4way( void *output, const void *input )
keccak512_4way_close( &ctx_keccak, vhash );
// memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
// keccak512_4way( &ctx_keccak, input+64, 16 );
// keccak512_4way( &ctx_keccak, input + (64<<2), 16 );
// keccak512_4way_close( &ctx_keccak, vhash );
// Heavy & Light Pair Loop
for ( int round = 0; round < 3; round++ )
{
memset_zero_m256i( vha256, 20 );
memset_zero_m256i( vhb256, 20 );
// memset_zero_256( vh0, 20 );
// memset_zero_256( vh1, 20 );
mask = _mm256_sub_epi64( _mm256_and_si256( vh256[0],
mm256_vec_epi64( 0x1 ) ), mm256_vec_epi64( 0x1 ) );
// positive logic, if maski select vhi
// going from bit to mask reverses logic such that if the test bit is set
// zero will be put in mask0, meaning don't take vh0. mask1 is
// inverted so 1 will be put in mask1 meaning take it.
mask0 = mm256_negate_64(
_mm256_and_si256( vh[0], _mm256_set1_epi64x( 0x1 ) ) );
mask1 = mm256_not( mask0 );
// mask = _mm256_sub_epi64( _mm256_and_si256( vh[0],
// _mm256_set1_epi64x( 0x1 ) ), _mm256_set1_epi64x( 0x1 ) );
// groestl (serial) v skein
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash0,
@@ -71,58 +80,66 @@ void jha_hash_4way( void *output, const void *input )
update_and_final_groestl( &ctx_groestl, (char*)hash3,
(char*)hash3, 512 );
m256_interleave_4x64( vhasha, hash0, hash1, hash2, hash3, 512 );
mm256_interleave_4x64( vhash0, hash0, hash1, hash2, hash3, 512 );
// skein
skein512_4way_init( &ctx_skein );
skein512_4way( &ctx_skein, vhash, 64 );
skein512_4way_close( &ctx_skein, vhashb );
skein512_4way_close( &ctx_skein, vhash1 );
// merge vectored hash
for ( int i = 0; i < 8; i++ )
{
vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
_mm256_and_si256( vh1[i], mask1 ) );
/*
vha256[i] = _mm256_maskload_epi64(
vhasha + i*4, mm256_bitnot(mask ) );
vhasha + i*4, mm256_not( mask ) );
vhb256[i] = _mm256_maskload_epi64(
vhashb + i*4, mask );
vh256[i] = _mm256_or_si256( vha256[i], vhb256[i] );
*/
}
// blake v jh
blake512_4way_init( &ctx_blake );
blake512_4way( &ctx_blake, vhash, 64 );
blake512_4way_close( &ctx_blake, vhasha );
blake512_4way_close( &ctx_blake, vhash0 );
jh512_4way_init( &ctx_jh );
jh512_4way( &ctx_jh, vhash, 64 );
jh512_4way_close( &ctx_jh, vhashb );
jh512_4way_close( &ctx_jh, vhash1 );
// merge vectored hash
// merge hash
for ( int i = 0; i < 8; i++ )
{
vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
_mm256_and_si256( vh1[i], mask1 ) );
/*
vha256[i] = _mm256_maskload_epi64(
vhasha + i*4, mm256_bitnot(mask ) );
vhasha + i*4, mm256_not( mask ) );
vhb256[i] = _mm256_maskload_epi64(
vhashb + i*4, mask );
vh256[i] = _mm256_or_si256( vha256[i], vhb256[i] );
*/
}
}
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
memcpy( output+96, hash3, 32 );
// memcpy( output, hash0, 32 );
// memcpy( output+32, hash1, 32 );
// memcpy( output+64, hash2, 32 );
// memcpy( output+96, hash3, 32 );
}
int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
@@ -160,7 +177,7 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
be32enc( &endiandata[i], pdata[i] );
uint64_t *edata = (uint64_t*)endiandata;
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
// precalc midstate for keccak
// keccak512_4way_init( &jha_kec_mid );

View File

@@ -3,15 +3,16 @@
bool register_jha_algo( algo_gate_t* gate )
{
//#if defined (JHA_4WAY)
// gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
// gate->scanhash = (void*)&scanhash_jha_4way;
// gate->hash = (void*)&jha_hash_4way;
//#else
gate->optimizations = SSE2_OPT | AES_OPT;
#if defined (JHA_4WAY)
four_way_not_tested();
gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_jha_4way;
gate->hash = (void*)&jha_hash_4way;
#else
gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_jha;
gate->hash = (void*)&jha_hash;
//#endif
#endif
gate->set_target = (void*)&scrypt_set_target;
return true;
};

View File

@@ -9,19 +9,17 @@
#define JHA_4WAY
#endif
//#if defined JHA_4WAY
//void jha_hash_4way( void *state, const void *input );
#if defined JHA_4WAY
void jha_hash_4way( void *state, const void *input );
//int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done );
//#else
int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif
void jha_hash( void *state, const void *input );
int scanhash_jha( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
//#endif
#endif

View File

@@ -1,39 +1,30 @@
#include "keccak-gate.h"
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "sph_keccak.h"
#include "keccak-hash-4way.h"
#ifdef KECCAK_4WAY
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "sph_keccak.h"
#include "keccak-hash-4way.h"
void keccakhash_4way(void *state, const void *input)
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
keccak256_4way_context ctx;
uint64_t vhash[4*4] __attribute__ ((aligned (64)));
keccak256_4way_context ctx;
keccak256_4way_init( &ctx );
keccak256_4way( &ctx, input, 80 );
keccak256_4way_close( &ctx, vhash );
keccak256_4way_init( &ctx );
keccak256_4way( &ctx, input, 80 );
keccak256_4way_close( &ctx, vhash );
m256_deinterleave_4x64x( hash0, hash1, hash2, hash3, vhash, 512 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
@@ -52,7 +43,7 @@ int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
be32enc( &endiandata[i], pdata[i] );
uint64_t *edata = (uint64_t*)endiandata;
m256_interleave_4x64x( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do {
found[0] = found[1] = found[2] = found[3] = false;

View File

@@ -9,19 +9,38 @@ int64_t keccak_get_max64() { return 0x7ffffLL; }
bool register_keccak_algo( algo_gate_t* gate )
{
gate->optimizations = FOUR_WAY_OPT;
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
gate->set_target = (void*)&keccak_set_target;
gate->get_max64 = (void*)&keccak_get_max64;
#if defined (KECCAK_4WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_keccak_4way;
gate->hash = (void*)&keccakhash_4way;
#else
gate->optimizations = SSE2_OPT;
gate->scanhash = (void*)&scanhash_keccak;
gate->hash = (void*)&keccakhash;
#endif
return true;
};
void keccakc_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_keccakc_algo( algo_gate_t* gate )
{
gate->optimizations = FOUR_WAY_OPT;
gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
gate->set_target = (void*)&keccakc_set_target;
gate->get_max64 = (void*)&keccak_get_max64;
#if defined (KECCAK_4WAY)
gate->scanhash = (void*)&scanhash_keccak_4way;
gate->hash = (void*)&keccakhash_4way;
#else
gate->scanhash = (void*)&scanhash_keccak;
gate->hash = (void*)&keccakhash;
#endif
return true;
};

View File

@@ -1,5 +1,5 @@
#ifndef __KECCAK_GATE_H__
#define __KECCAK_GATE_H__
#ifndef KECCAK_GATE_H__
#define KECCAK_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>

View File

@@ -54,10 +54,6 @@ static const sph_u64 RC[] = {
kc->w[j ] = _mm256_xor_si256( kc->w[j], buf[j] ); \
} while (0)
#define mm256_neg1 \
(_mm256_set_epi64x( 0xffffffffffffffff, 0xffffffffffffffff, \
0xffffffffffffffff, 0xffffffffffffffff ) )
#define DECL64(x) __m256i x
#define MOV64(d, s) (d = s)
#define XOR64(d, a, b) (d = _mm256_xor_si256(a,b))
@@ -403,7 +399,7 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
if ( len < (lim - ptr) )
{
memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
memcpy_256( buf + (ptr>>3), vdata, len>>3 );
kc->ptr = ptr + len;
return;
}
@@ -416,7 +412,7 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
clen = (lim - ptr);
if ( clen > len )
clen = len;
memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
ptr += clen;
vdata = vdata + (clen>>3);
len -= clen;
@@ -453,7 +449,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
{
j = lim - kc->ptr;
u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb );
memset_zero_m256i( u.tmp + 1, (j>>3) - 2 );
memset_zero_256( u.tmp + 1, (j>>3) - 2 );
u.tmp[ (j>>3) - 1] = _mm256_set_epi64x( 0x8000000000000000,
0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
}
@@ -467,7 +463,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
NOT64( kc->w[20], kc->w[20] );
for ( j = 0; j < m256_len; j++ )
u.tmp[j] = kc->w[j];
memcpy_m256i( dst, u.tmp, m256_len );
memcpy_256( dst, u.tmp, m256_len );
}
void keccak256_4way_init( void *kc )

View File

@@ -272,8 +272,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
// full blocks
for ( i = 0; i < blocks; i++ )
{
rnd512( state, mm_byteswap_epi32( casti_m128i( data, 1 ) ),
mm_byteswap_epi32( casti_m128i( data, 0 ) ) );
rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
mm_byteswap_32( casti_m128i( data, 0 ) ) );
data += MSG_BLOCK_BYTE_LEN;
}
@@ -282,7 +282,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
if ( state->rembytes )
{
// remaining data bytes
casti_m128i( state->buffer, 0 ) = mm_byteswap_epi32( cast_m128i( data ) );
casti_m128i( state->buffer, 0 ) = mm_byteswap_32( cast_m128i( data ) );
// padding of partial block
casti_m128i( state->buffer, 1 ) =
_mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
@@ -324,8 +324,8 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
// full blocks
for ( i = 0; i < blocks; i++ )
{
rnd512( state, mm_byteswap_epi32( casti_m128i( data, 1 ) ),
mm_byteswap_epi32( casti_m128i( data, 0 ) ) );
rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
mm_byteswap_32( casti_m128i( data, 0 ) ) );
data += MSG_BLOCK_BYTE_LEN;
}
@@ -334,7 +334,7 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
{
// padding of partial block
rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
mm_byteswap_epi32( cast_m128i( data ) ) );
mm_byteswap_32( cast_m128i( data ) ) );
}
else
{
@@ -542,7 +542,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 0 ) = mm256_byteswap_epi32( casti_m256i( hash, 0 ) );
casti_m256i( b, 0 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
rnd512( state, zero, zero );
@@ -555,7 +555,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 1 ) = mm256_byteswap_epi32( casti_m256i( hash, 0 ) );
casti_m256i( b, 1 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
}
#else
@@ -587,8 +587,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm_store_si128((__m128i*)&hash[0], t[0]);
_mm_store_si128((__m128i*)&hash[4], t[1]);
casti_m128i( b, 0 ) = mm_byteswap_epi32( casti_m128i( hash, 0 ) );
casti_m128i( b, 1 ) = mm_byteswap_epi32( casti_m128i( hash, 1 ) );
casti_m128i( b, 0 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 1 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
rnd512( state, zero, zero );
@@ -609,8 +609,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm_store_si128((__m128i*)&hash[0], t[0]);
_mm_store_si128((__m128i*)&hash[4], t[1]);
casti_m128i( b, 2 ) = mm_byteswap_epi32( casti_m128i( hash, 0 ) );
casti_m128i( b, 3 ) = mm_byteswap_epi32( casti_m128i( hash, 1 ) );
casti_m128i( b, 2 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 3 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
}
#endif

View File

@@ -377,7 +377,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
if (wholeMatrix == NULL)
return -1;
/*
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
#elif defined(__AVX__)
@@ -385,7 +385,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
#else
memset(wholeMatrix, 0, i);
#endif
*/
uint64_t *ptrWord = wholeMatrix;
//=== Getting the password + salt + basil padded with 10*1 ==========//

View File

@@ -128,34 +128,10 @@ void lyra2re_set_target ( struct work* work, double job_diff )
work_set_target(work, job_diff / (128.0 * opt_diff_factor) );
}
/*
bool lyra2re_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
lyra2re_wholeMatrix = _mm_malloc( i, 64 );
if ( lyra2re_wholeMatrix == NULL )
return false;
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)lyra2re_wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)lyra2re_wholeMatrix, i/16 );
#else
memset( lyra2re_wholeMatrix, 0, i );
#endif
return true;
}
*/
bool register_lyra2re_algo( algo_gate_t* gate )
{
init_lyra2re_ctx();
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
// gate->miner_thread_init = (void*)&lyra2re_thread_init;
gate->scanhash = (void*)&scanhash_lyra2re;
gate->hash = (void*)&lyra2re_hash;
gate->get_max64 = (void*)&lyra2re_get_max64;

View File

@@ -132,23 +132,13 @@ bool lyra2rev2_thread_init()
int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
l2v2_wholeMatrix = _mm_malloc( i, 64 );
if ( l2v2_wholeMatrix == NULL )
return false;
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)l2v2_wholeMatrix, i/32 );
#elif defined (__AVX__)
memset_zero_m128i( (__m128i*)l2v2_wholeMatrix, i/16 );
#else
memset( l2v2_wholeMatrix, 0, i );
#endif
return true;
return l2v2_wholeMatrix;
}
bool register_lyra2rev2_algo( algo_gate_t* gate )
{
init_lyra2rev2_ctx();
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->optimizations = AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
gate->scanhash = (void*)&scanhash_lyra2rev2;
gate->hash = (void*)&lyra2rev2_hash;

168
algo/lyra2/lyra2z-4way.c Normal file
View File

@@ -0,0 +1,168 @@
#include "lyra2z-gate.h"
#ifdef LYRA2Z_4WAY
#include <memory.h>
#include <mm_malloc.h>
//#include "algo-gate-api.h"
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake-hash-4way.h"
//#include "avxdefs.h"
// same size, only difference is the name, lyra2 is done serially
__thread uint64_t* lyra2z_4way_matrix;
bool lyra2z_4way_thread_init()
{
return ( lyra2z_4way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
}
static __thread blake256_4way_context l2z_4way_blake_mid;
void lyra2z_4way_midstate( const void* input )
{
blake256_4way_init( &l2z_4way_blake_mid );
blake256_4way( &l2z_4way_blake_mid, input, 64 );
}
// block 2050 new algo, blake plus new lyra parms. new input
// is power of 2 so normal lyra can be used
//void zcoin_hash(void *state, const void *input, uint32_t height)
void lyra2z_4way_hash( void *state, const void *input )
{
// uint32_t _ALIGN(64) hash[16];
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
// memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
// blake256_4way( &ctx_blake, input + (64*4), 16 );
// blake256_4way_close( &ctx_blake, vhash );
blake256_4way_init( &ctx_blake );
blake256_4way( &ctx_blake, input, 80 );
blake256_4way_close( &ctx_blake, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
// LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
// LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
// memcpy(state, hash, 32);
}
int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
// uint32_t _ALIGN(64) hash[8];
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 76; // 19*4
uint32_t *noncep1 = vdata + 77;
uint32_t *noncep2 = vdata + 78;
uint32_t *noncep3 = vdata + 79;
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
for ( int i=0; i < 19; i++ )
be32enc( &edata[i], pdata[i] );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
// lyra2z_4way_midstate( vdata );
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
be32enc( &edata[19], n );
lyra2z_4way_hash( hash, vdata );
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
printf("found 0\n");
found[0] = true;
num_found++;
nonces[0] = pdata[19] = n;
work_set_target_ratio( work, hash );
}
/* if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{
printf("found 1\n");
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
*/
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{
printf("found 2\n");
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
/*
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{
printf("found 3\n");
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
*/
n += 2;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif
/*
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
work_set_target_ratio(work, hash);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 1;
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
*/

28
algo/lyra2/lyra2z-gate.c Normal file
View File

@@ -0,0 +1,28 @@
#include "lyra2z-gate.h"
#include "lyra2.h"
void lyra2z_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_lyra2z_algo( algo_gate_t* gate )
{
#ifdef LYRA2Z_4WAY
four_way_not_tested();
gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_4way;
gate->hash = (void*)&lyra2z_4way_hash;
#else
gate->optimizations = AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2z_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z;
gate->hash = (void*)&lyra2z_hash;
#endif
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&lyra2z_set_target;
return true;
};

33
algo/lyra2/lyra2z-gate.h Normal file
View File

@@ -0,0 +1,33 @@
#ifndef LYRA2Z_GATE_H__
#define LYRA2Z_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY)
#define LYRA2Z_4WAY
#endif
#define LYRA2Z_MATRIX_SIZE BLOCK_LEN_INT64 * 8 * 8 * 8
#if defined(LYRA2Z_4WAY)
void lyra2z_4way_hash( void *state, const void *input );
int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
bool lyra2z_4way_thread_init();
#endif
void lyra2z_hash( void *state, const void *input );
int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
bool lyra2z_thread_init();
#endif

View File

@@ -1,40 +1,49 @@
#include <memory.h>
#include <mm_malloc.h>
#include "algo-gate-api.h"
#include "lyra2z-gate.h"
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
#include "avxdefs.h"
__thread uint64_t* zcoin_wholeMatrix;
__thread uint64_t* lyra2z_matrix;
static __thread sph_blake256_context zcoin_blake_mid;
void zcoin_midstate( const void* input )
bool lyra2z_thread_init()
{
sph_blake256_init( &zcoin_blake_mid );
sph_blake256( &zcoin_blake_mid, input, 64 );
// const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
// int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
const int i = BLOCK_LEN_INT64 * 8 * 8 * 8;
lyra2z_matrix = _mm_malloc( i, 64 );
return lyra2z_matrix;
}
static __thread sph_blake256_context lyra2z_blake_mid;
void lyra2z_midstate( const void* input )
{
sph_blake256_init( &lyra2z_blake_mid );
sph_blake256( &lyra2z_blake_mid, input, 64 );
}
// block 2050 new algo, blake plus new lyra parms. new input
// is power of 2 so normal lyra can be used
//void zcoin_hash(void *state, const void *input, uint32_t height)
void zcoin_hash(void *state, const void *input )
void lyra2z_hash( void *state, const void *input )
{
uint32_t _ALIGN(64) hash[16];
sph_blake256_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &zcoin_blake_mid, sizeof zcoin_blake_mid );
memcpy( &ctx_blake, &lyra2z_blake_mid, sizeof lyra2z_blake_mid );
sph_blake256( &ctx_blake, input + 64, 16 );
sph_blake256_close( &ctx_blake, hash );
LYRA2Z( zcoin_wholeMatrix, hash, 32, hash, 32, hash, 32, 8, 8, 8);
LYRA2Z( lyra2z_matrix, hash, 32, hash, 32, hash, 32, 8, 8, 8);
memcpy(state, hash, 32);
}
int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t _ALIGN(64) hash[8];
@@ -52,11 +61,11 @@ int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
be32enc(&endiandata[i], pdata[i]);
}
zcoin_midstate( endiandata );
lyra2z_midstate( endiandata );
do {
be32enc(&endiandata[19], nonce);
zcoin_hash( hash, endiandata );
lyra2z_hash( hash, endiandata );
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
work_set_target_ratio(work, hash);
@@ -73,50 +82,41 @@ int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
return 0;
}
/*
//int64_t get_max64_0xffffLL() { return 0xffffLL; };
void zcoin_set_target( struct work* work, double job_diff )
void lyra2z_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
/*
bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
{
work->height = sctx->bloc_height;
return false;
}
*/
bool zcoin_thread_init()
bool lyra2z_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
zcoin_wholeMatrix = _mm_malloc( i, 64 );
lyra2z_wholeMatrix = _mm_malloc( i, 64 );
if ( zcoin_wholeMatrix == NULL )
return false;
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)zcoin_wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)zcoin_wholeMatrix, i/16 );
#else
memset( zcoin_wholeMatrix, 0, i );
#endif
return true;
return lyra2z_wholeMatrix;
}
bool register_zcoin_algo( algo_gate_t* gate )
bool register_lyra2z_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&zcoin_thread_init;
gate->scanhash = (void*)&scanhash_zcoin;
gate->hash = (void*)&zcoin_hash;
gate->miner_thread_init = (void*)&lyra2z_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z;
gate->hash = (void*)&lyra2z_hash;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&zcoin_set_target;
gate->set_target = (void*)&lyra2z_set_target;
// gate->prevent_dupes = (void*)&zcoin_get_work_height;
return true;
};
*/

View File

@@ -64,22 +64,12 @@ bool lyra2z330_thread_init()
int i = (int64_t)ROW_LEN_BYTES * 330; // nRows;
lyra2z330_wholeMatrix = _mm_malloc( i, 64 );
if ( lyra2z330_wholeMatrix == NULL )
return false;
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)lyra2z330_wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)lyra2z330_wholeMatrix, i/16 );
#else
memset( lyra2z330_wholeMatrix, 0, i );
#endif
return true;
return lyra2z330_wholeMatrix;
}
bool register_lyra2z330_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->optimizations = AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2z330_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z330;
gate->hash = (void*)&lyra2z330_hash;

View File

@@ -130,12 +130,12 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
//Squeezes full blocks
for ( i = 0; i < fullBlocks; i++ )
{
memcpy_m256i( out, state, BLOCK_LEN_M256I );
memcpy_256( out, state, BLOCK_LEN_M256I );
LYRA_ROUND_AVX2( state[0], state[1], state[2], state[3] );
out += BLOCK_LEN_M256I;
}
//Squeezes remaining bytes
memcpy_m256i( out, state, ( len_m256i % BLOCK_LEN_M256I ) );
memcpy_256( out, state, ( len_m256i % BLOCK_LEN_M256I ) );
#elif defined (__AVX__)
@@ -148,13 +148,13 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
//Squeezes full blocks
for ( i = 0; i < fullBlocks; i++ )
{
memcpy_m128i( out, state, BLOCK_LEN_M128I );
memcpy_128( out, state, BLOCK_LEN_M128I );
LYRA_ROUND_AVX( state[0], state[1], state[2], state[3],
state[4], state[5], state[6], state[7] );
out += BLOCK_LEN_M128I;
}
//Squeezes remaining bytes
memcpy_m128i( out, state, ( len_m128i % BLOCK_LEN_M128I ) );
memcpy_128( out, state, ( len_m128i % BLOCK_LEN_M128I ) );
#else

View File

@@ -66,11 +66,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
G_4X64( s0, s1, s2, s3 ); \
s1 = mm256_rotl256_1x64( s1); \
s2 = mm256_swap128( s2 ); \
s2 = mm256_swap_128( s2 ); \
s3 = mm256_rotr256_1x64( s3 ); \
G_4X64( s0, s1, s2, s3 ); \
s1 = mm256_rotr256_1x64( s1 ); \
s2 = mm256_swap128( s2 ); \
s2 = mm256_swap_128( s2 ); \
s3 = mm256_rotl256_1x64( s3 );
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
@@ -105,14 +105,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm128_rotl256_1x64( s2, s3 ); \
mm128_swap128( s4, s5 ); \
mm128_rotr256_1x64( s6, s7 ); \
mm_rotl256_1x64( s2, s3 ); \
mm_swap_128( s4, s5 ); \
mm_rotr256_1x64( s6, s7 ); \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm128_rotr256_1x64( s2, s3 ); \
mm128_swap128( s4, s5 ); \
mm128_rotl256_1x64( s6, s7 );
mm_rotr256_1x64( s2, s3 ); \
mm_swap_128( s4, s5 ); \
mm_rotl256_1x64( s6, s7 );
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \

View File

@@ -15,7 +15,7 @@
// no improvement with midstate
//static __thread blake512_4way_context ctx_mid;
void nist5hash_4way( void *output, const void *input )
void nist5hash_4way( void *out, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
@@ -35,7 +35,7 @@ void nist5hash_4way( void *output, const void *input )
blake512_4way( &ctx_blake, input, 80 );
blake512_4way_close( &ctx_blake, vhash );
m256_deinterleave_4x64x( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash0,
@@ -50,7 +50,7 @@ void nist5hash_4way( void *output, const void *input )
update_and_final_groestl( &ctx_groestl, (char*)hash3,
(const char*)hash3, 512 );
m256_interleave_4x64x( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
jh512_4way_init( &ctx_jh );
jh512_4way( &ctx_jh, vhash, 64 );
@@ -64,12 +64,7 @@ void nist5hash_4way( void *output, const void *input )
skein512_4way( &ctx_skein, vhash, 64 );
skein512_4way_close( &ctx_skein, vhash );
m256_deinterleave_4x64x( hash0, hash1, hash2, hash3, vhash, 512 );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
memcpy( output+96, hash3, 32 );
mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
}
int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -109,7 +104,7 @@ int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
// precalc midstate
// blake512_4way_init( &ctx_mid );

View File

@@ -2,12 +2,11 @@
bool register_nist5_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
#if defined (NIST5_4WAY)
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_nist5_4way;
gate->hash = (void*)&nist5hash_4way;
#else
gate->optimizations = SSE2_OPT | AES_OPT;
init_nist5_ctx();
gate->scanhash = (void*)&scanhash_nist5;
gate->hash = (void*)&nist5hash;

View File

@@ -20,7 +20,7 @@ void skeinhash_4way( void *state, const void *input )
skein512_4way( &ctx_skein, input, 80 );
skein512_4way_close( &ctx_skein, vhash );
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
@@ -38,21 +38,20 @@ void skeinhash_4way( void *state, const void *input )
SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
memcpy( (char*)state, (char*)hash0, 32 );
memcpy( ((char*)state) + 32, (char*)hash1, 32 );
memcpy( ((char*)state) + 64, (char*)hash2, 32 );
memcpy( ((char*)state) + 96, (char*)hash3, 32 );
memcpy( state, hash0, 32 );
memcpy( state + 32, hash1, 32 );
memcpy( state + 64, hash2, 32 );
memcpy( state + 96, hash3, 32 );
}
int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint64_t *edata = (uint64_t*)endiandata;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
@@ -63,9 +62,9 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
// data is 80 bytes, 20 u32 or 4 u64.
swab32_array( endiandata, pdata, 20 );
swab32_array( edata, pdata, 20 );
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_interleave_4x64( vdata, edata, edata, edata, edata, 640 );
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;

View File

@@ -6,8 +6,8 @@ int64_t skein_get_max64() { return 0x7ffffLL; }
bool register_skein_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX_OPT| AVX2_OPT | SHA_OPT;
#if defined (SKEIN_4WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_skein_4way;
gate->hash = (void*)&skeinhash_4way;
#else

View File

@@ -463,7 +463,7 @@ skein_big_core_4way( skein512_4way_context *sc, const void *data,
if ( len <= buf_size - ptr )
{
memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
memcpy_256( buf + (ptr>>3), vdata, len>>3 );
sc->ptr = ptr + len;
return;
}
@@ -483,7 +483,7 @@ skein_big_core_4way( skein512_4way_context *sc, const void *data,
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
ptr += clen;
vdata += (clen>>3);
len -= clen;
@@ -520,11 +520,11 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
READ_STATE_BIG(sc);
memset_zero_m256i( buf + (ptr>>3), (buf_size - ptr) >> 3 );
memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_4WAY( et, ptr );
memset_zero_m256i( buf, buf_size >> 3 );
memset_zero_256( buf, buf_size >> 3 );
bcount = 0;
UBI_BIG_4WAY( 510, 8 );
@@ -537,7 +537,7 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
buf[6] = h6;
buf[7] = h7;
memcpy_m256i( dst, buf, out_len >> 3 );
memcpy_256( dst, buf, out_len >> 3 );
}
static const sph_u64 IV256[] = {

View File

@@ -19,13 +19,13 @@ void skein2hash_4way( void *output, const void *input )
skein512_4way( &ctx, hash, 64 );
skein512_4way_close( &ctx, hash );
m256_deinterleave_4x64( out64, out64+4, out64+8, out64+12, hash, 256 );
mm256_deinterleave_4x64( out64, out64+4, out64+8, out64+12, hash, 256 );
}
int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__ ((aligned (64)));
uint64_t *edata = (uint64_t*)endiandata;
@@ -41,7 +41,7 @@ int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
swab32_array( endiandata, pdata, 20 );
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_interleave_4x64( vdata, edata, edata, edata, edata, 640 );
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;

View File

@@ -9,12 +9,12 @@ int64_t skein2_get_max64 ()
bool register_skein2_algo( algo_gate_t* gate )
{
gate->optimizations = FOUR_WAY_OPT;
#if defined (FOUR_WAY) && defined (__AVX2__)
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_skein2_4way;
gate->hash = (void*)&skein2hash_4way;
four_way_not_tested();
#else
gate->optimizations = SSE2_OPT;
gate->scanhash = (void*)&scanhash_skein2;
gate->hash = (void*)&skein2hash;
#endif

View File

@@ -31,7 +31,7 @@ void tribus_hash_4way(void *state, const void *input)
keccak512_4way( &ctx_keccak, vhash, 64 );
keccak512_4way_close( &ctx_keccak, vhash );
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// hash echo serially
init_echo( &ctx_echo, 512 );
@@ -92,7 +92,7 @@ int scanhash_tribus_4way(int thr_id, struct work *work, uint32_t max_nonce, uint
}
uint64_t *edata = (uint64_t*)endiandata;
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
// precalc midstate
// doing it one way then then interleaving would be faster but too

View File

@@ -14,15 +14,13 @@ bool tribus_thread_init()
*/
bool register_tribus_algo( algo_gate_t* gate )
{
// gate->miner_thread_init = (void*)&tribus_thread_init;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0x1ffff;
#if defined (TRIBUS_4WAY)
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_tribus_4way;
gate->hash = (void*)&tribus_hash_4way;
#else
gate->miner_thread_init = (void*)&tribus_thread_init;
gate->optimizations = SSE2_OPT | AES_OPT;
gate->scanhash = (void*)&scanhash_tribus;
gate->hash = (void*)&tribus_hash;
#endif

View File

@@ -140,7 +140,7 @@ HASH ( void *cc, const void *data, size_t len )
clen = SPH_BLEN - ptr;
if ( clen > len )
clen = len;
memcpy_m256i( sc->buf + (ptr>>3), vdata, clen>>3 );
memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
vdata = vdata + (clen>>3);
ptr += clen;
len -= clen;
@@ -195,19 +195,19 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
sc = cc;
ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
uint64_t *b= (uint64_t*)sc->buf;
uint64_t *s= (uint64_t*)sc->state;
//uint64_t *b= (uint64_t*)sc->buf;
//uint64_t *s= (uint64_t*)sc->state;
//printf("Vptr 1= %u\n", ptr);
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );
#ifdef PW01
sc->buf[ptr>>3] = mm256_vec_epi64( 0x100 >> 8 );
sc->buf[ptr>>3] = _mm256_set1_epi64x( 0x100 >> 8 );
// sc->buf[ptr++] = 0x100 >> 8;
#else
// need to overwrite exactly one byte
// sc->buf[ptr>>3] = _mm256_set_epi64x( 0, 0, 0, 0x80 );
sc->buf[ptr>>3] = mm256_vec_epi64( 0x80 );
sc->buf[ptr>>3] = _mm256_set1_epi64x( 0x80 );
// ptr++;
#endif
ptr += 8;
@@ -218,43 +218,43 @@ uint64_t *s= (uint64_t*)sc->state;
if ( ptr > SPH_MAXPAD )
{
memset_zero_m256i( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
memset_zero_256( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
RFUN( sc->buf, SPH_VAL );
memset_zero_m256i( sc->buf, SPH_MAXPAD >> 3 );
memset_zero_256( sc->buf, SPH_MAXPAD >> 3 );
}
else
{
memset_zero_m256i( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
memset_zero_256( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
}
#if defined BE64
#if defined PLW1
sc->buf[ SPH_MAXPAD>>3 ] =
mm256_byteswap_epi64( mm256_vec_epi64( sc->count << 3 ) );
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#elif defined PLW4
memset_zero_m256i( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_epi64( mm256_vec_epi64( sc->count >> 61 ) );
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_epi64( mm256_vec_epi64( sc->count << 3 ) );
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#else
sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_epi64( mm256_vec_epi64( sc->count >> 61 ) );
mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_epi64( mm256_vec_epi64( sc->count << 3 ) );
mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
#endif // PLW
#else // LE64
#if defined PLW1
sc->buf[ SPH_MAXPAD >> 3 ] = mm256_vec_epi64( sc->count << 3 );
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
#elif defined PLW4
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_vec_epi64( sc->count << 3 );
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
mm256_vec_epi64( c->count >> 61 );
memset_zero_m256i( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
_mm256_set1_epi64x( c->count >> 61 );
memset_zero_256( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
2 * SPH_WLEN );
#else
sc->buf[ SPH_MAXPAD >> 3 ] = mm256_vec_epi64( sc->count << 3 );
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
mm256_vec_epi64( sc->count >> 61 );
_mm256_set1_epi64x( sc->count >> 61 );
#endif // PLW
#endif // LE64
@@ -276,7 +276,7 @@ uint64_t *s= (uint64_t*)sc->state;
for ( u = 0; u < rnum; u ++ )
{
#if defined BE64
((__m256i*)dst)[u] = mm256_byteswap_epi64( sc->val[u] );
((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
#else // LE64
((__m256i*)dst)[u] = sc->val[u];
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -1,209 +0,0 @@
/* $Id: sph_whirlpool.h 216 2010-06-08 09:46:57Z tp $ */
/**
* WHIRLPOOL interface.
*
* WHIRLPOOL knows three variants, dubbed "WHIRLPOOL-0" (original
* version, published in 2000, studied by NESSIE), "WHIRLPOOL-1"
* (first revision, 2001, with a new S-box) and "WHIRLPOOL" (current
* version, 2003, with a new diffusion matrix, also described as "plain
* WHIRLPOOL"). All three variants are implemented here.
*
* The original WHIRLPOOL (i.e. WHIRLPOOL-0) was published in: P. S. L.
* M. Barreto, V. Rijmen, "The Whirlpool Hashing Function", First open
* NESSIE Workshop, Leuven, Belgium, November 13--14, 2000.
*
* The current WHIRLPOOL specification and a reference implementation
* can be found on the WHIRLPOOL web page:
* http://paginas.terra.com.br/informatica/paulobarreto/WhirlpoolPage.html
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_whirlpool.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef SPH_WHIRLPOOL_H__
#define SPH_WHIRLPOOL_H__
#include <stddef.h>
#include "algo/sha/sph_types.h"
#if SPH_64
/**
* Output size (in bits) for WHIRLPOOL.
*/
#define SPH_SIZE_whirlpool 512
/**
* Output size (in bits) for WHIRLPOOL-0.
*/
#define SPH_SIZE_whirlpool0 512
/**
* Output size (in bits) for WHIRLPOOL-1.
*/
#define SPH_SIZE_whirlpool1 512
/**
* This structure is a context for WHIRLPOOL computations: it contains the
* intermediate values and some data from the last entered block. Once
* a WHIRLPOOL computation has been performed, the context can be reused for
* another computation.
*
* The contents of this structure are private. A running WHIRLPOOL computation
* can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char buf[64]; /* first field, for alignment */
sph_u64 state[8];
#if SPH_64
sph_u64 count;
#else
sph_u32 count_high, count_low;
#endif
#endif
} sph_whirlpool_context;
/**
* Initialize a WHIRLPOOL context. This process performs no memory allocation.
*
* @param cc the WHIRLPOOL context (pointer to a
* <code>sph_whirlpool_context</code>)
*/
void sph_whirlpool_init(void *cc);
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing). This function applies the
* plain WHIRLPOOL algorithm.
*
* @param cc the WHIRLPOOL context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_whirlpool(void *cc, const void *data, size_t len);
/**
* Terminate the current WHIRLPOOL computation and output the result into the
* provided buffer. The destination buffer must be wide enough to
* accomodate the result (64 bytes). The context is automatically
* reinitialized.
*
* @param cc the WHIRLPOOL context
* @param dst the destination buffer
*/
void sph_whirlpool_close(void *cc, void *dst);
/**
* WHIRLPOOL-0 uses the same structure than plain WHIRLPOOL.
*/
typedef sph_whirlpool_context sph_whirlpool0_context;
#ifdef DOXYGEN_IGNORE
/**
* Initialize a WHIRLPOOL-0 context. This function is identical to
* <code>sph_whirlpool_init()</code>.
*
* @param cc the WHIRLPOOL context (pointer to a
* <code>sph_whirlpool0_context</code>)
*/
void sph_whirlpool0_init(void *cc);
#endif
#ifndef DOXYGEN_IGNORE
#define sph_whirlpool0_init sph_whirlpool_init
#endif
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing). This function applies the
* WHIRLPOOL-0 algorithm.
*
* @param cc the WHIRLPOOL context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_whirlpool0(void *cc, const void *data, size_t len);
/**
* Terminate the current WHIRLPOOL-0 computation and output the result into the
* provided buffer. The destination buffer must be wide enough to
* accomodate the result (64 bytes). The context is automatically
* reinitialized.
*
* @param cc the WHIRLPOOL-0 context
* @param dst the destination buffer
*/
void sph_whirlpool0_close(void *cc, void *dst);
/**
* WHIRLPOOL-1 uses the same structure than plain WHIRLPOOL.
*/
typedef sph_whirlpool_context sph_whirlpool1_context;
#ifdef DOXYGEN_IGNORE
/**
* Initialize a WHIRLPOOL-1 context. This function is identical to
* <code>sph_whirlpool_init()</code>.
*
* @param cc the WHIRLPOOL context (pointer to a
* <code>sph_whirlpool1_context</code>)
*/
void sph_whirlpool1_init(void *cc);
#endif
#ifndef DOXYGEN_IGNORE
#define sph_whirlpool1_init sph_whirlpool_init
#endif
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing). This function applies the
* WHIRLPOOL-1 algorithm.
*
* @param cc the WHIRLPOOL context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_whirlpool1(void *cc, const void *data, size_t len);
/**
* Terminate the current WHIRLPOOL-1 computation and output the result into the
* provided buffer. The destination buffer must be wide enough to
* accomodate the result (64 bytes). The context is automatically
* reinitialized.
*
* @param cc the WHIRLPOOL-1 context
* @param dst the destination buffer
*/
void sph_whirlpool1_close(void *cc, void *dst);
#endif
#endif

View File

@@ -41,7 +41,7 @@ void whirlpool_hash_4way( void *state, const void *input )
whirlpool1_4way( &ctx, vhash, 64 );
whirlpool1_4way_close( &ctx, vhash);
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
memcpy( state , hash0, 32 );
memcpy( state+32, hash1, 32 );
@@ -74,7 +74,7 @@ int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce,
be32enc(&endiandata[i], pdata[i]);
uint64_t *edata = (uint64_t*)endiandata;
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
// midstate
whirlpool1_4way_init( &whirl_mid );

View File

@@ -3346,7 +3346,7 @@ do { \
#define ROUND0 MUL8(ROUND0_W)
#define UPDATE_STATE MUL8(UPDATE_STATE_W)
#define BYTE(x, n) \
_mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), mm256_vec_epi64( 0xFF ) )
_mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
// A very complex, but structured, expression with a mix of scalar
// and vector operations to retrieve specific 64 bit constants from
@@ -3359,7 +3359,7 @@ do { \
// Pack the data in a vector and return it.
#define t_row( inv, row ) \
_mm256_and_si256( \
_mm256_srli_epi64( inv, row << 3 ), mm256_vec_epi64( 0xFF ) )
_mm256_srli_epi64( inv, row << 3 ), _mm256_set1_epi64x( 0xFF ) )
// Extract vector element from "lane" of vector "in[row]" and use it to index
// scalar array of constants "table" and return referenced 64 bit entry.
@@ -3454,7 +3454,7 @@ void
whirlpool_4way_init(void *cc)
{
whirlpool_4way_context *sc = cc;;
memset_zero_m256i( sc->state, 8 );
memset_zero_256( sc->state, 8 );
sc->count = 0;
}
@@ -3470,7 +3470,7 @@ name ## _round( const void *src, __m256i *state ) \
ROUND0; \
for (r = 0; r < 10; r ++) { \
DECL8(tmp); \
ROUND_KSCHED( type ## _T, h, tmp, mm256_vec_epi64( type ## _RC[r] ) ); \
ROUND_KSCHED( type ## _T, h, tmp, _mm256_set1_epi64x( type ## _RC[r] ) ); \
TRANSFER( h, tmp ); \
ROUND_WENC( type ## _T, n, h, tmp ); \
TRANSFER( n, tmp ); \